Project

General

Profile

1
package eu.dnetlib.data.transform.xml;
2

    
3
import java.util.HashMap;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.Set;
7

    
8
import com.google.common.collect.Lists;
9
import com.google.common.collect.Maps;
10
import com.google.common.collect.Sets;
11
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
12
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
13
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
14
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
15
import eu.dnetlib.data.proto.OafProtos.Oaf;
16
import eu.dnetlib.data.proto.OafProtos.OafEntity;
17
import eu.dnetlib.data.proto.ResultProtos.Result;
18
import eu.dnetlib.data.proto.ResultProtos.Result.Context;
19
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
20
import eu.dnetlib.data.proto.ResultProtos.Result.Metadata.Builder;
21
import eu.dnetlib.data.proto.TypeProtos.Type;
22
import org.apache.commons.lang3.StringUtils;
23
import org.w3c.dom.Element;
24
import org.w3c.dom.NamedNodeMap;
25
import org.w3c.dom.Node;
26
import org.w3c.dom.NodeList;
27

    
28
public class OdfToHbaseXsltFunctions extends CommonDNetXsltFunctions {
29

    
30
	private static Map<String, String> mappingAccess = Maps.newHashMap();
31

    
32
	static {
33

    
34
		mappingAccess.put("info:eu-repo/semantics/openAccess", "OPEN");
35
		mappingAccess.put("info:eu-repo/semantics/closedAccess", "CLOSED");
36
		mappingAccess.put("info:eu-repo/semantics/restrictedAccess", "RESTRICTED");
37
		mappingAccess.put("info:eu-repo/semantics/embargoedAccess", "EMBARGO");
38

    
39
		// Transformator now maps the access rights into proper values, not sure if it does for all datasets.
40
		mappingAccess.put("OPEN", "OPEN");
41
		mappingAccess.put("CLOSED", "CLOSED");
42
		mappingAccess.put("RESTRICTED", "RESTRICTED");
43
		mappingAccess.put("EMBARGO", "EMBARGO");
44
		mappingAccess.put("OPEN SOURCE", "OPEN SOURCE");
45

    
46
	}
47

    
48
	public static String odfResult(
49
			final String resultId,
50
			final boolean invisible,
51
			final NodeList about,
52
			final NodeList metadata,
53
			final NodeList titles,
54
			final NodeList creators,
55
			final NodeList subjects,
56
			final NodeList publisher,
57
			final NodeList descriptions,
58
			final NodeList dates,
59
			final NodeList dateaccepted,
60
			final NodeList resourceTypes,
61
			final NodeList formats,
62
			final NodeList sizes,
63
			final NodeList languages,
64
			final NodeList cobjcategory,
65
			final NodeList contributors,
66
			final NodeList rights,
67
			final NodeList license,
68
			final NodeList version,
69
			final NodeList pidList,
70
			final String provenance,
71
			final String trust,
72
			final NodeList hostedby,
73
			final NodeList collectedfrom,
74
			final NodeList originalIds,
75
			final String instanceUri,
76
			final String landingPage,
77
			final NodeList distributionlocation,
78
			final NodeList documentationUrl,
79
			final String dateOfCollection,
80
			final String dateOfTransformation) {
81

    
82
		try {
83
			final String entityId = OafRowKeyDecoder.decode(resultId).getKey();
84

    
85
			final Result.Builder result = Result.newBuilder();
86
			Result.Metadata.Builder metadataProto = Result.Metadata.newBuilder();
87

    
88
			// subject
89
			for (int i = 0; i < subjects.getLength(); i++) {
90
				Node currentNode = subjects.item(i);
91
				NodeList childNodes = currentNode.getChildNodes();
92
				if (childNodes.getLength() > 0) {
93
					String subjectValue = childNodes.item(0).getNodeValue();
94
					String schemeName = "keyword";
95
					String schemeURI ="keyword";
96
					if (currentNode.hasAttributes()) {
97
						NamedNodeMap attributes = currentNode.getAttributes();
98
						Node schemeNameNode = attributes.getNamedItem("subjectScheme");
99
						Node schemeURINode = attributes.getNamedItem("schemeURI");
100
						if(schemeNameNode != null) schemeName = schemeNameNode.getTextContent();
101
						if(schemeURINode != null) schemeURI = schemeURINode.getTextContent();
102
						if(schemeNameNode != null && schemeURINode == null) schemeURI = schemeName;
103
						if(schemeURINode != null && schemeNameNode == null) schemeName = schemeURI;
104
					}
105
					addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("subject"),
106
							getStructuredProperty(subjectValue, schemeURI, schemeName, "dnet:subject_classification_typologies", "dnet:subject_classification_typologies"));
107
				}
108
			}
109

    
110
			// title
111
			for (int i = 0; i < titles.getLength(); i++) {
112
				Node currentNode = titles.item(i);
113
				NodeList childNodes = currentNode.getChildNodes();
114
				if (childNodes.getLength() > 0) {
115
					String titleValue = childNodes.item(0).getNodeValue();
116
					String classname = "main title";
117
					String classid = "main title";
118
					if (currentNode.hasAttributes()) {
119
						NamedNodeMap attributes = currentNode.getAttributes();
120
						Node titleType = attributes.getNamedItem("titleType");
121

    
122
						if (titleType != null && titleType.getNodeValue().equals("AlternativeTitle")) {
123
							classname = "alternative title";
124
							classid = "alternative title";
125
						}
126
						if (titleType != null && titleType.getNodeValue().equals("Subtitle")) {
127
							classname = "subtitle";
128
							classid = "subtitle";
129
						}
130
						if (titleType != null && titleType.getNodeValue().equals("TranslatedTitle")) {
131
							classname = "translated title";
132
							classid = "translated title";
133
						}
134
					}
135
					addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("title"),
136
							getStructuredProperty(titleValue, classname, classid, "dnet:dataCite_title", "dnet:dataCite_title"));
137
				}
138
			}
139

    
140
			// creators
141
			for (int i = 0; i < creators.getLength(); i++) {
142
				final Element creator = (Element) creators.item(i);
143
				if (creator != null && creator.hasChildNodes()) {
144

    
145
					final NodeList creatorNames = creator.getElementsByTagName("creatorName");
146
					if (creatorNames.getLength() > 0) {
147
						createAuthor(metadataProto, i, creator, creatorNames);
148
					} else{
149
						//handle authors with namespaceprefix
150
						final NodeList creatorNamesNs = creator.getElementsByTagNameNS("http://datacite.org/schema/kernel-4", "creatorName");
151
						if (creatorNamesNs.getLength() > 0) {
152
							createAuthor(metadataProto, i, creator, creatorNamesNs);
153
						}
154

    
155
					}
156
				}
157
			}
158

    
159
			// description
160
			for (int i = 0; i < descriptions.getLength(); i++) {
161
				Element currentNode = (Element) descriptions.item(i);
162
				if (currentNode != null && currentNode.hasChildNodes()) {
163
					String descriptionValue = currentNode.getChildNodes().item(0).getNodeValue();
164

    
165
					final String descriptionType = currentNode.getAttribute("descriptionType");
166
					if (StringUtils.isNotBlank(descriptionType)) {
167
						switch (descriptionType) {
168
						case "TechnicalInfo":
169
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("tool"), descriptionValue);
170
							break;
171
						case "Abstract":
172
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("description"), descriptionValue);
173
							break;
174
						case "DistributionForm":
175
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("format"), descriptionValue);
176
							break;
177
						}
178
					} else {
179
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("description"), descriptionValue);
180
					}
181
				}
182
			}
183

    
184
			// contributors
185
			for (int i = 0; i < contributors.getLength(); i++) {
186
				final Element contributor = (Element) contributors.item(i);
187
				if (contributor != null && contributor.hasChildNodes()) {
188

    
189
					NodeList contributorNames = contributor.getElementsByTagName("contributorName");
190
					if (contributorNames != null) {
191
						Element contributorName = (Element) contributorNames.item(0);
192
						if (contributorName != null) {
193
							final String contributorValue = contributorName.getTextContent();
194
							final String contributorType = contributor.getAttribute("contributorType");
195

    
196
							if (StringUtils.isNotBlank(contributorType)) {
197
								switch (contributorType) {
198
								case "ContactPerson":
199
									addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contactperson"), contributorValue);
200
									break;
201
								case "ContactGroup":
202
									addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contactgroup"), contributorValue);
203
									break;
204
								}
205
							} else {
206
								addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contributor"), contributorValue);
207
							}
208
						}
209
					}
210
				}
211
			}
212

    
213
			// publisher
214
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("publisher"), getFirstItem(publisher));
215

    
216

    
217

    
218
			// dates
219
			for (int i = 0; i < dates.getLength(); i++) {
220
				Node currentNode = dates.item(i);
221
				if (currentNode != null && currentNode.hasAttributes() && currentNode.hasChildNodes()) {
222
					String dateAttribute = currentNode.getAttributes().getNamedItem("dateType").getNodeValue();
223
					String dateValue = currentNode.getChildNodes().item(0).getNodeValue();
224
					String protoAttribute = "relevantdate";
225
					if ("Accepted".equals(dateAttribute)) {
226
						protoAttribute = "dateofacceptance";
227
					} else if ("Issued".equals(dateAttribute)) {
228
						protoAttribute = "storagedate";
229
					} else if ("Updated".equals(dateAttribute)) {
230
						protoAttribute = "lastmetadataupdate";
231
					} else if ("Available".equals(dateAttribute)) {
232
						protoAttribute = "embargoenddate";
233
					}
234
					if (protoAttribute.equals("relevantdate") == false) {
235
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName(protoAttribute), dateValue);
236
					} else {
237
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName(protoAttribute),
238
								getStructuredProperty(dateValue, "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date"));
239
					}
240
				}
241
			}
242

    
243
			//license
244
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("license"), getFirstItem(license));
245
			// dateofacceptance
246
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("dateofacceptance"), getFirstItem(dateaccepted));
247

    
248
			// size
249
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("size"), getFirstItem(sizes));
250

    
251
			// version
252
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("version"), getFirstItem(version));
253

    
254
			// language
255
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("language"),
256
					setQualifier(getDefaultQualifier("dnet:languages"), Lists.newArrayList(getFirstItem(languages))));
257

    
258
			// resource type
259
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("resourcetype"),
260
					setQualifier(getDefaultQualifier("dnet:dataCite_resource"), Lists.newArrayList(getFirstItem(resourceTypes))));
261

    
262
			// resultType
263
			final String cobjcategoryCode = getFirstItem(cobjcategory);
264
			final String resulttype = getResultType(cobjcategory);
265
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("resulttype"), getSimpleQualifier(resulttype, "dnet:result_typologies"));
266

    
267
			switch (resulttype) {
268
			case "software" :
269
				// format
270
				addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("programmingLanguage"),
271
						getSimpleQualifier(getFirstItem(formats), "dnet:programming_languages"));
272
				break;
273
			case "dataset":
274
				for (int i = 0; i < formats.getLength(); i++) {
275
					Node currentNode = formats.item(i);
276
					NodeList childNodes = currentNode.getChildNodes();
277
					if (childNodes.getLength() > 0) {
278
						String formatValue = childNodes.item(0).getNodeValue();
279
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("format"), formatValue);
280
					}
281
				}
282
				break;
283
			case "other":
284

    
285
				break;
286
			}
287

    
288
			// documentationUrl
289
			for (int i = 0; i < documentationUrl.getLength(); i++) {
290
				final Element docUrl = (Element) documentationUrl.item(i);
291
				if (docUrl != null && docUrl.hasChildNodes()) {
292
					final String value = docUrl.getTextContent();
293
					if (StringUtils.isNotBlank(value)) {
294
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("documentationUrl"), value);
295
					}
296
				}
297
			}
298

    
299
			ValueMap values = ValueMap.parseNodeList(metadata);
300
			// contexts
301
			if (values.get("concept") != null) {
302
				for (final eu.dnetlib.data.transform.xml.Element e : values.get("concept")) {
303
					final String id = e.getAttributes().get("id");
304
					if (StringUtils.isBlank(id)) throw new IllegalArgumentException("Context id cannot be blank");
305
					metadataProto.addContext(Context.newBuilder().setId(id));
306
				}
307
			}
308

    
309
			//journal
310
			if (values.containsKey("journal")) {
311
				for (final eu.dnetlib.data.transform.xml.Element journal : values.get("journal")) {
312
					addJournal(metadataProto, journal);
313

    
314
				}
315
			}
316

    
317
			final List<KeyValue> hostedBys = getKeyValues(ValueMap.parseNodeList(hostedby), "hostedby", Type.datasource);
318
			final List<KeyValue> collectedFroms = getKeyValues(ValueMap.parseNodeList(collectedfrom), "collectedfrom", Type.datasource);
319

    
320
			final Instance.Builder instance = Instance.newBuilder();
321

    
322
			String tmpRigths = "UNKNOWN";
323
			final String firstRight = getFirstItem(rights);
324
			if (mappingAccess.containsKey(firstRight)) {
325
				tmpRigths = mappingAccess.get(firstRight);
326
			}
327

    
328
			addField(instance, Instance.getDescriptor().findFieldByName("license"), getFirstItem(license));
329
			addField(instance, Instance.getDescriptor().findFieldByName("hostedby"), hostedBys);
330

    
331
			addField(instance, Instance.getDescriptor().findFieldByName("accessright"),
332
					setQualifier(getDefaultQualifier("dnet:access_modes"), Lists.newArrayList(tmpRigths)));
333

    
334
			addField(instance, Instance.getDescriptor().findFieldByName("instancetype"),
335
					setQualifier(getDefaultQualifier("dnet:dataCite_resource"), Lists.newArrayList(cobjcategoryCode)));
336

    
337
			if (StringUtils.isNotBlank(landingPage)) {
338
				addField(instance, Instance.getDescriptor().findFieldByName("url"), landingPage);
339
			}
340
			//sometimes the instanceUri is blank...
341
			if (StringUtils.isNotBlank(instanceUri)) {
342
				addField(instance, Instance.getDescriptor().findFieldByName("url"), instanceUri);
343
			}
344

    
345
			addField(instance, Instance.getDescriptor().findFieldByName("distributionlocation"), getFirstItem(distributionlocation));
346

    
347
			addField(instance, Instance.getDescriptor().findFieldByName("collectedfrom"), collectedFroms);
348
			addField(instance, Instance.getDescriptor().findFieldByName("dateofacceptance"), getFirstItem(dateaccepted));
349

    
350
			result.addInstance(instance);
351

    
352
			List<StructuredProperty> pids = parsePids(pidList);
353

    
354
			// original ids
355
			final Set<String> originalIdList = Sets.newHashSet();
356
			for (int i = 0; i < originalIds.getLength(); i++) {
357
				Node currentNode = originalIds.item(i);
358
				if (currentNode != null && currentNode.hasChildNodes()) {
359
					originalIdList.add(currentNode.getChildNodes().item(0).getNodeValue());
360
				}
361
			}
362

    
363
			OafEntity.Builder entity =
364
					getEntity(Type.result, entityId, collectedFroms, originalIdList, dateOfCollection, dateOfTransformation, pids).setResult(
365
							result.setMetadata(metadataProto));
366

    
367
			entity.setOaiprovenance(getOAIProvenance(about));
368

    
369
			Oaf oaf = getOaf(entity, getDataInfo(invisible, about, provenance, trust, false, false));
370
			return base64(oaf.toByteArray());
371
		} catch (Exception e) {
372
			e.printStackTrace(System.err);
373
			throw new RuntimeException(e);
374
		}
375

    
376
	}
377

    
378
	private static void createAuthor(final Builder metadataProto, final int i, final Element creator, final NodeList creatorNames) {
379
		final Element creatorName = (Element) creatorNames.item(0);
380

    
381
		final Author.Builder author = Author.newBuilder();
382
		author.setRank(i+1);
383
		final String fullname = StringUtils.trim(creatorName.getTextContent());
384

    
385
		author.setFullname(fullname);
386

    
387
		final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(fullname, false);
388
		if (p.isAccurate()) {
389
			author.setName(p.getNormalisedFirstName());
390
			author.setSurname(p.getNormalisedSurname());
391
		}
392
		final NodeList nameIdentifiers = creator.getElementsByTagName("nameIdentifier");
393
		if (nameIdentifiers.getLength() > 0) {
394
			final Element nameIdentifier = (Element) nameIdentifiers.item(0);
395
			final String nameIdentifierScheme = nameIdentifier.getAttribute("nameIdentifierScheme");
396
			final String id = StringUtils.trim(nameIdentifier.getTextContent());
397
			if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(nameIdentifierScheme)) {
398
				author.addPid(getKV(nameIdentifierScheme, id));
399
			}
400
		}
401

    
402
		addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("author"), author);
403
	}
404

    
405
	private static String getResultType(final NodeList cobjcategoryNode) {
406

    
407
		final ValueMap values = ValueMap.parseNodeList(cobjcategoryNode);
408

    
409
		final eu.dnetlib.data.transform.xml.Element cobjcategory = values.get("cobjcategory").stream()
410
				.map(e -> StringUtils.isNotBlank(e.getText()) ? e : new eu.dnetlib.data.transform.xml.Element("0000", e.getAttributes()))
411
				.findFirst()
412
				.orElse(new eu.dnetlib.data.transform.xml.Element("0000", new HashMap<>()));
413

    
414
		final String resulttype = cobjcategory.getAttributeValue("type");
415
		if (StringUtils.isNotBlank(resulttype)) {
416
			return resulttype;
417
		}
418

    
419
		return getDefaultResulttype(cobjcategory);
420
	}
421

    
422
	public static String getFirstItem(final NodeList list) {
423
		String out = "";
424
		if (list != null) {
425

    
426
			if (list.getLength() > 0 && list.item(0).getChildNodes() != null && list.item(0).getChildNodes().getLength() > 0) {
427
				out = list.item(0).getChildNodes().item(0).getNodeValue();
428
			}
429
		}
430
		return out;
431
	}
432

    
433
}
(8-8/10)