Project

General

Profile

1
package eu.dnetlib.data.transform.xml;
2

    
3
import java.util.HashMap;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.Set;
7

    
8
import com.google.common.collect.Lists;
9
import com.google.common.collect.Maps;
10
import com.google.common.collect.Sets;
11
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
12
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
13
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
14
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
15
import eu.dnetlib.data.proto.OafProtos.Oaf;
16
import eu.dnetlib.data.proto.OafProtos.OafEntity;
17
import eu.dnetlib.data.proto.ResultProtos.Result;
18
import eu.dnetlib.data.proto.ResultProtos.Result.Context;
19
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
20
import eu.dnetlib.data.proto.ResultProtos.Result.Metadata.Builder;
21
import eu.dnetlib.data.proto.TypeProtos.Type;
22
import org.apache.commons.lang3.StringUtils;
23
import org.w3c.dom.Element;
24
import org.w3c.dom.NamedNodeMap;
25
import org.w3c.dom.Node;
26
import org.w3c.dom.NodeList;
27

    
28
public class OdfToHbaseXsltFunctions extends CommonDNetXsltFunctions {
29

    
30
	private static Map<String, String> mappingAccess = Maps.newHashMap();
31

    
32
	static {
33

    
34
		mappingAccess.put("info:eu-repo/semantics/openAccess", "OPEN");
35
		mappingAccess.put("info:eu-repo/semantics/closedAccess", "CLOSED");
36
		mappingAccess.put("info:eu-repo/semantics/restrictedAccess", "RESTRICTED");
37
		mappingAccess.put("info:eu-repo/semantics/embargoedAccess", "EMBARGO");
38

    
39
		// Transformator now maps the access rights into proper values, not sure if it does for all datasets.
40
		mappingAccess.put("OPEN", "OPEN");
41
		mappingAccess.put("CLOSED", "CLOSED");
42
		mappingAccess.put("RESTRICTED", "RESTRICTED");
43
		mappingAccess.put("EMBARGO", "EMBARGO");
44
		mappingAccess.put("OPEN SOURCE", "OPEN SOURCE");
45

    
46
	}
47

    
48
	public static String odfResult(
49
			final String resultId,
50
			final boolean invisible,
51
			final NodeList about,
52
			final NodeList metadata,
53
			final NodeList titles,
54
			final NodeList creators,
55
			final NodeList subjects,
56
			final NodeList publisher,
57
			final NodeList descriptions,
58
			final NodeList dates,
59
			final NodeList dateaccepted,
60
			final NodeList resourceTypes,
61
			final NodeList formats,
62
			final NodeList sizes,
63
			final NodeList languages,
64
			final NodeList cobjcategory,
65
			final NodeList contributors,
66
			final NodeList rights,
67
			final NodeList license,
68
			final NodeList version,
69
			final NodeList pidList,
70
			final String provenance,
71
			final String trust,
72
			final NodeList hostedby,
73
			final NodeList collectedfrom,
74
			final NodeList originalIds,
75
			final String instanceUri,
76
			final String landingPage,
77
			final NodeList distributionlocation,
78
			final NodeList documentationUrl,
79
			final String dateOfCollection,
80
			final String dateOfTransformation) {
81

    
82
		try {
83
			final String entityId = OafRowKeyDecoder.decode(resultId).getKey();
84

    
85
			final Result.Builder result = Result.newBuilder();
86
			Result.Metadata.Builder metadataProto = Result.Metadata.newBuilder();
87

    
88
			// subject
89
			for (int i = 0; i < subjects.getLength(); i++) {
90
				Node currentNode = subjects.item(i);
91
				NodeList childNodes = currentNode.getChildNodes();
92
				if (childNodes.getLength() > 0) {
93
					String subjectValue = childNodes.item(0).getNodeValue();
94
					String schemeName = "keyword";
95
					String schemeURI ="keyword";
96
					if (currentNode.hasAttributes()) {
97
						NamedNodeMap attributes = currentNode.getAttributes();
98
						Node schemeNameNode = attributes.getNamedItem("subjectScheme");
99
						Node schemeURINode = attributes.getNamedItem("schemeURI");
100
						if(schemeNameNode != null) schemeName = schemeNameNode.getTextContent();
101
						if(schemeURINode != null) schemeURI = schemeURINode.getTextContent();
102
						if(schemeNameNode != null && schemeURINode == null) schemeURI = schemeName;
103
						if(schemeURINode != null && schemeNameNode == null) schemeName = schemeURI;
104
					}
105
					addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("subject"),
106
							getStructuredProperty(subjectValue, schemeURI, schemeName, "dnet:subject_classification_typologies", "dnet:subject_classification_typologies"));
107
				}
108
			}
109

    
110
			// title
111
			for (int i = 0; i < titles.getLength(); i++) {
112
				Node currentNode = titles.item(i);
113
				NodeList childNodes = currentNode.getChildNodes();
114
				if (childNodes.getLength() > 0) {
115
					String titleValue = childNodes.item(0).getNodeValue();
116
					String classname = "main title";
117
					String classid = "main title";
118
					if (currentNode.hasAttributes()) {
119
						NamedNodeMap attributes = currentNode.getAttributes();
120
						Node titleType = attributes.getNamedItem("titleType");
121

    
122
						if (titleType != null && titleType.getNodeValue().equals("AlternativeTitle")) {
123
							classname = "alternative title";
124
							classid = "alternative title";
125
						}
126
						if (titleType != null && titleType.getNodeValue().equals("Subtitle")) {
127
							classname = "subtitle";
128
							classid = "subtitle";
129
						}
130
						if (titleType != null && titleType.getNodeValue().equals("TranslatedTitle")) {
131
							classname = "translated title";
132
							classid = "translated title";
133
						}
134
					}
135
					addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("title"),
136
							getStructuredProperty(titleValue, classname, classid, "dnet:dataCite_title", "dnet:dataCite_title"));
137
				}
138
			}
139

    
140
			// creators
141
			for (int i = 0; i < creators.getLength(); i++) {
142
				final Element creator = (Element) creators.item(i);
143
				if (creator != null && creator.hasChildNodes()) {
144

    
145
					final NodeList creatorNames = creator.getElementsByTagName("creatorName");
146
					if (creatorNames.getLength() > 0) {
147
						createAuthor(metadataProto, i, creator, creatorNames);
148
					} else{
149
						//handle authors with namespaceprefix
150
						final NodeList creatorNamesNs = creator.getElementsByTagNameNS("http://datacite.org/schema/kernel-4", "creatorName");
151
						if (creatorNamesNs.getLength() > 0) {
152
							createAuthor(metadataProto, i, creator, creatorNamesNs);
153
						}
154

    
155
					}
156
				}
157
			}
158

    
159
			// description
160
			for (int i = 0; i < descriptions.getLength(); i++) {
161
				Element currentNode = (Element) descriptions.item(i);
162
				if (currentNode != null && currentNode.hasChildNodes()) {
163
					String descriptionValue = currentNode.getChildNodes().item(0).getNodeValue();
164

    
165
					final String descriptionType = currentNode.getAttribute("descriptionType");
166
					if (StringUtils.isNotBlank(descriptionType)) {
167
						switch (descriptionType) {
168
						case "TechnicalInfo":
169
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("tool"), descriptionValue);
170
							break;
171
						case "Abstract":
172
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("description"), descriptionValue);
173
							break;
174
						case "DistributionForm":
175
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("format"), descriptionValue);
176
							break;
177
						}
178
					} else {
179
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("description"), descriptionValue);
180
					}
181
				}
182
			}
183

    
184
			// contributors
185
			for (int i = 0; i < contributors.getLength(); i++) {
186
				final Element contributor = (Element) contributors.item(i);
187
				if (contributor != null && contributor.hasChildNodes()) {
188

    
189
					NodeList contributorNames = contributor.getElementsByTagName("contributorName");
190
					if (contributorNames != null) {
191
						Element contributorName = (Element) contributorNames.item(0);
192
						if (contributorName != null) {
193
							final String contributorValue = contributorName.getTextContent();
194
							final String contributorType = contributor.getAttribute("contributorType");
195

    
196
							if (StringUtils.isNotBlank(contributorType)) {
197
								switch (contributorType) {
198
								case "ContactPerson":
199
									addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contactperson"), contributorValue);
200
									break;
201
								case "ContactGroup":
202
									addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contactgroup"), contributorValue);
203
									break;
204
								}
205
							} else {
206
								addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contributor"), contributorValue);
207
							}
208
						}
209
					}
210
				}
211
			}
212

    
213
			// publisher
214
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("publisher"), getFirstItem(publisher));
215

    
216

    
217

    
218
			// dates
219
			for (int i = 0; i < dates.getLength(); i++) {
220
				Node currentNode = dates.item(i);
221
				if (currentNode != null && currentNode.hasAttributes() && currentNode.hasChildNodes()) {
222

    
223
					final NamedNodeMap attributes = currentNode.getAttributes();
224
					final Node dateType = attributes.getNamedItem("dateType") == null ? attributes.getNamedItem("datetype") : null;
225
					if (dateType != null) {
226

    
227
						String dateAttribute = dateType.getNodeValue();
228
						String dateValue = currentNode.getChildNodes().item(0).getNodeValue();
229
						String protoAttribute = "relevantdate";
230
						if ("Accepted".equals(dateAttribute)) {
231
							protoAttribute = "dateofacceptance";
232
						} else if ("Issued".equals(dateAttribute)) {
233
							protoAttribute = "storagedate";
234
						} else if ("Updated".equals(dateAttribute)) {
235
							protoAttribute = "lastmetadataupdate";
236
						} else if ("Available".equals(dateAttribute)) {
237
							protoAttribute = "embargoenddate";
238
						}
239
						if (protoAttribute.equals("relevantdate") == false) {
240
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName(protoAttribute), dateValue);
241
						} else {
242
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName(protoAttribute),
243
									getStructuredProperty(dateValue, "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date"));
244
						}
245
					}
246
				}
247
			}
248

    
249
			//license
250
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("license"), getFirstItem(license));
251
			// dateofacceptance
252
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("dateofacceptance"), getFirstItem(dateaccepted));
253

    
254
			// size
255
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("size"), getFirstItem(sizes));
256

    
257
			// version
258
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("version"), getFirstItem(version));
259

    
260
			// language
261
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("language"),
262
					setQualifier(getDefaultQualifier("dnet:languages"), Lists.newArrayList(getFirstItem(languages))));
263

    
264
			// resource type
265
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("resourcetype"),
266
					setQualifier(getDefaultQualifier("dnet:dataCite_resource"), Lists.newArrayList(getFirstItem(resourceTypes))));
267

    
268
			// resultType
269
			final String cobjcategoryCode = getFirstItem(cobjcategory);
270
			final String resulttype = getResultType(cobjcategory);
271
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("resulttype"), getSimpleQualifier(resulttype, "dnet:result_typologies"));
272

    
273
			switch (resulttype) {
274
			case "software" :
275
				// format
276
				addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("programmingLanguage"),
277
						getSimpleQualifier(getFirstItem(formats), "dnet:programming_languages"));
278
				break;
279
			case "dataset":
280
				for (int i = 0; i < formats.getLength(); i++) {
281
					Node currentNode = formats.item(i);
282
					NodeList childNodes = currentNode.getChildNodes();
283
					if (childNodes.getLength() > 0) {
284
						String formatValue = childNodes.item(0).getNodeValue();
285
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("format"), formatValue);
286
					}
287
				}
288
				break;
289
			case "other":
290

    
291
				break;
292
			}
293

    
294
			// documentationUrl
295
			for (int i = 0; i < documentationUrl.getLength(); i++) {
296
				final Element docUrl = (Element) documentationUrl.item(i);
297
				if (docUrl != null && docUrl.hasChildNodes()) {
298
					final String value = docUrl.getTextContent();
299
					if (StringUtils.isNotBlank(value)) {
300
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("documentationUrl"), value);
301
					}
302
				}
303
			}
304

    
305
			ValueMap values = ValueMap.parseNodeList(metadata);
306
			// contexts
307
			if (values.get("concept") != null) {
308
				for (final eu.dnetlib.data.transform.xml.Element e : values.get("concept")) {
309
					final String id = e.getAttributes().get("id");
310
					if (StringUtils.isBlank(id)) throw new IllegalArgumentException("Context id cannot be blank");
311
					metadataProto.addContext(Context.newBuilder().setId(id));
312
				}
313
			}
314

    
315
			//journal
316
			if (values.containsKey("journal")) {
317
				for (final eu.dnetlib.data.transform.xml.Element journal : values.get("journal")) {
318
					addJournal(metadataProto, journal);
319

    
320
				}
321
			}
322

    
323
			final List<KeyValue> hostedBys = getKeyValues(ValueMap.parseNodeList(hostedby), "hostedby", Type.datasource);
324
			final List<KeyValue> collectedFroms = getKeyValues(ValueMap.parseNodeList(collectedfrom), "collectedfrom", Type.datasource);
325

    
326
			final Instance.Builder instance = Instance.newBuilder();
327

    
328
			String tmpRigths = "UNKNOWN";
329
			final String firstRight = getFirstItem(rights);
330
			if (mappingAccess.containsKey(firstRight)) {
331
				tmpRigths = mappingAccess.get(firstRight);
332
			}
333

    
334
			addField(instance, Instance.getDescriptor().findFieldByName("license"), getFirstItem(license));
335
			addField(instance, Instance.getDescriptor().findFieldByName("hostedby"), hostedBys);
336

    
337
			addField(instance, Instance.getDescriptor().findFieldByName("accessright"),
338
					setQualifier(getDefaultQualifier("dnet:access_modes"), Lists.newArrayList(tmpRigths)));
339

    
340
			addField(instance, Instance.getDescriptor().findFieldByName("instancetype"),
341
					setQualifier(getDefaultQualifier("dnet:dataCite_resource"), Lists.newArrayList(cobjcategoryCode)));
342

    
343
			if (StringUtils.isNotBlank(landingPage)) {
344
				addField(instance, Instance.getDescriptor().findFieldByName("url"), landingPage);
345
			}
346
			//sometimes the instanceUri is blank...
347
			if (StringUtils.isNotBlank(instanceUri)) {
348
				addField(instance, Instance.getDescriptor().findFieldByName("url"), instanceUri);
349
			}
350

    
351
			addField(instance, Instance.getDescriptor().findFieldByName("distributionlocation"), getFirstItem(distributionlocation));
352

    
353
			addField(instance, Instance.getDescriptor().findFieldByName("collectedfrom"), collectedFroms);
354
			addField(instance, Instance.getDescriptor().findFieldByName("dateofacceptance"), getFirstItem(dateaccepted));
355
			if (values.get("refereed") != null) {
356
				addField(instance, Instance.getDescriptor().findFieldByName("refereed"), values.get("refereed").listValues());
357
			}
358

    
359
			result.addInstance(instance);
360

    
361
			List<StructuredProperty> pids = parsePids(pidList);
362

    
363
			// original ids
364
			final Set<String> originalIdList = Sets.newHashSet();
365
			for (int i = 0; i < originalIds.getLength(); i++) {
366
				Node currentNode = originalIds.item(i);
367
				if (currentNode != null && currentNode.hasChildNodes()) {
368
					originalIdList.add(currentNode.getChildNodes().item(0).getNodeValue());
369
				}
370
			}
371

    
372
			OafEntity.Builder entity =
373
					getEntity(Type.result, entityId, collectedFroms, originalIdList, dateOfCollection, dateOfTransformation, pids).setResult(
374
							result.setMetadata(metadataProto));
375

    
376
			entity.setOaiprovenance(getOAIProvenance(about));
377

    
378
			Oaf oaf = getOaf(entity, getDataInfo(invisible, about, provenance, trust, false, false));
379
			return base64(oaf.toByteArray());
380
		} catch (Exception e) {
381
			e.printStackTrace(System.err);
382
			throw new RuntimeException(e);
383
		}
384

    
385
	}
386

    
387
	private static void createAuthor(final Builder metadataProto, final int i, final Element creator, final NodeList creatorNames) {
388
		final Element creatorName = (Element) creatorNames.item(0);
389

    
390
		final Author.Builder author = Author.newBuilder();
391
		author.setRank(i+1);
392
		final String fullname = StringUtils.trim(creatorName.getTextContent());
393

    
394
		author.setFullname(fullname);
395

    
396
		final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(fullname, false);
397
		if (p.isAccurate()) {
398
			author.setName(p.getNormalisedFirstName());
399
			author.setSurname(p.getNormalisedSurname());
400
		}
401
		final NodeList nameIdentifiers = creator.getElementsByTagName("nameIdentifier");
402
		if (nameIdentifiers.getLength() > 0) {
403
			final Element nameIdentifier = (Element) nameIdentifiers.item(0);
404
			final String nameIdentifierScheme = nameIdentifier.getAttribute("nameIdentifierScheme");
405
			final String id = StringUtils.trim(nameIdentifier.getTextContent());
406
			if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(nameIdentifierScheme)) {
407
				author.addPid(getKV(nameIdentifierScheme, id));
408
			}
409
		}
410

    
411
		addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("author"), author);
412
	}
413

    
414
	private static String getResultType(final NodeList cobjcategoryNode) {
415

    
416
		final ValueMap values = ValueMap.parseNodeList(cobjcategoryNode);
417

    
418
		final eu.dnetlib.data.transform.xml.Element cobjcategory = values.get("cobjcategory").stream()
419
				.map(e -> StringUtils.isNotBlank(e.getText()) ? e : new eu.dnetlib.data.transform.xml.Element("0000", e.getAttributes()))
420
				.findFirst()
421
				.orElse(new eu.dnetlib.data.transform.xml.Element("0000", new HashMap<>()));
422

    
423
		final String resulttype = cobjcategory.getAttributeValue("type");
424
		if (StringUtils.isNotBlank(resulttype)) {
425
			return resulttype;
426
		}
427

    
428
		return getDefaultResulttype(cobjcategory);
429
	}
430

    
431
	public static String getFirstItem(final NodeList list) {
432
		String out = "";
433
		if (list != null) {
434

    
435
			if (list.getLength() > 0 && list.item(0).getChildNodes() != null && list.item(0).getChildNodes().getLength() > 0) {
436
				out = list.item(0).getChildNodes().item(0).getNodeValue();
437
			}
438
		}
439
		return out;
440
	}
441

    
442
}
(8-8/10)