Project

General

Profile

1
package eu.dnetlib.data.transform.xml;
2

    
3
import java.util.HashMap;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.Set;
7

    
8
import com.google.common.collect.Lists;
9
import com.google.common.collect.Maps;
10
import com.google.common.collect.Sets;
11
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
12
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
13
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
14
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
15
import eu.dnetlib.data.proto.OafProtos.Oaf;
16
import eu.dnetlib.data.proto.OafProtos.OafEntity;
17
import eu.dnetlib.data.proto.ResultProtos.Result;
18
import eu.dnetlib.data.proto.ResultProtos.Result.Context;
19
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
20
import eu.dnetlib.data.proto.TypeProtos.Type;
21
import org.apache.commons.lang3.StringUtils;
22
import org.w3c.dom.Element;
23
import org.w3c.dom.NamedNodeMap;
24
import org.w3c.dom.Node;
25
import org.w3c.dom.NodeList;
26

    
27
public class OdfToHbaseXsltFunctions extends CommonDNetXsltFunctions {
28

    
29
	private static Map<String, String> mappingAccess = Maps.newHashMap();
30

    
31
	static {
32

    
33
		mappingAccess.put("info:eu-repo/semantics/openAccess", "OPEN");
34
		mappingAccess.put("info:eu-repo/semantics/closedAccess", "CLOSED");
35
		mappingAccess.put("info:eu-repo/semantics/restrictedAccess", "RESTRICTED");
36
		mappingAccess.put("info:eu-repo/semantics/embargoedAccess", "EMBARGO");
37

    
38
		// Transformator now maps the access rights into proper values, not sure if it does for all datasets.
39
		mappingAccess.put("OPEN", "OPEN");
40
		mappingAccess.put("CLOSED", "CLOSED");
41
		mappingAccess.put("RESTRICTED", "RESTRICTED");
42
		mappingAccess.put("EMBARGO", "EMBARGO");
43
		mappingAccess.put("OPEN SOURCE", "OPEN SOURCE");
44

    
45
	}
46

    
47
	public static String odfResult(
48
			final String resultId,
49
			final boolean invisible,
50
			final NodeList about,
51
			final NodeList metadata,
52
			final NodeList titles,
53
			final NodeList creators,
54
			final NodeList subjects,
55
			final NodeList publisher,
56
			final NodeList descriptions,
57
			final NodeList dates,
58
			final NodeList dateaccepted,
59
			final NodeList resourceTypes,
60
			final NodeList formats,
61
			final NodeList sizes,
62
			final NodeList languages,
63
			final NodeList cobjcategory,
64
			final NodeList contributors,
65
			final NodeList rights,
66
			final NodeList license,
67
			final NodeList version,
68
			final NodeList pidList,
69
			final String provenance,
70
			final String trust,
71
			final NodeList hostedby,
72
			final NodeList collectedfrom,
73
			final NodeList originalIds,
74
			final String instanceUri,
75
			final String landingPage,
76
			final NodeList distributionlocation,
77
			final NodeList documentationUrl,
78
			final String dateOfCollection,
79
			final String dateOfTransformation) {
80

    
81
		try {
82
			final String entityId = OafRowKeyDecoder.decode(resultId).getKey();
83

    
84
			final Result.Builder result = Result.newBuilder();
85
			Result.Metadata.Builder metadataProto = Result.Metadata.newBuilder();
86

    
87
			// subject
88
			for (int i = 0; i < subjects.getLength(); i++) {
89
				Node currentNode = subjects.item(i);
90
				NodeList childNodes = currentNode.getChildNodes();
91
				if (childNodes.getLength() > 0) {
92
					String subjectValue = childNodes.item(0).getNodeValue();
93
					String schemeName = "keyword";
94
					String schemeURI ="keyword";
95
					if (currentNode.hasAttributes()) {
96
						NamedNodeMap attributes = currentNode.getAttributes();
97
						Node schemeNameNode = attributes.getNamedItem("subjectScheme");
98
						Node schemeURINode = attributes.getNamedItem("schemeURI");
99
						if(schemeNameNode != null) schemeName = schemeNameNode.getTextContent();
100
						if(schemeURINode != null) schemeURI = schemeURINode.getTextContent();
101
						if(schemeNameNode != null && schemeURINode == null) schemeURI = schemeName;
102
						if(schemeURINode != null && schemeNameNode == null) schemeName = schemeURI;
103
					}
104
					addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("subject"),
105
							getStructuredProperty(subjectValue, schemeURI, schemeName, "dnet:subject_classification_typologies", "dnet:subject_classification_typologies"));
106
				}
107
			}
108

    
109
			// title
110
			for (int i = 0; i < titles.getLength(); i++) {
111
				Node currentNode = titles.item(i);
112
				NodeList childNodes = currentNode.getChildNodes();
113
				if (childNodes.getLength() > 0) {
114
					String titleValue = childNodes.item(0).getNodeValue();
115
					String classname = "main title";
116
					String classid = "main title";
117
					if (currentNode.hasAttributes()) {
118
						NamedNodeMap attributes = currentNode.getAttributes();
119
						Node titleType = attributes.getNamedItem("titleType");
120

    
121
						if (titleType != null && titleType.getNodeValue().equals("AlternativeTitle")) {
122
							classname = "alternative title";
123
							classid = "alternative title";
124
						}
125
						if (titleType != null && titleType.getNodeValue().equals("Subtitle")) {
126
							classname = "subtitle";
127
							classid = "subtitle";
128
						}
129
						if (titleType != null && titleType.getNodeValue().equals("TranslatedTitle")) {
130
							classname = "translated title";
131
							classid = "translated title";
132
						}
133
					}
134
					addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("title"),
135
							getStructuredProperty(titleValue, classname, classid, "dnet:dataCite_title", "dnet:dataCite_title"));
136
				}
137
			}
138

    
139
			// creators
140
			for (int i = 0; i < creators.getLength(); i++) {
141
				final Element creator = (Element) creators.item(i);
142
				if (creator != null && creator.hasChildNodes()) {
143

    
144
					final NodeList creatorNames = creator.getElementsByTagName("creatorName");
145
					if (creatorNames.getLength() > 0) {
146
						final Element creatorName = (Element) creatorNames.item(0);
147

    
148
						final Author.Builder author = Author.newBuilder();
149
						author.setRank(i+1);
150
						final String fullname = StringUtils.trim(creatorName.getTextContent());
151

    
152
						author.setFullname(fullname);
153

    
154
						final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(fullname, false);
155
						if (p.isAccurate()) {
156
							author.setName(p.getNormalisedFirstName());
157
							author.setSurname(p.getNormalisedSurname());
158
						}
159
						final NodeList nameIdentifiers = creator.getElementsByTagName("nameIdentifier");
160
						if (nameIdentifiers.getLength() > 0) {
161
							final Element nameIdentifier = (Element) nameIdentifiers.item(0);
162
							final String nameIdentifierScheme = nameIdentifier.getAttribute("nameIdentifierScheme");
163
							final String id = StringUtils.trim(nameIdentifier.getTextContent());
164
							if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(nameIdentifierScheme)) {
165
								author.addPid(getKV(nameIdentifierScheme, id));
166
							}
167
						}
168

    
169
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("author"), author);
170
					}
171
				}
172
			}
173

    
174
			// description
175
			for (int i = 0; i < descriptions.getLength(); i++) {
176
				Element currentNode = (Element) descriptions.item(i);
177
				if (currentNode != null && currentNode.hasChildNodes()) {
178
					String descriptionValue = currentNode.getChildNodes().item(0).getNodeValue();
179

    
180
					final String descriptionType = currentNode.getAttribute("descriptionType");
181
					if (StringUtils.isNotBlank(descriptionType)) {
182
						switch (descriptionType) {
183
						case "TechnicalInfo":
184
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("tool"), descriptionValue);
185
							break;
186
						case "Abstract":
187
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("description"), descriptionValue);
188
							break;
189
						case "DistributionForm":
190
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("format"), descriptionValue);
191
							break;
192
						}
193
					} else {
194
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("description"), descriptionValue);
195
					}
196
				}
197
			}
198

    
199
			// contributors
200
			for (int i = 0; i < contributors.getLength(); i++) {
201
				final Element contributor = (Element) contributors.item(i);
202
				if (contributor != null && contributor.hasChildNodes()) {
203

    
204
					NodeList contributorNames = contributor.getElementsByTagName("contributorName");
205
					if (contributorNames != null) {
206
						Element contributorName = (Element) contributorNames.item(0);
207
						if (contributorName != null) {
208
							final String contributorValue = contributorName.getTextContent();
209
							final String contributorType = contributor.getAttribute("contributorType");
210

    
211
							if (StringUtils.isNotBlank(contributorType)) {
212
								switch (contributorType) {
213
								case "ContactPerson":
214
									addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contactperson"), contributorValue);
215
									break;
216
								case "ContactGroup":
217
									addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contactgroup"), contributorValue);
218
									break;
219
								}
220
							} else {
221
								addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contributor"), contributorValue);
222
							}
223
						}
224
					}
225
				}
226
			}
227

    
228
			// publisher
229
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("publisher"), getFirstItem(publisher));
230

    
231

    
232

    
233
			// dates
234
			for (int i = 0; i < dates.getLength(); i++) {
235
				Node currentNode = dates.item(i);
236
				if (currentNode != null && currentNode.hasAttributes() && currentNode.hasChildNodes()) {
237
					String dateAttribute = currentNode.getAttributes().getNamedItem("dateType").getNodeValue();
238
					String dateValue = currentNode.getChildNodes().item(0).getNodeValue();
239
					String protoAttribute = "relevantdate";
240
					if ("Accepted".equals(dateAttribute)) {
241
						protoAttribute = "dateofacceptance";
242
					} else if ("Issued".equals(dateAttribute)) {
243
						protoAttribute = "storagedate";
244
					} else if ("Updated".equals(dateAttribute)) {
245
						protoAttribute = "lastmetadataupdate";
246
					} else if ("Available".equals(dateAttribute)) {
247
						protoAttribute = "embargoenddate";
248
					}
249
					if (protoAttribute.equals("relevantdate") == false) {
250
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName(protoAttribute), dateValue);
251
					} else {
252
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName(protoAttribute),
253
								getStructuredProperty(dateValue, "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date"));
254
					}
255
				}
256
			}
257

    
258
			//license
259
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("license"), getFirstItem(license));
260
			// dateofacceptance
261
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("dateofacceptance"), getFirstItem(dateaccepted));
262

    
263
			// size
264
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("size"), getFirstItem(sizes));
265

    
266
			// version
267
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("version"), getFirstItem(version));
268

    
269
			// language
270
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("language"),
271
					setQualifier(getDefaultQualifier("dnet:languages"), Lists.newArrayList(getFirstItem(languages))));
272

    
273
			// resource type
274
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("resourcetype"),
275
					setQualifier(getDefaultQualifier("dnet:dataCite_resource"), Lists.newArrayList(getFirstItem(resourceTypes))));
276

    
277
			// resultType
278
			final String cobjcategoryCode = getFirstItem(cobjcategory);
279
			final String resulttype = getResultType(cobjcategory);
280
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("resulttype"), getSimpleQualifier(resulttype, "dnet:result_typologies"));
281

    
282
			switch (resulttype) {
283
			case "software" :
284
				// format
285
				addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("programmingLanguage"),
286
						getSimpleQualifier(getFirstItem(formats), "dnet:programming_languages"));
287
				break;
288
			case "dataset":
289
				for (int i = 0; i < formats.getLength(); i++) {
290
					Node currentNode = formats.item(i);
291
					NodeList childNodes = currentNode.getChildNodes();
292
					if (childNodes.getLength() > 0) {
293
						String formatValue = childNodes.item(0).getNodeValue();
294
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("format"), formatValue);
295
					}
296
				}
297
				break;
298
			case "other":
299

    
300
				break;
301
			}
302

    
303
			// documentationUrl
304
			for (int i = 0; i < documentationUrl.getLength(); i++) {
305
				final Element docUrl = (Element) documentationUrl.item(i);
306
				if (docUrl != null && docUrl.hasChildNodes()) {
307
					final String value = docUrl.getTextContent();
308
					if (StringUtils.isNotBlank(value)) {
309
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("documentationUrl"), value);
310
					}
311
				}
312
			}
313

    
314
			ValueMap values = ValueMap.parseNodeList(metadata);
315
			// contexts
316
			if (values.get("concept") != null) {
317
				for (final eu.dnetlib.data.transform.xml.Element e : values.get("concept")) {
318
					final String id = e.getAttributes().get("id");
319
					if (StringUtils.isBlank(id)) throw new IllegalArgumentException("Context id cannot be blank");
320
					metadataProto.addContext(Context.newBuilder().setId(id));
321
				}
322
			}
323

    
324
			//journal
325
			if (values.containsKey("journal")) {
326
				for (final eu.dnetlib.data.transform.xml.Element journal : values.get("journal")) {
327
					addJournal(metadataProto, journal);
328

    
329
				}
330
			}
331

    
332
			final List<KeyValue> hostedBys = getKeyValues(ValueMap.parseNodeList(hostedby), "hostedby", Type.datasource);
333
			final List<KeyValue> collectedFroms = getKeyValues(ValueMap.parseNodeList(collectedfrom), "collectedfrom", Type.datasource);
334

    
335
			final Instance.Builder instance = Instance.newBuilder();
336

    
337
			String tmpRigths = "UNKNOWN";
338
			final String firstRight = getFirstItem(rights);
339
			if (mappingAccess.containsKey(firstRight)) {
340
				tmpRigths = mappingAccess.get(firstRight);
341
			}
342

    
343
			addField(instance, Instance.getDescriptor().findFieldByName("license"), getFirstItem(license));
344
			addField(instance, Instance.getDescriptor().findFieldByName("hostedby"), hostedBys);
345

    
346
			addField(instance, Instance.getDescriptor().findFieldByName("accessright"),
347
					setQualifier(getDefaultQualifier("dnet:access_modes"), Lists.newArrayList(tmpRigths)));
348

    
349
			addField(instance, Instance.getDescriptor().findFieldByName("instancetype"),
350
					setQualifier(getDefaultQualifier("dnet:dataCite_resource"), Lists.newArrayList(cobjcategoryCode)));
351

    
352
			if (StringUtils.isNotBlank(landingPage)) {
353
				addField(instance, Instance.getDescriptor().findFieldByName("url"), landingPage);
354
			}
355
			//sometimes the instanceUri is blank...
356
			if (StringUtils.isNotBlank(instanceUri)) {
357
				addField(instance, Instance.getDescriptor().findFieldByName("url"), instanceUri);
358
			}
359

    
360
			addField(instance, Instance.getDescriptor().findFieldByName("distributionlocation"), getFirstItem(distributionlocation));
361

    
362
			addField(instance, Instance.getDescriptor().findFieldByName("collectedfrom"), collectedFroms);
363
			addField(instance, Instance.getDescriptor().findFieldByName("dateofacceptance"), getFirstItem(dateaccepted));
364

    
365
			result.addInstance(instance);
366

    
367
			List<StructuredProperty> pids = parsePids(pidList);
368

    
369
			// original ids
370
			final Set<String> originalIdList = Sets.newHashSet();
371
			for (int i = 0; i < originalIds.getLength(); i++) {
372
				Node currentNode = originalIds.item(i);
373
				if (currentNode != null && currentNode.hasChildNodes()) {
374
					originalIdList.add(currentNode.getChildNodes().item(0).getNodeValue());
375
				}
376
			}
377

    
378
			OafEntity.Builder entity =
379
					getEntity(Type.result, entityId, collectedFroms, originalIdList, dateOfCollection, dateOfTransformation, pids).setResult(
380
							result.setMetadata(metadataProto));
381

    
382
			entity.setOaiprovenance(getOAIProvenance(about));
383

    
384
			Oaf oaf = getOaf(entity, getDataInfo(invisible, about, provenance, trust, false, false));
385
			return base64(oaf.toByteArray());
386
		} catch (Exception e) {
387
			e.printStackTrace(System.err);
388
			throw new RuntimeException(e);
389
		}
390

    
391
	}
392

    
393
	private static String getResultType(final NodeList cobjcategoryNode) {
394

    
395
		final ValueMap values = ValueMap.parseNodeList(cobjcategoryNode);
396

    
397
		final eu.dnetlib.data.transform.xml.Element cobjcategory = values.get("cobjcategory").stream()
398
				.map(e -> StringUtils.isNotBlank(e.getText()) ? e : new eu.dnetlib.data.transform.xml.Element("0000", e.getAttributes()))
399
				.findFirst()
400
				.orElse(new eu.dnetlib.data.transform.xml.Element("0000", new HashMap<>()));
401

    
402
		final String resulttype = cobjcategory.getAttributeValue("type");
403
		if (StringUtils.isNotBlank(resulttype)) {
404
			return resulttype;
405
		}
406

    
407
		return getDefaultResulttype(cobjcategory);
408
	}
409

    
410
	public static String getFirstItem(final NodeList list) {
411
		String out = "";
412
		if (list != null) {
413

    
414
			if (list.getLength() > 0 && list.item(0).getChildNodes() != null && list.item(0).getChildNodes().getLength() > 0) {
415
				out = list.item(0).getChildNodes().item(0).getNodeValue();
416
			}
417
		}
418
		return out;
419
	}
420

    
421
}
(8-8/10)