Project

General

Profile

1
package eu.dnetlib.data.transform.xml;
2

    
3
import java.util.HashMap;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.Set;
7

    
8
import com.google.common.collect.Lists;
9
import com.google.common.collect.Maps;
10
import com.google.common.collect.Sets;
11
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
12
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
13
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
14
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
15
import eu.dnetlib.data.proto.OafProtos.Oaf;
16
import eu.dnetlib.data.proto.OafProtos.OafEntity;
17
import eu.dnetlib.data.proto.ResultProtos.Result;
18
import eu.dnetlib.data.proto.ResultProtos.Result.Context;
19
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
20
import eu.dnetlib.data.proto.TypeProtos.Type;
21
import org.apache.commons.lang3.StringUtils;
22
import org.w3c.dom.Element;
23
import org.w3c.dom.NamedNodeMap;
24
import org.w3c.dom.Node;
25
import org.w3c.dom.NodeList;
26

    
27
public class OdfToHbaseXsltFunctions extends CommonDNetXsltFunctions {
28

    
29
	private static Map<String, String> mappingAccess = Maps.newHashMap();
30

    
31
	static {
32

    
33
		mappingAccess.put("info:eu-repo/semantics/openAccess", "OPEN");
34
		mappingAccess.put("info:eu-repo/semantics/closedAccess", "CLOSED");
35
		mappingAccess.put("info:eu-repo/semantics/restrictedAccess", "RESTRICTED");
36
		mappingAccess.put("info:eu-repo/semantics/embargoedAccess", "EMBARGO");
37

    
38
		// Transformator now maps the access rights into proper values, not sure if it does for all datasets.
39
		mappingAccess.put("OPEN", "OPEN");
40
		mappingAccess.put("CLOSED", "CLOSED");
41
		mappingAccess.put("RESTRICTED", "RESTRICTED");
42
		mappingAccess.put("EMBARGO", "EMBARGO");
43
		mappingAccess.put("OPEN SOURCE", "OPEN SOURCE");
44

    
45
	}
46

    
47
	public static String odfResult(
48
			final String resultId,
49
			final boolean invisible,
50
			final NodeList about,
51
			final NodeList metadata,
52
			final NodeList titles,
53
			final NodeList creators,
54
			final NodeList subjects,
55
			final NodeList publisher,
56
			final NodeList descriptions,
57
			final NodeList dates,
58
			final NodeList dateaccepted,
59
			final NodeList resourceTypes,
60
			final NodeList formats,
61
			final NodeList sizes,
62
			final NodeList languages,
63
			final NodeList cobjcategory,
64
			final NodeList contributors,
65
			final NodeList rights,
66
			final NodeList license,
67
			final NodeList version,
68
			final NodeList pidList,
69
			final String provenance,
70
			final String trust,
71
			final NodeList hostedby,
72
			final NodeList collectedfrom,
73
			final NodeList originalIds,
74
			final String instanceUri,
75
			final String landingPage,
76
			final NodeList distributionlocation,
77
			final NodeList documentationUrl,
78
			final String dateOfCollection,
79
			final String dateOfTransformation) {
80

    
81
		try {
82
			final String entityId = OafRowKeyDecoder.decode(resultId).getKey();
83

    
84
			final Result.Builder result = Result.newBuilder();
85
			Result.Metadata.Builder metadataProto = Result.Metadata.newBuilder();
86

    
87
			// subject
88
			for (int i = 0; i < subjects.getLength(); i++) {
89
				Node currentNode = subjects.item(i);
90
				NodeList childNodes = currentNode.getChildNodes();
91
				if (childNodes.getLength() > 0) {
92
					String subjectValue = childNodes.item(0).getNodeValue();
93
					addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("subject"),
94
							getStructuredProperty(subjectValue, "keyword", "keyword", "dnet:subject_classification_typologies", "dnet:subject_classification_typologies"));
95
				}
96
			}
97

    
98
			// title
99
			for (int i = 0; i < titles.getLength(); i++) {
100
				Node currentNode = titles.item(i);
101
				NodeList childNodes = currentNode.getChildNodes();
102
				if (childNodes.getLength() > 0) {
103
					String titleValue = childNodes.item(0).getNodeValue();
104
					String classname = "main title";
105
					String classid = "main title";
106
					if (currentNode.hasAttributes()) {
107
						NamedNodeMap attributes = currentNode.getAttributes();
108
						Node titleType = attributes.getNamedItem("titleType");
109

    
110
						if (titleType != null && titleType.getNodeValue().equals("AlternativeTitle")) {
111
							classname = "alternative title";
112
							classid = "alternative title";
113
						}
114
						if (titleType != null && titleType.getNodeValue().equals("Subtitle")) {
115
							classname = "subtitle";
116
							classid = "subtitle";
117
						}
118
						if (titleType != null && titleType.getNodeValue().equals("TranslatedTitle")) {
119
							classname = "translated title";
120
							classid = "translated title";
121
						}
122
					}
123
					addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("title"),
124
							getStructuredProperty(titleValue, classname, classid, "dnet:dataCite_title", "dnet:dataCite_title"));
125
				}
126
			}
127

    
128
			// creators
129
			for (int i = 0; i < creators.getLength(); i++) {
130
				final Element creator = (Element) creators.item(i);
131
				if (creator != null && creator.hasChildNodes()) {
132

    
133
					final NodeList creatorNames = creator.getElementsByTagName("creatorName");
134
					if (creatorNames.getLength() > 0) {
135
						final Element creatorName = (Element) creatorNames.item(0);
136

    
137
						final Author.Builder author = Author.newBuilder();
138
						author.setRank(i+1);
139
						final String fullname = StringUtils.trim(creatorName.getTextContent());
140

    
141
						author.setFullname(fullname);
142

    
143
						final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(fullname, false);
144
						if (p.isAccurate()) {
145
							author.setName(p.getNormalisedFirstName());
146
							author.setSurname(p.getNormalisedSurname());
147
						}
148
						final NodeList nameIdentifiers = creator.getElementsByTagName("nameIdentifier");
149
						if (nameIdentifiers.getLength() > 0) {
150
							final Element nameIdentifier = (Element) nameIdentifiers.item(0);
151
							final String nameIdentifierScheme = nameIdentifier.getAttribute("nameIdentifierScheme");
152
							final String id = StringUtils.trim(nameIdentifier.getTextContent());
153
							if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(nameIdentifierScheme)) {
154
								author.addPid(getKV(nameIdentifierScheme, id));
155
							}
156
						}
157

    
158
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("author"), author);
159
					}
160
				}
161
			}
162

    
163
			// description
164
			for (int i = 0; i < descriptions.getLength(); i++) {
165
				Element currentNode = (Element) descriptions.item(i);
166
				if (currentNode != null && currentNode.hasChildNodes()) {
167
					String descriptionValue = currentNode.getChildNodes().item(0).getNodeValue();
168

    
169
					final String descriptionType = currentNode.getAttribute("descriptionType");
170
					if (StringUtils.isNotBlank(descriptionType)) {
171
						switch (descriptionType) {
172
						case "TechnicalInfo":
173
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("tool"), descriptionValue);
174
							break;
175
						case "Abstract":
176
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("description"), descriptionValue);
177
							break;
178
						case "DistributionForm":
179
							addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("format"), descriptionValue);
180
							break;
181
						}
182
					} else {
183
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("description"), descriptionValue);
184
					}
185
				}
186
			}
187

    
188
			// contributors
189
			for (int i = 0; i < contributors.getLength(); i++) {
190
				final Element contributor = (Element) contributors.item(i);
191
				if (contributor != null && contributor.hasChildNodes()) {
192

    
193
					NodeList contributorNames = contributor.getElementsByTagName("contributorName");
194
					if (contributorNames != null) {
195
						Element contributorName = (Element) contributorNames.item(0);
196
						if (contributorName != null) {
197
							final String contributorValue = contributorName.getTextContent();
198
							final String contributorType = contributor.getAttribute("contributorType");
199

    
200
							if (StringUtils.isNotBlank(contributorType)) {
201
								switch (contributorType) {
202
								case "ContactPerson":
203
									addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contactperson"), contributorValue);
204
									break;
205
								case "ContactGroup":
206
									addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contactgroup"), contributorValue);
207
									break;
208
								}
209
							} else {
210
								addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("contributor"), contributorValue);
211
							}
212
						}
213
					}
214
				}
215
			}
216

    
217
			// publisher
218
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("publisher"), getFirstItem(publisher));
219

    
220
			// dates
221
			for (int i = 0; i < dates.getLength(); i++) {
222
				Node currentNode = dates.item(i);
223
				if (currentNode != null && currentNode.hasAttributes() && currentNode.hasChildNodes()) {
224
					String dateAttribute = currentNode.getAttributes().getNamedItem("dateType").getNodeValue();
225
					String dateValue = currentNode.getChildNodes().item(0).getNodeValue();
226
					String protoAttribute = "relevantdate";
227
					if ("Accepted".equals(dateAttribute)) {
228
						protoAttribute = "dateofacceptance";
229
					} else if ("Issued".equals(dateAttribute)) {
230
						protoAttribute = "storagedate";
231
					} else if ("Updated".equals(dateAttribute)) {
232
						protoAttribute = "lastmetadataupdate";
233
					} else if ("Available".equals(dateAttribute)) {
234
						protoAttribute = "embargoenddate";
235
					}
236
					if (protoAttribute.equals("relevantdate") == false) {
237
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName(protoAttribute), dateValue);
238
					} else {
239
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName(protoAttribute),
240
								getStructuredProperty(dateValue, "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date"));
241
					}
242
				}
243
			}
244

    
245
			// dateofacceptance
246
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("dateofacceptance"), getFirstItem(dateaccepted));
247

    
248
			// size
249
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("size"), getFirstItem(sizes));
250

    
251
			// version
252
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("version"), getFirstItem(version));
253

    
254
			// language
255
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("language"),
256
					setQualifier(getDefaultQualifier("dnet:languages"), Lists.newArrayList(getFirstItem(languages))));
257

    
258
			// resource type
259
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("resourcetype"),
260
					setQualifier(getDefaultQualifier("dnet:dataCite_resource"), Lists.newArrayList(getFirstItem(resourceTypes))));
261

    
262
			// resultType
263
			final String cobjcategoryCode = getFirstItem(cobjcategory);
264
			final String resulttype = getResultType(cobjcategory);
265
			addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("resulttype"), getSimpleQualifier(resulttype, "dnet:result_typologies"));
266

    
267
			switch (resulttype) {
268
			case "software" :
269
				// format
270
				addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("programmingLanguage"),
271
						getSimpleQualifier(getFirstItem(formats), "dnet:programming_languages"));
272
				break;
273
			case "dataset":
274
				for (int i = 0; i < formats.getLength(); i++) {
275
					Node currentNode = formats.item(i);
276
					NodeList childNodes = currentNode.getChildNodes();
277
					if (childNodes.getLength() > 0) {
278
						String formatValue = childNodes.item(0).getNodeValue();
279
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("format"), formatValue);
280
					}
281
				}
282
				break;
283
			case "other":
284

    
285
				break;
286
			}
287

    
288
			// documentationUrl
289
			for (int i = 0; i < documentationUrl.getLength(); i++) {
290
				final Element docUrl = (Element) documentationUrl.item(i);
291
				if (docUrl != null && docUrl.hasChildNodes()) {
292
					final String value = docUrl.getTextContent();
293
					if (StringUtils.isNotBlank(value)) {
294
						addField(metadataProto, Result.Metadata.getDescriptor().findFieldByName("documentationUrl"), value);
295
					}
296
				}
297
			}
298

    
299
			// contexts
300
			ValueMap values = ValueMap.parseNodeList(metadata);
301
			if (values.get("concept") != null) {
302
				for (final eu.dnetlib.data.transform.xml.Element e : values.get("concept")) {
303
					final String id = e.getAttributes().get("id");
304
					if (StringUtils.isBlank(id)) throw new IllegalArgumentException("Context id cannot be blank");
305
					metadataProto.addContext(Context.newBuilder().setId(id));
306
				}
307
			}
308

    
309
			final List<KeyValue> hostedBys = getKeyValues(ValueMap.parseNodeList(hostedby), "hostedby", Type.datasource);
310
			final List<KeyValue> collectedFroms = getKeyValues(ValueMap.parseNodeList(collectedfrom), "collectedfrom", Type.datasource);
311

    
312
			final Instance.Builder instance = Instance.newBuilder();
313

    
314
			String tmpRigths = "UNKNOWN";
315
			final String firstRight = getFirstItem(rights);
316
			if (mappingAccess.containsKey(firstRight)) {
317
				tmpRigths = mappingAccess.get(firstRight);
318
			}
319

    
320
			addField(instance, Instance.getDescriptor().findFieldByName("license"), getFirstItem(license));
321
			addField(instance, Instance.getDescriptor().findFieldByName("hostedby"), hostedBys);
322

    
323
			addField(instance, Instance.getDescriptor().findFieldByName("accessright"),
324
					setQualifier(getDefaultQualifier("dnet:access_modes"), Lists.newArrayList(tmpRigths)));
325

    
326
			addField(instance, Instance.getDescriptor().findFieldByName("instancetype"),
327
					setQualifier(getDefaultQualifier("dnet:dataCite_resource"), Lists.newArrayList(cobjcategoryCode)));
328

    
329
			addField(instance, Instance.getDescriptor().findFieldByName("url"), instanceUri);
330
			if (StringUtils.isNotBlank(landingPage)) {
331
				addField(instance, Instance.getDescriptor().findFieldByName("url"), landingPage);
332
			}
333
			addField(instance, Instance.getDescriptor().findFieldByName("distributionlocation"), getFirstItem(distributionlocation));
334

    
335
			addField(instance, Instance.getDescriptor().findFieldByName("collectedfrom"), collectedFroms);
336
			addField(instance, Instance.getDescriptor().findFieldByName("dateofacceptance"), getFirstItem(dateaccepted));
337

    
338
			result.addInstance(instance);
339

    
340
			List<StructuredProperty> pids = parsePids(pidList);
341

    
342
			// original ids
343
			final Set<String> originalIdList = Sets.newHashSet();
344
			for (int i = 0; i < originalIds.getLength(); i++) {
345
				Node currentNode = originalIds.item(i);
346
				if (currentNode != null && currentNode.hasChildNodes()) {
347
					originalIdList.add(currentNode.getChildNodes().item(0).getNodeValue());
348
				}
349
			}
350

    
351
			OafEntity.Builder entity =
352
					getEntity(Type.result, entityId, collectedFroms, originalIdList, dateOfCollection, dateOfTransformation, pids).setResult(
353
							result.setMetadata(metadataProto));
354

    
355
			entity.setOaiprovenance(getOAIProvenance(about));
356

    
357
			Oaf oaf = getOaf(entity, getDataInfo(invisible, about, provenance, trust, false, false));
358
			return base64(oaf.toByteArray());
359
		} catch (Exception e) {
360
			e.printStackTrace(System.err);
361
			throw new RuntimeException(e);
362
		}
363

    
364
	}
365

    
366
	private static String getResultType(final NodeList cobjcategoryNode) {
367

    
368
		final ValueMap values = ValueMap.parseNodeList(cobjcategoryNode);
369

    
370
		final eu.dnetlib.data.transform.xml.Element cobjcategory = values.get("cobjcategory").stream()
371
				.map(e -> StringUtils.isNotBlank(e.getText()) ? e : new eu.dnetlib.data.transform.xml.Element("0000", e.getAttributes()))
372
				.findFirst()
373
				.orElse(new eu.dnetlib.data.transform.xml.Element("0000", new HashMap<>()));
374

    
375
		final String resulttype = cobjcategory.getAttributeValue("type");
376
		if (StringUtils.isNotBlank(resulttype)) {
377
			return resulttype;
378
		}
379

    
380
		return getDefaultResulttype(cobjcategory);
381
	}
382

    
383
	public static String getFirstItem(final NodeList list) {
384
		String out = "";
385
		if (list != null) {
386

    
387
			if (list.getLength() > 0 && list.item(0).getChildNodes() != null && list.item(0).getChildNodes().getLength() > 0) {
388
				out = list.item(0).getChildNodes().item(0).getNodeValue();
389
			}
390
		}
391
		return out;
392
	}
393

    
394
}
(8-8/10)