Project

General

Profile

1
package eu.dnetlib.data.transform.xml2;
2

    
3
import java.util.List;
4
import java.util.Map;
5
import java.util.Map.Entry;
6
import java.util.Objects;
7
import java.util.function.Function;
8

    
9
import com.google.common.collect.Streams;
10
import com.google.protobuf.Descriptors.Descriptor;
11
import com.ximpleware.AutoPilot;
12
import com.ximpleware.VTDGen;
13
import com.ximpleware.VTDNav;
14
import eu.dnetlib.data.proto.FieldTypeProtos;
15
import eu.dnetlib.data.proto.FieldTypeProtos.*;
16
import eu.dnetlib.data.proto.FieldTypeProtos.OAIProvenance.OriginDescription;
17
import eu.dnetlib.data.proto.KindProtos.Kind;
18
import eu.dnetlib.data.proto.OafProtos.Oaf;
19
import eu.dnetlib.data.proto.OafProtos.OafEntity;
20
import eu.dnetlib.data.proto.OafProtos.OafRel;
21
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
22
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
23
import eu.dnetlib.data.proto.ResultProtos.Result;
24
import eu.dnetlib.data.proto.ResultProtos.Result.*;
25
import eu.dnetlib.data.proto.TypeProtos.Type;
26
import eu.dnetlib.miscutils.collections.Pair;
27
import eu.dnetlib.pace.model.Person;
28
import org.apache.commons.lang3.StringUtils;
29
import org.apache.commons.lang3.exception.ExceptionUtils;
30
import org.apache.commons.logging.Log;
31
import org.apache.commons.logging.LogFactory;
32

    
33
import static eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions.oafSimpleId;
34
import static eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions.oafSplitId;
35
import static eu.dnetlib.data.transform.xml2.Utils.*;
36
import static eu.dnetlib.data.transform.xml2.VtdUtilityParser.*;
37
import static java.lang.String.format;
38

    
39
public abstract class AbstractResultVtdParser implements Function<String, Oaf> {
40

    
41
	private static final Log log = LogFactory.getLog(AbstractResultVtdParser.class);
42

    
43
	protected boolean invisible = false;
44
	protected String provenance = "";
45
	protected String trust = "0.9";
46

    
47
	protected SpecificationMap specs;
48

    
49
	public AbstractResultVtdParser(final Map<String, String> fields) {
50
		this.specs = buildSpecs(fields);
51
	}
52

    
53
	public AbstractResultVtdParser(final boolean invisible, final String provenance, final String trust, final Map<String, String> fields) {
54
		this(fields);
55
		this.invisible = invisible;
56
		this.provenance = provenance;
57
		this.trust = trust;
58
	}
59

    
60
	protected abstract String getResulttype(final String cobjcategory);
61

    
62
	@Override
63
	public Oaf apply(final String xml) {
64
		try {
65
			final VTDGen vg = parseXml(xml);
66
			final VTDNav vn = vg.getNav();
67
			final AutoPilot ap = new AutoPilot(vn);
68

    
69
			final boolean skiprecord = Boolean.valueOf(getFirstValue(ap, vn, xpath("record", "header", "skipRecord")));
70
			int metadata = countNodes(ap, vn, format("count(%s)", xpath("record", "metadata")));
71

    
72
			if (metadata == 0 || skiprecord) {
73
				return null;
74
			}
75

    
76
			final String objIdentifier = oafSimpleId(Type.result.name(), getFirstValue(ap, vn, xpath("record", "header", "objIdentifier")));
77
			if (StringUtils.isBlank(objIdentifier)) {
78
				return null;
79
			}
80

    
81
			for(final Entry<Descriptor, SpecificationDescriptor> spec : specs.entrySet()) {
82
				final Descriptor d = spec.getKey();
83
				final SpecificationDescriptor md = spec.getValue();
84

    
85
				for(Entry<String, Pair<String, Function<List<Node>, Object>>> entry : md.getFields().entrySet()) {
86
					final String fieldName = entry.getKey();
87
					final Pair<String, Function<List<Node>, Object>> pair = entry.getValue();
88
					final String xpath = pair.getKey();
89
					final Function<List<Node>, Object> function = pair.getValue();
90
					try {
91
						addField(md.getBuilder(), d.findFieldByName(fieldName), function.apply(getNodes(ap, vn, xpath)));
92
					} catch (Throwable e) {
93
						throw new VtdException(String.format("Error mapping field '%s' from xpath '%s' for record '%s'", fieldName, xpath, objIdentifier), e);
94
					}
95
				}
96
			}
97

    
98
			return Oaf.newBuilder()
99
					.setKind(Kind.entity)
100
					.setDataInfo(ensureDataInfo(ap, vn, DataInfo.newBuilder()))
101
					.setEntity(((OafEntity.Builder) specs.get(OafEntity.getDescriptor())
102
							.getBuilder()
103
							.setField(
104
									OafEntity.getDescriptor().findFieldByName(Type.result.name()),
105
									((Result.Builder) specs.get(Result.getDescriptor()).getBuilder())
106
											.setMetadata((Metadata) specs.get(Metadata.getDescriptor()).getBuilder().build())
107
											.addInstance((Instance) specs.get(Instance.getDescriptor()).getBuilder().build())
108
											.build()))
109
							.setId(objIdentifier)
110
							.setOaiprovenance(getOaiProvenance(ap, vn))
111
							.build())
112
					.build();
113
		} catch (Throwable e) {
114
			log.error(xml);
115
			log.error(ExceptionUtils.getStackTrace(e));
116
			return null;
117
		}
118
	}
119

    
120
	public SpecificationMap buildSpecs(final Map<String, String> fields) {
121
		final SpecificationMap specs = new SpecificationMap();
122

    
123
		specs.put(Result.getDescriptor(), SpecificationDescriptor.newInstance())
124
				.setBuilder(Result.newBuilder())
125
				.put("externalReference", fields.get("externalReference"), nodes -> nodes.stream()
126
						.map(node -> {
127
							final ExternalReference.Builder extref = ExternalReference.newBuilder();
128
							if (StringUtils.isNotBlank(node.getTextValue())) {
129
								extref.setUrl(node.getTextValue());
130
							}
131
							final Map<String, String> a = node.getAttributes();
132
							final String source = a.get("source");
133
							if (StringUtils.isNotBlank(source)) {
134
								extref.setSitename(source);
135
							}
136
							final String identifier = a.get("identifier");
137
							if (StringUtils.isNotBlank(identifier)) {
138
								extref.setRefidentifier(identifier);
139
							}
140
							final String title = a.get("title");
141
							if (StringUtils.isNotBlank(title)) {
142
								extref.setLabel(title);
143
							}
144
							final String query = a.get("query");
145
							if (StringUtils.isNotBlank(query)) {
146
								extref.setQuery(query);
147
							}
148
							final String type = a.get("type");
149
							if (StringUtils.isNotBlank(type)) {
150
								extref.setQualifier(getSimpleQualifier(type, DNET_EXT_REF_TYPOLOGIES));
151
							}
152
							return extref.build();
153
						}));
154

    
155
		specs.put(Instance.getDescriptor(), SpecificationDescriptor.newInstance())
156
				.setBuilder(Instance.newBuilder())
157
				.put("license", fields.get("license"), nodes -> nodes.stream()
158
						.filter(node -> {
159
							final Map<String, String> a = node.getAttributes();
160
							switch (node.getName()) {
161
							case "rights":
162
								return a.containsKey(RIGHTS_URI) && a.get(RIGHTS_URI).matches(URL_REGEX);
163
							case "license":
164
								return true;
165
							default:
166
								return false;
167
							}
168
						})
169
						.map(Node::getTextValue))
170
				.put("accessright", fields.get("accessright"), nodes -> nodes.stream()
171
						.map(Node::getTextValue)
172
						.map(rights -> mappingAccess.containsKey(rights) ? mappingAccess.get(rights) : "UNKNOWN")
173
						.map(code -> getQualifier(code, getClassName(code), DNET_ACCESS_MODES, DNET_ACCESS_MODES)))
174
				.put("instancetype", fields.get("instancetype"), nodes -> nodes.stream()
175
						.map(Node::getTextValue)
176
						.map(code -> getQualifier(code, getClassName(code), DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)))
177
				.put("hostedby", fields.get("hostedby"), nodes -> nodes.stream()
178
						.map(node -> getKV(oafSplitId("datasource", node.getAttributes().get("id")), node.getAttributes().get("name"))))
179
				.put("url", fields.get("url"), nodes -> nodes.stream()
180
						.map(Node::getTextValue)
181
						.filter(s -> s.trim().matches(URL_REGEX)))
182
				.put("dateofacceptance", fields.get("dateofacceptance"), nodes -> nodes.stream()
183
						.map(Node::getTextValue));
184

    
185
		specs.put(Metadata.getDescriptor(), SpecificationDescriptor.newInstance())
186
				.setBuilder(Metadata.newBuilder())
187
				.put("title", fields.get("title"), nodes -> nodes.stream()
188
						.map(node -> {
189
							final Qualifier.Builder q = Qualifier.newBuilder().setSchemeid(DNET_TITLE_TYPOLOGIES).setSchemename(DNET_TITLE_TYPOLOGIES);
190
							switch (node.getAttributes().get(TITLE_TYPE) + "") {
191
							case "AlternativeTitle":
192
								q.setClassid("alternative title").setClassname("alternative title");
193
								break;
194
							case "Subtitle":
195
								q.setClassid("subtitle").setClassname("subtitle");
196
								break;
197
							case "TranslatedTitle":
198
								q.setClassid("translated title").setClassname("translated title");
199
								break;
200
							default:
201
								q.setClassid("main title").setClassname("main title");
202
								break;
203
							}
204
							return StructuredProperty.newBuilder().setValue(node.getTextValue()).setQualifier(q).build();
205
						}))
206
				.put("description", fields.get("description"), nodes -> nodes.stream()
207
						.map(Node::getTextValue))
208
				.put("storagedate", fields.get("storagedate"), nodes -> nodes.stream()
209
						.map(Node::getTextValue))
210
				.put("lastmetadataupdate", fields.get("lastmetadataupdate"), nodes -> nodes.stream()
211
						.map(Node::getTextValue))
212
				.put("embargoenddate", fields.get("embargoenddate"), nodes -> nodes.stream()
213
						.map(Node::getTextValue))
214
				.put("dateofacceptance", fields.get("dateofacceptance"), nodes -> nodes.stream()
215
						.map(Node::getTextValue))
216
				.put("author", fields.get("author"), nodes -> Streams.mapWithIndex(
217
						nodes.stream()
218
								.map(Node::getTextValue),
219
						(creator, i) -> new Pair<>(i, creator))
220
						.map(pair -> {
221
							final Author.Builder author = Author.newBuilder();
222
							author.setFullname(pair.getValue());
223
							author.setRank(pair.getKey().intValue() + 1);
224
							final Person p = new Person(pair.getValue(), false);
225
							if (p.isAccurate()) {
226
								author.setName(p.getNormalisedFirstName());
227
								author.setSurname(p.getNormalisedSurname());
228
							}
229
							return author.build();
230
						}))
231
				.put("contributor", fields.get("contributor"), nodes -> nodes.stream()
232
						.map(Node::getTextValue))
233
				.put("subject", fields.get("subject"), nodes -> nodes.stream()
234
						.map(node -> {
235
							final Map<String, String> a = node.getAttributes();
236
							final String classId = StringUtils.isNotBlank(a.get(CLASSID)) ? a.get(CLASSID) : KEYWORD;
237
							final String className = StringUtils.isNotBlank(a.get(CLASSNAME)) ? a.get(CLASSNAME) : KEYWORD;
238
							final String schemeId = StringUtils.isNotBlank(a.get(SCHEMEID)) ? a.get(SCHEMEID) : DNET_SUBJECT_TYPOLOGIES;
239
							final String schemeName = StringUtils.isNotBlank(a.get(SCHEMENAME)) ? a.get(SCHEMENAME) : DNET_SUBJECT_TYPOLOGIES;
240
							return getStructuredProperty(node.getTextValue(), classId, className, schemeId, schemeName);
241
						}))
242
				.put("format", fields.get("format"), nodes -> nodes.stream()
243
						.map(Node::getTextValue))
244
				.put("source", fields.get("source"), nodes -> nodes.stream()
245
						.map(Node::getTextValue))
246
				.put("size", fields.get("size"), nodes -> nodes.stream()
247
						.map(Node::getTextValue))
248
				.put("version", fields.get("version"), nodes -> nodes.stream()
249
						.map(Node::getTextValue))
250
				.put("publisher", fields.get("publisher"), nodes -> nodes.stream()
251
						.map(Node::getTextValue))
252
				.put("language", fields.get("language"), nodes -> nodes.stream()
253
						.map(Node::getTextValue)
254
						.map(code -> getQualifier(code, getClassName(code), DNET_LANGUAGES, DNET_LANGUAGES)))
255
				.put("resourcetype", fields.get("resourcetype"), nodes -> nodes.stream()
256
						.map(node -> node.getAttributes().get("resourceTypeGeneral"))
257
						.map(resourceType -> getSimpleQualifier(resourceType, DNET_DATA_CITE_RESOURCE)))
258
				.put("resulttype", fields.get("resulttype"), nodes -> nodes.stream()
259
						.map(Node::getTextValue)
260
						.map(cobjcategory -> getSimpleQualifier(getResulttype(cobjcategory), DNET_RESULT_TYPOLOGIES)))
261
				.put("concept", fields.get("concept"), nodes -> nodes.stream()
262
						.filter(node -> node.getAttributes() != null && StringUtils.isNotBlank(node.getAttributes().get("id")))
263
						.map(node -> Context.newBuilder().setId(node.getAttributes().get("id"))))
264
				.put("journal", fields.get("journal"), nodes -> nodes.stream()
265
						.map(node -> {
266
							final Journal.Builder journal = Journal.newBuilder();
267
							if (StringUtils.isNotBlank(node.getTextValue())) {
268
								journal.setName(node.getTextValue());
269
							}
270
							if (node.getAttributes() != null) {
271
								final Map<String, String> a = node.getAttributes();
272
								if (StringUtils.isNotBlank(a.get("issn"))) {
273
									journal.setIssnPrinted(a.get("issn"));
274
								}
275
								if (StringUtils.isNotBlank(a.get("eissn"))) {
276
									journal.setIssnOnline(a.get("eissn"));
277
								}
278
								if (StringUtils.isNotBlank(a.get("lissn"))) {
279
									journal.setIssnLinking(a.get("lissn"));
280
								}
281
								if (StringUtils.isNotBlank(a.get("sp"))) {
282
									journal.setSp(a.get("sp"));
283
								}
284
								if (StringUtils.isNotBlank(a.get("ep"))) {
285
									journal.setEp(a.get("ep"));
286
								}
287
								if (StringUtils.isNotBlank(a.get("iss"))) {
288
									journal.setIss(a.get("iss"));
289
								}
290
								if (StringUtils.isNotBlank(a.get("vol"))) {
291
									journal.setVol(a.get("vol"));
292
								}
293
							}
294
							return journal;
295
						}));
296

    
297
		specs.put(OafEntity.getDescriptor(), SpecificationDescriptor.newInstance())
298
				.setBuilder(OafEntity.newBuilder().setType(Type.result))
299
				.put("originalId", fields.get("originalId"), nodes -> nodes.stream()
300
						.map(Node::getTextValue)
301
						.map(s -> StringUtils.contains(s, ID_SEPARATOR) ? StringUtils.substringAfter(s, ID_SEPARATOR) : s)
302
						.filter(s -> !s.trim().matches(URL_REGEX)))
303
				.put("collectedfrom", fields.get("collectedfrom"), nodes -> nodes.stream()
304
						.map(node -> getKV(
305
								oafSplitId(Type.datasource.name(), node.getAttributes().get("id")),
306
								node.getAttributes().get("name"))))
307
				.put("pid", fields.get("pid"), nodes -> nodes.stream()
308
						.filter(pid -> {
309
							final Map<String, String> a = pid.getAttributes();
310
							return a.containsKey(IDENTIFIER_TYPE) || a.containsKey(ALTERNATE_IDENTIFIER_TYPE);
311
						})
312
						.filter(pid -> {
313
							final Map<String, String> a = pid.getAttributes();
314
							return !"url".equalsIgnoreCase(a.get(IDENTIFIER_TYPE)) && !"url".equalsIgnoreCase(a.get(ALTERNATE_IDENTIFIER_TYPE));
315
						})
316
						.map(pid -> {
317
							final Map<String, String> a = pid.getAttributes();
318
							final String identifierType = a.get(IDENTIFIER_TYPE);
319
							final String altIdentifierType = a.get(ALTERNATE_IDENTIFIER_TYPE);
320
							return StructuredProperty.newBuilder()
321
									.setValue(pid.getTextValue())
322
									.setQualifier(getSimpleQualifier(
323
											StringUtils.isNotBlank(identifierType) ?
324
													identifierType : altIdentifierType, DNET_PID_TYPES))
325
									.build();
326
						}))
327
				.put("dateofcollection", fields.get("dateofcollection"), nodes -> nodes.stream()
328
						.map(Node::getTextValue))
329
				.put("dateoftransformation", fields.get("dateoftransformation"), nodes -> nodes.stream()
330
						.map(Node::getTextValue))
331
				.put("cachedRel", fields.get("cachedRel"), nodes -> nodes.stream()
332
						.map(node -> getOafRel(node,
333
								OafRel.newBuilder()
334
										.setSource("")
335
										.setChild(false)))
336
						.filter(Objects::nonNull)
337
						.map(oafRel -> oafRel.build()));
338
		return specs;
339
	}
340

    
341
	private static OafRel.Builder getOafRel(final Node node, final OafRel.Builder oafRel) {
342
		final Map<String, String> a = node.getAttributes();
343

    
344
		switch (node.getName()) {
345
		case PROJECTID:
346
			if (StringUtils.isBlank(node.getTextValue())) {
347
				return null;
348
			}
349
			return oafRel
350
					.setTarget(oafSplitId(Type.project.name(), StringUtils.trim(node.getTextValue())))
351
					.setRelType(RelType.resultProject)
352
					.setSubRelType(SubRelType.outcome)
353
					.setRelClass("isProducedBy");
354

    
355
		case RELATED_PUBLICATION:
356
		case RELATED_DATASET:
357
			if (StringUtils.isBlank(a.get("id"))) {
358
				return null;
359
			}
360
			return oafRel
361
					.setTarget(oafSimpleId(Type.result.name(), StringUtils.trim(a.get("id"))))
362
					.setRelType(RelType.resultResult)
363
					.setSubRelType(SubRelType.publicationDataset)
364
					.setRelClass("isRelatedTo");
365

    
366
		case RELATED_IDENTIFIER:
367
			if (StringUtils.isBlank(node.getTextValue())) {
368
				return null;
369
			}
370
			return oafRel
371
					.setTarget(node.getTextValue())
372
					.setRelType(RelType.resultResult)
373
					.setSubRelType(SubRelType.relationship)
374
					.setRelClass(a.get(RELATION_TYPE))
375
					.setCachedTarget(
376
							OafEntity.newBuilder()
377
									.setType(Type.result)
378
									.setId("") //TODO
379
									.addPid(
380
											StructuredProperty.newBuilder()
381
													.setValue(node.getTextValue())
382
													.setQualifier(getSimpleQualifier(a.get(RELATED_IDENTIFIER_TYPE), DNET_PID_TYPES))
383
													.build()));
384
		default:
385
			return null;
386
		}
387
	}
388

    
389
	private OriginDescription getOriginDescription(final AutoPilot ap, final VTDNav vn, final String basePath) throws VtdException {
390
		final OriginDescription.Builder od = OriginDescription.newBuilder();
391
		if (getNodes(ap, vn, basePath).isEmpty()) {
392
			return od.build();
393
		}
394
		final Map<String, String> odAttr = getNode(ap, vn, basePath).getAttributes();
395

    
396
		final String harvestDate = odAttr.get("harvestDate");
397
		if (StringUtils.isNotBlank(harvestDate)) {
398
			od.setHarvestDate(harvestDate);
399
		}
400
		final String altered = odAttr.get("altered");
401
		if (StringUtils.isNotBlank(altered)) {
402
			od.setAltered(Boolean.valueOf(altered));
403
		}
404
		final String baseUrl = getFirstValue(ap, vn, basePath + xpath("baseURL"));
405
		if (StringUtils.isNotBlank(basePath)) {
406
			od.setBaseURL(baseUrl);
407
		}
408
		final String identifier = getFirstValue(ap, vn, basePath + xpath("identifier"));
409
		if (StringUtils.isNotBlank(identifier)) {
410
			od.setIdentifier(identifier);
411
		}
412
		final String datestamp = getFirstValue(ap, vn, basePath + xpath("datestamp"));
413
		if (StringUtils.isNotBlank(datestamp)) {
414
			od.setDatestamp(datestamp);
415
		}
416
		final String metadataNamespace = getFirstValue(ap, vn, basePath + xpath("metadataNamespace"));
417
		if (StringUtils.isNotBlank(metadataNamespace)) {
418
			od.setMetadataNamespace(metadataNamespace);
419
		}
420
		final OriginDescription originDescription = getOriginDescription(ap, vn, basePath + xpath("originDescription"));
421
		if (originDescription.hasHarvestDate()) {
422
			od.setOriginDescription(originDescription);
423
		}
424

    
425
		return od.build();
426
	}
427

    
428
	private OAIProvenance getOaiProvenance(final AutoPilot ap, final VTDNav vn) throws VtdException {
429
		return OAIProvenance.newBuilder()
430
				.setOriginDescription(getOriginDescription(ap, vn, xpath("record", "about", "provenance", "originDescription")))
431
				.build();
432
	}
433

    
434
	private FieldTypeProtos.DataInfo.Builder ensureDataInfo(
435
    		final AutoPilot ap, final VTDNav vn,
436
            final DataInfo.Builder info) throws VtdException {
437

    
438
        if (info.isInitialized()) return info;
439
        return buildDataInfo( ap, vn, invisible, provenance, trust, false, false);
440
    }
441

    
442
	private FieldTypeProtos.DataInfo.Builder buildDataInfo(
443
            final AutoPilot ap,
444
            final VTDNav vn,
445
            final boolean invisible,
446
            final String defaultProvenanceaction,
447
            final String defaultTrust,
448
            final boolean defaultDeletedbyinference,
449
            final boolean defaultInferred) throws VtdException {
450

    
451
		final DataInfo.Builder dataInfoBuilder = FieldTypeProtos.DataInfo.newBuilder()
452
            .setInvisible(invisible)
453
			.setInferred(defaultInferred)
454
            .setDeletedbyinference(defaultDeletedbyinference)
455
            .setTrust(defaultTrust)
456
	        .setProvenanceaction(getSimpleQualifier(defaultProvenanceaction, DNET_PROVENANCE_ACTIONS));
457

    
458
        // checking instanceof because when receiving an empty <oaf:datainfo> we don't want to parse it.
459

    
460
	    final String xpath = xpath("record", "about", "datainfo");
461
	    if (getNodes(ap, vn, xpath).size() > 0) {
462
		    final Map<String, String> provAction = getNode(ap, vn, xpath + xpath("provenanceaction")).getAttributes();
463
		    dataInfoBuilder
464
				    .setInvisible(Boolean.valueOf(getValue(getNode(ap, vn, xpath + xpath("invisible")), String.valueOf(invisible))))
465
				    .setInferred(Boolean.valueOf(getValue(getNode(ap, vn, xpath + xpath("inferred")), String.valueOf(defaultInferred))))
466
				    .setDeletedbyinference(Boolean.valueOf(
467
						    getValue(getNode(ap, vn, xpath + xpath("deletedbyinference")), String.valueOf(defaultDeletedbyinference))))
468
				    .setTrust(getValue(getNode(ap, vn, xpath + xpath("trust")), defaultTrust))
469
				    .setInferenceprovenance(getValue(getNode(ap, vn, xpath + xpath("inferenceprovenance")), ""))
470
				    .setProvenanceaction(getSimpleQualifier(
471
						    getValue(provAction.get(CLASSID), defaultProvenanceaction),
472
						    DNET_PROVENANCE_ACTIONS));
473
	    }
474

    
475
	    return dataInfoBuilder;
476
    }
477

    
478
}
(2-2/11)