Project

General

Profile

1
package eu.dnetlib.data.transform.xml;
2

    
3
import java.util.List;
4
import java.util.Map;
5

    
6
import org.apache.commons.lang.StringUtils;
7
import org.w3c.dom.Node;
8
import org.w3c.dom.NodeList;
9

    
10
import com.google.common.collect.Iterables;
11
import com.google.common.collect.Lists;
12
import com.google.protobuf.Descriptors.Descriptor;
13

    
14
import eu.dnetlib.data.mapreduce.util.OafRowKeyDecoder;
15
import eu.dnetlib.data.proto.FieldTypeProtos.OAIProvenance;
16
import eu.dnetlib.data.proto.FieldTypeProtos.OAIProvenance.OriginDescription;
17
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
18
import eu.dnetlib.data.proto.OafProtos.Oaf;
19
import eu.dnetlib.data.proto.OafProtos.OafEntity;
20
import eu.dnetlib.data.proto.OafProtos.OafRel;
21
import eu.dnetlib.data.proto.PersonPersonProtos.PersonPerson;
22
import eu.dnetlib.data.proto.PersonPersonProtos.PersonPerson.CoAuthorship;
23
import eu.dnetlib.data.proto.PersonProtos.Person;
24
import eu.dnetlib.data.proto.PersonProtos.Person.CoAuthor;
25
import eu.dnetlib.data.proto.PersonResultProtos.PersonResult;
26
import eu.dnetlib.data.proto.PersonResultProtos.PersonResult.Authorship;
27
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
28
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
29
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
30
import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject;
31
import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject.Outcome;
32
import eu.dnetlib.data.proto.ResultProtos.Result;
33
import eu.dnetlib.data.proto.ResultProtos.Result.Context;
34
import eu.dnetlib.data.proto.ResultProtos.Result.ExternalReference;
35
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
36
import eu.dnetlib.data.proto.ResultProtos.Result.Journal;
37
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
38
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.PublicationDataset;
39
import eu.dnetlib.data.proto.TypeProtos.Type;
40

    
41
public class DmfToHbaseXsltFunctions extends AbstractDNetOafXsltFunctions {
42

    
43
	private static final int MAX_COAUTHORS = 50;
44

    
45
	// dnet:oafPersonResultFromDMF($resultId, $oafPerson, position(), "sysimport:crosswalk:repository", "0.9")
46
	public static String oafPersonResult_Authorship_FromDMF(final String source,
47
			final String target,
48
			final int rank,
49
			final String relClass,
50
			final String provenanceAction,
51
			final String trust,
52
			final NodeList about) {
53
		try {
54
			final String eSource = OafRowKeyDecoder.decode(source).getKey();
55
			final String eTarget = OafRowKeyDecoder.decode(target).getKey();
56

    
57
			final Authorship.Builder auth = Authorship.newBuilder().setRanking("" + rank)
58
					.setRelMetadata(RelMetadata.newBuilder().setSemantics(getSimpleQualifier(relClass, "dnet:personroles")));
59

    
60
			final OafRel.Builder rel = getRel(eSource, eTarget, RelType.personResult, SubRelType.authorship, relClass, false).setPersonResult(
61
					PersonResult.newBuilder().setAuthorship(auth));
62

    
63
			return base64(getOaf(rel, getDataInfo(about, provenanceAction, trust, false, false)).toByteArray());
64
		} catch (final Throwable e) {
65
			System.err.println("source: " + source);
66
			System.err.println("target: " + target);
67
			System.err.println("provenanceAction: " + provenanceAction);
68
			System.err.println("trust: " + trust);
69
			System.err.println("rank: " + rank);
70
			e.printStackTrace();
71
			throw new RuntimeException(e);
72
		}
73
	}
74

    
75
	// dnet:oafPersonPersonFromMDStore($personId, $coauthorId)
76
	public static String oafPersonPerson_CoAuthorship_FromDMF(final String source,
77
			final String target,
78
			final String relClass,
79
			final String provenanceAction,
80
			final String trust,
81
			final NodeList about) {
82
		try {
83
			final String eSource = OafRowKeyDecoder.decode(source).getKey();
84
			final String eTarget = OafRowKeyDecoder.decode(target).getKey();
85

    
86
			final PersonPerson.Builder pp = PersonPerson.newBuilder();
87
			final CoAuthorship.Builder coauth = CoAuthorship.newBuilder().setRelMetadata(
88
					RelMetadata.newBuilder().setSemantics(getSimpleQualifier(relClass, "dnet:personroles")));
89

    
90
			final OafRel.Builder rel = getRel(eSource, eTarget, RelType.personPerson, SubRelType.coauthorship, relClass, false).setPersonPerson(
91
					pp.setCoauthorship(coauth));
92

    
93
			return base64(getOaf(rel, getDataInfo(about, provenanceAction, trust, false, false)).toByteArray());
94
		} catch (final Throwable e) {
95
			System.err.println("source: " + source);
96
			System.err.println("target: " + target);
97
			System.err.println("provenanceAction: " + provenanceAction);
98
			System.err.println("trust: " + trust);
99
			e.printStackTrace();
100
			throw new RuntimeException(e);
101
		}
102
	}
103

    
104
	// dnet:oafPersonFromDMF($personId, ., "sysimport:crosswalk:repository", "0.9")
105

    
106
	// collectedFromId, collectedFromName, originalId, dateOfCollection
107
	public static String oafPerson_FromDMF(final String personId,
108
			final String fullname,
109
			final NodeList authors,
110
			final String namespaceprefix,
111
			final String objIdentifier,
112
			final Map<String, Object> map,
113
			final String provenanceAction,
114
			final String trust,
115
			final NodeList about,
116
			final String collectedFromId,
117
			final String collectedFromName,
118
			final String originalId,
119
			final String dateOfCollection) {
120
		try {
121
			final String entityId = OafRowKeyDecoder.decode(personId).getKey();
122

    
123
			final Person.Builder person = Person.newBuilder();
124
			final Person.Metadata.Builder metadata = getMetadata(fullname);
125

    
126
			if (authors != null) {
127
				for (int i = 0; (i < authors.getLength()) && (i < MAX_COAUTHORS); i++) {
128
					final Node node = authors.item(i);
129

    
130
					final String name = StringUtils.trim(node.getTextContent());
131
					if (!name.equals(fullname)) {
132

    
133
						final CoAuthor.Builder coAuthor = CoAuthor.newBuilder();
134
						coAuthor.setId(oafPersonId("person", namespaceprefix, objIdentifier, name, map));
135
						coAuthor.setMetadata(getMetadata(name));
136

    
137
						person.addCoauthor(coAuthor);
138
					}
139
				}
140
			}
141

    
142
			// metadata.setNationality(getSimpleQualifier("UNKNOWN", "dnet:countries"));
143
			final List<StructuredProperty> pids = Lists.newArrayList(getStructuredProperty(originalId, "oai", "oai", "dnet:pid_types", "dnet:pid_types"));
144
			final OafEntity.Builder entity = getEntity(Type.person, entityId, getKV(collectedFromId, collectedFromName), originalId, dateOfCollection, pids)
145
					.setPerson(person.setMetadata(metadata));
146

    
147
			final Oaf oaf = getOaf(entity, getDataInfo(about, provenanceAction, trust, false, false));
148
			return base64(oaf.toByteArray());
149
		} catch (final Throwable e) {
150
			System.err.println("personId: " + personId);
151
			System.err.println("fullname: " + fullname);
152
			System.err.println("provenanceAction: " + provenanceAction);
153
			System.err.println("trust: " + trust);
154
			System.err.println("collectedFromId: " + collectedFromId);
155
			System.err.println("collectedFromName: " + collectedFromName);
156
			System.err.println("originalId: " + originalId);
157
			System.err.println("dateOfCollection: " + dateOfCollection);
158
			e.printStackTrace();
159
			throw new RuntimeException(e);
160
		}
161
	}
162

    
163
	private static Person.Metadata.Builder getMetadata(final String fullname) {
164
		final Person.Metadata.Builder metadata = Person.Metadata.newBuilder();
165

    
166
		metadata.setFullname(sf(fullname));
167

    
168
		final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(fullname, false);
169
		if (p.isAccurate()) {
170
			metadata.setFirstname(sf(p.getNormalisedFirstName()));
171
			metadata.clearSecondnames().addSecondnames(sf(p.getNormalisedSurname()));
172
			// metadata.setFullname(sf(p.getNormalisedFullname()));
173
		}
174
		return metadata;
175
	}
176

    
177
	// dnet:oafResultProjectFromDMF($resultId, $projectId, "sysimport:crosswalk:repository", "0.9")
178
	public static String oafResultProject_Outcome_FromDMF(final String source,
179
			final String target,
180
			final String relClass,
181
			final String provenanceAction,
182
			final String trust,
183
			final NodeList about) {
184
		try {
185
			final String eSource = OafRowKeyDecoder.decode(source).getKey();
186
			final String eTarget = OafRowKeyDecoder.decode(target).getKey();
187

    
188
			final Outcome.Builder outcome = Outcome.newBuilder().setRelMetadata(
189
					RelMetadata.newBuilder().setSemantics(getSimpleQualifier(relClass, "dnet:result_project_relations")));
190

    
191
			final ResultProject.Builder rp = ResultProject.newBuilder().setOutcome(outcome);
192

    
193
			final OafRel.Builder rel = getRel(eSource, eTarget, RelType.resultProject, SubRelType.outcome, relClass, false).setResultProject(rp);
194

    
195
			return base64(getOaf(rel, getDataInfo(about, provenanceAction, trust, false, false)).toByteArray());
196
		} catch (final Throwable e) {
197
			System.err.println("source: " + source);
198
			System.err.println("target: " + target);
199
			System.err.println("provenanceAction: " + provenanceAction);
200
			System.err.println("trust: " + trust);
201

    
202
			e.printStackTrace();
203
			throw new RuntimeException(e);
204
		}
205
	}
206

    
207
	// dnet:oafResultProjectFromDMF($resultId, $projectId, "sysimport:crosswalk:repository", "0.9")
208
	public static String oafResultProject_Outcome_FromDMF(final String source,
209
			final String target,
210
			final String relClass,
211
			final String provenanceAction,
212
			final String trust) {
213
		return oafResultProject_Outcome_FromDMF(source, target, relClass, provenanceAction, trust, null);
214
	}
215

    
216
	public static String oafResultResult_PublicationDataset_FromDMF(final String source,
217
			final String target,
218
			final String relClass,
219
			final String provenanceAction,
220
			final String trust) {
221
		return oafResultResult_PublicationDataset_FromDMF(source, target, relClass, provenanceAction, trust, null);
222
	}
223

    
224
	public static String oafResultResult_PublicationDataset_FromDMF(final String source,
225
			final String target,
226
			final String relClass,
227
			final String provenanceAction,
228
			final String trust,
229
			final NodeList about) {
230
		try {
231
			final String eSource = OafRowKeyDecoder.decode(source).getKey();
232
			final String eTarget = OafRowKeyDecoder.decode(target).getKey();
233

    
234
			final PublicationDataset.Builder pd = PublicationDataset.newBuilder().setRelMetadata(
235
					RelMetadata.newBuilder().setSemantics(getSimpleQualifier(relClass, "dnet:result_result_relations")));
236

    
237
			final ResultResult.Builder rr = ResultResult.newBuilder().setPublicationDataset(pd);
238

    
239
			final OafRel.Builder rel = getRel(eSource, eTarget, RelType.resultResult, SubRelType.publicationDataset, relClass, false).setResultResult(rr);
240

    
241
			return base64(getOaf(rel, getDataInfo(about, provenanceAction, trust, false, false)).toByteArray());
242
		} catch (final Throwable e) {
243
			System.err.println("source: " + source);
244
			System.err.println("target: " + target);
245
			System.err.println("provenanceAction: " + provenanceAction);
246
			System.err.println("trust: " + trust);
247

    
248
			e.printStackTrace();
249
			throw new RuntimeException(e);
250
		}
251
	}
252

    
253
	public static String oafResult_FromDMF(final String resultId,
254
			final String provenanceAction,
255
			final String trust,
256
			final NodeList about,
257
			final String hostedbyId,
258
			final String hostedbyName,
259
			final String collectedFromId,
260
			final String collectedFromName,
261
			final String originalId,
262
			final String dateOfCollection,
263
			final NodeList nodelist) {
264
		try {
265
			final String entityId = OafRowKeyDecoder.decode(resultId).getKey();
266

    
267
			final Result.Builder result = Result.newBuilder();
268

    
269
			final ValueMap values = ValueMap.parseNodeList(nodelist);
270

    
271
			final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
272
			final Descriptor mDesc = Result.Metadata.getDescriptor();
273

    
274
			if (values.get("creator") != null) {
275
				for (final String fullname : Iterables.limit(values.get("creator").listValues(), 10)) {
276

    
277
					final Person.Metadata.Builder authorMetadata = Person.Metadata.newBuilder();
278

    
279
					authorMetadata.setFullname(sf(fullname));
280

    
281
					final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(fullname, false);
282
					if (p.isAccurate()) {
283
						authorMetadata.setFirstname(sf(p.getNormalisedFirstName()));
284
						authorMetadata.clearSecondnames().addSecondnames(sf(p.getNormalisedSurname()));
285
						authorMetadata.setFullname(sf(p.getNormalisedFullname()));
286
					}
287

    
288
					result.addAuthor(Person.newBuilder().setMetadata(authorMetadata));
289
				}
290
			}
291

    
292
			addStructuredProps(metadata, mDesc.findFieldByName("subject"), values.get("subject").listValues(), "keyword", "dnet:result_subject");
293
			addStructuredProps(metadata, mDesc.findFieldByName("title"), values.get("title").listValues(), "main title", "dnet:dataCite_title");
294

    
295
			for (final String fieldname : Lists.newArrayList("description", "source")) {
296
				if (values.get(fieldname) != null) {
297
					for (final String s : values.get(fieldname).listValues()) {
298
						addField(metadata, mDesc.findFieldByName(fieldname), s);
299
					}
300
				}
301
			}
302

    
303
			addField(metadata, mDesc.findFieldByName("language"), setQualifier(getDefaultQualifier("dnet:languages"), values.get("language").listValues()));
304
			addField(metadata, mDesc.findFieldByName("dateofacceptance"), values.get("dateaccepted").listValues());
305
			addField(metadata, mDesc.findFieldByName("publisher"), values.get("publisher").listValues());
306
			addField(metadata, mDesc.findFieldByName("embargoenddate"), values.get("embargoenddate").listValues());
307
			addField(metadata, mDesc.findFieldByName("storagedate"), values.get("storagedate").listValues());
308

    
309
			addField(metadata, mDesc.findFieldByName("resulttype"), getSimpleQualifier("publication", "dnet:result_typologies"));
310

    
311
			addField(metadata, mDesc.findFieldByName("fulltext"), values.get("fulltext").listValues());
312
			addField(metadata, mDesc.findFieldByName("format"), values.get("format").listValues());
313

    
314
			// addField(metadata, Result.Metadata.getDescriptor().findFieldByName("provenanceaction"),
315
			// getSimpleQualifier("sysimport:crosswalk:repository", "dnet:provenanceActions").build());
316

    
317
			if (values.get("concept") != null) {
318
				for (final Element e : values.get("concept")) {
319
					final String id = e.getAttributes().get("id");
320
					if (StringUtils.isBlank(id)) throw new IllegalArgumentException("Context id cannot be blank");
321
					metadata.addContext(Context.newBuilder().setId(id));
322
				}
323
			}
324

    
325
			if (values.get("journal") != null) {
326
				for (final Element e : values.get("journal")) {
327

    
328
					final Journal.Builder journal = Journal.newBuilder();
329
					if (e.getText() != null) {
330
						journal.setName(e.getText());
331
					}
332

    
333
					final Map<String, String> attr = e.getAttributes();
334
					if (attr != null) {
335
						if (attr.get("issn") != null) {
336
							journal.setIssnPrinted(attr.get("issn"));
337
						}
338
						if (attr.get("eissn") != null) {
339
							journal.setIssnOnline(attr.get("eissn"));
340
						}
341
						if (attr.get("lissn") != null) {
342
							journal.setIssnLinking(attr.get("lissn"));
343
						}
344
					}
345
					metadata.setJournal(journal.build());
346
				}
347
			}
348

    
349
			final Instance.Builder instance = Instance.newBuilder().setHostedby(getKV(hostedbyId, hostedbyName));
350

    
351
			addField(instance, Instance.getDescriptor().findFieldByName("licence"),
352
					setQualifier(getDefaultQualifier("dnet:access_modes"), values.get("accessrights").listValues()));
353
			addField(instance, Instance.getDescriptor().findFieldByName("instancetype"),
354
					setQualifier(getDefaultQualifier("dnet:publication_resource"), values.get("cobjcategory").listValues()));
355

    
356
			if (values.get("identifier") != null) {
357
				addField(instance, Instance.getDescriptor().findFieldByName("url"),
358
						Lists.newArrayList(Iterables.filter(values.get("identifier").listValues(), urlFilter)));
359
			}
360

    
361
			result.addInstance(instance);
362

    
363
			final List<Element> extrefs = values.get("reference");
364
			if (!extrefs.isEmpty()) {
365
				final Descriptor extDesc = ExternalReference.getDescriptor();
366
				for (final Element element : extrefs) {
367
					final ExternalReference.Builder extref = ExternalReference.newBuilder();
368
					addField(extref, extDesc.findFieldByName("url"), element.getText());
369
					addField(extref, extDesc.findFieldByName("sitename"), element.getAttributes().get("source"));
370
					addField(extref, extDesc.findFieldByName("refidentifier"), element.getAttributes().get("identifier"));
371
					addField(extref, extDesc.findFieldByName("label"), element.getAttributes().get("title"));
372
					addField(extref, extDesc.findFieldByName("query"), element.getAttributes().get("query"));
373
					addField(extref, extDesc.findFieldByName("qualifier"),
374
							setQualifier(getDefaultQualifier("dnet:externalReference_typologies"), Lists.newArrayList(element.getAttributes().get("type")))
375
							.build());
376

    
377
					result.addExternalReference(extref);
378
				}
379
			}
380

    
381
			final List<StructuredProperty> pids = Lists.newArrayList();
382
			pids.addAll(parsePids(nodelist));
383
			pids.add(getStructuredProperty(originalId, "oai", getClassName("oai"), "dnet:pid_types", "dnet:pid_types"));
384

    
385
			final OafEntity.Builder entity = getEntity(Type.result, entityId, getKV(collectedFromId, collectedFromName), originalId, dateOfCollection, pids)
386
					.setResult(result.setMetadata(metadata));
387

    
388
			entity.setOaiprovenance(getOAIProvenance(about));
389

    
390
			final Oaf oaf = getOaf(entity, getDataInfo(about, provenanceAction, trust, false, false));
391
			return base64(oaf.toByteArray());
392
		} catch (final Throwable e) {
393
			System.err.println("resultId: " + resultId);
394
			System.err.println("hostedbyId: " + hostedbyId);
395
			System.err.println("hostedbyName: " + hostedbyName);
396
			System.err.println("provenanceAction: " + provenanceAction);
397
			System.err.println("trust: " + trust);
398
			System.err.println("collectedFromId: " + collectedFromId);
399
			System.err.println("collectedFromName: " + collectedFromName);
400
			System.err.println("originalId: " + originalId);
401
			System.err.println("dateOfCollection: " + dateOfCollection);
402
			e.printStackTrace();
403
			throw new RuntimeException(e);
404
		}
405
	}
406

    
407
	private static OAIProvenance getOAIProvenance(final NodeList about) {
408

    
409
		OAIProvenance.Builder oaiProv = OAIProvenance.newBuilder();
410

    
411
		if (((about != null) && (about.getLength() > 0))) {
412

    
413
			final org.w3c.dom.Element provenance = getDirectChild((org.w3c.dom.Element) about.item(0), "provenance");
414

    
415
			if (provenance != null) {
416
				final org.w3c.dom.Element origDesc = getDirectChild(provenance, "originDescription");
417
				oaiProv.setOriginDescription(buildOriginDescription(origDesc, OriginDescription.newBuilder()));
418
			}
419
		}
420

    
421
		return oaiProv.build();
422
	}
423

    
424
	private static OriginDescription buildOriginDescription(final org.w3c.dom.Element origDesc, final OriginDescription.Builder od) {
425
		od.setHarvestDate(origDesc.getAttribute("harvestDate")).setAltered(Boolean.valueOf(origDesc.getAttribute("altered")));
426

    
427
		org.w3c.dom.Element elem = getDirectChild(origDesc, "baseURL");
428
		od.setBaseURL(elem != null ? elem.getTextContent() : "");
429

    
430
		elem = getDirectChild(origDesc, "identifier");
431
		od.setIdentifier(elem != null ? elem.getTextContent() : "");
432

    
433
		elem = getDirectChild(origDesc, "datestamp");
434
		od.setDatestamp(elem != null ? elem.getTextContent() : "");
435

    
436
		elem = getDirectChild(origDesc, "metadataNamespace");
437
		od.setMetadataNamespace(elem != null ? elem.getTextContent() : "");
438

    
439
		elem = getDirectChild(origDesc, "originDescription");
440

    
441
		if (elem != null) {
442

    
443
			od.setOriginDescription(buildOriginDescription(elem, OriginDescription.newBuilder()));
444
		}
445

    
446
		return od.build();
447
	}
448

    
449
}
(6-6/9)