Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import java.io.ByteArrayOutputStream;
4
import java.io.IOException;
5
import java.io.InputStream;
6
import java.util.*;
7
import java.util.concurrent.atomic.AtomicInteger;
8
import java.util.function.Function;
9
import java.util.stream.Collectors;
10
import java.util.stream.Stream;
11
import java.util.zip.Inflater;
12

    
13
import com.google.common.collect.Lists;
14
import com.google.gson.Gson;
15
import com.google.gson.JsonElement;
16
import com.google.gson.JsonObject;
17
import eu.dnetlib.actionmanager.actions.ActionFactory;
18
import eu.dnetlib.actionmanager.actions.AtomicAction;
19
import eu.dnetlib.actionmanager.common.Agent;
20
import eu.dnetlib.data.mapreduce.hbase.Reporter;
21
import eu.dnetlib.data.mapreduce.util.StreamUtils;
22
import eu.dnetlib.data.proto.*;
23
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
24
import eu.dnetlib.miscutils.collections.Pair;
25
import org.apache.commons.codec.binary.Base64;
26
import org.apache.commons.io.IOUtils;
27
import org.apache.commons.lang3.StringUtils;
28

    
29
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
30
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
31

    
32
public class DOIBoostToActions {
33

    
34
	public static final String MAG = "MAG";
35
	public static final String ORCID = "ORCID";
36
	public static final String CROSSREF = "Crossref";
37
	public static final String UNPAYWALL = "UnpayWall";
38

    
39
	public static final String GRID_AC = "grid.ac";
40
	public static final String WIKPEDIA = "wikpedia";
41

    
42
	public final static String doiBoostNSPREFIX = "doiboost____";
43
	public static final String OPENAIRE_PREFIX = "openaire____";
44

    
45
	public static final String SEPARATOR = "::";
46
	public static final String DNET_LANGUAGES = "dnet:languages";
47

    
48
	private static final List<String> DATE_TYPES = Lists.newArrayList("issued", "accepted", "published-online", "published-print");
49

    
50

    
51

    
52
	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
53
		put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
54
		put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
55
		put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
56
		put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
57

    
58
	}};
59

    
60
	private static String decompressAbstract(final String abstractCompressed) {
61
		try {
62
			byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
63
			final Inflater decompresser = new Inflater();
64
			decompresser.setInput(byteArray);
65
			final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
66
			byte[] buffer = new byte[8192];
67
			while (!decompresser.finished()) {
68
				int size = decompresser.inflate(buffer);
69
				bos.write(buffer, 0, size);
70
			}
71
			byte[] unzippeddata = bos.toByteArray();
72
			decompresser.end();
73
			return new String(unzippeddata);
74
		} catch (Throwable e) {
75
			System.out.println("Wrong abstract:" + abstractCompressed);
76
			throw new RuntimeException(e);
77
		}
78
	}
79

    
80
	public static final String PID_TYPES = "dnet:pid_types";
81
	private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
82
		put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES)
83
				.setSchemeid(PID_TYPES).build());
84
		put(GRID_AC, getQualifier("grid", PID_TYPES));
85
		put(WIKPEDIA, getQualifier("urn", PID_TYPES));
86
	}};
87

    
88
	static Map<String, Map<String, String>> typologiesMapping;
89

    
90
	static {
91
		try {
92
			final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
93
			final String tt = IOUtils.toString(is);
94
			typologiesMapping = new Gson().fromJson(tt, Map.class);
95
		} catch (IOException e) {
96
			e.printStackTrace();
97
		}
98
	}
99

    
100
	protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
101

    
102
		final String doi = getStringValue(rootElement, "doi");
103
		if (doi == null) {
104
			context.incrementCounter("filtered", "no_doi", 1);
105
			return false;
106
		}
107
		final String type = getStringValue(rootElement, "type");
108
		if (!typologiesMapping.containsKey(type)) {
109
			context.incrementCounter("filtered", "unknowntype_" + type, 1);
110
			return false;
111
		}
112
		// fixes #4360 (test publisher)
113
		final String publisher = getStringValue(rootElement, "publisher");
114
		if (StringUtils.isNotBlank(publisher) && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
115
			context.incrementCounter("filtered", "test_publisher", 1);
116
			return false;
117
		}
118

    
119
		List<JsonObject> authors = getArrayObjects(rootElement, "authors");
120
		boolean hasAuthors = false;
121
		for (JsonObject author : authors) {
122
			final String given = getStringValue(author, "given");
123
			final String family = getStringValue(author, "family");
124
			String fullname = getStringValue(author, "fullname");
125
			if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
126
				fullname = String.format("%s %s", given, family);
127
			}
128
			// fixes #4368
129
			if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) {
130
				context.incrementCounter("invalid_author", "addiejackson", 1);
131
				context.incrementCounter("filtered", "invalid_authors", 1);
132
				return false;
133
			}
134
			if (isValidAuthorName(fullname, context)) hasAuthors = true;
135
		}
136

    
137
		if (!hasAuthors) {
138
			context.incrementCounter("filtered", "invalid_authors", 1);
139
			return false;
140
		}
141
		// fixes #4360
142
		if (getCleanedTitles(rootElement).isEmpty()) {
143
			context.incrementCounter("filtered", "invalid_title", 1);
144
			return false;
145
		}
146

    
147
		return true;
148
	}
149

    
150
	private static List<String> getCleanedTitles(final JsonObject rootElement) {
151
		List<String> titles = getArrayValues(rootElement, "title");
152
		return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
153
	}
154

    
155
	private static boolean isValidAuthorName(final String fullName, final Reporter context) {
156
		if (StringUtils.isBlank(fullName)) {
157
			if(context != null) context.incrementCounter("invalid_author", "blank", 1);
158
			return false;
159
		}
160
		// fixes #4391 and subtasks related to DOIBoost
161
		switch (StringUtils.lowerCase(fullName)) {
162
		case ",":
163
		case "none none":
164
		case "none, none":
165
		case "none &na;":
166
		case "(:null)":
167
		case "test test test":
168
		case "test test":
169
		case "test":
170
		case "&na; &na;": {
171
			if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1);
172
			return false;
173
			}
174
		}
175
		return true;
176
	}
177

    
178
	public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
179
			final ActionFactory factory,
180
			final String setName,
181
			final Agent agent,
182
			boolean invisible,
183
			final boolean onlyOrganization,
184
			final Reporter context) {
185

    
186
		if (!isValid(rootElement, context)) return null;
187

    
188
		//Create OAF Proto
189

    
190
		final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
191
		//Add Data Info
192
		oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
193
				.setInvisible(invisible)
194
				.setDeletedbyinference(false)
195
				.setInferred(false)
196
				.setTrust("0.9")
197
				.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
198
				.build());
199

    
200
		//Adding Kind
201
		oaf.setKind(KindProtos.Kind.entity);
202

    
203
		//creating Result Proto
204
		final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
205

    
206
		entity.setDateofcollection("2019-02-15");
207

    
208
		if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
209
			StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
210
					.map(JsonElement::getAsString)
211
					.forEach(cf -> {
212
								final String id = datasources.get(cf.toLowerCase()).getValue();
213
								final String name = datasources.get(cf.toLowerCase()).getKey();
214
								if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
215
									final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
216
											.setValue(name)
217
											.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
218
											.build();
219
									entity.addCollectedfrom(collectedFrom);
220
								}
221
							}
222
					);
223
		}
224
		//Adding identifier
225
		final String doi = getStringValue(rootElement, "doi");
226
		entity.addOriginalId(doi);
227

    
228
		final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
229
		entity.setId(sourceId);
230

    
231
		entity.addPid(FieldTypeProtos.StructuredProperty.newBuilder()
232
				.setValue(doi)
233
				.setQualifier(getQualifier("doi", PID_TYPES))
234
				.build());
235

    
236
		//Create Result Field
237
		ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
238

    
239
		final String type = getStringValue(rootElement, "type");
240

    
241
		//Adding Instances
242
		final String typeValue = typologiesMapping.get(type).get("value");
243
		final String cobjValue = typologiesMapping.get(type).get("cobj");
244

    
245
		// TODO: workaround for #4362: remove it when UnpayWall is correctly mapped
246
		List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> {
247
			String provS = getStringValue(prov, "provenance");
248
			if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true;
249
			else return false;
250
		}).collect(Collectors.toList());
251

    
252
		Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
253
		{
254
			ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
255
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
256
					.setClassid(cobjValue)
257
					.setClassname(typeValue)
258
					.setSchemeid("dnet:publication_resource")
259
					.setSchemename("dnet:publication_resource")
260
					.build());
261
			instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
262
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
263
					.setValue("Unknown Repository")
264
					.build());
265

    
266
			String acc_class_id = it.get("access-rights").getAsString();
267
			String acc_class_value;
268
			switch (acc_class_id) {
269
			case "OPEN": {
270
				acc_class_value = "Open Access";
271
				break;
272
			}
273
			case "CLOSED":
274
			case "RESTRICTED": {
275
				//acc_class_value = "Closed Access";
276
				//4362#note-3
277
				acc_class_id = "RESTRICTED";
278
				acc_class_value = "Restricted";
279
				break;
280
			}
281
			case "EMBARGO":
282
				acc_class_value = "Embargo";
283
				break;
284
			default: {
285
				acc_class_value = "not available";
286
				acc_class_id = "UNKNOWN";
287
			}
288

    
289
			}
290

    
291
			instance.addUrl(it.get("url").getAsString());
292
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
293
					.setClassid(acc_class_id)
294
					.setClassname(acc_class_value)
295
					.setSchemeid("dnet:access_modes")
296
					.setSchemename("dnet:access_modes")
297
					.build());
298

    
299
			final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
300
			final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
301
			if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
302
				final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
303
						.setValue(name)
304
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
305
						.build();
306

    
307
				instance.setCollectedfrom(collectedFrom);
308
			}
309

    
310
			return instance.build();
311
		}).forEach(result::addInstance);
312

    
313
		//Adding DOI URL as  Instance
314
		final String doiURL = getStringValue(rootElement, "doi-url");
315
		JsonObject hostedByOpenAire = null;
316
		if (rootElement.has("hostedByOpenAire")) {
317
			hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire");
318
		}
319
		final String publisher = getStringValue(rootElement, "publisher");
320
		if (StringUtils.isNotBlank(doiURL)) {
321
			final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
322
			instance.addUrl(doiURL);
323
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
324
					.setClassid(cobjValue)
325
					.setClassname(typeValue)
326
					.setSchemeid("dnet:publication_resource")
327
					.setSchemename("dnet:publication_resource")
328
					.build());
329

    
330
			//#4362: if the publisher is Scielo, then the result is OPEN
331

    
332
			String accessModeId = "RESTRICTED";
333
			String accessModeName = "Restricted";
334
			if(publisher != null && publisher.equalsIgnoreCase("FapUNIFESP (SciELO)")){
335
				accessModeId = "OPEN";
336
				accessModeName = "Open Access";
337
			}
338
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
339
					.setClassid(accessModeId)
340
					.setClassname(accessModeName)
341
					.setSchemeid("dnet:access_modes")
342
					.setSchemename("dnet:access_modes")
343
					.build());
344
			instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
345
					.setValue(CROSSREF)
346
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref"))
347
					.build());
348

    
349
			if (hostedByOpenAire == null)
350
				instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
351
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
352
						.setValue("Unknown Repository")
353
						.build());
354
			else {
355
				instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
356
						.setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString()))
357
						.setValue(hostedByOpenAire.get("name").getAsString())
358
						.build());
359
			}
360

    
361
			result.addInstance(instance);
362
		}
363

    
364
		//Create Metadata Proto
365
		final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
366

    
367
		Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
368

    
369
		if (authorsOrganizations.getKey().size() > 0) {
370
			metadata.addAllAuthor(authorsOrganizations.getKey());
371
		} else {
372
			//Should never enter here becasue of the isValid method at the beginning.
373
			context.incrementCounter("filtered", "unexpected_no_authors", 1);
374
			return null;
375
		}
376
		//adding Language
377
		metadata.setLanguage(FieldTypeProtos.Qualifier.newBuilder()
378
				.setClassid("und")
379
				.setClassname("Undetermined")
380
				.setSchemeid(DNET_LANGUAGES)
381
				.setSchemename(DNET_LANGUAGES)
382
				.build());
383

    
384
		//Adding subjects
385
		List<String> subjects = getArrayValues(rootElement, "subject");
386

    
387
		subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
388
				.setValue(s)
389
				.setQualifier(getQualifier("keyword", "dnet:subject"))
390
				.build()));
391

    
392
		List<String> titles = getCleanedTitles(rootElement);
393
		titles.forEach(t ->
394
				metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
395
						.setValue(t)
396
						.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
397
						.build()));
398

    
399

    
400
		final String firstValidDate = getFirstValidDate(rootElement);
401
		if (StringUtils.isNotBlank(firstValidDate)) {
402
			setDate(metadata, "issued", firstValidDate, true);
403
		} else {
404
			context.incrementCounter("filtered", "missing_date", 1);
405
			return null;
406
		}
407
		settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
408
		settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
409
		settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
410

    
411
		getArrayObjects(rootElement, "abstract").forEach(d ->
412
				{
413
					if (MAG.equals(d.get("provenance").getAsString()) && d.get("value")!= null && !d.get("value").isJsonNull())
414
						metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(decompressAbstract(d.get("value").getAsString())).build());
415
					else if (d.get("value")!= null && !d.get("value").isJsonNull())
416
						metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build());
417
				}
418
		);
419

    
420
		//Adding Journal
421

    
422
		if (StringUtils.isNotBlank(publisher)) {
423

    
424
			final FieldTypeProtos.Journal.Builder journal = FieldTypeProtos.Journal.newBuilder().setName(publisher);
425

    
426
			if (hasJSONArrayField(rootElement, "issn")) {
427
				StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
428
						.map(JsonElement::getAsJsonObject)
429
						.forEach(it -> {
430
							final String issntype = getStringValue(it, "type");
431
							final String value = getStringValue(it, "value");
432
							if ("electronic".equals(issntype)) {
433
								journal.setIssnOnline(value);
434
							}
435
							if ("print".equals(issntype))
436
								journal.setIssnPrinted(value);
437
						});
438
			}
439
			metadata.setJournal(journal.build());
440
		}
441
		metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
442
		result.setMetadata(metadata.build());
443
		entity.setResult(result.build());
444
		oaf.setEntity(entity.build());
445

    
446
		//System.out.println(JsonFormat.printToString(oaf.build()));
447

    
448
		final List<AtomicAction> actionList = new ArrayList<>();
449

    
450
		if (!onlyOrganization)
451
			actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
452

    
453
		if (!authorsOrganizations.getValue().isEmpty()) {
454

    
455
			authorsOrganizations.getValue().forEach(o ->
456
			{
457

    
458
				actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organization", "body", o.toByteArray()));
459
				if (!onlyOrganization)
460
					actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent));
461
				final String gridOrganization = getSimilarGridOrganization(o.getEntity());
462
				if (gridOrganization != null) {
463
					actionList.add(factory
464
							.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization,
465
									"".getBytes()));
466
					actionList.add(factory
467
							.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(),
468
									"".getBytes()));
469
				}
470
			});
471
		}
472
		return actionList;
473

    
474
	}
475

    
476
	private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) {
477

    
478
		final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList();
479
		if (pidList != null) {
480
			for (FieldTypeProtos.StructuredProperty p : pidList) {
481
				if (p.getQualifier().getClassname().equals("grid")) {
482
					return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
483
				}
484
			}
485
		}
486
		return null;
487

    
488
	}
489

    
490
	private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication,
491
			final OafProtos.Oaf organization,
492
			final ActionFactory factory,
493
			final String setName,
494
			final Agent agent) {
495

    
496
		List<AtomicAction> result = new ArrayList<>();
497

    
498
		final OafProtos.Oaf.Builder roaf = OafProtos.Oaf.newBuilder();
499
		roaf.setKind(KindProtos.Kind.relation);
500

    
501
		roaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
502
				.setInvisible(false)
503
				.setDeletedbyinference(false)
504
				.setInferred(false)
505
				.setTrust("0.9")
506
				.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
507
				.build());
508

    
509
		final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder();
510

    
511
		rel.setRelType(RelTypeProtos.RelType.resultOrganization);
512
		rel.setSubRelType(RelTypeProtos.SubRelType.affiliation);
513

    
514
		//Create a relation Result --> Organization
515
		rel.setSource(publication.getEntity().getId());
516
		rel.setTarget(organization.getEntity().getId());
517
		rel.setRelClass(ResultOrganization.Affiliation.RelName.hasAuthorInstitution.toString());
518

    
519
		final ResultOrganization.Builder rel_instance = ResultOrganization.newBuilder();
520

    
521
		final ResultOrganization.Affiliation.Builder affiliationRel = ResultOrganization.Affiliation.newBuilder();
522
		affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
523
				.setSemantics(getQualifier("hasAuthorInstitution", "dnet:result_organization_relations"))
524
				.build());
525
		rel_instance.setAffiliation(affiliationRel.build());
526
		rel.setResultOrganization(rel_instance.build());
527

    
528
		rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
529
				.setValue(datasources.get(MAG.toLowerCase()).getKey())
530
				.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions
531
						.md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR)))
532
				.build());
533

    
534
		rel.setChild(false);
535
		roaf.setRel(rel.build());
536

    
537
		result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution",
538
				organization.getEntity().getId(), roaf.build().toByteArray()));
539

    
540
		//Create a relation Organization --> Result
541
		rel.setTarget(publication.getEntity().getId());
542
		rel.setSource(organization.getEntity().getId());
543
		rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString());
544

    
545
		affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
546
				.setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations"))
547
				.build());
548
		rel_instance.setAffiliation(affiliationRel.build());
549
		rel.setResultOrganization(rel_instance.build());
550
		roaf.setRel(rel.build());
551
		result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf",
552
				publication.getEntity().getId(), roaf.build().toByteArray()));
553

    
554
		return result;
555

    
556
	}
557

    
558
	private static boolean hasJSONArrayField(final JsonObject root, final String key) {
559
		return root.has(key) && root.get(key).isJsonArray();
560
	}
561

    
562
	private static String getFirstValidDate(final JsonObject root) {
563
		return DATE_TYPES.stream()
564
			.map(type -> getStringValue(root, type))
565
			.filter(Objects::nonNull)
566
			.filter(DumpToActionsUtility::isValidDate)
567
			.findFirst()
568
			.orElseGet(null);
569
	}
570

    
571
	private static void setDate(ResultProtos.Result.Metadata.Builder metadata,
572
											final String dictionaryKey,
573
											final String date,
574
											final boolean addToDateOfAcceptance) {
575
		if (date == null)
576
			return;
577
		if (addToDateOfAcceptance) {
578
			metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
579
		} else {
580
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
581
					.setValue(date)
582
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
583
					.build());
584
		}
585
	}
586

    
587
	private static void settingRelevantDate(JsonObject rootElement,
588
			ResultProtos.Result.Metadata.Builder metadata,
589
			final String jsonKey,
590
			final String dictionaryKey,
591
			final boolean addToDateOfAcceptance) {
592
		//Adding date
593
		String date = getStringValue(rootElement, jsonKey);
594
		if (date == null)
595
			return;
596
		if (date.length() == 4) {
597
			date += "-01-01";
598
		}
599
		if (isValidDate(date)) {
600
			if (addToDateOfAcceptance)
601
				metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
602
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
603
					.setValue(date)
604
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
605
					.build());
606
		}
607
	}
608

    
609
	public static FieldTypeProtos.KeyValue extractIdentifier(final String value) {
610
		FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder();
611
		if (StringUtils.contains(value, "orcid.org")) {
612
			return pid.setValue(value.replaceAll("https://orcid.org/", "").replaceAll("http://orcid.org/",""))
613
					.setKey(ORCID).build();
614
		}
615
		if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
616
			return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
617
					.setKey("MAG Identifier").build();
618
		}
619
		return pid.setValue(value)
620
				.setKey("URL").build();
621
	}
622

    
623
	public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) {
624
		final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>();
625
		final List<String> magId = new ArrayList<>();
626
		getArrayObjects(affiliation, "identifiers").forEach(it -> {
627
			if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) {
628
				affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG));
629
				magId.add(it.get("value").getAsString());
630
			} else
631
				affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
632
		});
633
		if (magId.size() > 0) {
634
			final String microsoftID = magId.get(0);
635
			OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
636
			oaf.setKind(KindProtos.Kind.entity);
637
			OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder();
638
			entity.setType(TypeProtos.Type.organization);
639
			entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
640
			final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
641
			final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
642
			if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
643
				final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
644
						.setValue(name)
645
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
646
						.build();
647
				entity.addCollectedfrom(collectedFrom);
648
			} else {
649
				return null;
650
			}
651
			entity.addOriginalId(microsoftID);
652

    
653
			affiliationIdentifiers.forEach((key, value) -> entity.addPid(
654
					FieldTypeProtos.StructuredProperty.newBuilder()
655
							.setQualifier(value)
656
							.setValue(key)
657
							.build()));
658

    
659
			final OrganizationProtos.Organization.Builder organization = OrganizationProtos.Organization.newBuilder();
660
			organization.setMetadata(OrganizationProtos.Organization.Metadata.newBuilder()
661
					.setWebsiteurl(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("official-page").getAsString()).build())
662
					.setLegalname(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("value").getAsString()).build())
663
					.build());
664

    
665
			entity.setOrganization(organization);
666
			oaf.setEntity(entity);
667
			oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
668
					.setInvisible(false)
669
					.setDeletedbyinference(false)
670
					.setInferred(false)
671
					.setTrust("0.9")
672
					.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
673
					.build());
674
			return oaf.build();
675
		}
676
		return null;
677
	}
678

    
679
	public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
680

    
681
		final Map<String, OafProtos.Oaf> affiliations = new HashMap<>();
682

    
683
		List<JsonObject> authors = getArrayObjects(root, "authors");
684

    
685
		final AtomicInteger counter = new AtomicInteger(1);
686

    
687
		List<FieldTypeProtos.Author> collect = authors.stream().map(author -> {
688
			final String given = getStringValue(author, "given");
689
			final String family = getStringValue(author, "family");
690
			String fullname = getStringValue(author, "fullname");
691

    
692
			if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
693
				fullname = String.format("%s %s", given, family);
694
			}
695

    
696
			if (!isValidAuthorName(fullname, null)) {
697
				return null;
698
			}
699
			final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder();
700

    
701
			if (StringUtils.isNotBlank(given))
702
				abuilder.setName(given);
703
			if (StringUtils.isNotBlank(family))
704
				abuilder.setSurname(family);
705
			if (StringUtils.isNotBlank(fullname))
706
				abuilder.setFullname(fullname);
707

    
708
			final List<JsonObject> identifiers = getArrayObjects(author, "identifiers");
709
			final List<JsonObject> authorAffiliation = getArrayObjects(author, "affiliations");
710

    
711
			authorAffiliation.forEach(it ->
712
			{
713
				OafProtos.Oaf org = createOrganizationFromJSON(it);
714
				if (org != null) {
715
					affiliations.put(org.getEntity().getId(), org);
716
					abuilder.addAffiliation(org.getEntity().getOrganization().getMetadata().getLegalname());
717
				}
718
			});
719
			identifiers.stream().map(id -> {
720
				final String value = id.get("value").getAsString();
721
				return extractIdentifier(value);
722
			}).collect(
723
					Collectors.toMap(
724
							FieldTypeProtos.KeyValue::getKey,
725
							Function.identity(),
726
							(a, b) -> a
727
					)).values().forEach(abuilder::addPid);
728
			abuilder.setRank(counter.getAndIncrement());
729

    
730
			return abuilder.build();
731

    
732
		}).filter(Objects::nonNull).collect(Collectors.toList());
733

    
734
		return new Pair<>(collect, affiliations.values());
735
	}
736

    
737
}
(4-4/18)