Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import java.io.ByteArrayOutputStream;
4
import java.io.IOException;
5
import java.io.InputStream;
6
import java.util.*;
7
import java.util.concurrent.atomic.AtomicInteger;
8
import java.util.function.Function;
9
import java.util.stream.Collectors;
10
import java.util.stream.Stream;
11
import java.util.zip.Inflater;
12

    
13
import com.google.common.collect.Lists;
14
import com.google.gson.Gson;
15
import com.google.gson.JsonElement;
16
import com.google.gson.JsonObject;
17
import eu.dnetlib.actionmanager.actions.ActionFactory;
18
import eu.dnetlib.actionmanager.actions.AtomicAction;
19
import eu.dnetlib.actionmanager.common.Agent;
20
import eu.dnetlib.data.mapreduce.hbase.Reporter;
21
import eu.dnetlib.data.mapreduce.util.StreamUtils;
22
import eu.dnetlib.data.proto.*;
23
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
24
import eu.dnetlib.miscutils.collections.Pair;
25
import org.apache.commons.codec.binary.Base64;
26
import org.apache.commons.io.IOUtils;
27
import org.apache.commons.lang3.StringUtils;
28

    
29
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
30
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
31

    
32
public class DOIBoostToActions {
33

    
34
	public static final String MAG = "MAG";
35
	public static final String ORCID = "ORCID";
36
	public static final String CROSSREF = "Crossref";
37
	public static final String UNPAYWALL = "UnpayWall";
38

    
39
	public static final String GRID_AC = "grid.ac";
40
	public static final String WIKPEDIA = "wikpedia";
41

    
42
	public final static String doiBoostNSPREFIX = "doiboost____";
43
	public static final String OPENAIRE_PREFIX = "openaire____";
44

    
45
	public static final String SEPARATOR = "::";
46
	public static final String DNET_LANGUAGES = "dnet:languages";
47

    
48
	private static final List<String> DATE_TYPES = Lists.newArrayList("issued", "accepted", "published-online", "published-print");
49

    
50

    
51

    
52
	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
53
		put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
54
		put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
55
		put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
56
		put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
57

    
58
	}};
59

    
60
	private static String decompressAbstract(final String abstractCompressed) {
61
		try {
62
			byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
63
			final Inflater decompresser = new Inflater();
64
			decompresser.setInput(byteArray);
65
			final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
66
			byte[] buffer = new byte[8192];
67
			while (!decompresser.finished()) {
68
				int size = decompresser.inflate(buffer);
69
				bos.write(buffer, 0, size);
70
			}
71
			byte[] unzippeddata = bos.toByteArray();
72
			decompresser.end();
73
			return new String(unzippeddata);
74
		} catch (Throwable e) {
75
			System.out.println("Wrong abstract:" + abstractCompressed);
76
			throw new RuntimeException(e);
77
		}
78
	}
79

    
80
	public static final String PID_TYPES = "dnet:pid_types";
81
	private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
82
		put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES)
83
				.setSchemeid(PID_TYPES).build());
84
		put(GRID_AC, getQualifier("grid", PID_TYPES));
85
		put(WIKPEDIA, getQualifier("urn", PID_TYPES));
86
	}};
87

    
88
	static Map<String, Map<String, String>> typologiesMapping;
89

    
90
	static {
91
		try {
92
			final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
93
			final String tt = IOUtils.toString(is);
94
			typologiesMapping = new Gson().fromJson(tt, Map.class);
95
		} catch (IOException e) {
96
			e.printStackTrace();
97
		}
98
	}
99

    
100
	protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
101

    
102
		final String doi = getStringValue(rootElement, "doi");
103
		if (doi == null) {
104
			context.incrementCounter("filtered", "no_doi", 1);
105
			return false;
106
		}
107
		final String type = getStringValue(rootElement, "type");
108
		if (!typologiesMapping.containsKey(type)) {
109
			context.incrementCounter("filtered", "unknowntype_" + type, 1);
110
			return false;
111
		}
112
		// fixes #4360 (test publisher)
113
		final String publisher = getStringValue(rootElement, "publisher");
114
		if (StringUtils.isNotBlank(publisher) && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
115
			context.incrementCounter("filtered", "test_publisher", 1);
116
			return false;
117
		}
118

    
119
		List<JsonObject> authors = getArrayObjects(rootElement, "authors");
120
		boolean hasAuthors = false;
121
		for (JsonObject author : authors) {
122
			final String given = getStringValue(author, "given");
123
			final String family = getStringValue(author, "family");
124
			String fullname = getStringValue(author, "fullname");
125
			if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
126
				fullname = String.format("%s %s", given, family);
127
			}
128
			// fixes #4368
129
			if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) {
130
				context.incrementCounter("invalid_author", "addiejackson", 1);
131
				context.incrementCounter("filtered", "invalid_authors", 1);
132
				return false;
133
			}
134
			if (isValidAuthorName(fullname, context)) hasAuthors = true;
135
		}
136

    
137
		if (!hasAuthors) {
138
			context.incrementCounter("filtered", "invalid_authors", 1);
139
			return false;
140
		}
141
		// fixes #4360
142
		if (getCleanedTitles(rootElement).isEmpty()) {
143
			context.incrementCounter("filtered", "invalid_title", 1);
144
			return false;
145
		}
146

    
147
		return true;
148
	}
149

    
150
	private static List<String> getCleanedTitles(final JsonObject rootElement) {
151
		List<String> titles = getArrayValues(rootElement, "title");
152
		return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
153
	}
154

    
155
	private static boolean isValidAuthorName(final String fullName, final Reporter context) {
156
		if (StringUtils.isBlank(fullName)) {
157
			if(context != null) context.incrementCounter("invalid_author", "blank", 1);
158
			return false;
159
		}
160
		// fixes #4391 and subtasks related to DOIBoost
161
		switch (StringUtils.lowerCase(fullName)) {
162
		case ",":
163
		case "none none":
164
		case "none, none":
165
		case "none &na;":
166
		case "(:null)":
167
		case "test test test":
168
		case "test test":
169
		case "test":
170
		case "&na; &na;": {
171
			if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1);
172
			return false;
173
			}
174
		}
175
		return true;
176
	}
177

    
178
	public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
179
			final ActionFactory factory,
180
			final String setName,
181
			final Agent agent,
182
			boolean invisible,
183
			final boolean onlyOrganization,
184
			final Reporter context) {
185

    
186
		if (!isValid(rootElement, context)) return null;
187

    
188
		//Create OAF Proto
189

    
190
		final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
191
		//Add Data Info
192
		oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
193
				.setInvisible(invisible)
194
				.setDeletedbyinference(false)
195
				.setInferred(false)
196
				.setTrust("0.9")
197
				.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
198
				.build());
199

    
200
		//Adding Kind
201
		oaf.setKind(KindProtos.Kind.entity);
202

    
203
		//creating Result Proto
204
		final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
205

    
206
		entity.setDateofcollection("2019-02-15");
207

    
208
		if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
209
			StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
210
					.map(JsonElement::getAsString)
211
					.forEach(cf -> {
212
								final String id = datasources.get(cf.toLowerCase()).getValue();
213
								final String name = datasources.get(cf.toLowerCase()).getKey();
214
								if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
215
									final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
216
											.setValue(name)
217
											.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
218
											.build();
219
									entity.addCollectedfrom(collectedFrom);
220
								}
221
							}
222
					);
223
		}
224
		//Adding identifier
225
		final String doi = getStringValue(rootElement, "doi");
226
		entity.addOriginalId(doi);
227

    
228
		final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
229
		entity.setId(sourceId);
230

    
231
		entity.addPid(FieldTypeProtos.StructuredProperty.newBuilder()
232
				.setValue(doi)
233
				.setQualifier(getQualifier("doi", PID_TYPES))
234
				.build());
235

    
236
		//Create Result Field
237
		ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
238

    
239
		final String type = getStringValue(rootElement, "type");
240

    
241
		//Adding Instances
242
		final String typeValue = typologiesMapping.get(type).get("value");
243
		final String cobjValue = typologiesMapping.get(type).get("cobj");
244

    
245
		// TODO: workaround for #4362: remove it when UnpayWall is correctly mapped
246
		List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> {
247
			String provS = getStringValue(prov, "provenance");
248
			if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true;
249
			else return false;
250
		}).collect(Collectors.toList());
251

    
252
		Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
253
		{
254
			ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
255
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
256
					.setClassid(cobjValue)
257
					.setClassname(typeValue)
258
					.setSchemeid("dnet:publication_resource")
259
					.setSchemename("dnet:publication_resource")
260
					.build());
261
			instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
262
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
263
					.setValue("Unknown Repository")
264
					.build());
265

    
266
			String acc_class_id = it.get("access-rights").getAsString();
267
			String acc_class_value;
268
			switch (acc_class_id) {
269
			case "OPEN": {
270
				acc_class_value = "Open Access";
271
				break;
272
			}
273
			case "CLOSED":
274
			case "RESTRICTED": {
275
				//acc_class_value = "Closed Access";
276
				//4362#note-3
277
				acc_class_id = "RESTRICTED";
278
				acc_class_value = "Restricted";
279
				break;
280
			}
281
			case "EMBARGO":
282
				acc_class_value = "Embargo";
283
				break;
284
			default: {
285
				acc_class_value = "not available";
286
				acc_class_id = "UNKNOWN";
287
			}
288

    
289
			}
290

    
291
			instance.addUrl(it.get("url").getAsString());
292
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
293
					.setClassid(acc_class_id)
294
					.setClassname(acc_class_value)
295
					.setSchemeid("dnet:access_modes")
296
					.setSchemename("dnet:access_modes")
297
					.build());
298

    
299
			final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
300
			final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
301
			if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
302
				final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
303
						.setValue(name)
304
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
305
						.build();
306

    
307
				instance.setCollectedfrom(collectedFrom);
308
			}
309

    
310
			return instance.build();
311
		}).forEach(result::addInstance);
312

    
313
		//Adding DOI URL as  Instance
314
		final String doiURL = getStringValue(rootElement, "doi-url");
315
		JsonObject hostedByOpenAire = null;
316
		if (rootElement.has("hostedByOpenAire")) {
317
			hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire");
318
		}
319
		final String publisher = getStringValue(rootElement, "publisher");
320
		if (StringUtils.isNotBlank(doiURL)) {
321
			final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
322
			instance.addUrl(doiURL);
323
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
324
					.setClassid(cobjValue)
325
					.setClassname(typeValue)
326
					.setSchemeid("dnet:publication_resource")
327
					.setSchemename("dnet:publication_resource")
328
					.build());
329

    
330
			//#4362: if the publisher is Scielo, then the result is OPEN
331

    
332
			String accessModeId = "RESTRICTED";
333
			String accessModeName = "Restricted";
334
			if(publisher != null && publisher.equalsIgnoreCase("FapUNIFESP (SciELO)")){
335
				accessModeId = "OPEN";
336
				accessModeName = "Open Access";
337
			}
338
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
339
					.setClassid(accessModeId)
340
					.setClassname(accessModeName)
341
					.setSchemeid("dnet:access_modes")
342
					.setSchemename("dnet:access_modes")
343
					.build());
344
			instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
345
					.setValue(CROSSREF)
346
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref"))
347
					.build());
348

    
349
			if (hostedByOpenAire == null)
350
				instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
351
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
352
						.setValue("Unknown Repository")
353
						.build());
354
			else {
355
				instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
356
						.setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString()))
357
						.setValue(hostedByOpenAire.get("name").getAsString())
358
						.build());
359
			}
360

    
361
			result.addInstance(instance);
362
		}
363

    
364
		//Create Metadata Proto
365
		final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
366

    
367
		Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
368

    
369
		if (authorsOrganizations.getKey().size() > 0) {
370
			metadata.addAllAuthor(authorsOrganizations.getKey());
371
		} else {
372
			//Should never enter here becasue of the isValid method at the beginning.
373
			context.incrementCounter("filtered", "unexpected_no_authors", 1);
374
			return null;
375
		}
376
		//adding Language
377
		metadata.setLanguage(FieldTypeProtos.Qualifier.newBuilder()
378
				.setClassid("und")
379
				.setClassname("Undetermined")
380
				.setSchemeid(DNET_LANGUAGES)
381
				.setSchemename(DNET_LANGUAGES)
382
				.build());
383

    
384
		//Adding subjects
385
		List<String> subjects = getArrayValues(rootElement, "subject");
386

    
387
		subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
388
				.setValue(s)
389
				.setQualifier(getQualifier("keyword", "dnet:subject"))
390
				.build()));
391

    
392
		List<String> titles = getCleanedTitles(rootElement);
393
		titles.forEach(t ->
394
				metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
395
						.setValue(t)
396
						.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
397
						.build()));
398

    
399

    
400
		final String firstValidDate = getFirstValidDate(rootElement);
401
		if (StringUtils.isNotBlank(firstValidDate)) {
402
			setDate(metadata, "issued", firstValidDate, true);
403
		} else {
404
			context.incrementCounter("filtered", "missing_date", 1);
405
			return null;
406
		}
407
		settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
408
		settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
409
		settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
410

    
411
		getArrayObjects(rootElement, "abstract").forEach(d ->
412
				{
413
					if (MAG.equals(d.get("provenance").getAsString()) && d.get("value")!= null && !d.get("value").isJsonNull())
414
						metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(decompressAbstract(d.get("value").getAsString())).build());
415
					else if (d.get("value")!= null && !d.get("value").isJsonNull())
416
						metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build());
417
				}
418
		);
419

    
420
		//Adding Journal and publisher
421
		//TODO: name of the journal is not the publisher: this needs to be fixed on DOIBoost side
422

    
423
		if (StringUtils.isNotBlank(publisher)) {
424
			metadata.setPublisher(FieldTypeProtos.StringField.newBuilder().setValue(publisher).build());
425
			final FieldTypeProtos.Journal.Builder journal = FieldTypeProtos.Journal.newBuilder().setName(publisher);
426

    
427
			if (hasJSONArrayField(rootElement, "issn")) {
428
				StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
429
						.map(JsonElement::getAsJsonObject)
430
						.forEach(it -> {
431
							final String issntype = getStringValue(it, "type");
432
							final String value = getStringValue(it, "value");
433
							if ("electronic".equals(issntype)) {
434
								journal.setIssnOnline(value);
435
							}
436
							if ("print".equals(issntype))
437
								journal.setIssnPrinted(value);
438
						});
439
			}
440
			metadata.setJournal(journal.build());
441
		}
442
		metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
443
		result.setMetadata(metadata.build());
444
		entity.setResult(result.build());
445
		oaf.setEntity(entity.build());
446

    
447
		//System.out.println(JsonFormat.printToString(oaf.build()));
448

    
449
		final List<AtomicAction> actionList = new ArrayList<>();
450

    
451
		if (!onlyOrganization)
452
			actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
453

    
454
		if (!authorsOrganizations.getValue().isEmpty()) {
455

    
456
			authorsOrganizations.getValue().forEach(o ->
457
			{
458

    
459
				actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organization", "body", o.toByteArray()));
460
				if (!onlyOrganization)
461
					actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent));
462
				final String gridOrganization = getSimilarGridOrganization(o.getEntity());
463
				if (gridOrganization != null) {
464
					actionList.add(factory
465
							.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization,
466
									"".getBytes()));
467
					actionList.add(factory
468
							.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(),
469
									"".getBytes()));
470
				}
471
			});
472
		}
473
		return actionList;
474

    
475
	}
476

    
477
	private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) {
478

    
479
		final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList();
480
		if (pidList != null) {
481
			for (FieldTypeProtos.StructuredProperty p : pidList) {
482
				if (p.getQualifier().getClassname().equals("grid")) {
483
					return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
484
				}
485
			}
486
		}
487
		return null;
488

    
489
	}
490

    
491
	private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication,
492
			final OafProtos.Oaf organization,
493
			final ActionFactory factory,
494
			final String setName,
495
			final Agent agent) {
496

    
497
		List<AtomicAction> result = new ArrayList<>();
498

    
499
		final OafProtos.Oaf.Builder roaf = OafProtos.Oaf.newBuilder();
500
		roaf.setKind(KindProtos.Kind.relation);
501

    
502
		roaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
503
				.setInvisible(false)
504
				.setDeletedbyinference(false)
505
				.setInferred(false)
506
				.setTrust("0.9")
507
				.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
508
				.build());
509

    
510
		final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder();
511

    
512
		rel.setRelType(RelTypeProtos.RelType.resultOrganization);
513
		rel.setSubRelType(RelTypeProtos.SubRelType.affiliation);
514

    
515
		//Create a relation Result --> Organization
516
		rel.setSource(publication.getEntity().getId());
517
		rel.setTarget(organization.getEntity().getId());
518
		rel.setRelClass(ResultOrganization.Affiliation.RelName.hasAuthorInstitution.toString());
519

    
520
		final ResultOrganization.Builder rel_instance = ResultOrganization.newBuilder();
521

    
522
		final ResultOrganization.Affiliation.Builder affiliationRel = ResultOrganization.Affiliation.newBuilder();
523
		affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
524
				.setSemantics(getQualifier("hasAuthorInstitution", "dnet:result_organization_relations"))
525
				.build());
526
		rel_instance.setAffiliation(affiliationRel.build());
527
		rel.setResultOrganization(rel_instance.build());
528

    
529
		rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
530
				.setValue(datasources.get(MAG.toLowerCase()).getKey())
531
				.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions
532
						.md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR)))
533
				.build());
534

    
535
		rel.setChild(false);
536
		roaf.setRel(rel.build());
537

    
538
		result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution",
539
				organization.getEntity().getId(), roaf.build().toByteArray()));
540

    
541
		//Create a relation Organization --> Result
542
		rel.setTarget(publication.getEntity().getId());
543
		rel.setSource(organization.getEntity().getId());
544
		rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString());
545

    
546
		affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
547
				.setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations"))
548
				.build());
549
		rel_instance.setAffiliation(affiliationRel.build());
550
		rel.setResultOrganization(rel_instance.build());
551
		roaf.setRel(rel.build());
552
		result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf",
553
				publication.getEntity().getId(), roaf.build().toByteArray()));
554

    
555
		return result;
556

    
557
	}
558

    
559
	private static boolean hasJSONArrayField(final JsonObject root, final String key) {
560
		return root.has(key) && root.get(key).isJsonArray();
561
	}
562

    
563
	private static String getFirstValidDate(final JsonObject root) {
564
		return DATE_TYPES.stream()
565
			.map(type -> getStringValue(root, type))
566
			.filter(Objects::nonNull)
567
			.filter(DumpToActionsUtility::isValidDate)
568
			.findFirst()
569
			.orElse("");
570
	}
571

    
572
	private static void setDate(ResultProtos.Result.Metadata.Builder metadata,
573
											final String dictionaryKey,
574
											final String date,
575
											final boolean addToDateOfAcceptance) {
576
		if (date == null)
577
			return;
578
		if (addToDateOfAcceptance) {
579
			metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
580
		} else {
581
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
582
					.setValue(date)
583
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
584
					.build());
585
		}
586
	}
587

    
588
	private static void settingRelevantDate(JsonObject rootElement,
589
			ResultProtos.Result.Metadata.Builder metadata,
590
			final String jsonKey,
591
			final String dictionaryKey,
592
			final boolean addToDateOfAcceptance) {
593
		//Adding date
594
		String date = getStringValue(rootElement, jsonKey);
595
		if (date == null)
596
			return;
597
		if (date.length() == 4) {
598
			date += "-01-01";
599
		}
600
		if (isValidDate(date)) {
601
			if (addToDateOfAcceptance)
602
				metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
603
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
604
					.setValue(date)
605
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
606
					.build());
607
		}
608
	}
609

    
610
	public static FieldTypeProtos.KeyValue extractIdentifier(final String value) {
611
		FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder();
612
		if (StringUtils.contains(value, "orcid.org")) {
613
			return pid.setValue(value.replaceAll("https://orcid.org/", "").replaceAll("http://orcid.org/",""))
614
					.setKey(ORCID).build();
615
		}
616
		if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
617
			return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
618
					.setKey("MAG Identifier").build();
619
		}
620
		return pid.setValue(value)
621
				.setKey("URL").build();
622
	}
623

    
624
	public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) {
625
		final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>();
626
		final List<String> magId = new ArrayList<>();
627
		getArrayObjects(affiliation, "identifiers").forEach(it -> {
628
			if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) {
629
				affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG));
630
				magId.add(it.get("value").getAsString());
631
			} else
632
				affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
633
		});
634
		if (magId.size() > 0) {
635
			final String microsoftID = magId.get(0);
636
			OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
637
			oaf.setKind(KindProtos.Kind.entity);
638
			OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder();
639
			entity.setType(TypeProtos.Type.organization);
640
			entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
641
			final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
642
			final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
643
			if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
644
				final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
645
						.setValue(name)
646
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
647
						.build();
648
				entity.addCollectedfrom(collectedFrom);
649
			} else {
650
				return null;
651
			}
652
			entity.addOriginalId(microsoftID);
653

    
654
			affiliationIdentifiers.forEach((key, value) -> entity.addPid(
655
					FieldTypeProtos.StructuredProperty.newBuilder()
656
							.setQualifier(value)
657
							.setValue(key)
658
							.build()));
659

    
660
			final OrganizationProtos.Organization.Builder organization = OrganizationProtos.Organization.newBuilder();
661
			organization.setMetadata(OrganizationProtos.Organization.Metadata.newBuilder()
662
					.setWebsiteurl(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("official-page").getAsString()).build())
663
					.setLegalname(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("value").getAsString()).build())
664
					.build());
665

    
666
			entity.setOrganization(organization);
667
			oaf.setEntity(entity);
668
			oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
669
					.setInvisible(false)
670
					.setDeletedbyinference(false)
671
					.setInferred(false)
672
					.setTrust("0.9")
673
					.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
674
					.build());
675
			return oaf.build();
676
		}
677
		return null;
678
	}
679

    
680
	public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
681

    
682
		final Map<String, OafProtos.Oaf> affiliations = new HashMap<>();
683

    
684
		List<JsonObject> authors = getArrayObjects(root, "authors");
685

    
686
		final AtomicInteger counter = new AtomicInteger(1);
687

    
688
		List<FieldTypeProtos.Author> collect = authors.stream().map(author -> {
689
			final String given = getStringValue(author, "given");
690
			final String family = getStringValue(author, "family");
691
			String fullname = getStringValue(author, "fullname");
692

    
693
			if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
694
				fullname = String.format("%s %s", given, family);
695
			}
696

    
697
			if (!isValidAuthorName(fullname, null)) {
698
				return null;
699
			}
700
			final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder();
701

    
702
			if (StringUtils.isNotBlank(given))
703
				abuilder.setName(given);
704
			if (StringUtils.isNotBlank(family))
705
				abuilder.setSurname(family);
706
			if (StringUtils.isNotBlank(fullname))
707
				abuilder.setFullname(fullname);
708

    
709
			final List<JsonObject> identifiers = getArrayObjects(author, "identifiers");
710
			final List<JsonObject> authorAffiliation = getArrayObjects(author, "affiliations");
711

    
712
			authorAffiliation.forEach(it ->
713
			{
714
				OafProtos.Oaf org = createOrganizationFromJSON(it);
715
				if (org != null) {
716
					affiliations.put(org.getEntity().getId(), org);
717
					abuilder.addAffiliation(org.getEntity().getOrganization().getMetadata().getLegalname());
718
				}
719
			});
720
			identifiers.stream().map(id -> {
721
				final String value = id.get("value").getAsString();
722
				return extractIdentifier(value);
723
			}).collect(
724
					Collectors.toMap(
725
							FieldTypeProtos.KeyValue::getKey,
726
							Function.identity(),
727
							(a, b) -> a
728
					)).values().forEach(abuilder::addPid);
729
			abuilder.setRank(counter.getAndIncrement());
730

    
731
			return abuilder.build();
732

    
733
		}).filter(Objects::nonNull).collect(Collectors.toList());
734

    
735
		return new Pair<>(collect, affiliations.values());
736
	}
737

    
738
}
(4-4/18)