Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import java.io.ByteArrayOutputStream;
4
import java.io.IOException;
5
import java.io.InputStream;
6
import java.util.*;
7
import java.util.concurrent.atomic.AtomicInteger;
8
import java.util.function.Function;
9
import java.util.stream.Collectors;
10
import java.util.stream.Stream;
11
import java.util.zip.Inflater;
12

    
13
import com.google.gson.Gson;
14
import com.google.gson.JsonElement;
15
import com.google.gson.JsonObject;
16
import eu.dnetlib.actionmanager.actions.ActionFactory;
17
import eu.dnetlib.actionmanager.actions.AtomicAction;
18
import eu.dnetlib.actionmanager.common.Agent;
19
import eu.dnetlib.data.mapreduce.hbase.Reporter;
20
import eu.dnetlib.data.mapreduce.util.StreamUtils;
21
import eu.dnetlib.data.proto.*;
22
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
23
import eu.dnetlib.miscutils.collections.Pair;
24
import org.apache.commons.codec.binary.Base64;
25
import org.apache.commons.io.IOUtils;
26
import org.apache.commons.lang3.StringUtils;
27

    
28
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
29
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
30

    
31
public class DOIBoostToActions {
32

    
33
	public static final String MAG = "MAG";
34
	public static final String ORCID = "ORCID";
35
	public static final String CROSSREF = "Crossref";
36
	public static final String UNPAYWALL = "UnpayWall";
37

    
38
	public static final String GRID_AC = "grid.ac";
39
	public static final String WIKPEDIA = "wikpedia";
40

    
41
	public final static String doiBoostNSPREFIX = "doiboost____";
42
	public static final String OPENAIRE_PREFIX = "openaire____";
43

    
44
	public static final String SEPARATOR = "::";
45
	public static final String DNET_LANGUAGES = "dnet:languages";
46

    
47
	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
48
		put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
49
		put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
50
		put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
51
		put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
52

    
53
	}};
54

    
55
	private static String decompressAbstract(final String abstractCompressed) {
56
		try {
57
			byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
58
			final Inflater decompresser = new Inflater();
59
			decompresser.setInput(byteArray);
60
			final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
61
			byte[] buffer = new byte[8192];
62
			while (!decompresser.finished()) {
63
				int size = decompresser.inflate(buffer);
64
				bos.write(buffer, 0, size);
65
			}
66
			byte[] unzippeddata = bos.toByteArray();
67
			decompresser.end();
68
			return new String(unzippeddata);
69
		} catch (Throwable e) {
70
			System.out.println("Wrong abstract:" + abstractCompressed);
71
			throw new RuntimeException(e);
72
		}
73
	}
74

    
75
	public static final String PID_TYPES = "dnet:pid_types";
76
	private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
77
		put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES)
78
				.setSchemeid(PID_TYPES).build());
79
		put(GRID_AC, getQualifier("grid", PID_TYPES));
80
		put(WIKPEDIA, getQualifier("urn", PID_TYPES));
81
	}};
82

    
83
	static Map<String, Map<String, String>> typologiesMapping;
84

    
85
	static {
86
		try {
87
			final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
88
			final String tt = IOUtils.toString(is);
89
			typologiesMapping = new Gson().fromJson(tt, Map.class);
90
		} catch (IOException e) {
91
			e.printStackTrace();
92
		}
93
	}
94

    
95
	protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
96

    
97
		final String doi = getStringValue(rootElement, "doi");
98
		if (doi == null) {
99
			context.incrementCounter("filtered", "no_doi", 1);
100
			return false;
101
		}
102
		final String type = getStringValue(rootElement, "type");
103
		if (!typologiesMapping.containsKey(type)) {
104
			context.incrementCounter("filtered", "unknowntype_" + type, 1);
105
			return false;
106
		}
107
		// fixes #4360 (test publisher)
108
		final String publisher = getStringValue(rootElement, "publisher");
109
		if (StringUtils.isNotBlank(publisher) && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
110
			context.incrementCounter("filtered", "test_publisher", 1);
111
			return false;
112
		}
113

    
114
		List<JsonObject> authors = getArrayObjects(rootElement, "authors");
115
		boolean hasAuthors = false;
116
		for (JsonObject author : authors) {
117
			final String given = getStringValue(author, "given");
118
			final String family = getStringValue(author, "family");
119
			String fullname = getStringValue(author, "fullname");
120
			if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
121
				fullname = String.format("%s %s", given, family);
122
			}
123
			// fixes #4368
124
			if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) {
125
				context.incrementCounter("invalid_author", "addiejackson", 1);
126
				context.incrementCounter("filtered", "invalid_authors", 1);
127
				return false;
128
			}
129
			if (isValidAuthorName(fullname, context)) hasAuthors = true;
130
		}
131

    
132
		if (!hasAuthors) {
133
			context.incrementCounter("filtered", "invalid_authors", 1);
134
			return false;
135
		}
136
		// fixes #4360
137
		if (getCleanedTitles(rootElement).isEmpty()) {
138
			context.incrementCounter("filtered", "invalid_title", 1);
139
			return false;
140
		}
141

    
142
		return true;
143
	}
144

    
145
	private static List<String> getCleanedTitles(final JsonObject rootElement) {
146
		List<String> titles = getArrayValues(rootElement, "title");
147
		return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
148
	}
149

    
150
	private static boolean isValidAuthorName(final String fullName, final Reporter context) {
151
		if (StringUtils.isBlank(fullName)) {
152
			if(context != null) context.incrementCounter("invalid_author", "blank", 1);
153
			return false;
154
		}
155
		// fixes #4391 and subtasks related to DOIBoost
156
		switch (StringUtils.lowerCase(fullName)) {
157
		case ",":
158
		case "none none":
159
		case "none, none":
160
		case "none &na;":
161
		case "(:null)":
162
		case "test test test":
163
		case "test test":
164
		case "test":
165
		case "&na; &na;": {
166
			if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1);
167
			return false;
168
			}
169
		}
170
		return true;
171
	}
172

    
173
	public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
174
			final ActionFactory factory,
175
			final String setName,
176
			final Agent agent,
177
			boolean invisible,
178
			final boolean onlyOrganization,
179
			final Reporter context) {
180

    
181
		if (!isValid(rootElement, context)) return null;
182

    
183
		//Create OAF Proto
184

    
185
		final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
186
		//Add Data Info
187
		oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
188
				.setInvisible(invisible)
189
				.setDeletedbyinference(false)
190
				.setInferred(false)
191
				.setTrust("0.9")
192
				.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
193
				.build());
194

    
195
		//Adding Kind
196
		oaf.setKind(KindProtos.Kind.entity);
197

    
198
		//creating Result Proto
199
		final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
200

    
201
		entity.setDateofcollection("2019-02-15");
202

    
203
		if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
204
			StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
205
					.map(JsonElement::getAsString)
206
					.forEach(cf -> {
207
								final String id = datasources.get(cf.toLowerCase()).getValue();
208
								final String name = datasources.get(cf.toLowerCase()).getKey();
209
								if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
210
									final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
211
											.setValue(name)
212
											.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
213
											.build();
214
									entity.addCollectedfrom(collectedFrom);
215
								}
216
							}
217
					);
218
		}
219
		//Adding identifier
220
		final String doi = getStringValue(rootElement, "doi");
221
		entity.addOriginalId(doi);
222

    
223
		final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
224
		entity.setId(sourceId);
225

    
226
		entity.addPid(FieldTypeProtos.StructuredProperty.newBuilder()
227
				.setValue(doi)
228
				.setQualifier(getQualifier("doi", PID_TYPES))
229
				.build());
230

    
231
		//Create Result Field
232
		ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
233

    
234
		final String type = getStringValue(rootElement, "type");
235

    
236
		//Adding Instances
237
		final String typeValue = typologiesMapping.get(type).get("value");
238
		final String cobjValue = typologiesMapping.get(type).get("cobj");
239

    
240
		// TODO: workaround for #4362: remove it when UnpayWall is correctly mapped
241
		List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> {
242
			String provS = getStringValue(prov, "provenance");
243
			if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true;
244
			else return false;
245
		}).collect(Collectors.toList());
246

    
247
		Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
248
		{
249
			ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
250
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
251
					.setClassid(cobjValue)
252
					.setClassname(typeValue)
253
					.setSchemeid("dnet:publication_resource")
254
					.setSchemename("dnet:publication_resource")
255
					.build());
256
			instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
257
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
258
					.setValue("Unknown Repository")
259
					.build());
260

    
261
			String acc_class_id = it.get("access-rights").getAsString();
262
			String acc_class_value;
263
			switch (acc_class_id) {
264
			case "OPEN": {
265
				acc_class_value = "Open Access";
266
				break;
267
			}
268
			case "CLOSED":
269
			case "RESTRICTED": {
270
				//acc_class_value = "Closed Access";
271
				//4362#note-3
272
				acc_class_id = "RESTRICTED";
273
				acc_class_value = "Restricted";
274
				break;
275
			}
276
			case "EMBARGO":
277
				acc_class_value = "Embargo";
278
				break;
279
			default: {
280
				acc_class_value = "not available";
281
				acc_class_id = "UNKNOWN";
282
			}
283

    
284
			}
285

    
286
			instance.addUrl(it.get("url").getAsString());
287
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
288
					.setClassid(acc_class_id)
289
					.setClassname(acc_class_value)
290
					.setSchemeid("dnet:access_modes")
291
					.setSchemename("dnet:access_modes")
292
					.build());
293

    
294
			final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
295
			final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
296
			if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
297
				final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
298
						.setValue(name)
299
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
300
						.build();
301

    
302
				instance.setCollectedfrom(collectedFrom);
303
			}
304

    
305
			return instance.build();
306
		}).forEach(result::addInstance);
307

    
308
		//Adding DOI URL as  Instance
309
		final String doiURL = getStringValue(rootElement, "doi-url");
310
		JsonObject hostedByOpenAire = null;
311
		if (rootElement.has("hostedByOpenAire")) {
312
			hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire");
313
		}
314
		final String publisher = getStringValue(rootElement, "publisher");
315
		if (StringUtils.isNotBlank(doiURL)) {
316
			final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
317
			instance.addUrl(doiURL);
318
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
319
					.setClassid(cobjValue)
320
					.setClassname(typeValue)
321
					.setSchemeid("dnet:publication_resource")
322
					.setSchemename("dnet:publication_resource")
323
					.build());
324

    
325
			//#4362: if the publisher is Scielo, then the result is OPEN
326

    
327
			String accessModeId = "RESTRICTED";
328
			String accessModeName = "Restricted";
329
			if(publisher.equalsIgnoreCase("FapUNIFESP (SciELO)")){
330
				accessModeId = "OPEN";
331
				accessModeName = "Open Access";
332
			}
333
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
334
					.setClassid(accessModeId)
335
					.setClassname(accessModeName)
336
					.setSchemeid("dnet:access_modes")
337
					.setSchemename("dnet:access_modes")
338
					.build());
339
			instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
340
					.setValue(CROSSREF)
341
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref"))
342
					.build());
343

    
344
			if (hostedByOpenAire == null)
345
				instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
346
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
347
						.setValue("Unknown Repository")
348
						.build());
349
			else {
350
				instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
351
						.setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString()))
352
						.setValue(hostedByOpenAire.get("name").getAsString())
353
						.build());
354
			}
355

    
356
			result.addInstance(instance);
357
		}
358

    
359
		//Create Metadata Proto
360
		final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
361

    
362
		Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
363

    
364
		if (authorsOrganizations.getKey().size() > 0) {
365
			metadata.addAllAuthor(authorsOrganizations.getKey());
366
		} else {
367
			//Should never enter here becasue of the isValid method at the beginning.
368
			context.incrementCounter("filtered", "unexpected_no_authors", 1);
369
			return null;
370
		}
371
		//adding Language
372
		metadata.setLanguage(FieldTypeProtos.Qualifier.newBuilder()
373
				.setClassid("und")
374
				.setClassname("Undetermined")
375
				.setSchemeid(DNET_LANGUAGES)
376
				.setSchemename(DNET_LANGUAGES)
377
				.build());
378

    
379
		//Adding subjects
380
		List<String> subjects = getArrayValues(rootElement, "subject");
381

    
382
		subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
383
				.setValue(s)
384
				.setQualifier(getQualifier("keyword", "dnet:subject"))
385
				.build()));
386

    
387
		List<String> titles = getCleanedTitles(rootElement);
388
		titles.forEach(t ->
389
				metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
390
						.setValue(t)
391
						.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
392
						.build()));
393

    
394
		settingRelevantDate(rootElement, metadata, "issued", "issued", true);
395
		settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
396
		settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
397
		settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
398

    
399
		getArrayObjects(rootElement, "abstract").forEach(d ->
400
				{
401
					if (MAG.equals(d.get("provenance").getAsString()))
402
						metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(decompressAbstract(d.get("value").getAsString())).build());
403
					else
404
						metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build());
405
				}
406
		);
407

    
408
		//Adding Journal
409

    
410
		if (StringUtils.isNotBlank(publisher)) {
411

    
412
			final FieldTypeProtos.Journal.Builder journal = FieldTypeProtos.Journal.newBuilder().setName(publisher);
413

    
414
			if (hasJSONArrayField(rootElement, "issn")) {
415
				StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
416
						.map(JsonElement::getAsJsonObject)
417
						.forEach(it -> {
418
							final String issntype = getStringValue(it, "type");
419
							final String value = getStringValue(it, "value");
420
							if ("electronic".equals(issntype)) {
421
								journal.setIssnOnline(value);
422
							}
423
							if ("print".equals(issntype))
424
								journal.setIssnPrinted(value);
425
						});
426
			}
427
			metadata.setJournal(journal.build());
428
		}
429
		metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
430
		result.setMetadata(metadata.build());
431
		entity.setResult(result.build());
432
		oaf.setEntity(entity.build());
433

    
434
		//System.out.println(JsonFormat.printToString(oaf.build()));
435

    
436
		final List<AtomicAction> actionList = new ArrayList<>();
437

    
438
		if (!onlyOrganization)
439
			actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
440

    
441
		if (!authorsOrganizations.getValue().isEmpty()) {
442

    
443
			authorsOrganizations.getValue().forEach(o ->
444
			{
445

    
446
				actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organization", "body", o.toByteArray()));
447
				if (!onlyOrganization)
448
					actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent));
449
				final String gridOrganization = getSimilarGridOrganization(o.getEntity());
450
				if (gridOrganization != null) {
451
					actionList.add(factory
452
							.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization,
453
									"".getBytes()));
454
					actionList.add(factory
455
							.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(),
456
									"".getBytes()));
457
				}
458
			});
459
		}
460
		return actionList;
461

    
462
	}
463

    
464
	private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) {
465

    
466
		final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList();
467
		if (pidList != null) {
468
			for (FieldTypeProtos.StructuredProperty p : pidList) {
469
				if (p.getQualifier().getClassname().equals("grid")) {
470
					return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
471
				}
472
			}
473
		}
474
		return null;
475

    
476
	}
477

    
478
	private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication,
479
			final OafProtos.Oaf organization,
480
			final ActionFactory factory,
481
			final String setName,
482
			final Agent agent) {
483

    
484
		List<AtomicAction> result = new ArrayList<>();
485

    
486
		final OafProtos.Oaf.Builder roaf = OafProtos.Oaf.newBuilder();
487
		roaf.setKind(KindProtos.Kind.relation);
488

    
489
		roaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
490
				.setInvisible(false)
491
				.setDeletedbyinference(false)
492
				.setInferred(false)
493
				.setTrust("0.9")
494
				.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
495
				.build());
496

    
497
		final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder();
498

    
499
		rel.setRelType(RelTypeProtos.RelType.resultOrganization);
500
		rel.setSubRelType(RelTypeProtos.SubRelType.affiliation);
501

    
502
		//Create a relation Result --> Organization
503
		rel.setSource(publication.getEntity().getId());
504
		rel.setTarget(organization.getEntity().getId());
505
		rel.setRelClass(ResultOrganization.Affiliation.RelName.hasAuthorInstitution.toString());
506

    
507
		final ResultOrganization.Builder rel_instance = ResultOrganization.newBuilder();
508

    
509
		final ResultOrganization.Affiliation.Builder affiliationRel = ResultOrganization.Affiliation.newBuilder();
510
		affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
511
				.setSemantics(getQualifier("hasAuthorInstitution", "dnet:result_organization_relations"))
512
				.build());
513
		rel_instance.setAffiliation(affiliationRel.build());
514
		rel.setResultOrganization(rel_instance.build());
515

    
516
		rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
517
				.setValue(datasources.get(MAG.toLowerCase()).getKey())
518
				.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions
519
						.md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR)))
520
				.build());
521

    
522
		rel.setChild(false);
523
		roaf.setRel(rel.build());
524

    
525
		result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution",
526
				organization.getEntity().getId(), roaf.build().toByteArray()));
527

    
528
		//Create a relation Organization --> Result
529
		rel.setTarget(publication.getEntity().getId());
530
		rel.setSource(organization.getEntity().getId());
531
		rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString());
532

    
533
		affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
534
				.setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations"))
535
				.build());
536
		rel_instance.setAffiliation(affiliationRel.build());
537
		rel.setResultOrganization(rel_instance.build());
538
		roaf.setRel(rel.build());
539
		result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf",
540
				publication.getEntity().getId(), roaf.build().toByteArray()));
541

    
542
		return result;
543

    
544
	}
545

    
546
	private static boolean hasJSONArrayField(final JsonObject root, final String key) {
547
		return root.has(key) && root.get(key).isJsonArray();
548
	}
549

    
550
	private static void settingRelevantDate(JsonObject rootElement,
551
			ResultProtos.Result.Metadata.Builder metadata,
552
			final String jsonKey,
553
			final String dictionaryKey,
554
			final boolean addToDateOfAcceptance) {
555
		//Adding date
556
		String date = getStringValue(rootElement, jsonKey);
557
		if (date == null)
558
			return;
559
		if (date.length() == 4) {
560
			date += "-01-01";
561
		}
562
		if (isValidDate(date)) {
563
			if (addToDateOfAcceptance)
564
				metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
565
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
566
					.setValue(date)
567
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
568
					.build());
569
		}
570
	}
571

    
572
	public static FieldTypeProtos.KeyValue extractIdentifier(final String value) {
573
		FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder();
574
		if (StringUtils.contains(value, "orcid.org")) {
575
			return pid.setValue(value.replaceAll("https://orcid.org/", "").replaceAll("http://orcid.org/",""))
576
					.setKey(ORCID).build();
577
		}
578
		if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
579
			return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
580
					.setKey("MAG Identifier").build();
581
		}
582
		return pid.setValue(value)
583
				.setKey("URL").build();
584
	}
585

    
586
	public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) {
587
		final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>();
588
		final List<String> magId = new ArrayList<>();
589
		getArrayObjects(affiliation, "identifiers").forEach(it -> {
590
			if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) {
591
				affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG));
592
				magId.add(it.get("value").getAsString());
593
			} else
594
				affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
595
		});
596
		if (magId.size() > 0) {
597
			final String microsoftID = magId.get(0);
598
			OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
599
			oaf.setKind(KindProtos.Kind.entity);
600
			OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder();
601
			entity.setType(TypeProtos.Type.organization);
602
			entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
603
			final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
604
			final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
605
			if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
606
				final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
607
						.setValue(name)
608
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
609
						.build();
610
				entity.addCollectedfrom(collectedFrom);
611
			} else {
612
				return null;
613
			}
614
			entity.addOriginalId(microsoftID);
615

    
616
			affiliationIdentifiers.forEach((key, value) -> entity.addPid(
617
					FieldTypeProtos.StructuredProperty.newBuilder()
618
							.setQualifier(value)
619
							.setValue(key)
620
							.build()));
621

    
622
			final OrganizationProtos.Organization.Builder organization = OrganizationProtos.Organization.newBuilder();
623
			organization.setMetadata(OrganizationProtos.Organization.Metadata.newBuilder()
624
					.setWebsiteurl(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("official-page").getAsString()).build())
625
					.setLegalname(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("value").getAsString()).build())
626
					.build());
627

    
628
			entity.setOrganization(organization);
629
			oaf.setEntity(entity);
630
			oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
631
					.setInvisible(false)
632
					.setDeletedbyinference(false)
633
					.setInferred(false)
634
					.setTrust("0.9")
635
					.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
636
					.build());
637
			return oaf.build();
638
		}
639
		return null;
640
	}
641

    
642
	public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
643

    
644
		final Map<String, OafProtos.Oaf> affiliations = new HashMap<>();
645

    
646
		List<JsonObject> authors = getArrayObjects(root, "authors");
647

    
648
		final AtomicInteger counter = new AtomicInteger(1);
649

    
650
		List<FieldTypeProtos.Author> collect = authors.stream().map(author -> {
651
			final String given = getStringValue(author, "given");
652
			final String family = getStringValue(author, "family");
653
			String fullname = getStringValue(author, "fullname");
654

    
655
			if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
656
				fullname = String.format("%s %s", given, family);
657
			}
658

    
659
			if (!isValidAuthorName(fullname, null)) {
660
				return null;
661
			}
662
			final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder();
663

    
664
			if (StringUtils.isNotBlank(given))
665
				abuilder.setName(given);
666
			if (StringUtils.isNotBlank(family))
667
				abuilder.setSurname(family);
668
			if (StringUtils.isNotBlank(fullname))
669
				abuilder.setFullname(fullname);
670

    
671
			final List<JsonObject> identifiers = getArrayObjects(author, "identifiers");
672
			final List<JsonObject> authorAffiliation = getArrayObjects(author, "affiliations");
673

    
674
			authorAffiliation.forEach(it ->
675
			{
676
				OafProtos.Oaf org = createOrganizationFromJSON(it);
677
				if (org != null) {
678
					affiliations.put(org.getEntity().getId(), org);
679
					abuilder.addAffiliation(org.getEntity().getOrganization().getMetadata().getLegalname());
680
				}
681
			});
682
			identifiers.stream().map(id -> {
683
				final String value = id.get("value").getAsString();
684
				return extractIdentifier(value);
685
			}).collect(
686
					Collectors.toMap(
687
							FieldTypeProtos.KeyValue::getKey,
688
							Function.identity(),
689
							(a, b) -> a
690
					)).values().forEach(abuilder::addPid);
691
			abuilder.setRank(counter.getAndIncrement());
692

    
693
			return abuilder.build();
694

    
695
		}).filter(Objects::nonNull).collect(Collectors.toList());
696

    
697
		return new Pair<>(collect, affiliations.values());
698
	}
699

    
700
}
(4-4/18)