Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import java.io.ByteArrayOutputStream;
4
import java.io.IOException;
5
import java.io.InputStream;
6
import java.util.*;
7
import java.util.concurrent.atomic.AtomicInteger;
8
import java.util.function.Function;
9
import java.util.stream.Collectors;
10
import java.util.stream.Stream;
11
import java.util.zip.Inflater;
12

    
13
import com.google.gson.Gson;
14
import com.google.gson.JsonElement;
15
import com.google.gson.JsonObject;
16
import eu.dnetlib.actionmanager.actions.ActionFactory;
17
import eu.dnetlib.actionmanager.actions.AtomicAction;
18
import eu.dnetlib.actionmanager.common.Agent;
19
import eu.dnetlib.data.mapreduce.hbase.Reporter;
20
import eu.dnetlib.data.mapreduce.util.StreamUtils;
21
import eu.dnetlib.data.proto.*;
22
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
23
import eu.dnetlib.miscutils.collections.Pair;
24
import org.apache.commons.codec.binary.Base64;
25
import org.apache.commons.io.IOUtils;
26
import org.apache.commons.lang3.StringUtils;
27

    
28
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
29
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
30

    
31
public class DOIBoostToActions {
32

    
33
	public static final String MAG = "MAG";
34
	public static final String ORCID = "ORCID";
35
	public static final String CROSSREF = "Crossref";
36
	public static final String UNPAYWALL = "UnpayWall";
37

    
38
	public static final String GRID_AC = "grid.ac";
39
	public static final String WIKPEDIA = "wikpedia";
40

    
41
	public final static String doiBoostNSPREFIX = "doiboost____";
42
	public static final String OPENAIRE_PREFIX = "openaire____";
43

    
44
	public static final String SEPARATOR = "::";
45
	public static final String DNET_LANGUAGES = "dnet:languages";
46

    
47
	private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
48
		put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
49
		put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
50
		put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
51
		put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
52

    
53
	}};
54

    
55
	private static String decompressAbstract(final String abstractCompressed) {
56
		try {
57
			byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
58
			final Inflater decompresser = new Inflater();
59
			decompresser.setInput(byteArray);
60
			final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
61
			byte[] buffer = new byte[8192];
62
			while (!decompresser.finished()) {
63
				int size = decompresser.inflate(buffer);
64
				bos.write(buffer, 0, size);
65
			}
66
			byte[] unzippeddata = bos.toByteArray();
67
			decompresser.end();
68
			return new String(unzippeddata);
69
		} catch (Throwable e) {
70
			System.out.println("Wrong abstract:" + abstractCompressed);
71
			throw new RuntimeException(e);
72
		}
73
	}
74

    
75
	public static final String PID_TYPES = "dnet:pid_types";
76
	private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
77
		put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES)
78
				.setSchemeid(PID_TYPES).build());
79
		put(GRID_AC, getQualifier("grid", PID_TYPES));
80
		put(WIKPEDIA, getQualifier("urn", PID_TYPES));
81
	}};
82

    
83
	static Map<String, Map<String, String>> typologiesMapping;
84

    
85
	static {
86
		try {
87
			final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
88
			final String tt = IOUtils.toString(is);
89
			typologiesMapping = new Gson().fromJson(tt, Map.class);
90
		} catch (IOException e) {
91
			e.printStackTrace();
92
		}
93
	}
94

    
95
	protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
96

    
97
		final String doi = getStringValue(rootElement, "doi");
98
		if (doi == null) {
99
			context.incrementCounter("filtered", "no_doi", 1);
100
			return false;
101
		}
102
		final String type = getStringValue(rootElement, "type");
103
		if (!typologiesMapping.containsKey(type)) {
104
			context.incrementCounter("filtered", "unknowntype_" + type, 1);
105
			return false;
106
		}
107
		// fixes #4360 (test publisher)
108
		final String publisher = getStringValue(rootElement, "publisher");
109
		if (StringUtils.isNotBlank(publisher) && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
110
			context.incrementCounter("filtered", "test_publisher", 1);
111
			return false;
112
		}
113

    
114
		List<JsonObject> authors = getArrayObjects(rootElement, "authors");
115
		boolean hasAuthors = false;
116
		for (JsonObject author : authors) {
117
			final String given = getStringValue(author, "given");
118
			final String family = getStringValue(author, "family");
119
			String fullname = getStringValue(author, "fullname");
120
			if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
121
				fullname = String.format("%s %s", given, family);
122
			}
123
			// fixes #4368
124
			if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) {
125
				context.incrementCounter("invalid_author", "addiejackson", 1);
126
				context.incrementCounter("filtered", "invalid_authors", 1);
127
				return false;
128
			}
129
			if (isValidAuthorName(fullname, context)) hasAuthors = true;
130
		}
131

    
132
		if (!hasAuthors) {
133
			context.incrementCounter("filtered", "invalid_authors", 1);
134
			return false;
135
		}
136
		// fixes #4360
137
		if (getCleanedTitles(rootElement).isEmpty()) {
138
			context.incrementCounter("filtered", "invalid_title", 1);
139
			return false;
140
		}
141

    
142
		return true;
143
	}
144

    
145
	private static List<String> getCleanedTitles(final JsonObject rootElement) {
146
		List<String> titles = getArrayValues(rootElement, "title");
147
		return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
148
	}
149

    
150
	private static boolean isValidAuthorName(final String fullName, final Reporter context) {
151
		if (StringUtils.isBlank(fullName)) {
152
			if(context != null) context.incrementCounter("invalid_author", "blank", 1);
153
			return false;
154
		}
155
		// fixes #4391 and subtasks related to DOIBoost
156
		switch (StringUtils.lowerCase(fullName)) {
157
		case ",":
158
		case "none none":
159
		case "none, none":
160
		case "none &na;":
161
		case "(:null)":
162
		case "test test test":
163
		case "test test":
164
		case "test":
165
		case "&na; &na;": {
166
			if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1);
167
			return false;
168
			}
169
		}
170
		return true;
171
	}
172

    
173
	public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
174
			final ActionFactory factory,
175
			final String setName,
176
			final Agent agent,
177
			boolean invisible,
178
			final boolean onlyOrganization,
179
			final Reporter context) {
180

    
181
		if (!isValid(rootElement, context)) return null;
182

    
183
		//Create OAF Proto
184

    
185
		final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
186
		//Add Data Info
187
		oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
188
				.setInvisible(invisible)
189
				.setDeletedbyinference(false)
190
				.setInferred(false)
191
				.setTrust("0.9")
192
				.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
193
				.build());
194

    
195
		//Adding Kind
196
		oaf.setKind(KindProtos.Kind.entity);
197

    
198
		//creating Result Proto
199
		final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
200

    
201
		entity.setDateofcollection("2019-02-15");
202

    
203
		if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
204
			StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
205
					.map(JsonElement::getAsString)
206
					.forEach(cf -> {
207
								final String id = datasources.get(cf.toLowerCase()).getValue();
208
								final String name = datasources.get(cf.toLowerCase()).getKey();
209
								if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
210
									final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
211
											.setValue(name)
212
											.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
213
											.build();
214
									entity.addCollectedfrom(collectedFrom);
215
								}
216
							}
217
					);
218
		}
219
		//Adding identifier
220
		final String doi = getStringValue(rootElement, "doi");
221
		entity.addOriginalId(doi);
222

    
223
		final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
224
		entity.setId(sourceId);
225

    
226
		entity.addPid(FieldTypeProtos.StructuredProperty.newBuilder()
227
				.setValue(doi)
228
				.setQualifier(getQualifier("doi", PID_TYPES))
229
				.build());
230

    
231
		//Create Result Field
232
		ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
233

    
234
		final String type = getStringValue(rootElement, "type");
235

    
236
		//Adding Instances
237
		final String typeValue = typologiesMapping.get(type).get("value");
238
		final String cobjValue = typologiesMapping.get(type).get("cobj");
239

    
240
		// TODO: workaround for #4362: remove it when UnpayWall is correctly mapped
241
		List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> {
242
			String provS = getStringValue(prov, "provenance");
243
			if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true;
244
			else return false;
245
		}).collect(Collectors.toList());
246

    
247
		Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
248
		{
249
			ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
250
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
251
					.setClassid(cobjValue)
252
					.setClassname(typeValue)
253
					.setSchemeid("dnet:publication_resource")
254
					.setSchemename("dnet:publication_resource")
255
					.build());
256
			instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
257
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
258
					.setValue("Unknown Repository")
259
					.build());
260

    
261
			String acc_class_id = it.get("access-rights").getAsString();
262
			String acc_class_value;
263
			switch (acc_class_id) {
264
			case "OPEN": {
265
				acc_class_value = "Open Access";
266
				break;
267
			}
268
			case "CLOSED":
269
			case "RESTRICTED": {
270
				//acc_class_value = "Closed Access";
271
				//4362#note-3
272
				acc_class_id = "RESTRICTED";
273
				acc_class_value = "Restricted";
274
				break;
275
			}
276
			case "EMBARGO":
277
				acc_class_value = "Embargo";
278
				break;
279
			default: {
280
				acc_class_value = "not available";
281
				acc_class_id = "UNKNOWN";
282
			}
283

    
284
			}
285

    
286
			instance.addUrl(it.get("url").getAsString());
287
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
288
					.setClassid(acc_class_id)
289
					.setClassname(acc_class_value)
290
					.setSchemeid("dnet:access_modes")
291
					.setSchemename("dnet:access_modes")
292
					.build());
293

    
294
			final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
295
			final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
296
			if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
297
				final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
298
						.setValue(name)
299
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
300
						.build();
301

    
302
				instance.setCollectedfrom(collectedFrom);
303
			}
304

    
305
			return instance.build();
306
		}).forEach(result::addInstance);
307

    
308
		//Adding DOI URL as  Instance
309
		final String doiURL = getStringValue(rootElement, "doi-url");
310
		JsonObject hostedByOpenAire = null;
311
		if (rootElement.has("hostedByOpenAire")) {
312
			hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire");
313
		}
314

    
315
		if (StringUtils.isNotBlank(doiURL)) {
316
			final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
317
			instance.addUrl(doiURL);
318
			instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
319
					.setClassid(cobjValue)
320
					.setClassname(typeValue)
321
					.setSchemeid("dnet:publication_resource")
322
					.setSchemename("dnet:publication_resource")
323
					.build());
324
			instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
325
					.setClassid("CLOSED")
326
					.setClassname("Closed Access")
327
					.setSchemeid("dnet:access_modes")
328
					.setSchemename("dnet:access_modes")
329
					.build());
330
			instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
331
					.setValue(CROSSREF)
332
					.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref"))
333
					.build());
334

    
335
			if (hostedByOpenAire == null)
336
				instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
337
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
338
						.setValue("Unknown Repository")
339
						.build());
340
			else {
341
				instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
342
						.setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString()))
343
						.setValue(hostedByOpenAire.get("name").getAsString())
344
						.build());
345
			}
346

    
347
			result.addInstance(instance);
348
		}
349

    
350
		//Create Metadata Proto
351
		final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
352

    
353
		Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
354

    
355
		if (authorsOrganizations.getKey().size() > 0) {
356
			metadata.addAllAuthor(authorsOrganizations.getKey());
357
		} else {
358
			//Should never enter here becasue of the isValid method at the beginning.
359
			context.incrementCounter("filtered", "unexpected_no_authors", 1);
360
			return null;
361
		}
362
		//adding Language
363
		metadata.setLanguage(FieldTypeProtos.Qualifier.newBuilder()
364
				.setClassid("und")
365
				.setClassname("Undetermined")
366
				.setSchemeid(DNET_LANGUAGES)
367
				.setSchemename(DNET_LANGUAGES)
368
				.build());
369

    
370
		//Adding subjects
371
		List<String> subjects = getArrayValues(rootElement, "subject");
372

    
373
		subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
374
				.setValue(s)
375
				.setQualifier(getQualifier("keyword", "dnet:subject"))
376
				.build()));
377

    
378
		List<String> titles = getCleanedTitles(rootElement);
379
		titles.forEach(t ->
380
				metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
381
						.setValue(t)
382
						.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
383
						.build()));
384

    
385
		settingRelevantDate(rootElement, metadata, "issued", "issued", true);
386
		settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
387
		settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
388
		settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
389

    
390
		getArrayObjects(rootElement, "abstract").forEach(d ->
391
				{
392
					if (MAG.equals(d.get("provenance").getAsString()))
393
						metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(decompressAbstract(d.get("value").getAsString())).build());
394
					else
395
						metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build());
396
				}
397
		);
398

    
399
		//Adding Journal
400
		final String publisher = getStringValue(rootElement, "publisher");
401
		if (StringUtils.isNotBlank(publisher)) {
402

    
403
			final FieldTypeProtos.Journal.Builder journal = FieldTypeProtos.Journal.newBuilder().setName(publisher);
404

    
405
			if (hasJSONArrayField(rootElement, "issn")) {
406
				StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
407
						.map(JsonElement::getAsJsonObject)
408
						.forEach(it -> {
409
							final String issntype = getStringValue(it, "type");
410
							final String value = getStringValue(it, "value");
411
							if ("electronic".equals(issntype)) {
412
								journal.setIssnOnline(value);
413
							}
414
							if ("print".equals(issntype))
415
								journal.setIssnPrinted(value);
416
						});
417
			}
418
			metadata.setJournal(journal.build());
419
		}
420
		metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
421
		result.setMetadata(metadata.build());
422
		entity.setResult(result.build());
423
		oaf.setEntity(entity.build());
424

    
425
		//System.out.println(JsonFormat.printToString(oaf.build()));
426

    
427
		final List<AtomicAction> actionList = new ArrayList<>();
428

    
429
		if (!onlyOrganization)
430
			actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
431

    
432
		if (!authorsOrganizations.getValue().isEmpty()) {
433

    
434
			authorsOrganizations.getValue().forEach(o ->
435
			{
436

    
437
				actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organization", "body", o.toByteArray()));
438
				if (!onlyOrganization)
439
					actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent));
440
				final String gridOrganization = getSimilarGridOrganization(o.getEntity());
441
				if (gridOrganization != null) {
442
					actionList.add(factory
443
							.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization,
444
									"".getBytes()));
445
					actionList.add(factory
446
							.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(),
447
									"".getBytes()));
448
				}
449
			});
450
		}
451
		return actionList;
452

    
453
	}
454

    
455
	private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) {
456

    
457
		final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList();
458
		if (pidList != null) {
459
			for (FieldTypeProtos.StructuredProperty p : pidList) {
460
				if (p.getQualifier().getClassname().equals("grid")) {
461
					return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
462
				}
463
			}
464
		}
465
		return null;
466

    
467
	}
468

    
469
	private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication,
470
			final OafProtos.Oaf organization,
471
			final ActionFactory factory,
472
			final String setName,
473
			final Agent agent) {
474

    
475
		List<AtomicAction> result = new ArrayList<>();
476

    
477
		final OafProtos.Oaf.Builder roaf = OafProtos.Oaf.newBuilder();
478
		roaf.setKind(KindProtos.Kind.relation);
479

    
480
		roaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
481
				.setInvisible(false)
482
				.setDeletedbyinference(false)
483
				.setInferred(false)
484
				.setTrust("0.9")
485
				.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
486
				.build());
487

    
488
		final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder();
489

    
490
		rel.setRelType(RelTypeProtos.RelType.resultOrganization);
491
		rel.setSubRelType(RelTypeProtos.SubRelType.affiliation);
492

    
493
		//Create a relation Result --> Organization
494
		rel.setSource(publication.getEntity().getId());
495
		rel.setTarget(organization.getEntity().getId());
496
		rel.setRelClass(ResultOrganization.Affiliation.RelName.hasAuthorInstitution.toString());
497

    
498
		final ResultOrganization.Builder rel_instance = ResultOrganization.newBuilder();
499

    
500
		final ResultOrganization.Affiliation.Builder affiliationRel = ResultOrganization.Affiliation.newBuilder();
501
		affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
502
				.setSemantics(getQualifier("hasAuthorInstitution", "dnet:result_organization_relations"))
503
				.build());
504
		rel_instance.setAffiliation(affiliationRel.build());
505
		rel.setResultOrganization(rel_instance.build());
506

    
507
		rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
508
				.setValue(datasources.get(MAG.toLowerCase()).getKey())
509
				.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions
510
						.md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR)))
511
				.build());
512

    
513
		rel.setChild(false);
514
		roaf.setRel(rel.build());
515

    
516
		result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution",
517
				organization.getEntity().getId(), roaf.build().toByteArray()));
518

    
519
		//Create a relation Organization --> Result
520
		rel.setTarget(publication.getEntity().getId());
521
		rel.setSource(organization.getEntity().getId());
522
		rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString());
523

    
524
		affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
525
				.setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations"))
526
				.build());
527
		rel_instance.setAffiliation(affiliationRel.build());
528
		rel.setResultOrganization(rel_instance.build());
529
		roaf.setRel(rel.build());
530
		result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf",
531
				publication.getEntity().getId(), roaf.build().toByteArray()));
532

    
533
		return result;
534

    
535
	}
536

    
537
	private static boolean hasJSONArrayField(final JsonObject root, final String key) {
538
		return root.has(key) && root.get(key).isJsonArray();
539
	}
540

    
541
	private static void settingRelevantDate(JsonObject rootElement,
542
			ResultProtos.Result.Metadata.Builder metadata,
543
			final String jsonKey,
544
			final String dictionaryKey,
545
			final boolean addToDateOfAcceptance) {
546
		//Adding date
547
		String date = getStringValue(rootElement, jsonKey);
548
		if (date == null)
549
			return;
550
		if (date.length() == 4) {
551
			date += "-01-01";
552
		}
553
		if (isValidDate(date)) {
554
			if (addToDateOfAcceptance)
555
				metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
556
			metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
557
					.setValue(date)
558
					.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
559
					.build());
560
		}
561
	}
562

    
563
	public static FieldTypeProtos.KeyValue extractIdentifier(final String value) {
564
		FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder();
565
		if (StringUtils.contains(value, "orcid.org")) {
566
			return pid.setValue(value.replaceAll("https://orcid.org/", "").replaceAll("http://orcid.org/",""))
567
					.setKey(ORCID).build();
568
		}
569
		if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
570
			return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
571
					.setKey("MAG Identifier").build();
572
		}
573
		return pid.setValue(value)
574
				.setKey("URL").build();
575
	}
576

    
577
	public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) {
578
		final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>();
579
		final List<String> magId = new ArrayList<>();
580
		getArrayObjects(affiliation, "identifiers").forEach(it -> {
581
			if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) {
582
				affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG));
583
				magId.add(it.get("value").getAsString());
584
			} else
585
				affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
586
		});
587
		if (magId.size() > 0) {
588
			final String microsoftID = magId.get(0);
589
			OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
590
			oaf.setKind(KindProtos.Kind.entity);
591
			OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder();
592
			entity.setType(TypeProtos.Type.organization);
593
			entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
594
			final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
595
			final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
596
			if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
597
				final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
598
						.setValue(name)
599
						.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
600
						.build();
601
				entity.addCollectedfrom(collectedFrom);
602
			} else {
603
				return null;
604
			}
605
			entity.addOriginalId(microsoftID);
606

    
607
			affiliationIdentifiers.forEach((key, value) -> entity.addPid(
608
					FieldTypeProtos.StructuredProperty.newBuilder()
609
							.setQualifier(value)
610
							.setValue(key)
611
							.build()));
612

    
613
			final OrganizationProtos.Organization.Builder organization = OrganizationProtos.Organization.newBuilder();
614
			organization.setMetadata(OrganizationProtos.Organization.Metadata.newBuilder()
615
					.setWebsiteurl(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("official-page").getAsString()).build())
616
					.setLegalname(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("value").getAsString()).build())
617
					.build());
618

    
619
			entity.setOrganization(organization);
620
			oaf.setEntity(entity);
621
			oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
622
					.setInvisible(false)
623
					.setDeletedbyinference(false)
624
					.setInferred(false)
625
					.setTrust("0.9")
626
					.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
627
					.build());
628
			return oaf.build();
629
		}
630
		return null;
631
	}
632

    
633
	public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
634

    
635
		final Map<String, OafProtos.Oaf> affiliations = new HashMap<>();
636

    
637
		List<JsonObject> authors = getArrayObjects(root, "authors");
638

    
639
		final AtomicInteger counter = new AtomicInteger(1);
640

    
641
		List<FieldTypeProtos.Author> collect = authors.stream().map(author -> {
642
			final String given = getStringValue(author, "given");
643
			final String family = getStringValue(author, "family");
644
			String fullname = getStringValue(author, "fullname");
645

    
646
			if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
647
				fullname = String.format("%s %s", given, family);
648
			}
649

    
650
			if (!isValidAuthorName(fullname, null)) {
651
				return null;
652
			}
653
			final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder();
654

    
655
			if (StringUtils.isNotBlank(given))
656
				abuilder.setName(given);
657
			if (StringUtils.isNotBlank(family))
658
				abuilder.setSurname(family);
659
			if (StringUtils.isNotBlank(fullname))
660
				abuilder.setFullname(fullname);
661

    
662
			final List<JsonObject> identifiers = getArrayObjects(author, "identifiers");
663
			final List<JsonObject> authorAffiliation = getArrayObjects(author, "affiliations");
664

    
665
			authorAffiliation.forEach(it ->
666
			{
667
				OafProtos.Oaf org = createOrganizationFromJSON(it);
668
				if (org != null) {
669
					affiliations.put(org.getEntity().getId(), org);
670
					abuilder.addAffiliation(org.getEntity().getOrganization().getMetadata().getLegalname());
671
				}
672
			});
673
			identifiers.stream().map(id -> {
674
				final String value = id.get("value").getAsString();
675
				return extractIdentifier(value);
676
			}).collect(
677
					Collectors.toMap(
678
							FieldTypeProtos.KeyValue::getKey,
679
							Function.identity(),
680
							(a, b) -> a
681
					)).values().forEach(abuilder::addPid);
682
			abuilder.setRank(counter.getAndIncrement());
683

    
684
			return abuilder.build();
685

    
686
		}).filter(Objects::nonNull).collect(Collectors.toList());
687

    
688
		return new Pair<>(collect, affiliations.values());
689
	}
690

    
691
}
(4-4/18)