Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2

    
3
import java.io.ByteArrayOutputStream;
4
import java.io.IOException;
5
import java.io.InputStream;
6
import java.util.*;
7
import java.util.concurrent.atomic.AtomicInteger;
8
import java.util.function.Function;
9
import java.util.stream.Collectors;
10
import java.util.stream.Stream;
11
import java.util.zip.Inflater;
12

    
13
import com.google.gson.Gson;
14
import com.google.gson.JsonElement;
15
import com.google.gson.JsonObject;
16
import eu.dnetlib.actionmanager.actions.ActionFactory;
17
import eu.dnetlib.actionmanager.actions.AtomicAction;
18
import eu.dnetlib.actionmanager.common.Agent;
19
import eu.dnetlib.data.mapreduce.hbase.Reporter;
20
import eu.dnetlib.data.mapreduce.util.StreamUtils;
21
import eu.dnetlib.data.proto.*;
22
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
23
import eu.dnetlib.miscutils.collections.Pair;
24
import org.apache.commons.codec.binary.Base64;
25
import org.apache.commons.io.IOUtils;
26
import org.apache.commons.lang3.StringUtils;
27

    
28
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
29
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
30

    
31
public class DOIBoostToActions {
32

    
33
    public static final String MAG = "MAG";
34
    public static final String ORCID = "ORCID";
35
    public static final String CROSSREF = "Crossref";
36
    public static final String UNPAYWALL = "UnpayWall";
37

    
38
    public static final String GRID_AC = "grid.ac";
39
    public static final String WIKPEDIA = "wikpedia";
40

    
41
    public final static String doiBoostNSPREFIX = "doiboost____";
42
    public static final String OPENAIRE_PREFIX = "openaire____";
43

    
44
    public static final String SEPARATOR = "::";
45

    
46
    private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
47
        put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
48
        put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
49
        put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
50
        put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
51

    
52
    }};
53

    
54
    private static String decompressAbstract(final String abstractCompressed) {
55
        try {
56
            byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
57
            final Inflater decompresser = new Inflater();
58
            decompresser.setInput(byteArray);
59
            final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
60
            byte[] buffer = new byte[8192];
61
            while (!decompresser.finished()) {
62
                int size = decompresser.inflate(buffer);
63
                bos.write(buffer, 0, size);
64
            }
65
            byte[] unzippeddata = bos.toByteArray();
66
            decompresser.end();
67
            return new String(unzippeddata);
68
        } catch (Throwable e) {
69
            System.out.println("Wrong abstract:" + abstractCompressed);
70
            throw new RuntimeException(e);
71
        }
72
    }
73

    
74
    public static final String PID_TYPES = "dnet:pid_types";
75
    private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
76
        put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES)
77
                .setSchemeid(PID_TYPES).build());
78
        put(GRID_AC, getQualifier("grid", PID_TYPES));
79
        put(WIKPEDIA, getQualifier("urn", PID_TYPES));
80
    }};
81

    
82
    static Map<String, Map<String, String>> typologiesMapping;
83

    
84
    static {
85
        try {
86
            final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
87
            final String tt = IOUtils.toString(is);
88
            typologiesMapping = new Gson().fromJson(tt, Map.class);
89
        } catch (IOException e) {
90
            e.printStackTrace();
91
        }
92
    }
93

    
94
    protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
95

    
96
        final String doi = getStringValue(rootElement, "doi");
97
        if (doi == null) {
98
            context.incrementCounter("filtered", "no_doi", 1);
99
            return false;
100
        }
101
        final String type = getStringValue(rootElement, "type");
102
        if (!typologiesMapping.containsKey(type)) {
103
            context.incrementCounter("filtered", "unknowntype_" + type, 1);
104
            return false;
105
        }
106
        // fixes #4360 (test publisher)
107
        final String publisher = getStringValue(rootElement, "publisher");
108
        if (StringUtils.isNotBlank(publisher) && publisher.equalsIgnoreCase("Test accounts")) {
109
            context.incrementCounter("filtered", "test_publisher", 1);
110
            return false;
111
        }
112

    
113
        List<JsonObject> authors = getArrayObjects(rootElement, "authors");
114
        boolean hasAuthors = false;
115
        for (JsonObject author : authors) {
116
            final String given = getStringValue(author, "given");
117
            final String family = getStringValue(author, "family");
118
            String fullname = getStringValue(author, "fullname");
119
            if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
120
                fullname = String.format("%s %s", given, family);
121
            }
122
            // fixes #4368
123
            if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) {
124
                context.incrementCounter("invalid_author", "addiejackson", 1);
125
                context.incrementCounter("filtered", "invalid_authors", 1);
126
                return false;
127
            }
128
            if (isValidAuthorName(fullname, context)) hasAuthors = true;
129
        }
130

    
131
        if (!hasAuthors) {
132
            context.incrementCounter("filtered", "invalid_authors", 1);
133
            return false;
134
        }
135
        // fixes #4360
136
        if (getCleanedTitles(rootElement).isEmpty()) {
137
            context.incrementCounter("filtered", "invalid_title", 1);
138
            return false;
139
        }
140

    
141
        return true;
142
    }
143

    
144
    private static List<String> getCleanedTitles(final JsonObject rootElement) {
145
        List<String> titles = getArrayValues(rootElement, "title");
146
        return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
147
    }
148

    
149
    private static boolean isValidAuthorName(final String fullName, final Reporter context) {
150
        if (StringUtils.isBlank(fullName)) {
151
            if(context != null) context.incrementCounter("invalid_author", "blank", 1);
152
            return false;
153
        }
154
        // fixes #4391 and subtasks related to DOIBoost
155
        switch (fullName) {
156
        case ",":
157
        case "none none":
158
        case "none &na;":
159
        case "(:null)":
160
        case "&na; &na;": {
161
            if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1);
162
            return false;
163
        }
164
        }
165
        return true;
166
    }
167

    
168
    public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
169
            final ActionFactory factory,
170
            final String setName,
171
            final Agent agent,
172
            boolean invisible,
173
            final boolean onlyOrganization,
174
            final Reporter context) {
175

    
176
        if (!isValid(rootElement, context)) return null;
177

    
178
        //Create OAF Proto
179

    
180
        final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
181
        //Add Data Info
182
        oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
183
                .setInvisible(invisible)
184
                .setDeletedbyinference(false)
185
                .setInferred(false)
186
                .setTrust("0.9")
187
                .setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
188
                .build());
189

    
190
        //Adding Kind
191
        oaf.setKind(KindProtos.Kind.entity);
192

    
193
        //creating Result Proto
194
        final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
195

    
196
        entity.setDateofcollection("2019-02-15");
197

    
198
        if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
199
            StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
200
                    .map(JsonElement::getAsString)
201
                    .forEach(cf -> {
202
                                final String id = datasources.get(cf.toLowerCase()).getValue();
203
                                final String name = datasources.get(cf.toLowerCase()).getKey();
204
                                if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
205
                                    final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
206
                                            .setValue(name)
207
                                            .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
208
                                            .build();
209
                                    entity.addCollectedfrom(collectedFrom);
210
                                }
211
                            }
212
                    );
213
        }
214
        //Adding identifier
215
        final String doi = getStringValue(rootElement, "doi");
216
        entity.addOriginalId(doi);
217

    
218
        final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
219
        entity.setId(sourceId);
220

    
221
        entity.addPid(FieldTypeProtos.StructuredProperty.newBuilder()
222
                .setValue(doi)
223
                .setQualifier(getQualifier("doi", PID_TYPES))
224
                .build());
225

    
226
        //Create Result Field
227
        ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
228

    
229
        final String type = getStringValue(rootElement, "type");
230

    
231
        //Adding Instances
232
        final String typeValue = typologiesMapping.get(type).get("value");
233
        final String cobjValue = typologiesMapping.get(type).get("cobj");
234

    
235
        // TODO: workaround for #4362: remove it when UnpayWall is correctly mapped
236
        List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> {
237
            String provS = getStringValue(prov, "provenance");
238
            if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true;
239
            else return false;
240
        }).collect(Collectors.toList());
241

    
242
        Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
243
        {
244
            ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
245
            instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
246
                    .setClassid(cobjValue)
247
                    .setClassname(typeValue)
248
                    .setSchemeid("dnet:publication_resource")
249
                    .setSchemename("dnet:publication_resource")
250
                    .build());
251
            instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
252
                    .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
253
                    .setValue("Unknown Repository")
254
                    .build());
255

    
256
            final String acc_class_id = it.get("access-rights").getAsString();
257
            String acc_class_value;
258
            switch (acc_class_id) {
259
            case "OPEN": {
260
                acc_class_value = "open access";
261
                break;
262
            }
263
            case "CLOSED": {
264
                acc_class_value = "closed access";
265
                break;
266
            }
267
            default: {
268
                acc_class_value = "not available";
269
            }
270

    
271
            }
272

    
273
            instance.addUrl(it.get("url").getAsString());
274
            instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
275
                    .setClassid(acc_class_id)
276
                    .setClassname(acc_class_value)
277
                    .setSchemeid("dnet:access_modes")
278
                    .setSchemename("dnet:access_modes")
279
                    .build());
280

    
281
            final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
282
            final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
283
            if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
284
                final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
285
                        .setValue(name)
286
                        .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
287
                        .build();
288

    
289
                instance.setCollectedfrom(collectedFrom);
290
            }
291

    
292
            return instance.build();
293
        }).forEach(result::addInstance);
294

    
295
        //Adding DOI URL as  Instance
296
        final String doiURL = getStringValue(rootElement, "doi-url");
297
        JsonObject hostedByOpenAire = null;
298
        if (rootElement.has("hostedByOpenAire")) {
299
            hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire");
300
        }
301

    
302
        if (StringUtils.isNotBlank(doiURL)) {
303
            final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
304
            instance.addUrl(doiURL);
305
            instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
306
                    .setClassid(cobjValue)
307
                    .setClassname(typeValue)
308
                    .setSchemeid("dnet:publication_resource")
309
                    .setSchemename("dnet:publication_resource")
310
                    .build());
311
            instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
312
                    .setClassid("CLOSED")
313
                    .setClassname("Closed Access")
314
                    .setSchemeid("dnet:access_modes")
315
                    .setSchemename("dnet:access_modes")
316
                    .build());
317
            instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
318
                    .setValue(CROSSREF)
319
                    .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref"))
320
                    .build());
321

    
322
            if (hostedByOpenAire == null)
323
                instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
324
                        .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
325
                        .setValue("Unknown Repository")
326
                        .build());
327
            else {
328
                instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
329
                        .setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString()))
330
                        .setValue(hostedByOpenAire.get("name").getAsString())
331
                        .build());
332
            }
333

    
334
            result.addInstance(instance);
335
        }
336

    
337
        //Create Metadata Proto
338
        final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
339

    
340
        Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
341

    
342
        if (authorsOrganizations.getKey().size() > 0) {
343
            metadata.addAllAuthor(authorsOrganizations.getKey());
344
        } else {
345
            //Should never enter here becasue of the isValid method at the beginning.
346
            context.incrementCounter("filtered", "unexpected_no_authors", 1);
347
            return null;
348
        }
349
        //adding Language
350
        metadata.setLanguage(FieldTypeProtos.Qualifier.newBuilder()
351
                .setClassid("und")
352
                .setClassname("Undetermined")
353
                .setSchemeid("dent:languages")
354
                .setSchemename("dent:languages")
355
                .build());
356

    
357
        //Adding subjects
358
        List<String> subjects = getArrayValues(rootElement, "subject");
359

    
360
        subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
361
                .setValue(s)
362
                .setQualifier(getQualifier("keyword", "dnet:subject"))
363
                .build()));
364

    
365
        List<String> titles = getCleanedTitles(rootElement);
366
        titles.forEach(t ->
367
                metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
368
                        .setValue(t)
369
                        .setQualifier(getQualifier("main title", "dnet:dataCite_title"))
370
                        .build()));
371

    
372
        settingRelevantDate(rootElement, metadata, "issued", "issued", true);
373
        settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
374
        settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
375
        settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
376

    
377
        getArrayObjects(rootElement, "abstract").forEach(d ->
378
                {
379
                    if (MAG.equals(d.get("provenance").getAsString()))
380
                        metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(decompressAbstract(d.get("value").getAsString())).build());
381
                    else
382
                        metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build());
383
                }
384
        );
385

    
386
        //Adding Journal
387
        final String publisher = getStringValue(rootElement, "publisher");
388
        if (StringUtils.isNotBlank(publisher)) {
389

    
390
            final ResultProtos.Result.Journal.Builder journal = ResultProtos.Result.Journal.newBuilder().setName(publisher);
391

    
392
            if (hasJSONArrayField(rootElement, "issn")) {
393
                StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
394
                        .map(JsonElement::getAsJsonObject)
395
                        .forEach(it -> {
396
                            final String issntype = getStringValue(it, "type");
397
                            final String value = getStringValue(it, "value");
398
                            if ("electronic".equals(issntype)) {
399
                                journal.setIssnOnline(value);
400
                            }
401
                            if ("print".equals(issntype))
402
                                journal.setIssnPrinted(value);
403
                        });
404
            }
405
            metadata.setJournal(journal.build());
406
        }
407
        metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
408
        result.setMetadata(metadata.build());
409
        entity.setResult(result.build());
410
        oaf.setEntity(entity.build());
411

    
412
        //System.out.println(JsonFormat.printToString(oaf.build()));
413

    
414
        final List<AtomicAction> actionList = new ArrayList<>();
415

    
416
        if (!onlyOrganization)
417
            actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
418

    
419
        if (!authorsOrganizations.getValue().isEmpty()) {
420

    
421
            authorsOrganizations.getValue().forEach(o ->
422
            {
423

    
424
                actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organization", "body", o.toByteArray()));
425
                if (!onlyOrganization)
426
                    actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent));
427
                final String gridOrganization = getSimilarGridOrganization(o.getEntity());
428
                if (gridOrganization != null) {
429
                    actionList.add(factory
430
                            .createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization,
431
                                    "".getBytes()));
432
                    actionList.add(factory
433
                            .createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(),
434
                                    "".getBytes()));
435
                }
436
            });
437
        }
438
        return actionList;
439

    
440
    }
441

    
442
    private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) {
443

    
444
        final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList();
445
        if (pidList != null) {
446
            for (FieldTypeProtos.StructuredProperty p : pidList) {
447
                if (p.getQualifier().getClassname().equals("grid")) {
448
                    return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
449
                }
450
            }
451
        }
452
        return null;
453

    
454
    }
455

    
456
    private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication,
457
            final OafProtos.Oaf organization,
458
            final ActionFactory factory,
459
            final String setName,
460
            final Agent agent) {
461

    
462
        List<AtomicAction> result = new ArrayList<>();
463

    
464
        final OafProtos.Oaf.Builder roaf = OafProtos.Oaf.newBuilder();
465
        roaf.setKind(KindProtos.Kind.relation);
466

    
467
        roaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
468
                .setInvisible(false)
469
                .setDeletedbyinference(false)
470
                .setInferred(false)
471
                .setTrust("0.9")
472
                .setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
473
                .build());
474

    
475
        final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder();
476

    
477
        rel.setRelType(RelTypeProtos.RelType.resultOrganization);
478
        rel.setSubRelType(RelTypeProtos.SubRelType.affiliation);
479

    
480
        //Create a relation Result --> Organization
481
        rel.setSource(publication.getEntity().getId());
482
        rel.setTarget(organization.getEntity().getId());
483
        rel.setRelClass(ResultOrganization.Affiliation.RelName.hasAuthorInstitution.toString());
484

    
485
        final ResultOrganization.Builder rel_instance = ResultOrganization.newBuilder();
486

    
487
        final ResultOrganization.Affiliation.Builder affiliationRel = ResultOrganization.Affiliation.newBuilder();
488
        affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
489
                .setSemantics(getQualifier("hasAuthorInstitution", "dnet:result_organization_relations"))
490
                .build());
491
        rel_instance.setAffiliation(affiliationRel.build());
492
        rel.setResultOrganization(rel_instance.build());
493

    
494
        rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
495
                .setValue(datasources.get(MAG.toLowerCase()).getKey())
496
                .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions
497
                        .md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR)))
498
                .build());
499

    
500
        rel.setChild(false);
501
        roaf.setRel(rel.build());
502

    
503
        result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution",
504
                organization.getEntity().getId(), roaf.build().toByteArray()));
505

    
506
        //Create a relation Organization --> Result
507
        rel.setTarget(publication.getEntity().getId());
508
        rel.setSource(organization.getEntity().getId());
509
        rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString());
510

    
511
        affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
512
                .setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations"))
513
                .build());
514
        rel_instance.setAffiliation(affiliationRel.build());
515
        rel.setResultOrganization(rel_instance.build());
516
        roaf.setRel(rel.build());
517
        result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf",
518
                publication.getEntity().getId(), roaf.build().toByteArray()));
519

    
520
        return result;
521

    
522
    }
523

    
524
    private static boolean hasJSONArrayField(final JsonObject root, final String key) {
525
        return root.has(key) && root.get(key).isJsonArray();
526
    }
527

    
528
    private static void settingRelevantDate(JsonObject rootElement,
529
            ResultProtos.Result.Metadata.Builder metadata,
530
            final String jsonKey,
531
            final String dictionaryKey,
532
            final boolean addToDateOfAcceptance) {
533
        //Adding date
534
        String date = getStringValue(rootElement, jsonKey);
535
        if (date == null)
536
            return;
537
        if (date.length() == 4) {
538
            date += "-01-01";
539
        }
540
        if (isValidDate(date)) {
541
            if (addToDateOfAcceptance)
542
                metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
543
            metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
544
                    .setValue(date)
545
                    .setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
546
                    .build());
547
        }
548
    }
549

    
550
    public static FieldTypeProtos.KeyValue extractIdentifier(final String value) {
551
        FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder();
552
        if (StringUtils.contains(value, "orcid.org")) {
553
            return pid.setValue(value.replaceAll("https://orcid.org/", ""))
554
                    .setKey(ORCID).build();
555
        }
556
        if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
557
            return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
558
                    .setKey("MAG Identifier").build();
559
        }
560
        return pid.setValue(value)
561
                .setKey("URL").build();
562
    }
563

    
564
    public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) {
565
        final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>();
566
        final List<String> magId = new ArrayList<>();
567
        getArrayObjects(affiliation, "identifiers").forEach(it -> {
568
            if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) {
569
                affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG));
570
                magId.add(it.get("value").getAsString());
571
            } else
572
                affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
573
        });
574
        if (magId.size() > 0) {
575
            final String microsoftID = magId.get(0);
576
            OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
577
            oaf.setKind(KindProtos.Kind.entity);
578
            OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder();
579
            entity.setType(TypeProtos.Type.organization);
580
            entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
581
            final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
582
            final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
583
            if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
584
                final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
585
                        .setValue(name)
586
                        .setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
587
                        .build();
588
                entity.addCollectedfrom(collectedFrom);
589
            } else {
590
                return null;
591
            }
592
            entity.addOriginalId(microsoftID);
593

    
594
            affiliationIdentifiers.forEach((key, value) -> entity.addPid(
595
                    FieldTypeProtos.StructuredProperty.newBuilder()
596
                            .setQualifier(value)
597
                            .setValue(key)
598
                            .build()));
599

    
600
            final OrganizationProtos.Organization.Builder organization = OrganizationProtos.Organization.newBuilder();
601
            organization.setMetadata(OrganizationProtos.Organization.Metadata.newBuilder()
602
                    .setWebsiteurl(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("official-page").getAsString()).build())
603
                    .setLegalname(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("value").getAsString()).build())
604
                    .build());
605

    
606
            entity.setOrganization(organization);
607
            oaf.setEntity(entity);
608
            oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
609
                    .setInvisible(false)
610
                    .setDeletedbyinference(false)
611
                    .setInferred(false)
612
                    .setTrust("0.9")
613
                    .setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
614
                    .build());
615
            return oaf.build();
616
        }
617
        return null;
618
    }
619

    
620
    public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
621

    
622
        final Map<String, OafProtos.Oaf> affiliations = new HashMap<>();
623

    
624
        List<JsonObject> authors = getArrayObjects(root, "authors");
625

    
626
        final AtomicInteger counter = new AtomicInteger(1);
627

    
628
        List<FieldTypeProtos.Author> collect = authors.stream().map(author -> {
629
            final String given = getStringValue(author, "given");
630
            final String family = getStringValue(author, "family");
631
            String fullname = getStringValue(author, "fullname");
632

    
633
            if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
634
                fullname = String.format("%s %s", given, family);
635
            }
636

    
637
            if (!isValidAuthorName(fullname, null)) {
638
                return null;
639
            }
640
            final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder();
641

    
642
            if (StringUtils.isNotBlank(given))
643
                abuilder.setName(given);
644
            if (StringUtils.isNotBlank(family))
645
                abuilder.setSurname(family);
646
            if (StringUtils.isNotBlank(fullname))
647
                abuilder.setFullname(fullname);
648

    
649
            final List<JsonObject> identifiers = getArrayObjects(author, "identifiers");
650
            final List<JsonObject> authorAffiliation = getArrayObjects(author, "affiliations");
651

    
652
            authorAffiliation.forEach(it ->
653
            {
654
                OafProtos.Oaf org = createOrganizationFromJSON(it);
655
                if (org != null) {
656
                    affiliations.put(org.getEntity().getId(), org);
657
                    abuilder.addAffiliation(org.getEntity().getOrganization().getMetadata().getLegalname());
658
                }
659
            });
660
            identifiers.stream().map(id -> {
661
                final String value = id.get("value").getAsString();
662
                return extractIdentifier(value);
663
            }).collect(
664
                    Collectors.toMap(
665
                            FieldTypeProtos.KeyValue::getKey,
666
                            Function.identity(),
667
                            (a, b) -> a
668
                    )).values().forEach(abuilder::addPid);
669
            abuilder.setRank(counter.getAndIncrement());
670

    
671
            return abuilder.build();
672

    
673
        }).filter(Objects::nonNull).collect(Collectors.toList());
674

    
675
        return new Pair<>(collect, affiliations.values());
676
    }
677

    
678
}
(4-4/16)