1
|
package eu.dnetlib.data.mapreduce.hbase.dataimport;
|
2
|
|
3
|
import java.io.ByteArrayOutputStream;
|
4
|
import java.io.IOException;
|
5
|
import java.io.InputStream;
|
6
|
import java.util.*;
|
7
|
import java.util.concurrent.atomic.AtomicInteger;
|
8
|
import java.util.function.Function;
|
9
|
import java.util.stream.Collectors;
|
10
|
import java.util.stream.Stream;
|
11
|
import java.util.zip.Inflater;
|
12
|
|
13
|
import com.google.gson.Gson;
|
14
|
import com.google.gson.JsonElement;
|
15
|
import com.google.gson.JsonObject;
|
16
|
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
17
|
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
18
|
import eu.dnetlib.actionmanager.common.Agent;
|
19
|
import eu.dnetlib.data.mapreduce.hbase.Reporter;
|
20
|
import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
21
|
import eu.dnetlib.data.proto.*;
|
22
|
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
23
|
import eu.dnetlib.miscutils.collections.Pair;
|
24
|
import org.apache.commons.codec.binary.Base64;
|
25
|
import org.apache.commons.io.IOUtils;
|
26
|
import org.apache.commons.lang3.StringUtils;
|
27
|
|
28
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
|
29
|
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
|
30
|
|
31
|
public class DOIBoostToActions {
|
32
|
|
33
|
public static final String MAG = "MAG";
|
34
|
public static final String ORCID = "ORCID";
|
35
|
public static final String CROSSREF = "Crossref";
|
36
|
public static final String UNPAYWALL = "UnpayWall";
|
37
|
|
38
|
public static final String GRID_AC = "grid.ac";
|
39
|
public static final String WIKPEDIA = "wikpedia";
|
40
|
|
41
|
public final static String doiBoostNSPREFIX = "doiboost____";
|
42
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
43
|
|
44
|
public static final String SEPARATOR = "::";
|
45
|
|
46
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
|
47
|
put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
|
48
|
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
49
|
put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
|
50
|
put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
|
51
|
|
52
|
}};
|
53
|
|
54
|
private static String decompressAbstract(final String abstractCompressed) {
|
55
|
try {
|
56
|
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
|
57
|
final Inflater decompresser = new Inflater();
|
58
|
decompresser.setInput(byteArray);
|
59
|
final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
|
60
|
byte[] buffer = new byte[8192];
|
61
|
while (!decompresser.finished()) {
|
62
|
int size = decompresser.inflate(buffer);
|
63
|
bos.write(buffer, 0, size);
|
64
|
}
|
65
|
byte[] unzippeddata = bos.toByteArray();
|
66
|
decompresser.end();
|
67
|
return new String(unzippeddata);
|
68
|
} catch (Throwable e) {
|
69
|
System.out.println("Wrong abstract:" + abstractCompressed);
|
70
|
throw new RuntimeException(e);
|
71
|
}
|
72
|
}
|
73
|
|
74
|
public static final String PID_TYPES = "dnet:pid_types";
|
75
|
private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
|
76
|
put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES)
|
77
|
.setSchemeid(PID_TYPES).build());
|
78
|
put(GRID_AC, getQualifier("grid", PID_TYPES));
|
79
|
put(WIKPEDIA, getQualifier("urn", PID_TYPES));
|
80
|
}};
|
81
|
|
82
|
static Map<String, Map<String, String>> typologiesMapping;
|
83
|
|
84
|
static {
|
85
|
try {
|
86
|
final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
|
87
|
final String tt = IOUtils.toString(is);
|
88
|
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
89
|
} catch (IOException e) {
|
90
|
e.printStackTrace();
|
91
|
}
|
92
|
}
|
93
|
|
94
|
protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
|
95
|
|
96
|
final String doi = getStringValue(rootElement, "doi");
|
97
|
if (doi == null) {
|
98
|
context.incrementCounter("filtered", "no_doi", 1);
|
99
|
return false;
|
100
|
}
|
101
|
final String type = getStringValue(rootElement, "type");
|
102
|
if (!typologiesMapping.containsKey(type)) {
|
103
|
context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
104
|
return false;
|
105
|
}
|
106
|
// fixes #4360 (test publisher)
|
107
|
final String publisher = getStringValue(rootElement, "publisher");
|
108
|
if (StringUtils.isNotBlank(publisher) && publisher.equalsIgnoreCase("Test accounts")) {
|
109
|
context.incrementCounter("filtered", "test_publisher", 1);
|
110
|
return false;
|
111
|
}
|
112
|
|
113
|
List<JsonObject> authors = getArrayObjects(rootElement, "authors");
|
114
|
boolean hasAuthors = false;
|
115
|
for (JsonObject author : authors) {
|
116
|
final String given = getStringValue(author, "given");
|
117
|
final String family = getStringValue(author, "family");
|
118
|
String fullname = getStringValue(author, "fullname");
|
119
|
if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
|
120
|
fullname = String.format("%s %s", given, family);
|
121
|
}
|
122
|
// fixes #4368
|
123
|
if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) {
|
124
|
context.incrementCounter("invalid_author", "addiejackson", 1);
|
125
|
context.incrementCounter("filtered", "invalid_authors", 1);
|
126
|
return false;
|
127
|
}
|
128
|
if (isValidAuthorName(fullname, context)) hasAuthors = true;
|
129
|
}
|
130
|
|
131
|
if (!hasAuthors) {
|
132
|
context.incrementCounter("filtered", "invalid_authors", 1);
|
133
|
return false;
|
134
|
}
|
135
|
// fixes #4360
|
136
|
if (getCleanedTitles(rootElement).isEmpty()) {
|
137
|
context.incrementCounter("filtered", "invalid_title", 1);
|
138
|
return false;
|
139
|
}
|
140
|
|
141
|
return true;
|
142
|
}
|
143
|
|
144
|
private static List<String> getCleanedTitles(final JsonObject rootElement) {
|
145
|
List<String> titles = getArrayValues(rootElement, "title");
|
146
|
return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
|
147
|
}
|
148
|
|
149
|
private static boolean isValidAuthorName(final String fullName, final Reporter context) {
|
150
|
if (StringUtils.isBlank(fullName)) {
|
151
|
if(context != null) context.incrementCounter("invalid_author", "blank", 1);
|
152
|
return false;
|
153
|
}
|
154
|
// fixes #4391 and subtasks related to DOIBoost
|
155
|
switch (fullName) {
|
156
|
case ",":
|
157
|
case "none none":
|
158
|
case "none &na;":
|
159
|
case "(:null)":
|
160
|
case "&na; &na;": {
|
161
|
if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1);
|
162
|
return false;
|
163
|
}
|
164
|
}
|
165
|
return true;
|
166
|
}
|
167
|
|
168
|
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
|
169
|
final ActionFactory factory,
|
170
|
final String setName,
|
171
|
final Agent agent,
|
172
|
boolean invisible,
|
173
|
final boolean onlyOrganization,
|
174
|
final Reporter context) {
|
175
|
|
176
|
if (!isValid(rootElement, context)) return null;
|
177
|
|
178
|
//Create OAF Proto
|
179
|
|
180
|
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
181
|
//Add Data Info
|
182
|
oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
|
183
|
.setInvisible(invisible)
|
184
|
.setDeletedbyinference(false)
|
185
|
.setInferred(false)
|
186
|
.setTrust("0.9")
|
187
|
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
188
|
.build());
|
189
|
|
190
|
//Adding Kind
|
191
|
oaf.setKind(KindProtos.Kind.entity);
|
192
|
|
193
|
//creating Result Proto
|
194
|
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
|
195
|
|
196
|
entity.setDateofcollection("2019-02-15");
|
197
|
|
198
|
if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
|
199
|
StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
|
200
|
.map(JsonElement::getAsString)
|
201
|
.forEach(cf -> {
|
202
|
final String id = datasources.get(cf.toLowerCase()).getValue();
|
203
|
final String name = datasources.get(cf.toLowerCase()).getKey();
|
204
|
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
205
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
206
|
.setValue(name)
|
207
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
|
208
|
.build();
|
209
|
entity.addCollectedfrom(collectedFrom);
|
210
|
}
|
211
|
}
|
212
|
);
|
213
|
}
|
214
|
//Adding identifier
|
215
|
final String doi = getStringValue(rootElement, "doi");
|
216
|
entity.addOriginalId(doi);
|
217
|
|
218
|
final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
|
219
|
entity.setId(sourceId);
|
220
|
|
221
|
entity.addPid(FieldTypeProtos.StructuredProperty.newBuilder()
|
222
|
.setValue(doi)
|
223
|
.setQualifier(getQualifier("doi", PID_TYPES))
|
224
|
.build());
|
225
|
|
226
|
//Create Result Field
|
227
|
ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
|
228
|
|
229
|
final String type = getStringValue(rootElement, "type");
|
230
|
|
231
|
//Adding Instances
|
232
|
final String typeValue = typologiesMapping.get(type).get("value");
|
233
|
final String cobjValue = typologiesMapping.get(type).get("cobj");
|
234
|
|
235
|
// TODO: workaround for #4362: remove it when UnpayWall is correctly mapped
|
236
|
List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> {
|
237
|
String provS = getStringValue(prov, "provenance");
|
238
|
if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true;
|
239
|
else return false;
|
240
|
}).collect(Collectors.toList());
|
241
|
|
242
|
Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
|
243
|
{
|
244
|
ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
245
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
246
|
.setClassid(cobjValue)
|
247
|
.setClassname(typeValue)
|
248
|
.setSchemeid("dnet:publication_resource")
|
249
|
.setSchemename("dnet:publication_resource")
|
250
|
.build());
|
251
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
252
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
253
|
.setValue("Unknown Repository")
|
254
|
.build());
|
255
|
|
256
|
final String acc_class_id = it.get("access-rights").getAsString();
|
257
|
String acc_class_value;
|
258
|
switch (acc_class_id) {
|
259
|
case "OPEN": {
|
260
|
acc_class_value = "open access";
|
261
|
break;
|
262
|
}
|
263
|
case "CLOSED": {
|
264
|
acc_class_value = "closed access";
|
265
|
break;
|
266
|
}
|
267
|
default: {
|
268
|
acc_class_value = "not available";
|
269
|
}
|
270
|
|
271
|
}
|
272
|
|
273
|
instance.addUrl(it.get("url").getAsString());
|
274
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
275
|
.setClassid(acc_class_id)
|
276
|
.setClassname(acc_class_value)
|
277
|
.setSchemeid("dnet:access_modes")
|
278
|
.setSchemename("dnet:access_modes")
|
279
|
.build());
|
280
|
|
281
|
final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
|
282
|
final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
|
283
|
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
284
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
285
|
.setValue(name)
|
286
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
|
287
|
.build();
|
288
|
|
289
|
instance.setCollectedfrom(collectedFrom);
|
290
|
}
|
291
|
|
292
|
return instance.build();
|
293
|
}).forEach(result::addInstance);
|
294
|
|
295
|
//Adding DOI URL as Instance
|
296
|
final String doiURL = getStringValue(rootElement, "doi-url");
|
297
|
JsonObject hostedByOpenAire = null;
|
298
|
if (rootElement.has("hostedByOpenAire")) {
|
299
|
hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire");
|
300
|
}
|
301
|
|
302
|
if (StringUtils.isNotBlank(doiURL)) {
|
303
|
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
304
|
instance.addUrl(doiURL);
|
305
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
306
|
.setClassid(cobjValue)
|
307
|
.setClassname(typeValue)
|
308
|
.setSchemeid("dnet:publication_resource")
|
309
|
.setSchemename("dnet:publication_resource")
|
310
|
.build());
|
311
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
312
|
.setClassid("CLOSED")
|
313
|
.setClassname("Closed Access")
|
314
|
.setSchemeid("dnet:access_modes")
|
315
|
.setSchemename("dnet:access_modes")
|
316
|
.build());
|
317
|
instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
|
318
|
.setValue(CROSSREF)
|
319
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref"))
|
320
|
.build());
|
321
|
|
322
|
if (hostedByOpenAire == null)
|
323
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
324
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
325
|
.setValue("Unknown Repository")
|
326
|
.build());
|
327
|
else {
|
328
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
329
|
.setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString()))
|
330
|
.setValue(hostedByOpenAire.get("name").getAsString())
|
331
|
.build());
|
332
|
}
|
333
|
|
334
|
result.addInstance(instance);
|
335
|
}
|
336
|
|
337
|
//Create Metadata Proto
|
338
|
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
|
339
|
|
340
|
Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
|
341
|
|
342
|
if (authorsOrganizations.getKey().size() > 0) {
|
343
|
metadata.addAllAuthor(authorsOrganizations.getKey());
|
344
|
} else {
|
345
|
//Should never enter here becasue of the isValid method at the beginning.
|
346
|
context.incrementCounter("filtered", "unexpected_no_authors", 1);
|
347
|
return null;
|
348
|
}
|
349
|
//adding Language
|
350
|
metadata.setLanguage(FieldTypeProtos.Qualifier.newBuilder()
|
351
|
.setClassid("und")
|
352
|
.setClassname("Undetermined")
|
353
|
.setSchemeid("dent:languages")
|
354
|
.setSchemename("dent:languages")
|
355
|
.build());
|
356
|
|
357
|
//Adding subjects
|
358
|
List<String> subjects = getArrayValues(rootElement, "subject");
|
359
|
|
360
|
subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
|
361
|
.setValue(s)
|
362
|
.setQualifier(getQualifier("keyword", "dnet:subject"))
|
363
|
.build()));
|
364
|
|
365
|
List<String> titles = getCleanedTitles(rootElement);
|
366
|
titles.forEach(t ->
|
367
|
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
|
368
|
.setValue(t)
|
369
|
.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
370
|
.build()));
|
371
|
|
372
|
settingRelevantDate(rootElement, metadata, "issued", "issued", true);
|
373
|
settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
|
374
|
settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
|
375
|
settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
|
376
|
|
377
|
getArrayObjects(rootElement, "abstract").forEach(d ->
|
378
|
{
|
379
|
if (MAG.equals(d.get("provenance").getAsString()))
|
380
|
metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(decompressAbstract(d.get("value").getAsString())).build());
|
381
|
else
|
382
|
metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build());
|
383
|
}
|
384
|
);
|
385
|
|
386
|
//Adding Journal
|
387
|
final String publisher = getStringValue(rootElement, "publisher");
|
388
|
if (StringUtils.isNotBlank(publisher)) {
|
389
|
|
390
|
final ResultProtos.Result.Journal.Builder journal = ResultProtos.Result.Journal.newBuilder().setName(publisher);
|
391
|
|
392
|
if (hasJSONArrayField(rootElement, "issn")) {
|
393
|
StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
|
394
|
.map(JsonElement::getAsJsonObject)
|
395
|
.forEach(it -> {
|
396
|
final String issntype = getStringValue(it, "type");
|
397
|
final String value = getStringValue(it, "value");
|
398
|
if ("electronic".equals(issntype)) {
|
399
|
journal.setIssnOnline(value);
|
400
|
}
|
401
|
if ("print".equals(issntype))
|
402
|
journal.setIssnPrinted(value);
|
403
|
});
|
404
|
}
|
405
|
metadata.setJournal(journal.build());
|
406
|
}
|
407
|
metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
|
408
|
result.setMetadata(metadata.build());
|
409
|
entity.setResult(result.build());
|
410
|
oaf.setEntity(entity.build());
|
411
|
|
412
|
//System.out.println(JsonFormat.printToString(oaf.build()));
|
413
|
|
414
|
final List<AtomicAction> actionList = new ArrayList<>();
|
415
|
|
416
|
if (!onlyOrganization)
|
417
|
actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
|
418
|
|
419
|
if (!authorsOrganizations.getValue().isEmpty()) {
|
420
|
|
421
|
authorsOrganizations.getValue().forEach(o ->
|
422
|
{
|
423
|
|
424
|
actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organization", "body", o.toByteArray()));
|
425
|
if (!onlyOrganization)
|
426
|
actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent));
|
427
|
final String gridOrganization = getSimilarGridOrganization(o.getEntity());
|
428
|
if (gridOrganization != null) {
|
429
|
actionList.add(factory
|
430
|
.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization,
|
431
|
"".getBytes()));
|
432
|
actionList.add(factory
|
433
|
.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(),
|
434
|
"".getBytes()));
|
435
|
}
|
436
|
});
|
437
|
}
|
438
|
return actionList;
|
439
|
|
440
|
}
|
441
|
|
442
|
private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) {
|
443
|
|
444
|
final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList();
|
445
|
if (pidList != null) {
|
446
|
for (FieldTypeProtos.StructuredProperty p : pidList) {
|
447
|
if (p.getQualifier().getClassname().equals("grid")) {
|
448
|
return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
|
449
|
}
|
450
|
}
|
451
|
}
|
452
|
return null;
|
453
|
|
454
|
}
|
455
|
|
456
|
private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication,
|
457
|
final OafProtos.Oaf organization,
|
458
|
final ActionFactory factory,
|
459
|
final String setName,
|
460
|
final Agent agent) {
|
461
|
|
462
|
List<AtomicAction> result = new ArrayList<>();
|
463
|
|
464
|
final OafProtos.Oaf.Builder roaf = OafProtos.Oaf.newBuilder();
|
465
|
roaf.setKind(KindProtos.Kind.relation);
|
466
|
|
467
|
roaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
|
468
|
.setInvisible(false)
|
469
|
.setDeletedbyinference(false)
|
470
|
.setInferred(false)
|
471
|
.setTrust("0.9")
|
472
|
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
473
|
.build());
|
474
|
|
475
|
final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder();
|
476
|
|
477
|
rel.setRelType(RelTypeProtos.RelType.resultOrganization);
|
478
|
rel.setSubRelType(RelTypeProtos.SubRelType.affiliation);
|
479
|
|
480
|
//Create a relation Result --> Organization
|
481
|
rel.setSource(publication.getEntity().getId());
|
482
|
rel.setTarget(organization.getEntity().getId());
|
483
|
rel.setRelClass(ResultOrganization.Affiliation.RelName.hasAuthorInstitution.toString());
|
484
|
|
485
|
final ResultOrganization.Builder rel_instance = ResultOrganization.newBuilder();
|
486
|
|
487
|
final ResultOrganization.Affiliation.Builder affiliationRel = ResultOrganization.Affiliation.newBuilder();
|
488
|
affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
|
489
|
.setSemantics(getQualifier("hasAuthorInstitution", "dnet:result_organization_relations"))
|
490
|
.build());
|
491
|
rel_instance.setAffiliation(affiliationRel.build());
|
492
|
rel.setResultOrganization(rel_instance.build());
|
493
|
|
494
|
rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
|
495
|
.setValue(datasources.get(MAG.toLowerCase()).getKey())
|
496
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions
|
497
|
.md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR)))
|
498
|
.build());
|
499
|
|
500
|
rel.setChild(false);
|
501
|
roaf.setRel(rel.build());
|
502
|
|
503
|
result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution",
|
504
|
organization.getEntity().getId(), roaf.build().toByteArray()));
|
505
|
|
506
|
//Create a relation Organization --> Result
|
507
|
rel.setTarget(publication.getEntity().getId());
|
508
|
rel.setSource(organization.getEntity().getId());
|
509
|
rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString());
|
510
|
|
511
|
affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
|
512
|
.setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations"))
|
513
|
.build());
|
514
|
rel_instance.setAffiliation(affiliationRel.build());
|
515
|
rel.setResultOrganization(rel_instance.build());
|
516
|
roaf.setRel(rel.build());
|
517
|
result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf",
|
518
|
publication.getEntity().getId(), roaf.build().toByteArray()));
|
519
|
|
520
|
return result;
|
521
|
|
522
|
}
|
523
|
|
524
|
private static boolean hasJSONArrayField(final JsonObject root, final String key) {
|
525
|
return root.has(key) && root.get(key).isJsonArray();
|
526
|
}
|
527
|
|
528
|
private static void settingRelevantDate(JsonObject rootElement,
|
529
|
ResultProtos.Result.Metadata.Builder metadata,
|
530
|
final String jsonKey,
|
531
|
final String dictionaryKey,
|
532
|
final boolean addToDateOfAcceptance) {
|
533
|
//Adding date
|
534
|
String date = getStringValue(rootElement, jsonKey);
|
535
|
if (date == null)
|
536
|
return;
|
537
|
if (date.length() == 4) {
|
538
|
date += "-01-01";
|
539
|
}
|
540
|
if (isValidDate(date)) {
|
541
|
if (addToDateOfAcceptance)
|
542
|
metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
|
543
|
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
544
|
.setValue(date)
|
545
|
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
546
|
.build());
|
547
|
}
|
548
|
}
|
549
|
|
550
|
public static FieldTypeProtos.KeyValue extractIdentifier(final String value) {
|
551
|
FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder();
|
552
|
if (StringUtils.contains(value, "orcid.org")) {
|
553
|
return pid.setValue(value.replaceAll("https://orcid.org/", ""))
|
554
|
.setKey(ORCID).build();
|
555
|
}
|
556
|
if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
|
557
|
return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
|
558
|
.setKey("MAG Identifier").build();
|
559
|
}
|
560
|
return pid.setValue(value)
|
561
|
.setKey("URL").build();
|
562
|
}
|
563
|
|
564
|
public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) {
|
565
|
final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>();
|
566
|
final List<String> magId = new ArrayList<>();
|
567
|
getArrayObjects(affiliation, "identifiers").forEach(it -> {
|
568
|
if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) {
|
569
|
affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG));
|
570
|
magId.add(it.get("value").getAsString());
|
571
|
} else
|
572
|
affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
|
573
|
});
|
574
|
if (magId.size() > 0) {
|
575
|
final String microsoftID = magId.get(0);
|
576
|
OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
577
|
oaf.setKind(KindProtos.Kind.entity);
|
578
|
OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder();
|
579
|
entity.setType(TypeProtos.Type.organization);
|
580
|
entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
|
581
|
final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
|
582
|
final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
|
583
|
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
584
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
585
|
.setValue(name)
|
586
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
|
587
|
.build();
|
588
|
entity.addCollectedfrom(collectedFrom);
|
589
|
} else {
|
590
|
return null;
|
591
|
}
|
592
|
entity.addOriginalId(microsoftID);
|
593
|
|
594
|
affiliationIdentifiers.forEach((key, value) -> entity.addPid(
|
595
|
FieldTypeProtos.StructuredProperty.newBuilder()
|
596
|
.setQualifier(value)
|
597
|
.setValue(key)
|
598
|
.build()));
|
599
|
|
600
|
final OrganizationProtos.Organization.Builder organization = OrganizationProtos.Organization.newBuilder();
|
601
|
organization.setMetadata(OrganizationProtos.Organization.Metadata.newBuilder()
|
602
|
.setWebsiteurl(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("official-page").getAsString()).build())
|
603
|
.setLegalname(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("value").getAsString()).build())
|
604
|
.build());
|
605
|
|
606
|
entity.setOrganization(organization);
|
607
|
oaf.setEntity(entity);
|
608
|
oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
|
609
|
.setInvisible(false)
|
610
|
.setDeletedbyinference(false)
|
611
|
.setInferred(false)
|
612
|
.setTrust("0.9")
|
613
|
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
614
|
.build());
|
615
|
return oaf.build();
|
616
|
}
|
617
|
return null;
|
618
|
}
|
619
|
|
620
|
public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
|
621
|
|
622
|
final Map<String, OafProtos.Oaf> affiliations = new HashMap<>();
|
623
|
|
624
|
List<JsonObject> authors = getArrayObjects(root, "authors");
|
625
|
|
626
|
final AtomicInteger counter = new AtomicInteger(1);
|
627
|
|
628
|
List<FieldTypeProtos.Author> collect = authors.stream().map(author -> {
|
629
|
final String given = getStringValue(author, "given");
|
630
|
final String family = getStringValue(author, "family");
|
631
|
String fullname = getStringValue(author, "fullname");
|
632
|
|
633
|
if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
|
634
|
fullname = String.format("%s %s", given, family);
|
635
|
}
|
636
|
|
637
|
if (!isValidAuthorName(fullname, null)) {
|
638
|
return null;
|
639
|
}
|
640
|
final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder();
|
641
|
|
642
|
if (StringUtils.isNotBlank(given))
|
643
|
abuilder.setName(given);
|
644
|
if (StringUtils.isNotBlank(family))
|
645
|
abuilder.setSurname(family);
|
646
|
if (StringUtils.isNotBlank(fullname))
|
647
|
abuilder.setFullname(fullname);
|
648
|
|
649
|
final List<JsonObject> identifiers = getArrayObjects(author, "identifiers");
|
650
|
final List<JsonObject> authorAffiliation = getArrayObjects(author, "affiliations");
|
651
|
|
652
|
authorAffiliation.forEach(it ->
|
653
|
{
|
654
|
OafProtos.Oaf org = createOrganizationFromJSON(it);
|
655
|
if (org != null) {
|
656
|
affiliations.put(org.getEntity().getId(), org);
|
657
|
abuilder.addAffiliation(org.getEntity().getOrganization().getMetadata().getLegalname());
|
658
|
}
|
659
|
});
|
660
|
identifiers.stream().map(id -> {
|
661
|
final String value = id.get("value").getAsString();
|
662
|
return extractIdentifier(value);
|
663
|
}).collect(
|
664
|
Collectors.toMap(
|
665
|
FieldTypeProtos.KeyValue::getKey,
|
666
|
Function.identity(),
|
667
|
(a, b) -> a
|
668
|
)).values().forEach(abuilder::addPid);
|
669
|
abuilder.setRank(counter.getAndIncrement());
|
670
|
|
671
|
return abuilder.build();
|
672
|
|
673
|
}).filter(Objects::nonNull).collect(Collectors.toList());
|
674
|
|
675
|
return new Pair<>(collect, affiliations.values());
|
676
|
}
|
677
|
|
678
|
}
|