1
|
package eu.dnetlib.data.mapreduce.hbase.dataimport;
|
2
|
|
3
|
import java.io.ByteArrayOutputStream;
|
4
|
import java.io.IOException;
|
5
|
import java.io.InputStream;
|
6
|
import java.util.*;
|
7
|
import java.util.concurrent.atomic.AtomicInteger;
|
8
|
import java.util.function.Function;
|
9
|
import java.util.stream.Collectors;
|
10
|
import java.util.stream.Stream;
|
11
|
import java.util.zip.Inflater;
|
12
|
|
13
|
import com.google.gson.Gson;
|
14
|
import com.google.gson.JsonElement;
|
15
|
import com.google.gson.JsonObject;
|
16
|
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
17
|
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
18
|
import eu.dnetlib.actionmanager.common.Agent;
|
19
|
import eu.dnetlib.data.mapreduce.hbase.Reporter;
|
20
|
import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
21
|
import eu.dnetlib.data.proto.*;
|
22
|
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
23
|
import eu.dnetlib.miscutils.collections.Pair;
|
24
|
import org.apache.commons.codec.binary.Base64;
|
25
|
import org.apache.commons.io.IOUtils;
|
26
|
import org.apache.commons.lang3.StringUtils;
|
27
|
|
28
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
|
29
|
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
|
30
|
|
31
|
public class DOIBoostToActions {
|
32
|
|
33
|
public static final String MAG = "MAG";
|
34
|
public static final String ORCID = "ORCID";
|
35
|
public static final String CROSSREF = "Crossref";
|
36
|
public static final String UNPAYWALL = "UnpayWall";
|
37
|
|
38
|
public static final String GRID_AC = "grid.ac";
|
39
|
public static final String WIKPEDIA = "wikpedia";
|
40
|
|
41
|
public final static String doiBoostNSPREFIX = "doiboost____";
|
42
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
43
|
|
44
|
public static final String SEPARATOR = "::";
|
45
|
public static final String DNET_LANGUAGES = "dnet:languages";
|
46
|
|
47
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
|
48
|
put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
|
49
|
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
50
|
put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
|
51
|
put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
|
52
|
|
53
|
}};
|
54
|
|
55
|
private static String decompressAbstract(final String abstractCompressed) {
|
56
|
try {
|
57
|
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
|
58
|
final Inflater decompresser = new Inflater();
|
59
|
decompresser.setInput(byteArray);
|
60
|
final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
|
61
|
byte[] buffer = new byte[8192];
|
62
|
while (!decompresser.finished()) {
|
63
|
int size = decompresser.inflate(buffer);
|
64
|
bos.write(buffer, 0, size);
|
65
|
}
|
66
|
byte[] unzippeddata = bos.toByteArray();
|
67
|
decompresser.end();
|
68
|
return new String(unzippeddata);
|
69
|
} catch (Throwable e) {
|
70
|
System.out.println("Wrong abstract:" + abstractCompressed);
|
71
|
throw new RuntimeException(e);
|
72
|
}
|
73
|
}
|
74
|
|
75
|
public static final String PID_TYPES = "dnet:pid_types";
|
76
|
private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
|
77
|
put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES)
|
78
|
.setSchemeid(PID_TYPES).build());
|
79
|
put(GRID_AC, getQualifier("grid", PID_TYPES));
|
80
|
put(WIKPEDIA, getQualifier("urn", PID_TYPES));
|
81
|
}};
|
82
|
|
83
|
static Map<String, Map<String, String>> typologiesMapping;
|
84
|
|
85
|
static {
|
86
|
try {
|
87
|
final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
|
88
|
final String tt = IOUtils.toString(is);
|
89
|
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
90
|
} catch (IOException e) {
|
91
|
e.printStackTrace();
|
92
|
}
|
93
|
}
|
94
|
|
95
|
protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
|
96
|
|
97
|
final String doi = getStringValue(rootElement, "doi");
|
98
|
if (doi == null) {
|
99
|
context.incrementCounter("filtered", "no_doi", 1);
|
100
|
return false;
|
101
|
}
|
102
|
final String type = getStringValue(rootElement, "type");
|
103
|
if (!typologiesMapping.containsKey(type)) {
|
104
|
context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
105
|
return false;
|
106
|
}
|
107
|
// fixes #4360 (test publisher)
|
108
|
final String publisher = getStringValue(rootElement, "publisher");
|
109
|
if (StringUtils.isNotBlank(publisher) && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
|
110
|
context.incrementCounter("filtered", "test_publisher", 1);
|
111
|
return false;
|
112
|
}
|
113
|
|
114
|
List<JsonObject> authors = getArrayObjects(rootElement, "authors");
|
115
|
boolean hasAuthors = false;
|
116
|
for (JsonObject author : authors) {
|
117
|
final String given = getStringValue(author, "given");
|
118
|
final String family = getStringValue(author, "family");
|
119
|
String fullname = getStringValue(author, "fullname");
|
120
|
if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
|
121
|
fullname = String.format("%s %s", given, family);
|
122
|
}
|
123
|
// fixes #4368
|
124
|
if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) {
|
125
|
context.incrementCounter("invalid_author", "addiejackson", 1);
|
126
|
context.incrementCounter("filtered", "invalid_authors", 1);
|
127
|
return false;
|
128
|
}
|
129
|
if (isValidAuthorName(fullname, context)) hasAuthors = true;
|
130
|
}
|
131
|
|
132
|
if (!hasAuthors) {
|
133
|
context.incrementCounter("filtered", "invalid_authors", 1);
|
134
|
return false;
|
135
|
}
|
136
|
// fixes #4360
|
137
|
if (getCleanedTitles(rootElement).isEmpty()) {
|
138
|
context.incrementCounter("filtered", "invalid_title", 1);
|
139
|
return false;
|
140
|
}
|
141
|
|
142
|
return true;
|
143
|
}
|
144
|
|
145
|
private static List<String> getCleanedTitles(final JsonObject rootElement) {
|
146
|
List<String> titles = getArrayValues(rootElement, "title");
|
147
|
return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
|
148
|
}
|
149
|
|
150
|
private static boolean isValidAuthorName(final String fullName, final Reporter context) {
|
151
|
if (StringUtils.isBlank(fullName)) {
|
152
|
if(context != null) context.incrementCounter("invalid_author", "blank", 1);
|
153
|
return false;
|
154
|
}
|
155
|
// fixes #4391 and subtasks related to DOIBoost
|
156
|
switch (StringUtils.lowerCase(fullName)) {
|
157
|
case ",":
|
158
|
case "none none":
|
159
|
case "none, none":
|
160
|
case "none &na;":
|
161
|
case "(:null)":
|
162
|
case "test test test":
|
163
|
case "test test":
|
164
|
case "test":
|
165
|
case "&na; &na;": {
|
166
|
if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1);
|
167
|
return false;
|
168
|
}
|
169
|
}
|
170
|
return true;
|
171
|
}
|
172
|
|
173
|
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
|
174
|
final ActionFactory factory,
|
175
|
final String setName,
|
176
|
final Agent agent,
|
177
|
boolean invisible,
|
178
|
final boolean onlyOrganization,
|
179
|
final Reporter context) {
|
180
|
|
181
|
if (!isValid(rootElement, context)) return null;
|
182
|
|
183
|
//Create OAF Proto
|
184
|
|
185
|
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
186
|
//Add Data Info
|
187
|
oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
|
188
|
.setInvisible(invisible)
|
189
|
.setDeletedbyinference(false)
|
190
|
.setInferred(false)
|
191
|
.setTrust("0.9")
|
192
|
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
193
|
.build());
|
194
|
|
195
|
//Adding Kind
|
196
|
oaf.setKind(KindProtos.Kind.entity);
|
197
|
|
198
|
//creating Result Proto
|
199
|
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
|
200
|
|
201
|
entity.setDateofcollection("2019-02-15");
|
202
|
|
203
|
if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
|
204
|
StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
|
205
|
.map(JsonElement::getAsString)
|
206
|
.forEach(cf -> {
|
207
|
final String id = datasources.get(cf.toLowerCase()).getValue();
|
208
|
final String name = datasources.get(cf.toLowerCase()).getKey();
|
209
|
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
210
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
211
|
.setValue(name)
|
212
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
|
213
|
.build();
|
214
|
entity.addCollectedfrom(collectedFrom);
|
215
|
}
|
216
|
}
|
217
|
);
|
218
|
}
|
219
|
//Adding identifier
|
220
|
final String doi = getStringValue(rootElement, "doi");
|
221
|
entity.addOriginalId(doi);
|
222
|
|
223
|
final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
|
224
|
entity.setId(sourceId);
|
225
|
|
226
|
entity.addPid(FieldTypeProtos.StructuredProperty.newBuilder()
|
227
|
.setValue(doi)
|
228
|
.setQualifier(getQualifier("doi", PID_TYPES))
|
229
|
.build());
|
230
|
|
231
|
//Create Result Field
|
232
|
ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
|
233
|
|
234
|
final String type = getStringValue(rootElement, "type");
|
235
|
|
236
|
//Adding Instances
|
237
|
final String typeValue = typologiesMapping.get(type).get("value");
|
238
|
final String cobjValue = typologiesMapping.get(type).get("cobj");
|
239
|
|
240
|
// TODO: workaround for #4362: remove it when UnpayWall is correctly mapped
|
241
|
List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> {
|
242
|
String provS = getStringValue(prov, "provenance");
|
243
|
if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true;
|
244
|
else return false;
|
245
|
}).collect(Collectors.toList());
|
246
|
|
247
|
Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
|
248
|
{
|
249
|
ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
250
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
251
|
.setClassid(cobjValue)
|
252
|
.setClassname(typeValue)
|
253
|
.setSchemeid("dnet:publication_resource")
|
254
|
.setSchemename("dnet:publication_resource")
|
255
|
.build());
|
256
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
257
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
258
|
.setValue("Unknown Repository")
|
259
|
.build());
|
260
|
|
261
|
String acc_class_id = it.get("access-rights").getAsString();
|
262
|
String acc_class_value;
|
263
|
switch (acc_class_id) {
|
264
|
case "OPEN": {
|
265
|
acc_class_value = "Open Access";
|
266
|
break;
|
267
|
}
|
268
|
case "CLOSED":
|
269
|
case "RESTRICTED": {
|
270
|
//acc_class_value = "Closed Access";
|
271
|
//4362#note-3
|
272
|
acc_class_id = "RESTRICTED";
|
273
|
acc_class_value = "Restricted";
|
274
|
break;
|
275
|
}
|
276
|
case "EMBARGO":
|
277
|
acc_class_value = "Embargo";
|
278
|
break;
|
279
|
default: {
|
280
|
acc_class_value = "not available";
|
281
|
acc_class_id = "UNKNOWN";
|
282
|
}
|
283
|
|
284
|
}
|
285
|
|
286
|
instance.addUrl(it.get("url").getAsString());
|
287
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
288
|
.setClassid(acc_class_id)
|
289
|
.setClassname(acc_class_value)
|
290
|
.setSchemeid("dnet:access_modes")
|
291
|
.setSchemename("dnet:access_modes")
|
292
|
.build());
|
293
|
|
294
|
final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
|
295
|
final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
|
296
|
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
297
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
298
|
.setValue(name)
|
299
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
|
300
|
.build();
|
301
|
|
302
|
instance.setCollectedfrom(collectedFrom);
|
303
|
}
|
304
|
|
305
|
return instance.build();
|
306
|
}).forEach(result::addInstance);
|
307
|
|
308
|
//Adding DOI URL as Instance
|
309
|
final String doiURL = getStringValue(rootElement, "doi-url");
|
310
|
JsonObject hostedByOpenAire = null;
|
311
|
if (rootElement.has("hostedByOpenAire")) {
|
312
|
hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire");
|
313
|
}
|
314
|
final String publisher = getStringValue(rootElement, "publisher");
|
315
|
if (StringUtils.isNotBlank(doiURL)) {
|
316
|
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
317
|
instance.addUrl(doiURL);
|
318
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
319
|
.setClassid(cobjValue)
|
320
|
.setClassname(typeValue)
|
321
|
.setSchemeid("dnet:publication_resource")
|
322
|
.setSchemename("dnet:publication_resource")
|
323
|
.build());
|
324
|
|
325
|
//#4362: if the publisher is Scielo, then the result is OPEN
|
326
|
|
327
|
String accessModeId = "RESTRICTED";
|
328
|
String accessModeName = "Restricted";
|
329
|
if(publisher.equalsIgnoreCase("FapUNIFESP (SciELO)")){
|
330
|
accessModeId = "OPEN";
|
331
|
accessModeName = "Open Access";
|
332
|
}
|
333
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
334
|
.setClassid(accessModeId)
|
335
|
.setClassname(accessModeName)
|
336
|
.setSchemeid("dnet:access_modes")
|
337
|
.setSchemename("dnet:access_modes")
|
338
|
.build());
|
339
|
instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
|
340
|
.setValue(CROSSREF)
|
341
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref"))
|
342
|
.build());
|
343
|
|
344
|
if (hostedByOpenAire == null)
|
345
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
346
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
347
|
.setValue("Unknown Repository")
|
348
|
.build());
|
349
|
else {
|
350
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
351
|
.setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString()))
|
352
|
.setValue(hostedByOpenAire.get("name").getAsString())
|
353
|
.build());
|
354
|
}
|
355
|
|
356
|
result.addInstance(instance);
|
357
|
}
|
358
|
|
359
|
//Create Metadata Proto
|
360
|
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
|
361
|
|
362
|
Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
|
363
|
|
364
|
if (authorsOrganizations.getKey().size() > 0) {
|
365
|
metadata.addAllAuthor(authorsOrganizations.getKey());
|
366
|
} else {
|
367
|
//Should never enter here becasue of the isValid method at the beginning.
|
368
|
context.incrementCounter("filtered", "unexpected_no_authors", 1);
|
369
|
return null;
|
370
|
}
|
371
|
//adding Language
|
372
|
metadata.setLanguage(FieldTypeProtos.Qualifier.newBuilder()
|
373
|
.setClassid("und")
|
374
|
.setClassname("Undetermined")
|
375
|
.setSchemeid(DNET_LANGUAGES)
|
376
|
.setSchemename(DNET_LANGUAGES)
|
377
|
.build());
|
378
|
|
379
|
//Adding subjects
|
380
|
List<String> subjects = getArrayValues(rootElement, "subject");
|
381
|
|
382
|
subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
|
383
|
.setValue(s)
|
384
|
.setQualifier(getQualifier("keyword", "dnet:subject"))
|
385
|
.build()));
|
386
|
|
387
|
List<String> titles = getCleanedTitles(rootElement);
|
388
|
titles.forEach(t ->
|
389
|
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
|
390
|
.setValue(t)
|
391
|
.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
392
|
.build()));
|
393
|
|
394
|
settingRelevantDate(rootElement, metadata, "issued", "issued", true);
|
395
|
settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
|
396
|
settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
|
397
|
settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
|
398
|
|
399
|
getArrayObjects(rootElement, "abstract").forEach(d ->
|
400
|
{
|
401
|
if (MAG.equals(d.get("provenance").getAsString()))
|
402
|
metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(decompressAbstract(d.get("value").getAsString())).build());
|
403
|
else
|
404
|
metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build());
|
405
|
}
|
406
|
);
|
407
|
|
408
|
//Adding Journal
|
409
|
|
410
|
if (StringUtils.isNotBlank(publisher)) {
|
411
|
|
412
|
final FieldTypeProtos.Journal.Builder journal = FieldTypeProtos.Journal.newBuilder().setName(publisher);
|
413
|
|
414
|
if (hasJSONArrayField(rootElement, "issn")) {
|
415
|
StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
|
416
|
.map(JsonElement::getAsJsonObject)
|
417
|
.forEach(it -> {
|
418
|
final String issntype = getStringValue(it, "type");
|
419
|
final String value = getStringValue(it, "value");
|
420
|
if ("electronic".equals(issntype)) {
|
421
|
journal.setIssnOnline(value);
|
422
|
}
|
423
|
if ("print".equals(issntype))
|
424
|
journal.setIssnPrinted(value);
|
425
|
});
|
426
|
}
|
427
|
metadata.setJournal(journal.build());
|
428
|
}
|
429
|
metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
|
430
|
result.setMetadata(metadata.build());
|
431
|
entity.setResult(result.build());
|
432
|
oaf.setEntity(entity.build());
|
433
|
|
434
|
//System.out.println(JsonFormat.printToString(oaf.build()));
|
435
|
|
436
|
final List<AtomicAction> actionList = new ArrayList<>();
|
437
|
|
438
|
if (!onlyOrganization)
|
439
|
actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
|
440
|
|
441
|
if (!authorsOrganizations.getValue().isEmpty()) {
|
442
|
|
443
|
authorsOrganizations.getValue().forEach(o ->
|
444
|
{
|
445
|
|
446
|
actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organization", "body", o.toByteArray()));
|
447
|
if (!onlyOrganization)
|
448
|
actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent));
|
449
|
final String gridOrganization = getSimilarGridOrganization(o.getEntity());
|
450
|
if (gridOrganization != null) {
|
451
|
actionList.add(factory
|
452
|
.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization,
|
453
|
"".getBytes()));
|
454
|
actionList.add(factory
|
455
|
.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(),
|
456
|
"".getBytes()));
|
457
|
}
|
458
|
});
|
459
|
}
|
460
|
return actionList;
|
461
|
|
462
|
}
|
463
|
|
464
|
private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) {
|
465
|
|
466
|
final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList();
|
467
|
if (pidList != null) {
|
468
|
for (FieldTypeProtos.StructuredProperty p : pidList) {
|
469
|
if (p.getQualifier().getClassname().equals("grid")) {
|
470
|
return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
|
471
|
}
|
472
|
}
|
473
|
}
|
474
|
return null;
|
475
|
|
476
|
}
|
477
|
|
478
|
private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication,
|
479
|
final OafProtos.Oaf organization,
|
480
|
final ActionFactory factory,
|
481
|
final String setName,
|
482
|
final Agent agent) {
|
483
|
|
484
|
List<AtomicAction> result = new ArrayList<>();
|
485
|
|
486
|
final OafProtos.Oaf.Builder roaf = OafProtos.Oaf.newBuilder();
|
487
|
roaf.setKind(KindProtos.Kind.relation);
|
488
|
|
489
|
roaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
|
490
|
.setInvisible(false)
|
491
|
.setDeletedbyinference(false)
|
492
|
.setInferred(false)
|
493
|
.setTrust("0.9")
|
494
|
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
495
|
.build());
|
496
|
|
497
|
final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder();
|
498
|
|
499
|
rel.setRelType(RelTypeProtos.RelType.resultOrganization);
|
500
|
rel.setSubRelType(RelTypeProtos.SubRelType.affiliation);
|
501
|
|
502
|
//Create a relation Result --> Organization
|
503
|
rel.setSource(publication.getEntity().getId());
|
504
|
rel.setTarget(organization.getEntity().getId());
|
505
|
rel.setRelClass(ResultOrganization.Affiliation.RelName.hasAuthorInstitution.toString());
|
506
|
|
507
|
final ResultOrganization.Builder rel_instance = ResultOrganization.newBuilder();
|
508
|
|
509
|
final ResultOrganization.Affiliation.Builder affiliationRel = ResultOrganization.Affiliation.newBuilder();
|
510
|
affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
|
511
|
.setSemantics(getQualifier("hasAuthorInstitution", "dnet:result_organization_relations"))
|
512
|
.build());
|
513
|
rel_instance.setAffiliation(affiliationRel.build());
|
514
|
rel.setResultOrganization(rel_instance.build());
|
515
|
|
516
|
rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
|
517
|
.setValue(datasources.get(MAG.toLowerCase()).getKey())
|
518
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions
|
519
|
.md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR)))
|
520
|
.build());
|
521
|
|
522
|
rel.setChild(false);
|
523
|
roaf.setRel(rel.build());
|
524
|
|
525
|
result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution",
|
526
|
organization.getEntity().getId(), roaf.build().toByteArray()));
|
527
|
|
528
|
//Create a relation Organization --> Result
|
529
|
rel.setTarget(publication.getEntity().getId());
|
530
|
rel.setSource(organization.getEntity().getId());
|
531
|
rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString());
|
532
|
|
533
|
affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
|
534
|
.setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations"))
|
535
|
.build());
|
536
|
rel_instance.setAffiliation(affiliationRel.build());
|
537
|
rel.setResultOrganization(rel_instance.build());
|
538
|
roaf.setRel(rel.build());
|
539
|
result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf",
|
540
|
publication.getEntity().getId(), roaf.build().toByteArray()));
|
541
|
|
542
|
return result;
|
543
|
|
544
|
}
|
545
|
|
546
|
private static boolean hasJSONArrayField(final JsonObject root, final String key) {
|
547
|
return root.has(key) && root.get(key).isJsonArray();
|
548
|
}
|
549
|
|
550
|
private static void settingRelevantDate(JsonObject rootElement,
|
551
|
ResultProtos.Result.Metadata.Builder metadata,
|
552
|
final String jsonKey,
|
553
|
final String dictionaryKey,
|
554
|
final boolean addToDateOfAcceptance) {
|
555
|
//Adding date
|
556
|
String date = getStringValue(rootElement, jsonKey);
|
557
|
if (date == null)
|
558
|
return;
|
559
|
if (date.length() == 4) {
|
560
|
date += "-01-01";
|
561
|
}
|
562
|
if (isValidDate(date)) {
|
563
|
if (addToDateOfAcceptance)
|
564
|
metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
|
565
|
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
566
|
.setValue(date)
|
567
|
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
568
|
.build());
|
569
|
}
|
570
|
}
|
571
|
|
572
|
public static FieldTypeProtos.KeyValue extractIdentifier(final String value) {
|
573
|
FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder();
|
574
|
if (StringUtils.contains(value, "orcid.org")) {
|
575
|
return pid.setValue(value.replaceAll("https://orcid.org/", "").replaceAll("http://orcid.org/",""))
|
576
|
.setKey(ORCID).build();
|
577
|
}
|
578
|
if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
|
579
|
return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
|
580
|
.setKey("MAG Identifier").build();
|
581
|
}
|
582
|
return pid.setValue(value)
|
583
|
.setKey("URL").build();
|
584
|
}
|
585
|
|
586
|
public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) {
|
587
|
final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>();
|
588
|
final List<String> magId = new ArrayList<>();
|
589
|
getArrayObjects(affiliation, "identifiers").forEach(it -> {
|
590
|
if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) {
|
591
|
affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG));
|
592
|
magId.add(it.get("value").getAsString());
|
593
|
} else
|
594
|
affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
|
595
|
});
|
596
|
if (magId.size() > 0) {
|
597
|
final String microsoftID = magId.get(0);
|
598
|
OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
599
|
oaf.setKind(KindProtos.Kind.entity);
|
600
|
OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder();
|
601
|
entity.setType(TypeProtos.Type.organization);
|
602
|
entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
|
603
|
final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
|
604
|
final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
|
605
|
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
606
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
607
|
.setValue(name)
|
608
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
|
609
|
.build();
|
610
|
entity.addCollectedfrom(collectedFrom);
|
611
|
} else {
|
612
|
return null;
|
613
|
}
|
614
|
entity.addOriginalId(microsoftID);
|
615
|
|
616
|
affiliationIdentifiers.forEach((key, value) -> entity.addPid(
|
617
|
FieldTypeProtos.StructuredProperty.newBuilder()
|
618
|
.setQualifier(value)
|
619
|
.setValue(key)
|
620
|
.build()));
|
621
|
|
622
|
final OrganizationProtos.Organization.Builder organization = OrganizationProtos.Organization.newBuilder();
|
623
|
organization.setMetadata(OrganizationProtos.Organization.Metadata.newBuilder()
|
624
|
.setWebsiteurl(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("official-page").getAsString()).build())
|
625
|
.setLegalname(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("value").getAsString()).build())
|
626
|
.build());
|
627
|
|
628
|
entity.setOrganization(organization);
|
629
|
oaf.setEntity(entity);
|
630
|
oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
|
631
|
.setInvisible(false)
|
632
|
.setDeletedbyinference(false)
|
633
|
.setInferred(false)
|
634
|
.setTrust("0.9")
|
635
|
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
636
|
.build());
|
637
|
return oaf.build();
|
638
|
}
|
639
|
return null;
|
640
|
}
|
641
|
|
642
|
public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
|
643
|
|
644
|
final Map<String, OafProtos.Oaf> affiliations = new HashMap<>();
|
645
|
|
646
|
List<JsonObject> authors = getArrayObjects(root, "authors");
|
647
|
|
648
|
final AtomicInteger counter = new AtomicInteger(1);
|
649
|
|
650
|
List<FieldTypeProtos.Author> collect = authors.stream().map(author -> {
|
651
|
final String given = getStringValue(author, "given");
|
652
|
final String family = getStringValue(author, "family");
|
653
|
String fullname = getStringValue(author, "fullname");
|
654
|
|
655
|
if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
|
656
|
fullname = String.format("%s %s", given, family);
|
657
|
}
|
658
|
|
659
|
if (!isValidAuthorName(fullname, null)) {
|
660
|
return null;
|
661
|
}
|
662
|
final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder();
|
663
|
|
664
|
if (StringUtils.isNotBlank(given))
|
665
|
abuilder.setName(given);
|
666
|
if (StringUtils.isNotBlank(family))
|
667
|
abuilder.setSurname(family);
|
668
|
if (StringUtils.isNotBlank(fullname))
|
669
|
abuilder.setFullname(fullname);
|
670
|
|
671
|
final List<JsonObject> identifiers = getArrayObjects(author, "identifiers");
|
672
|
final List<JsonObject> authorAffiliation = getArrayObjects(author, "affiliations");
|
673
|
|
674
|
authorAffiliation.forEach(it ->
|
675
|
{
|
676
|
OafProtos.Oaf org = createOrganizationFromJSON(it);
|
677
|
if (org != null) {
|
678
|
affiliations.put(org.getEntity().getId(), org);
|
679
|
abuilder.addAffiliation(org.getEntity().getOrganization().getMetadata().getLegalname());
|
680
|
}
|
681
|
});
|
682
|
identifiers.stream().map(id -> {
|
683
|
final String value = id.get("value").getAsString();
|
684
|
return extractIdentifier(value);
|
685
|
}).collect(
|
686
|
Collectors.toMap(
|
687
|
FieldTypeProtos.KeyValue::getKey,
|
688
|
Function.identity(),
|
689
|
(a, b) -> a
|
690
|
)).values().forEach(abuilder::addPid);
|
691
|
abuilder.setRank(counter.getAndIncrement());
|
692
|
|
693
|
return abuilder.build();
|
694
|
|
695
|
}).filter(Objects::nonNull).collect(Collectors.toList());
|
696
|
|
697
|
return new Pair<>(collect, affiliations.values());
|
698
|
}
|
699
|
|
700
|
}
|