1
|
package eu.dnetlib.data.mapreduce.hbase.dataimport;
|
2
|
|
3
|
import java.io.ByteArrayOutputStream;
|
4
|
import java.io.IOException;
|
5
|
import java.io.InputStream;
|
6
|
import java.util.*;
|
7
|
import java.util.concurrent.atomic.AtomicInteger;
|
8
|
import java.util.function.Function;
|
9
|
import java.util.stream.Collectors;
|
10
|
import java.util.stream.Stream;
|
11
|
import java.util.zip.Inflater;
|
12
|
|
13
|
import com.google.common.collect.Lists;
|
14
|
import com.google.gson.Gson;
|
15
|
import com.google.gson.JsonElement;
|
16
|
import com.google.gson.JsonObject;
|
17
|
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
18
|
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
19
|
import eu.dnetlib.actionmanager.common.Agent;
|
20
|
import eu.dnetlib.data.mapreduce.hbase.Reporter;
|
21
|
import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
22
|
import eu.dnetlib.data.proto.*;
|
23
|
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
24
|
import eu.dnetlib.miscutils.collections.Pair;
|
25
|
import org.apache.commons.codec.binary.Base64;
|
26
|
import org.apache.commons.io.IOUtils;
|
27
|
import org.apache.commons.lang3.StringUtils;
|
28
|
|
29
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
|
30
|
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
|
31
|
|
32
|
public class DOIBoostToActions {
|
33
|
|
34
|
public static final String MAG = "MAG";
|
35
|
public static final String ORCID = "ORCID";
|
36
|
public static final String CROSSREF = "Crossref";
|
37
|
public static final String UNPAYWALL = "UnpayWall";
|
38
|
|
39
|
public static final String GRID_AC = "grid.ac";
|
40
|
public static final String WIKPEDIA = "wikpedia";
|
41
|
|
42
|
public final static String doiBoostNSPREFIX = "doiboost____";
|
43
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
44
|
|
45
|
public static final String SEPARATOR = "::";
|
46
|
public static final String DNET_LANGUAGES = "dnet:languages";
|
47
|
|
48
|
private static final List<String> DATE_TYPES = Lists.newArrayList("issued", "accepted", "published-online", "published-print");
|
49
|
|
50
|
|
51
|
|
52
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
|
53
|
put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
|
54
|
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
55
|
put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
|
56
|
put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
|
57
|
|
58
|
}};
|
59
|
|
60
|
private static String decompressAbstract(final String abstractCompressed) {
|
61
|
try {
|
62
|
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
|
63
|
final Inflater decompresser = new Inflater();
|
64
|
decompresser.setInput(byteArray);
|
65
|
final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
|
66
|
byte[] buffer = new byte[8192];
|
67
|
while (!decompresser.finished()) {
|
68
|
int size = decompresser.inflate(buffer);
|
69
|
bos.write(buffer, 0, size);
|
70
|
}
|
71
|
byte[] unzippeddata = bos.toByteArray();
|
72
|
decompresser.end();
|
73
|
return new String(unzippeddata);
|
74
|
} catch (Throwable e) {
|
75
|
System.out.println("Wrong abstract:" + abstractCompressed);
|
76
|
throw new RuntimeException(e);
|
77
|
}
|
78
|
}
|
79
|
|
80
|
public static final String PID_TYPES = "dnet:pid_types";
|
81
|
private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
|
82
|
put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES)
|
83
|
.setSchemeid(PID_TYPES).build());
|
84
|
put(GRID_AC, getQualifier("grid", PID_TYPES));
|
85
|
put(WIKPEDIA, getQualifier("urn", PID_TYPES));
|
86
|
}};
|
87
|
|
88
|
static Map<String, Map<String, String>> typologiesMapping;
|
89
|
|
90
|
static {
|
91
|
try {
|
92
|
final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
|
93
|
final String tt = IOUtils.toString(is);
|
94
|
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
95
|
} catch (IOException e) {
|
96
|
e.printStackTrace();
|
97
|
}
|
98
|
}
|
99
|
|
100
|
protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
|
101
|
|
102
|
final String doi = getStringValue(rootElement, "doi");
|
103
|
if (doi == null) {
|
104
|
context.incrementCounter("filtered", "no_doi", 1);
|
105
|
return false;
|
106
|
}
|
107
|
final String type = getStringValue(rootElement, "type");
|
108
|
if (!typologiesMapping.containsKey(type)) {
|
109
|
context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
110
|
return false;
|
111
|
}
|
112
|
// fixes #4360 (test publisher)
|
113
|
final String publisher = getStringValue(rootElement, "publisher");
|
114
|
if (StringUtils.isNotBlank(publisher) && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
|
115
|
context.incrementCounter("filtered", "test_publisher", 1);
|
116
|
return false;
|
117
|
}
|
118
|
|
119
|
List<JsonObject> authors = getArrayObjects(rootElement, "authors");
|
120
|
boolean hasAuthors = false;
|
121
|
for (JsonObject author : authors) {
|
122
|
final String given = getStringValue(author, "given");
|
123
|
final String family = getStringValue(author, "family");
|
124
|
String fullname = getStringValue(author, "fullname");
|
125
|
if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
|
126
|
fullname = String.format("%s %s", given, family);
|
127
|
}
|
128
|
// fixes #4368
|
129
|
if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) {
|
130
|
context.incrementCounter("invalid_author", "addiejackson", 1);
|
131
|
context.incrementCounter("filtered", "invalid_authors", 1);
|
132
|
return false;
|
133
|
}
|
134
|
if (isValidAuthorName(fullname, context)) hasAuthors = true;
|
135
|
}
|
136
|
|
137
|
if (!hasAuthors) {
|
138
|
context.incrementCounter("filtered", "invalid_authors", 1);
|
139
|
return false;
|
140
|
}
|
141
|
// fixes #4360
|
142
|
if (getCleanedTitles(rootElement).isEmpty()) {
|
143
|
context.incrementCounter("filtered", "invalid_title", 1);
|
144
|
return false;
|
145
|
}
|
146
|
|
147
|
return true;
|
148
|
}
|
149
|
|
150
|
private static List<String> getCleanedTitles(final JsonObject rootElement) {
|
151
|
List<String> titles = getArrayValues(rootElement, "title");
|
152
|
return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
|
153
|
}
|
154
|
|
155
|
private static boolean isValidAuthorName(final String fullName, final Reporter context) {
|
156
|
if (StringUtils.isBlank(fullName)) {
|
157
|
if(context != null) context.incrementCounter("invalid_author", "blank", 1);
|
158
|
return false;
|
159
|
}
|
160
|
// fixes #4391 and subtasks related to DOIBoost
|
161
|
switch (StringUtils.lowerCase(fullName)) {
|
162
|
case ",":
|
163
|
case "none none":
|
164
|
case "none, none":
|
165
|
case "none &na;":
|
166
|
case "(:null)":
|
167
|
case "test test test":
|
168
|
case "test test":
|
169
|
case "test":
|
170
|
case "&na; &na;": {
|
171
|
if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1);
|
172
|
return false;
|
173
|
}
|
174
|
}
|
175
|
return true;
|
176
|
}
|
177
|
|
178
|
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
|
179
|
final ActionFactory factory,
|
180
|
final String setName,
|
181
|
final Agent agent,
|
182
|
boolean invisible,
|
183
|
final boolean onlyOrganization,
|
184
|
final Reporter context) {
|
185
|
|
186
|
if (!isValid(rootElement, context)) return null;
|
187
|
|
188
|
//Create OAF Proto
|
189
|
|
190
|
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
191
|
//Add Data Info
|
192
|
oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
|
193
|
.setInvisible(invisible)
|
194
|
.setDeletedbyinference(false)
|
195
|
.setInferred(false)
|
196
|
.setTrust("0.9")
|
197
|
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
198
|
.build());
|
199
|
|
200
|
//Adding Kind
|
201
|
oaf.setKind(KindProtos.Kind.entity);
|
202
|
|
203
|
//creating Result Proto
|
204
|
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
|
205
|
|
206
|
entity.setDateofcollection("2019-02-15");
|
207
|
|
208
|
if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
|
209
|
StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
|
210
|
.map(JsonElement::getAsString)
|
211
|
.forEach(cf -> {
|
212
|
final String id = datasources.get(cf.toLowerCase()).getValue();
|
213
|
final String name = datasources.get(cf.toLowerCase()).getKey();
|
214
|
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
215
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
216
|
.setValue(name)
|
217
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
|
218
|
.build();
|
219
|
entity.addCollectedfrom(collectedFrom);
|
220
|
}
|
221
|
}
|
222
|
);
|
223
|
}
|
224
|
//Adding identifier
|
225
|
final String doi = getStringValue(rootElement, "doi");
|
226
|
entity.addOriginalId(doi);
|
227
|
|
228
|
final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
|
229
|
entity.setId(sourceId);
|
230
|
|
231
|
entity.addPid(FieldTypeProtos.StructuredProperty.newBuilder()
|
232
|
.setValue(doi)
|
233
|
.setQualifier(getQualifier("doi", PID_TYPES))
|
234
|
.build());
|
235
|
|
236
|
//Create Result Field
|
237
|
ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
|
238
|
|
239
|
final String type = getStringValue(rootElement, "type");
|
240
|
|
241
|
//Adding Instances
|
242
|
final String typeValue = typologiesMapping.get(type).get("value");
|
243
|
final String cobjValue = typologiesMapping.get(type).get("cobj");
|
244
|
|
245
|
// TODO: workaround for #4362: remove it when UnpayWall is correctly mapped
|
246
|
List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> {
|
247
|
String provS = getStringValue(prov, "provenance");
|
248
|
if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true;
|
249
|
else return false;
|
250
|
}).collect(Collectors.toList());
|
251
|
|
252
|
Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
|
253
|
{
|
254
|
ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
255
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
256
|
.setClassid(cobjValue)
|
257
|
.setClassname(typeValue)
|
258
|
.setSchemeid("dnet:publication_resource")
|
259
|
.setSchemename("dnet:publication_resource")
|
260
|
.build());
|
261
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
262
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
263
|
.setValue("Unknown Repository")
|
264
|
.build());
|
265
|
|
266
|
String acc_class_id = it.get("access-rights").getAsString();
|
267
|
String acc_class_value;
|
268
|
switch (acc_class_id) {
|
269
|
case "OPEN": {
|
270
|
acc_class_value = "Open Access";
|
271
|
break;
|
272
|
}
|
273
|
case "CLOSED":
|
274
|
case "RESTRICTED": {
|
275
|
//acc_class_value = "Closed Access";
|
276
|
//4362#note-3
|
277
|
acc_class_id = "RESTRICTED";
|
278
|
acc_class_value = "Restricted";
|
279
|
break;
|
280
|
}
|
281
|
case "EMBARGO":
|
282
|
acc_class_value = "Embargo";
|
283
|
break;
|
284
|
default: {
|
285
|
acc_class_value = "not available";
|
286
|
acc_class_id = "UNKNOWN";
|
287
|
}
|
288
|
|
289
|
}
|
290
|
|
291
|
instance.addUrl(it.get("url").getAsString());
|
292
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
293
|
.setClassid(acc_class_id)
|
294
|
.setClassname(acc_class_value)
|
295
|
.setSchemeid("dnet:access_modes")
|
296
|
.setSchemename("dnet:access_modes")
|
297
|
.build());
|
298
|
|
299
|
final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
|
300
|
final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
|
301
|
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
302
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
303
|
.setValue(name)
|
304
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
|
305
|
.build();
|
306
|
|
307
|
instance.setCollectedfrom(collectedFrom);
|
308
|
}
|
309
|
|
310
|
return instance.build();
|
311
|
}).forEach(result::addInstance);
|
312
|
|
313
|
//Adding DOI URL as Instance
|
314
|
final String doiURL = getStringValue(rootElement, "doi-url");
|
315
|
JsonObject hostedByOpenAire = null;
|
316
|
if (rootElement.has("hostedByOpenAire")) {
|
317
|
hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire");
|
318
|
}
|
319
|
final String publisher = getStringValue(rootElement, "publisher");
|
320
|
if (StringUtils.isNotBlank(doiURL)) {
|
321
|
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
322
|
instance.addUrl(doiURL);
|
323
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
324
|
.setClassid(cobjValue)
|
325
|
.setClassname(typeValue)
|
326
|
.setSchemeid("dnet:publication_resource")
|
327
|
.setSchemename("dnet:publication_resource")
|
328
|
.build());
|
329
|
|
330
|
//#4362: if the publisher is Scielo, then the result is OPEN
|
331
|
|
332
|
String accessModeId = "RESTRICTED";
|
333
|
String accessModeName = "Restricted";
|
334
|
if(publisher != null && publisher.equalsIgnoreCase("FapUNIFESP (SciELO)")){
|
335
|
accessModeId = "OPEN";
|
336
|
accessModeName = "Open Access";
|
337
|
}
|
338
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
339
|
.setClassid(accessModeId)
|
340
|
.setClassname(accessModeName)
|
341
|
.setSchemeid("dnet:access_modes")
|
342
|
.setSchemename("dnet:access_modes")
|
343
|
.build());
|
344
|
instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
|
345
|
.setValue(CROSSREF)
|
346
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref"))
|
347
|
.build());
|
348
|
|
349
|
if (hostedByOpenAire == null)
|
350
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
351
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
352
|
.setValue("Unknown Repository")
|
353
|
.build());
|
354
|
else {
|
355
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
356
|
.setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString()))
|
357
|
.setValue(hostedByOpenAire.get("name").getAsString())
|
358
|
.build());
|
359
|
}
|
360
|
|
361
|
result.addInstance(instance);
|
362
|
}
|
363
|
|
364
|
//Create Metadata Proto
|
365
|
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
|
366
|
|
367
|
Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
|
368
|
|
369
|
if (authorsOrganizations.getKey().size() > 0) {
|
370
|
metadata.addAllAuthor(authorsOrganizations.getKey());
|
371
|
} else {
|
372
|
//Should never enter here becasue of the isValid method at the beginning.
|
373
|
context.incrementCounter("filtered", "unexpected_no_authors", 1);
|
374
|
return null;
|
375
|
}
|
376
|
//adding Language
|
377
|
metadata.setLanguage(FieldTypeProtos.Qualifier.newBuilder()
|
378
|
.setClassid("und")
|
379
|
.setClassname("Undetermined")
|
380
|
.setSchemeid(DNET_LANGUAGES)
|
381
|
.setSchemename(DNET_LANGUAGES)
|
382
|
.build());
|
383
|
|
384
|
//Adding subjects
|
385
|
List<String> subjects = getArrayValues(rootElement, "subject");
|
386
|
|
387
|
subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
|
388
|
.setValue(s)
|
389
|
.setQualifier(getQualifier("keyword", "dnet:subject"))
|
390
|
.build()));
|
391
|
|
392
|
List<String> titles = getCleanedTitles(rootElement);
|
393
|
titles.forEach(t ->
|
394
|
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
|
395
|
.setValue(t)
|
396
|
.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
397
|
.build()));
|
398
|
|
399
|
|
400
|
final String firstValidDate = getFirstValidDate(rootElement);
|
401
|
if (StringUtils.isNotBlank(firstValidDate)) {
|
402
|
setDate(metadata, "issued", firstValidDate, true);
|
403
|
} else {
|
404
|
context.incrementCounter("filtered", "missing_date", 1);
|
405
|
return null;
|
406
|
}
|
407
|
settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
|
408
|
settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
|
409
|
settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
|
410
|
|
411
|
getArrayObjects(rootElement, "abstract").forEach(d ->
|
412
|
{
|
413
|
if (MAG.equals(d.get("provenance").getAsString()) && d.get("value")!= null && !d.get("value").isJsonNull())
|
414
|
metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(decompressAbstract(d.get("value").getAsString())).build());
|
415
|
else if (d.get("value")!= null && !d.get("value").isJsonNull())
|
416
|
metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build());
|
417
|
}
|
418
|
);
|
419
|
|
420
|
//Adding Journal
|
421
|
|
422
|
if (StringUtils.isNotBlank(publisher)) {
|
423
|
|
424
|
final FieldTypeProtos.Journal.Builder journal = FieldTypeProtos.Journal.newBuilder().setName(publisher);
|
425
|
|
426
|
if (hasJSONArrayField(rootElement, "issn")) {
|
427
|
StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
|
428
|
.map(JsonElement::getAsJsonObject)
|
429
|
.forEach(it -> {
|
430
|
final String issntype = getStringValue(it, "type");
|
431
|
final String value = getStringValue(it, "value");
|
432
|
if ("electronic".equals(issntype)) {
|
433
|
journal.setIssnOnline(value);
|
434
|
}
|
435
|
if ("print".equals(issntype))
|
436
|
journal.setIssnPrinted(value);
|
437
|
});
|
438
|
}
|
439
|
metadata.setJournal(journal.build());
|
440
|
}
|
441
|
metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
|
442
|
result.setMetadata(metadata.build());
|
443
|
entity.setResult(result.build());
|
444
|
oaf.setEntity(entity.build());
|
445
|
|
446
|
//System.out.println(JsonFormat.printToString(oaf.build()));
|
447
|
|
448
|
final List<AtomicAction> actionList = new ArrayList<>();
|
449
|
|
450
|
if (!onlyOrganization)
|
451
|
actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
|
452
|
|
453
|
if (!authorsOrganizations.getValue().isEmpty()) {
|
454
|
|
455
|
authorsOrganizations.getValue().forEach(o ->
|
456
|
{
|
457
|
|
458
|
actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organization", "body", o.toByteArray()));
|
459
|
if (!onlyOrganization)
|
460
|
actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent));
|
461
|
final String gridOrganization = getSimilarGridOrganization(o.getEntity());
|
462
|
if (gridOrganization != null) {
|
463
|
actionList.add(factory
|
464
|
.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization,
|
465
|
"".getBytes()));
|
466
|
actionList.add(factory
|
467
|
.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(),
|
468
|
"".getBytes()));
|
469
|
}
|
470
|
});
|
471
|
}
|
472
|
return actionList;
|
473
|
|
474
|
}
|
475
|
|
476
|
private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) {
|
477
|
|
478
|
final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList();
|
479
|
if (pidList != null) {
|
480
|
for (FieldTypeProtos.StructuredProperty p : pidList) {
|
481
|
if (p.getQualifier().getClassname().equals("grid")) {
|
482
|
return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
|
483
|
}
|
484
|
}
|
485
|
}
|
486
|
return null;
|
487
|
|
488
|
}
|
489
|
|
490
|
private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication,
|
491
|
final OafProtos.Oaf organization,
|
492
|
final ActionFactory factory,
|
493
|
final String setName,
|
494
|
final Agent agent) {
|
495
|
|
496
|
List<AtomicAction> result = new ArrayList<>();
|
497
|
|
498
|
final OafProtos.Oaf.Builder roaf = OafProtos.Oaf.newBuilder();
|
499
|
roaf.setKind(KindProtos.Kind.relation);
|
500
|
|
501
|
roaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
|
502
|
.setInvisible(false)
|
503
|
.setDeletedbyinference(false)
|
504
|
.setInferred(false)
|
505
|
.setTrust("0.9")
|
506
|
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
507
|
.build());
|
508
|
|
509
|
final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder();
|
510
|
|
511
|
rel.setRelType(RelTypeProtos.RelType.resultOrganization);
|
512
|
rel.setSubRelType(RelTypeProtos.SubRelType.affiliation);
|
513
|
|
514
|
//Create a relation Result --> Organization
|
515
|
rel.setSource(publication.getEntity().getId());
|
516
|
rel.setTarget(organization.getEntity().getId());
|
517
|
rel.setRelClass(ResultOrganization.Affiliation.RelName.hasAuthorInstitution.toString());
|
518
|
|
519
|
final ResultOrganization.Builder rel_instance = ResultOrganization.newBuilder();
|
520
|
|
521
|
final ResultOrganization.Affiliation.Builder affiliationRel = ResultOrganization.Affiliation.newBuilder();
|
522
|
affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
|
523
|
.setSemantics(getQualifier("hasAuthorInstitution", "dnet:result_organization_relations"))
|
524
|
.build());
|
525
|
rel_instance.setAffiliation(affiliationRel.build());
|
526
|
rel.setResultOrganization(rel_instance.build());
|
527
|
|
528
|
rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
|
529
|
.setValue(datasources.get(MAG.toLowerCase()).getKey())
|
530
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions
|
531
|
.md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR)))
|
532
|
.build());
|
533
|
|
534
|
rel.setChild(false);
|
535
|
roaf.setRel(rel.build());
|
536
|
|
537
|
result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution",
|
538
|
organization.getEntity().getId(), roaf.build().toByteArray()));
|
539
|
|
540
|
//Create a relation Organization --> Result
|
541
|
rel.setTarget(publication.getEntity().getId());
|
542
|
rel.setSource(organization.getEntity().getId());
|
543
|
rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString());
|
544
|
|
545
|
affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
|
546
|
.setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations"))
|
547
|
.build());
|
548
|
rel_instance.setAffiliation(affiliationRel.build());
|
549
|
rel.setResultOrganization(rel_instance.build());
|
550
|
roaf.setRel(rel.build());
|
551
|
result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf",
|
552
|
publication.getEntity().getId(), roaf.build().toByteArray()));
|
553
|
|
554
|
return result;
|
555
|
|
556
|
}
|
557
|
|
558
|
private static boolean hasJSONArrayField(final JsonObject root, final String key) {
|
559
|
return root.has(key) && root.get(key).isJsonArray();
|
560
|
}
|
561
|
|
562
|
private static String getFirstValidDate(final JsonObject root) {
|
563
|
return DATE_TYPES.stream()
|
564
|
.map(type -> getStringValue(root, type))
|
565
|
.filter(Objects::nonNull)
|
566
|
.filter(DumpToActionsUtility::isValidDate)
|
567
|
.findFirst()
|
568
|
.orElse("");
|
569
|
}
|
570
|
|
571
|
private static void setDate(ResultProtos.Result.Metadata.Builder metadata,
|
572
|
final String dictionaryKey,
|
573
|
final String date,
|
574
|
final boolean addToDateOfAcceptance) {
|
575
|
if (date == null)
|
576
|
return;
|
577
|
if (addToDateOfAcceptance) {
|
578
|
metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
|
579
|
} else {
|
580
|
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
581
|
.setValue(date)
|
582
|
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
583
|
.build());
|
584
|
}
|
585
|
}
|
586
|
|
587
|
private static void settingRelevantDate(JsonObject rootElement,
|
588
|
ResultProtos.Result.Metadata.Builder metadata,
|
589
|
final String jsonKey,
|
590
|
final String dictionaryKey,
|
591
|
final boolean addToDateOfAcceptance) {
|
592
|
//Adding date
|
593
|
String date = getStringValue(rootElement, jsonKey);
|
594
|
if (date == null)
|
595
|
return;
|
596
|
if (date.length() == 4) {
|
597
|
date += "-01-01";
|
598
|
}
|
599
|
if (isValidDate(date)) {
|
600
|
if (addToDateOfAcceptance)
|
601
|
metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
|
602
|
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
603
|
.setValue(date)
|
604
|
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
605
|
.build());
|
606
|
}
|
607
|
}
|
608
|
|
609
|
public static FieldTypeProtos.KeyValue extractIdentifier(final String value) {
|
610
|
FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder();
|
611
|
if (StringUtils.contains(value, "orcid.org")) {
|
612
|
return pid.setValue(value.replaceAll("https://orcid.org/", "").replaceAll("http://orcid.org/",""))
|
613
|
.setKey(ORCID).build();
|
614
|
}
|
615
|
if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
|
616
|
return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
|
617
|
.setKey("MAG Identifier").build();
|
618
|
}
|
619
|
return pid.setValue(value)
|
620
|
.setKey("URL").build();
|
621
|
}
|
622
|
|
623
|
public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) {
|
624
|
final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>();
|
625
|
final List<String> magId = new ArrayList<>();
|
626
|
getArrayObjects(affiliation, "identifiers").forEach(it -> {
|
627
|
if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) {
|
628
|
affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG));
|
629
|
magId.add(it.get("value").getAsString());
|
630
|
} else
|
631
|
affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
|
632
|
});
|
633
|
if (magId.size() > 0) {
|
634
|
final String microsoftID = magId.get(0);
|
635
|
OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
636
|
oaf.setKind(KindProtos.Kind.entity);
|
637
|
OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder();
|
638
|
entity.setType(TypeProtos.Type.organization);
|
639
|
entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
|
640
|
final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
|
641
|
final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
|
642
|
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
643
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
644
|
.setValue(name)
|
645
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, SEPARATOR)))
|
646
|
.build();
|
647
|
entity.addCollectedfrom(collectedFrom);
|
648
|
} else {
|
649
|
return null;
|
650
|
}
|
651
|
entity.addOriginalId(microsoftID);
|
652
|
|
653
|
affiliationIdentifiers.forEach((key, value) -> entity.addPid(
|
654
|
FieldTypeProtos.StructuredProperty.newBuilder()
|
655
|
.setQualifier(value)
|
656
|
.setValue(key)
|
657
|
.build()));
|
658
|
|
659
|
final OrganizationProtos.Organization.Builder organization = OrganizationProtos.Organization.newBuilder();
|
660
|
organization.setMetadata(OrganizationProtos.Organization.Metadata.newBuilder()
|
661
|
.setWebsiteurl(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("official-page").getAsString()).build())
|
662
|
.setLegalname(FieldTypeProtos.StringField.newBuilder().setValue(affiliation.get("value").getAsString()).build())
|
663
|
.build());
|
664
|
|
665
|
entity.setOrganization(organization);
|
666
|
oaf.setEntity(entity);
|
667
|
oaf.setDataInfo(FieldTypeProtos.DataInfo.newBuilder()
|
668
|
.setInvisible(false)
|
669
|
.setDeletedbyinference(false)
|
670
|
.setInferred(false)
|
671
|
.setTrust("0.9")
|
672
|
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
673
|
.build());
|
674
|
return oaf.build();
|
675
|
}
|
676
|
return null;
|
677
|
}
|
678
|
|
679
|
public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
|
680
|
|
681
|
final Map<String, OafProtos.Oaf> affiliations = new HashMap<>();
|
682
|
|
683
|
List<JsonObject> authors = getArrayObjects(root, "authors");
|
684
|
|
685
|
final AtomicInteger counter = new AtomicInteger(1);
|
686
|
|
687
|
List<FieldTypeProtos.Author> collect = authors.stream().map(author -> {
|
688
|
final String given = getStringValue(author, "given");
|
689
|
final String family = getStringValue(author, "family");
|
690
|
String fullname = getStringValue(author, "fullname");
|
691
|
|
692
|
if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
|
693
|
fullname = String.format("%s %s", given, family);
|
694
|
}
|
695
|
|
696
|
if (!isValidAuthorName(fullname, null)) {
|
697
|
return null;
|
698
|
}
|
699
|
final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder();
|
700
|
|
701
|
if (StringUtils.isNotBlank(given))
|
702
|
abuilder.setName(given);
|
703
|
if (StringUtils.isNotBlank(family))
|
704
|
abuilder.setSurname(family);
|
705
|
if (StringUtils.isNotBlank(fullname))
|
706
|
abuilder.setFullname(fullname);
|
707
|
|
708
|
final List<JsonObject> identifiers = getArrayObjects(author, "identifiers");
|
709
|
final List<JsonObject> authorAffiliation = getArrayObjects(author, "affiliations");
|
710
|
|
711
|
authorAffiliation.forEach(it ->
|
712
|
{
|
713
|
OafProtos.Oaf org = createOrganizationFromJSON(it);
|
714
|
if (org != null) {
|
715
|
affiliations.put(org.getEntity().getId(), org);
|
716
|
abuilder.addAffiliation(org.getEntity().getOrganization().getMetadata().getLegalname());
|
717
|
}
|
718
|
});
|
719
|
identifiers.stream().map(id -> {
|
720
|
final String value = id.get("value").getAsString();
|
721
|
return extractIdentifier(value);
|
722
|
}).collect(
|
723
|
Collectors.toMap(
|
724
|
FieldTypeProtos.KeyValue::getKey,
|
725
|
Function.identity(),
|
726
|
(a, b) -> a
|
727
|
)).values().forEach(abuilder::addPid);
|
728
|
abuilder.setRank(counter.getAndIncrement());
|
729
|
|
730
|
return abuilder.build();
|
731
|
|
732
|
}).filter(Objects::nonNull).collect(Collectors.toList());
|
733
|
|
734
|
return new Pair<>(collect, affiliations.values());
|
735
|
}
|
736
|
|
737
|
}
|