1
|
package eu.dnetlib.data.mapreduce.hbase.dataimport;
|
2
|
|
3
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
|
4
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype;
|
5
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
|
6
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
|
7
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
|
8
|
|
9
|
import java.io.IOException;
|
10
|
import java.io.InputStream;
|
11
|
import java.util.ArrayList;
|
12
|
import java.util.HashMap;
|
13
|
import java.util.List;
|
14
|
import java.util.Map;
|
15
|
|
16
|
import org.apache.commons.io.IOUtils;
|
17
|
import org.apache.commons.lang3.StringUtils;
|
18
|
|
19
|
import com.google.gson.Gson;
|
20
|
import com.google.gson.JsonArray;
|
21
|
import com.google.gson.JsonElement;
|
22
|
import com.google.gson.JsonObject;
|
23
|
import com.googlecode.protobuf.format.JsonFormat;
|
24
|
|
25
|
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
26
|
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
27
|
import eu.dnetlib.actionmanager.common.Agent;
|
28
|
import eu.dnetlib.data.mapreduce.hbase.Reporter;
|
29
|
import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
30
|
import eu.dnetlib.data.proto.FieldTypeProtos;
|
31
|
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
|
32
|
import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo;
|
33
|
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
|
34
|
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
35
|
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
|
36
|
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
37
|
import eu.dnetlib.data.proto.KindProtos;
|
38
|
import eu.dnetlib.data.proto.OafProtos;
|
39
|
import eu.dnetlib.data.proto.ResultProtos;
|
40
|
import eu.dnetlib.data.proto.TypeProtos;
|
41
|
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
42
|
import eu.dnetlib.miscutils.collections.Pair;
|
43
|
import eu.dnetlib.miscutils.datetime.DateUtils;
|
44
|
import eu.dnetlib.pace.model.Person;
|
45
|
|
46
|
public class OrcidToActions {
|
47
|
|
48
|
public static final String ORCID = "ORCID";
|
49
|
public final static String orcidPREFIX = "orcid_______";
|
50
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
51
|
public static final String SEPARATOR = "::";
|
52
|
|
53
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
54
|
|
55
|
{
|
56
|
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
57
|
|
58
|
}
|
59
|
};
|
60
|
|
61
|
// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
|
62
|
private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
|
63
|
|
64
|
{
|
65
|
put("ark".toLowerCase(), new Pair<>("ark", "ark"));
|
66
|
put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
|
67
|
put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
|
68
|
put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
|
69
|
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
|
70
|
put("urn".toLowerCase(), new Pair<>("urn", "urn"));
|
71
|
}
|
72
|
};
|
73
|
|
74
|
static Map<String, Map<String, String>> typologiesMapping;
|
75
|
|
76
|
static {
|
77
|
try {
|
78
|
final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
|
79
|
final String tt = IOUtils.toString(is);
|
80
|
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
81
|
} catch (final IOException e) {
|
82
|
e.printStackTrace();
|
83
|
}
|
84
|
}
|
85
|
|
86
|
public static final String PID_TYPES = "dnet:pid_types";
|
87
|
|
88
|
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
|
89
|
final ActionFactory factory,
|
90
|
final String setName,
|
91
|
final Agent agent,
|
92
|
final Reporter context) {
|
93
|
|
94
|
if (!isValid(rootElement, context)) { return null; }
|
95
|
|
96
|
// Create OAF proto
|
97
|
|
98
|
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
99
|
|
100
|
oaf.setDataInfo(
|
101
|
DataInfo.newBuilder()
|
102
|
.setDeletedbyinference(false)
|
103
|
.setInferred(false)
|
104
|
.setTrust("0.9")
|
105
|
.setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions"))
|
106
|
.build());
|
107
|
|
108
|
// Adding kind
|
109
|
oaf.setKind(KindProtos.Kind.entity);
|
110
|
|
111
|
oaf.setLastupdatetimestamp(DateUtils.now());
|
112
|
|
113
|
// creating result proto
|
114
|
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
|
115
|
|
116
|
entity.setDateofcollection("2018-10-22");
|
117
|
entity.setDateoftransformation(DateUtils.now_ISO8601());
|
118
|
|
119
|
// Adding external ids
|
120
|
StreamUtils.toStream(externalIds.keySet().iterator())
|
121
|
.forEach(jsonExtId -> {
|
122
|
final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
|
123
|
final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
|
124
|
final String extId = getStringValue(rootElement, jsonExtId);
|
125
|
if (StringUtils.isNotBlank(extId)) {
|
126
|
entity.addPid(StructuredProperty.newBuilder()
|
127
|
.setValue(extId)
|
128
|
.setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
|
129
|
.setSchemename("dnet:pid_types").build())
|
130
|
.build());
|
131
|
}
|
132
|
});
|
133
|
|
134
|
// Create result field
|
135
|
final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
|
136
|
|
137
|
// Create metadata proto
|
138
|
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
|
139
|
|
140
|
// Adding source
|
141
|
final String source = getStringValue(rootElement, "source");
|
142
|
if (StringUtils.isNotBlank(source)) {
|
143
|
metadata.addSource(StringField.newBuilder().setValue(source).build());
|
144
|
}
|
145
|
|
146
|
// Adding title
|
147
|
final String title = createRepeatedField(rootElement, "titles");
|
148
|
if (StringUtils.isBlank(title)) {
|
149
|
context.incrementCounter("filtered", "title_not_found", 1);
|
150
|
return null;
|
151
|
}
|
152
|
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
|
153
|
.setValue(title)
|
154
|
.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
155
|
.build());
|
156
|
|
157
|
// Adding identifier
|
158
|
final String id = getStringValue(rootElement, "id");
|
159
|
String sourceId = null;
|
160
|
if (id != null) {
|
161
|
entity.addOriginalId(id);
|
162
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
|
163
|
} else {
|
164
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
|
165
|
}
|
166
|
entity.setId(sourceId);
|
167
|
|
168
|
// Adding relevant date
|
169
|
settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
|
170
|
|
171
|
// Adding collectedfrom
|
172
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
173
|
.setValue(ORCID)
|
174
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
|
175
|
.build();
|
176
|
entity.addCollectedfrom(collectedFrom);
|
177
|
|
178
|
// Adding type
|
179
|
final String type = getStringValue(rootElement, "type");
|
180
|
String cobjValue = "";
|
181
|
if (StringUtils.isNotBlank(type)) {
|
182
|
|
183
|
metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
|
184
|
.setClassid(type)
|
185
|
.setClassname(type)
|
186
|
.setSchemeid("dnet:dataCite_resource")
|
187
|
.setSchemename("dnet:dataCite_resource")
|
188
|
.build());
|
189
|
|
190
|
final String typeValue = typologiesMapping.get(type).get("value");
|
191
|
cobjValue = typologiesMapping.get(type).get("cobj");
|
192
|
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
193
|
|
194
|
// Adding hostedby
|
195
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
196
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
197
|
.setValue("Unknown Repository")
|
198
|
.build());
|
199
|
|
200
|
// Adding url
|
201
|
final String url = createRepeatedField(rootElement, "urls");
|
202
|
if (StringUtils.isNotBlank(url)) {
|
203
|
instance.addUrl(url);
|
204
|
}
|
205
|
|
206
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
207
|
if (StringUtils.isNotBlank(pubDate)) {
|
208
|
instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
209
|
}
|
210
|
|
211
|
instance.setCollectedfrom(collectedFrom);
|
212
|
|
213
|
// Adding accessright
|
214
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
215
|
.setClassid("UNKNOWN")
|
216
|
.setClassname("UNKNOWN")
|
217
|
.setSchemeid("dnet:access_modes")
|
218
|
.setSchemename("dnet:access_modes")
|
219
|
.build());
|
220
|
|
221
|
// Adding type
|
222
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
223
|
.setClassid(cobjValue)
|
224
|
.setClassname(typeValue)
|
225
|
.setSchemeid("dnet:publication_resource")
|
226
|
.setSchemename("dnet:publication_resource")
|
227
|
.build());
|
228
|
|
229
|
result.addInstance(instance);
|
230
|
} else {
|
231
|
context.incrementCounter("filtered", "type_not_found", 1);
|
232
|
return null;
|
233
|
}
|
234
|
|
235
|
// Adding authors
|
236
|
final List<Author> authors = createAuthors(rootElement);
|
237
|
if (authors != null && authors.size() > 0) {
|
238
|
metadata.addAllAuthor(authors);
|
239
|
} else {
|
240
|
context.incrementCounter("filtered", "author_not_found", 1);
|
241
|
return null;
|
242
|
}
|
243
|
|
244
|
metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
|
245
|
result.setMetadata(metadata.build());
|
246
|
entity.setResult(result.build());
|
247
|
oaf.setEntity(entity.build());
|
248
|
|
249
|
final List<AtomicAction> actionList = new ArrayList<>();
|
250
|
|
251
|
actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
|
252
|
|
253
|
// System.out.println(JsonFormat.printToString(oaf.build()));
|
254
|
return actionList;
|
255
|
|
256
|
}
|
257
|
|
258
|
public static List<Author> createAuthors(final JsonObject root) {
|
259
|
|
260
|
final String authorsJSONFieldName = "authors";
|
261
|
|
262
|
if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
|
263
|
|
264
|
final List<Author> authors = new ArrayList<>();
|
265
|
final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
|
266
|
int firstCounter = 0;
|
267
|
int defaultCounter = 0;
|
268
|
int rank = 1;
|
269
|
int currentRank = 0;
|
270
|
|
271
|
for (final JsonElement item : jsonAuthors) {
|
272
|
final JsonObject author = item.getAsJsonObject();
|
273
|
final Author.Builder result = Author.newBuilder();
|
274
|
if (item.isJsonObject()) {
|
275
|
final String surname = getStringValue(author, "surname");
|
276
|
final String name = getStringValue(author, "name");
|
277
|
final String oid = getStringValue(author, "oid");
|
278
|
final String seq = getStringValue(author, "seq");
|
279
|
if (StringUtils.isNotBlank(seq)) {
|
280
|
if (seq.equals("first")) {
|
281
|
firstCounter += 1;
|
282
|
rank = firstCounter;
|
283
|
|
284
|
} else if (seq.equals("additional")) {
|
285
|
rank = currentRank + 1;
|
286
|
} else {
|
287
|
defaultCounter += 1;
|
288
|
rank = defaultCounter;
|
289
|
}
|
290
|
}
|
291
|
|
292
|
if (StringUtils.isNotBlank(oid)) {
|
293
|
result.addPid(KeyValue.newBuilder()
|
294
|
.setValue(oid)
|
295
|
.setKey("ORCID")
|
296
|
.build());
|
297
|
result.setFullname(name + " " + surname);
|
298
|
if (StringUtils.isNotBlank(name)) {
|
299
|
result.setName(name);
|
300
|
}
|
301
|
if (StringUtils.isNotBlank(surname)) {
|
302
|
result.setSurname(surname);
|
303
|
}
|
304
|
} else {
|
305
|
String fullname = "";
|
306
|
if (StringUtils.isNotBlank(name)) {
|
307
|
fullname = name;
|
308
|
} else {
|
309
|
if (StringUtils.isNotBlank(surname)) {
|
310
|
fullname = surname;
|
311
|
}
|
312
|
}
|
313
|
Person p = new Person(fullname, false);
|
314
|
if (p.isAccurate()) {
|
315
|
result.setName(p.getNormalisedFirstName());
|
316
|
result.setSurname(p.getNormalisedSurname());
|
317
|
result.setFullname(p.getNormalisedFullname());
|
318
|
}
|
319
|
else {
|
320
|
result.setFullname(fullname);
|
321
|
}
|
322
|
}
|
323
|
}
|
324
|
result.setRank(rank);
|
325
|
authors.add(result.build());
|
326
|
currentRank = rank;
|
327
|
}
|
328
|
return authors;
|
329
|
|
330
|
}
|
331
|
return null;
|
332
|
}
|
333
|
|
334
|
private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
|
335
|
String field = "";
|
336
|
if (!rootElement.has(fieldName)) { return null; }
|
337
|
if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
|
338
|
if (rootElement.get(fieldName).isJsonArray()) {
|
339
|
if (!isValidJsonArray(rootElement, fieldName)) { return null; }
|
340
|
final StringBuilder ttl = new StringBuilder();
|
341
|
getArrayValues(rootElement, fieldName).forEach(ttl::append);
|
342
|
field = ttl.toString();
|
343
|
} else {
|
344
|
field = getStringValue(rootElement, fieldName);
|
345
|
}
|
346
|
|
347
|
if (StringUtils.isNotBlank(field) && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
|
348
|
field = StringUtils.strip(field, "\"");
|
349
|
}
|
350
|
return field;
|
351
|
}
|
352
|
|
353
|
private static void settingRelevantDate(final JsonObject rootElement,
|
354
|
final ResultProtos.Result.Metadata.Builder metadata,
|
355
|
final String jsonKey,
|
356
|
final String dictionaryKey,
|
357
|
final boolean addToDateOfAcceptance) {
|
358
|
|
359
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
360
|
if (StringUtils.isNotBlank(pubDate)) {
|
361
|
if (addToDateOfAcceptance) {
|
362
|
metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
363
|
}
|
364
|
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
365
|
.setValue(pubDate)
|
366
|
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
367
|
.build());
|
368
|
}
|
369
|
}
|
370
|
|
371
|
private static String getPublicationDate(final JsonObject rootElement,
|
372
|
final String jsonKey) {
|
373
|
|
374
|
final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
375
|
if (pubDateJson == null) { return null; }
|
376
|
final String year = getStringValue(pubDateJson, "year");
|
377
|
final String month = getStringValue(pubDateJson, "month");
|
378
|
final String day = getStringValue(pubDateJson, "day");
|
379
|
|
380
|
if (StringUtils.isBlank(year)) { return null; }
|
381
|
String pubDate = "".concat(year);
|
382
|
if (StringUtils.isNotBlank(month)) {
|
383
|
pubDate = pubDate.concat("-" + month);
|
384
|
if (StringUtils.isNotBlank(day)) {
|
385
|
pubDate = pubDate.concat("-" + day);
|
386
|
} else {
|
387
|
pubDate += "-01";
|
388
|
}
|
389
|
} else {
|
390
|
pubDate += "-01-01";
|
391
|
}
|
392
|
if (isValidDate(pubDate)) { return pubDate; }
|
393
|
return null;
|
394
|
}
|
395
|
|
396
|
protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
|
397
|
|
398
|
final String type = getStringValue(rootElement, "type");
|
399
|
if (!typologiesMapping.containsKey(type)) {
|
400
|
context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
401
|
return false;
|
402
|
}
|
403
|
|
404
|
if (!isValidJsonArray(rootElement, "titles")) {
|
405
|
context.incrementCounter("filtered", "invalid_title", 1);
|
406
|
return false;
|
407
|
}
|
408
|
return true;
|
409
|
}
|
410
|
|
411
|
private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
|
412
|
if (!rootElement.has(fieldName)) { return false; }
|
413
|
final JsonElement jsonElement = rootElement.get(fieldName);
|
414
|
if (jsonElement.isJsonNull()) { return false; }
|
415
|
if (jsonElement.isJsonArray()) {
|
416
|
final JsonArray jsonArray = jsonElement.getAsJsonArray();
|
417
|
if (jsonArray.isJsonNull()) { return false; }
|
418
|
if (jsonArray.get(0).isJsonNull()) { return false; }
|
419
|
}
|
420
|
return true;
|
421
|
}
|
422
|
}
|