1
|
package eu.dnetlib.data.mapreduce.hbase.dataimport;
|
2
|
|
3
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
|
4
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype;
|
5
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
|
6
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
|
7
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
|
8
|
|
9
|
import java.io.IOException;
|
10
|
import java.io.InputStream;
|
11
|
import java.util.ArrayList;
|
12
|
import java.util.HashMap;
|
13
|
import java.util.List;
|
14
|
import java.util.Map;
|
15
|
|
16
|
import org.apache.commons.io.IOUtils;
|
17
|
import org.apache.commons.lang3.StringUtils;
|
18
|
|
19
|
import com.google.gson.Gson;
|
20
|
import com.google.gson.JsonArray;
|
21
|
import com.google.gson.JsonElement;
|
22
|
import com.google.gson.JsonObject;
|
23
|
|
24
|
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
25
|
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
26
|
import eu.dnetlib.actionmanager.common.Agent;
|
27
|
import eu.dnetlib.data.mapreduce.hbase.Reporter;
|
28
|
import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
29
|
import eu.dnetlib.data.proto.FieldTypeProtos;
|
30
|
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
|
31
|
import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo;
|
32
|
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
|
33
|
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
34
|
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
|
35
|
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
36
|
import eu.dnetlib.data.proto.KindProtos;
|
37
|
import eu.dnetlib.data.proto.OafProtos;
|
38
|
import eu.dnetlib.data.proto.ResultProtos;
|
39
|
import eu.dnetlib.data.proto.TypeProtos;
|
40
|
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
41
|
import eu.dnetlib.miscutils.collections.Pair;
|
42
|
import eu.dnetlib.miscutils.datetime.DateUtils;
|
43
|
|
44
|
public class OrcidToActions {
|
45
|
|
46
|
public static final String ORCID = "ORCID";
|
47
|
public final static String orcidPREFIX = "orcid_______";
|
48
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
49
|
public static final String SEPARATOR = "::";
|
50
|
|
51
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
52
|
|
53
|
{
|
54
|
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
55
|
|
56
|
}
|
57
|
};
|
58
|
|
59
|
// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
|
60
|
private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
|
61
|
|
62
|
{
|
63
|
put("ark".toLowerCase(), new Pair<>("ark", "ark"));
|
64
|
put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
|
65
|
put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
|
66
|
put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
|
67
|
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
|
68
|
put("urn".toLowerCase(), new Pair<>("urn", "urn"));
|
69
|
}
|
70
|
};
|
71
|
|
72
|
static Map<String, Map<String, String>> typologiesMapping;
|
73
|
|
74
|
static {
|
75
|
try {
|
76
|
final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
|
77
|
final String tt = IOUtils.toString(is);
|
78
|
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
79
|
} catch (final IOException e) {
|
80
|
e.printStackTrace();
|
81
|
}
|
82
|
}
|
83
|
|
84
|
public static final String PID_TYPES = "dnet:pid_types";
|
85
|
|
86
|
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
|
87
|
final ActionFactory factory,
|
88
|
final String setName,
|
89
|
final Agent agent,
|
90
|
final Reporter context) {
|
91
|
|
92
|
if (!isValid(rootElement, context)) { return null; }
|
93
|
|
94
|
// Create OAF proto
|
95
|
|
96
|
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
97
|
|
98
|
oaf.setDataInfo(
|
99
|
DataInfo.newBuilder()
|
100
|
.setDeletedbyinference(false)
|
101
|
.setInferred(false)
|
102
|
.setTrust("0.9")
|
103
|
.setProvenanceaction(getQualifier("sysimport:actionset:orcidworks-no-doi", "dnet:provenanceActions"))
|
104
|
.build());
|
105
|
|
106
|
// Adding kind
|
107
|
oaf.setKind(KindProtos.Kind.entity);
|
108
|
|
109
|
oaf.setLastupdatetimestamp(DateUtils.now());
|
110
|
|
111
|
// creating result proto
|
112
|
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
|
113
|
|
114
|
entity.setDateofcollection("2018-10-22");
|
115
|
entity.setDateoftransformation(DateUtils.now_ISO8601());
|
116
|
|
117
|
// Adding external ids
|
118
|
StreamUtils.toStream(externalIds.keySet().iterator())
|
119
|
.forEach(jsonExtId -> {
|
120
|
final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
|
121
|
final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
|
122
|
final String extId = getStringValue(rootElement, jsonExtId);
|
123
|
if (StringUtils.isNotBlank(extId)) {
|
124
|
entity.addPid(StructuredProperty.newBuilder()
|
125
|
.setValue(extId)
|
126
|
.setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
|
127
|
.setSchemename("dnet:pid_types").build())
|
128
|
.build());
|
129
|
}
|
130
|
});
|
131
|
|
132
|
// Create result field
|
133
|
final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
|
134
|
|
135
|
// Create metadata proto
|
136
|
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
|
137
|
|
138
|
// Adding source
|
139
|
final String source = getStringValue(rootElement, "source");
|
140
|
if (StringUtils.isNotBlank(source)) {
|
141
|
metadata.addSource(StringField.newBuilder().setValue(source).build());
|
142
|
}
|
143
|
|
144
|
// Adding title
|
145
|
final String title = createRepeatedField(rootElement, "titles");
|
146
|
if (StringUtils.isBlank(title)) {
|
147
|
context.incrementCounter("filtered", "title_not_found", 1);
|
148
|
return null;
|
149
|
}
|
150
|
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
|
151
|
.setValue(title)
|
152
|
.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
153
|
.build());
|
154
|
|
155
|
// Adding identifier
|
156
|
final String id = getStringValue(rootElement, "id");
|
157
|
String sourceId = null;
|
158
|
if (id != null) {
|
159
|
entity.addOriginalId(id);
|
160
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
|
161
|
} else {
|
162
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
|
163
|
}
|
164
|
entity.setId(sourceId);
|
165
|
|
166
|
// Adding relevant date
|
167
|
settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
|
168
|
|
169
|
// Adding collectedfrom
|
170
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
171
|
.setValue(ORCID)
|
172
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
|
173
|
.build();
|
174
|
entity.addCollectedfrom(collectedFrom);
|
175
|
|
176
|
// Adding type
|
177
|
final String type = getStringValue(rootElement, "type");
|
178
|
String cobjValue = "";
|
179
|
if (StringUtils.isNotBlank(type)) {
|
180
|
|
181
|
metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
|
182
|
.setClassid(type)
|
183
|
.setClassname(type)
|
184
|
.setSchemeid("dnet:dataCite_resource")
|
185
|
.setSchemename("dnet:dataCite_resource")
|
186
|
.build());
|
187
|
|
188
|
final String typeValue = typologiesMapping.get(type).get("value");
|
189
|
cobjValue = typologiesMapping.get(type).get("cobj");
|
190
|
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
191
|
|
192
|
// Adding hostedby
|
193
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
194
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
195
|
.setValue("Unknown Repository")
|
196
|
.build());
|
197
|
|
198
|
// Adding url
|
199
|
final String url = createRepeatedField(rootElement, "urls");
|
200
|
if (StringUtils.isNotBlank(url)) {
|
201
|
instance.addUrl(url);
|
202
|
}
|
203
|
|
204
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
205
|
if (StringUtils.isNotBlank(pubDate)) {
|
206
|
instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
207
|
}
|
208
|
|
209
|
instance.setCollectedfrom(collectedFrom);
|
210
|
|
211
|
// Adding accessright
|
212
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
213
|
.setClassid("UNKNOWN")
|
214
|
.setClassname("UNKNOWN")
|
215
|
.setSchemeid("dnet:access_modes")
|
216
|
.setSchemename("dnet:access_modes")
|
217
|
.build());
|
218
|
|
219
|
// Adding type
|
220
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
221
|
.setClassid(cobjValue)
|
222
|
.setClassname(typeValue)
|
223
|
.setSchemeid("dnet:publication_resource")
|
224
|
.setSchemename("dnet:publication_resource")
|
225
|
.build());
|
226
|
|
227
|
result.addInstance(instance);
|
228
|
} else {
|
229
|
context.incrementCounter("filtered", "type_not_found", 1);
|
230
|
return null;
|
231
|
}
|
232
|
|
233
|
// Adding authors
|
234
|
final List<Author> authors = createAuthors(rootElement);
|
235
|
if (authors != null && authors.size() > 0) {
|
236
|
metadata.addAllAuthor(authors);
|
237
|
} else {
|
238
|
context.incrementCounter("filtered", "author_not_found", 1);
|
239
|
return null;
|
240
|
}
|
241
|
|
242
|
metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
|
243
|
result.setMetadata(metadata.build());
|
244
|
entity.setResult(result.build());
|
245
|
oaf.setEntity(entity.build());
|
246
|
|
247
|
final List<AtomicAction> actionList = new ArrayList<>();
|
248
|
|
249
|
actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
|
250
|
|
251
|
// System.out.println(JsonFormat.printToString(oaf.build()));
|
252
|
return actionList;
|
253
|
|
254
|
}
|
255
|
|
256
|
public static List<Author> createAuthors(final JsonObject root) {
|
257
|
|
258
|
final String authorsJSONFieldName = "authors";
|
259
|
|
260
|
if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
|
261
|
|
262
|
final List<Author> authors = new ArrayList<>();
|
263
|
final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
|
264
|
int firstCounter = 0;
|
265
|
int defaultCounter = 0;
|
266
|
int rank = 1;
|
267
|
int currentRank = 0;
|
268
|
|
269
|
for (final JsonElement item : jsonAuthors) {
|
270
|
final JsonObject author = item.getAsJsonObject();
|
271
|
final Author.Builder result = Author.newBuilder();
|
272
|
if (item.isJsonObject()) {
|
273
|
final String surname = getStringValue(author, "surname");
|
274
|
final String name = getStringValue(author, "name");
|
275
|
final String oid = getStringValue(author, "oid");
|
276
|
final String seq = getStringValue(author, "seq");
|
277
|
if (StringUtils.isNotBlank(seq)) {
|
278
|
if (seq.equals("first")) {
|
279
|
firstCounter += 1;
|
280
|
rank = firstCounter;
|
281
|
|
282
|
} else if (seq.equals("additional")) {
|
283
|
rank = currentRank + 1;
|
284
|
} else {
|
285
|
defaultCounter += 1;
|
286
|
rank = defaultCounter;
|
287
|
}
|
288
|
}
|
289
|
|
290
|
if (StringUtils.isNotBlank(oid)) {
|
291
|
result.addPid(KeyValue.newBuilder()
|
292
|
.setValue(oid)
|
293
|
.setKey("ORCID")
|
294
|
.build());
|
295
|
result.setFullname(name + " " + surname);
|
296
|
if (StringUtils.isNotBlank(name)) {
|
297
|
result.setName(name);
|
298
|
}
|
299
|
if (StringUtils.isNotBlank(surname)) {
|
300
|
result.setSurname(surname);
|
301
|
}
|
302
|
} else {
|
303
|
if (StringUtils.isNotBlank(name)) {
|
304
|
result.setFullname(name);
|
305
|
} else {
|
306
|
if (StringUtils.isNotBlank(surname)) {
|
307
|
result.setFullname(surname);
|
308
|
}
|
309
|
}
|
310
|
}
|
311
|
}
|
312
|
result.setRank(rank);
|
313
|
authors.add(result.build());
|
314
|
currentRank = rank;
|
315
|
}
|
316
|
return authors;
|
317
|
|
318
|
}
|
319
|
return null;
|
320
|
}
|
321
|
|
322
|
private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
|
323
|
String field = "";
|
324
|
if (!rootElement.has(fieldName)) { return null; }
|
325
|
if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
|
326
|
if (rootElement.get(fieldName).isJsonArray()) {
|
327
|
if (!isValidJsonArray(rootElement, fieldName)) { return null; }
|
328
|
final StringBuilder ttl = new StringBuilder();
|
329
|
getArrayValues(rootElement, fieldName).forEach(ttl::append);
|
330
|
field = ttl.toString();
|
331
|
} else {
|
332
|
field = getStringValue(rootElement, fieldName);
|
333
|
}
|
334
|
|
335
|
if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
|
336
|
field = field.substring(1, field.length() - 1);
|
337
|
}
|
338
|
return field;
|
339
|
}
|
340
|
|
341
|
private static void settingRelevantDate(final JsonObject rootElement,
|
342
|
final ResultProtos.Result.Metadata.Builder metadata,
|
343
|
final String jsonKey,
|
344
|
final String dictionaryKey,
|
345
|
final boolean addToDateOfAcceptance) {
|
346
|
|
347
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
348
|
if (StringUtils.isNotBlank(pubDate)) {
|
349
|
if (addToDateOfAcceptance) {
|
350
|
metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
351
|
}
|
352
|
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
353
|
.setValue(pubDate)
|
354
|
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
355
|
.build());
|
356
|
}
|
357
|
}
|
358
|
|
359
|
private static String getPublicationDate(final JsonObject rootElement,
|
360
|
final String jsonKey) {
|
361
|
|
362
|
final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
363
|
if (pubDateJson == null) { return null; }
|
364
|
final String year = getStringValue(pubDateJson, "year");
|
365
|
final String month = getStringValue(pubDateJson, "month");
|
366
|
final String day = getStringValue(pubDateJson, "day");
|
367
|
|
368
|
if (StringUtils.isBlank(year)) { return null; }
|
369
|
String pubDate = "".concat(year);
|
370
|
if (StringUtils.isNotBlank(month)) {
|
371
|
pubDate = pubDate.concat("-" + month);
|
372
|
if (StringUtils.isNotBlank(day)) {
|
373
|
pubDate = pubDate.concat("-" + day);
|
374
|
} else {
|
375
|
pubDate += "-01";
|
376
|
}
|
377
|
} else {
|
378
|
pubDate += "-01-01";
|
379
|
}
|
380
|
if (isValidDate(pubDate)) { return pubDate; }
|
381
|
return null;
|
382
|
}
|
383
|
|
384
|
protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
|
385
|
|
386
|
final String type = getStringValue(rootElement, "type");
|
387
|
if (!typologiesMapping.containsKey(type)) {
|
388
|
context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
389
|
return false;
|
390
|
}
|
391
|
|
392
|
if (!isValidJsonArray(rootElement, "titles")) {
|
393
|
context.incrementCounter("filtered", "invalid_title", 1);
|
394
|
return false;
|
395
|
}
|
396
|
return true;
|
397
|
}
|
398
|
|
399
|
private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
|
400
|
if (!rootElement.has(fieldName)) { return false; }
|
401
|
final JsonElement jsonElement = rootElement.get(fieldName);
|
402
|
if (jsonElement.isJsonNull()) { return false; }
|
403
|
if (jsonElement.isJsonArray()) {
|
404
|
final JsonArray jsonArray = jsonElement.getAsJsonArray();
|
405
|
if (jsonArray.isJsonNull()) { return false; }
|
406
|
if (jsonArray.get(0).isJsonNull()) { return false; }
|
407
|
}
|
408
|
return true;
|
409
|
}
|
410
|
}
|