1
|
package eu.dnetlib.data.mapreduce.hbase.dataimport;
|
2
|
|
3
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
|
4
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getDefaultResulttype;
|
5
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
|
6
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
|
7
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
|
8
|
|
9
|
import java.io.IOException;
|
10
|
import java.io.InputStream;
|
11
|
import java.util.ArrayList;
|
12
|
import java.util.HashMap;
|
13
|
import java.util.List;
|
14
|
import java.util.Map;
|
15
|
|
16
|
import org.apache.commons.io.IOUtils;
|
17
|
import org.apache.commons.lang3.StringUtils;
|
18
|
|
19
|
import com.google.gson.Gson;
|
20
|
import com.google.gson.JsonArray;
|
21
|
import com.google.gson.JsonElement;
|
22
|
import com.google.gson.JsonObject;
|
23
|
|
24
|
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
25
|
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
26
|
import eu.dnetlib.actionmanager.common.Agent;
|
27
|
import eu.dnetlib.data.mapreduce.hbase.Reporter;
|
28
|
import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
29
|
import eu.dnetlib.data.proto.FieldTypeProtos;
|
30
|
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
|
31
|
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
|
32
|
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
33
|
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
|
34
|
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
35
|
import eu.dnetlib.data.proto.KindProtos;
|
36
|
import eu.dnetlib.data.proto.OafProtos;
|
37
|
import eu.dnetlib.data.proto.ResultProtos;
|
38
|
import eu.dnetlib.data.proto.TypeProtos;
|
39
|
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
40
|
import eu.dnetlib.miscutils.collections.Pair;
|
41
|
|
42
|
public class OrcidToActions {
|
43
|
|
44
|
public static final String ORCID = "ORCID";
|
45
|
public final static String orcidPREFIX = "orcid____";
|
46
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
47
|
public static final String SEPARATOR = "::";
|
48
|
|
49
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
50
|
|
51
|
{
|
52
|
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
53
|
|
54
|
}
|
55
|
};
|
56
|
|
57
|
// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
|
58
|
private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
|
59
|
|
60
|
{
|
61
|
put("ark".toLowerCase(), new Pair<>("ark", "ark"));
|
62
|
put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
|
63
|
put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
|
64
|
put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
|
65
|
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
|
66
|
put("urn".toLowerCase(), new Pair<>("urn", "urn"));
|
67
|
}
|
68
|
};
|
69
|
|
70
|
static Map<String, Map<String, String>> typologiesMapping;
|
71
|
|
72
|
static {
|
73
|
try {
|
74
|
final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
|
75
|
final String tt = IOUtils.toString(is);
|
76
|
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
77
|
} catch (final IOException e) {
|
78
|
e.printStackTrace();
|
79
|
}
|
80
|
}
|
81
|
|
82
|
public static final String PID_TYPES = "dnet:pid_types";
|
83
|
|
84
|
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
|
85
|
final ActionFactory factory,
|
86
|
final String setName,
|
87
|
final Agent agent,
|
88
|
final Reporter context) {
|
89
|
|
90
|
if (!isValid(rootElement, context)) { return null; }
|
91
|
|
92
|
// Create OAF proto
|
93
|
|
94
|
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
95
|
|
96
|
// Adding kind
|
97
|
oaf.setKind(KindProtos.Kind.entity);
|
98
|
|
99
|
// creating result proto
|
100
|
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
|
101
|
|
102
|
entity.setDateofcollection("2018-10-22");
|
103
|
|
104
|
// Adding external ids
|
105
|
StreamUtils.toStream(externalIds.keySet().iterator())
|
106
|
.forEach(jsonExtId -> {
|
107
|
final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
|
108
|
final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
|
109
|
final String extId = getStringValue(rootElement, jsonExtId);
|
110
|
if (StringUtils.isNotBlank(extId)) {
|
111
|
entity.addPid(StructuredProperty.newBuilder()
|
112
|
.setValue(extId)
|
113
|
.setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
|
114
|
.setSchemename("dnet:pid_types").build())
|
115
|
.build());
|
116
|
}
|
117
|
});
|
118
|
|
119
|
// Create result field
|
120
|
final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
|
121
|
|
122
|
// Create metadata proto
|
123
|
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
|
124
|
|
125
|
// Adding source
|
126
|
final String source = getStringValue(rootElement, "source");
|
127
|
if (StringUtils.isNotBlank(source)) {
|
128
|
metadata.addSource(StringField.newBuilder().setValue(source).build());
|
129
|
}
|
130
|
|
131
|
// Adding title
|
132
|
final String title = createRepeatedField(rootElement, "titles");
|
133
|
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
|
134
|
.setValue(title)
|
135
|
.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
136
|
.build());
|
137
|
|
138
|
// Adding identifier
|
139
|
final String id = getStringValue(rootElement, "id");
|
140
|
String sourceId = null;
|
141
|
if (id != null) {
|
142
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
|
143
|
} else {
|
144
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
|
145
|
}
|
146
|
entity.setId(sourceId);
|
147
|
|
148
|
// Adding relevant date
|
149
|
settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
|
150
|
|
151
|
// Adding type
|
152
|
final String type = getStringValue(rootElement, "type");
|
153
|
String cobjValue = "";
|
154
|
if (StringUtils.isNotBlank(type)) {
|
155
|
|
156
|
metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
|
157
|
.setClassid(type)
|
158
|
.setClassname(type)
|
159
|
.setSchemeid("dnet:dataCite_resource")
|
160
|
.setSchemename("dnet:dataCite_resource")
|
161
|
.build());
|
162
|
|
163
|
final String typeValue = typologiesMapping.get(type).get("value");
|
164
|
cobjValue = typologiesMapping.get(type).get("cobj");
|
165
|
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
166
|
|
167
|
// Adding hostedby
|
168
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
169
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
170
|
.setValue("Unknown Repository")
|
171
|
.build());
|
172
|
|
173
|
// Adding url
|
174
|
final String url = createRepeatedField(rootElement, "urls");
|
175
|
if (StringUtils.isNotBlank(url)) {
|
176
|
instance.addUrl(url);
|
177
|
}
|
178
|
|
179
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
180
|
if (StringUtils.isNotBlank(pubDate)) {
|
181
|
instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
182
|
}
|
183
|
|
184
|
// Adding collectedfrom
|
185
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
186
|
.setValue(ORCID)
|
187
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
|
188
|
.build();
|
189
|
instance.setCollectedfrom(collectedFrom);
|
190
|
|
191
|
// Adding accessright
|
192
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
193
|
.setClassid("UNKNOWN")
|
194
|
.setClassname("UNKNOWN")
|
195
|
.setSchemeid("dnet:access_modes")
|
196
|
.setSchemename("dnet:access_modes")
|
197
|
.build());
|
198
|
|
199
|
// Adding type
|
200
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
201
|
.setClassid(cobjValue)
|
202
|
.setClassname(typeValue)
|
203
|
.setSchemeid("dnet:publication_resource")
|
204
|
.setSchemename("dnet:publication_resource")
|
205
|
.build());
|
206
|
|
207
|
result.addInstance(instance);
|
208
|
}
|
209
|
|
210
|
// Adding authors
|
211
|
final List<Author> authors = createAuthors(rootElement);
|
212
|
if (authors != null) {
|
213
|
metadata.addAllAuthor(authors);
|
214
|
}
|
215
|
|
216
|
metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
|
217
|
result.setMetadata(metadata.build());
|
218
|
entity.setResult(result.build());
|
219
|
oaf.setEntity(entity.build());
|
220
|
|
221
|
final List<AtomicAction> actionList = new ArrayList<>();
|
222
|
|
223
|
actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
|
224
|
|
225
|
return actionList;
|
226
|
|
227
|
}
|
228
|
|
229
|
public static List<Author> createAuthors(final JsonObject root) {
|
230
|
|
231
|
final String authorsJSONFieldName = "authors";
|
232
|
|
233
|
if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
|
234
|
|
235
|
final List<Author> authors = new ArrayList<>();
|
236
|
final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
|
237
|
int firstCounter = 0;
|
238
|
int defaultCounter = 0;
|
239
|
int rank = 1;
|
240
|
int currentRank = 0;
|
241
|
|
242
|
for (final JsonElement item : jsonAuthors) {
|
243
|
final JsonObject author = item.getAsJsonObject();
|
244
|
final Author.Builder result = Author.newBuilder();
|
245
|
if (item.isJsonObject()) {
|
246
|
final String surname = getStringValue(author, "surname");
|
247
|
final String name = getStringValue(author, "name");
|
248
|
final String oid = getStringValue(author, "oid");
|
249
|
final String seq = getStringValue(author, "seq");
|
250
|
if (StringUtils.isNotBlank(seq)) {
|
251
|
if (seq.equals("first")) {
|
252
|
firstCounter += 1;
|
253
|
rank = firstCounter;
|
254
|
|
255
|
} else if (seq.equals("additional")) {
|
256
|
rank = currentRank + 1;
|
257
|
} else {
|
258
|
defaultCounter += 1;
|
259
|
rank = defaultCounter;
|
260
|
}
|
261
|
}
|
262
|
|
263
|
if (StringUtils.isNotBlank(oid)) {
|
264
|
result.addPid(KeyValue.newBuilder()
|
265
|
.setValue(oid)
|
266
|
.setKey("ORCID")
|
267
|
.build());
|
268
|
result.setFullname(name + " " + surname);
|
269
|
if (StringUtils.isNotBlank(name)) {
|
270
|
result.setName(name);
|
271
|
}
|
272
|
if (StringUtils.isNotBlank(surname)) {
|
273
|
result.setSurname(surname);
|
274
|
}
|
275
|
} else {
|
276
|
if (StringUtils.isNotBlank(name)) {
|
277
|
result.setFullname(name);
|
278
|
} else {
|
279
|
if (StringUtils.isNotBlank(surname)) {
|
280
|
result.setFullname(surname);
|
281
|
}
|
282
|
}
|
283
|
}
|
284
|
}
|
285
|
result.setRank(rank);
|
286
|
authors.add(result.build());
|
287
|
currentRank = rank;
|
288
|
}
|
289
|
return authors;
|
290
|
|
291
|
}
|
292
|
return null;
|
293
|
}
|
294
|
|
295
|
private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
|
296
|
String field = "";
|
297
|
if (!rootElement.has(fieldName)) { return null; }
|
298
|
if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
|
299
|
if (rootElement.get(fieldName).isJsonArray()) {
|
300
|
if (!isValidJsonArray(rootElement, fieldName)) { return null; }
|
301
|
final StringBuilder ttl = new StringBuilder();
|
302
|
getArrayValues(rootElement, fieldName).forEach(ttl::append);
|
303
|
field = ttl.toString();
|
304
|
} else {
|
305
|
field = getStringValue(rootElement, fieldName);
|
306
|
}
|
307
|
|
308
|
if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
|
309
|
field = field.substring(1, field.length() - 1);
|
310
|
}
|
311
|
return field;
|
312
|
}
|
313
|
|
314
|
private static void settingRelevantDate(final JsonObject rootElement,
|
315
|
final ResultProtos.Result.Metadata.Builder metadata,
|
316
|
final String jsonKey,
|
317
|
final String dictionaryKey,
|
318
|
final boolean addToDateOfAcceptance) {
|
319
|
|
320
|
// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
321
|
// if (pubDateJson == null) { return; }
|
322
|
// final String year = getStringValue(pubDateJson, "year");
|
323
|
// final String month = getStringValue(pubDateJson, "month");
|
324
|
// final String day = getStringValue(pubDateJson, "day");
|
325
|
//
|
326
|
// if (StringUtils.isBlank(year)) { return; }
|
327
|
// String pubDate = "".concat(year);
|
328
|
// if (StringUtils.isNotBlank(month)) {
|
329
|
// pubDate = pubDate.concat("-" + month);
|
330
|
// if (StringUtils.isNotBlank(day)) {
|
331
|
// pubDate = pubDate.concat("-" + day);
|
332
|
// } else {
|
333
|
// pubDate += "-01";
|
334
|
// }
|
335
|
// } else {
|
336
|
// pubDate += "-01-01";
|
337
|
// }
|
338
|
|
339
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
340
|
if (StringUtils.isNotBlank(pubDate)) {
|
341
|
// if (addToDateOfAcceptance) {
|
342
|
// metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
343
|
// }
|
344
|
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
345
|
.setValue(pubDate)
|
346
|
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
347
|
.build());
|
348
|
}
|
349
|
}
|
350
|
|
351
|
private static String getPublicationDate(final JsonObject rootElement,
|
352
|
final String jsonKey) {
|
353
|
|
354
|
final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
355
|
if (pubDateJson == null) { return null; }
|
356
|
final String year = getStringValue(pubDateJson, "year");
|
357
|
final String month = getStringValue(pubDateJson, "month");
|
358
|
final String day = getStringValue(pubDateJson, "day");
|
359
|
|
360
|
if (StringUtils.isBlank(year)) { return null; }
|
361
|
String pubDate = "".concat(year);
|
362
|
if (StringUtils.isNotBlank(month)) {
|
363
|
pubDate = pubDate.concat("-" + month);
|
364
|
if (StringUtils.isNotBlank(day)) {
|
365
|
pubDate = pubDate.concat("-" + day);
|
366
|
} else {
|
367
|
pubDate += "-01";
|
368
|
}
|
369
|
} else {
|
370
|
pubDate += "-01-01";
|
371
|
}
|
372
|
if (isValidDate(pubDate)) { return pubDate; }
|
373
|
return null;
|
374
|
}
|
375
|
|
376
|
protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
|
377
|
|
378
|
final String type = getStringValue(rootElement, "type");
|
379
|
if (!typologiesMapping.containsKey(type)) {
|
380
|
context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
381
|
return false;
|
382
|
}
|
383
|
|
384
|
if (!isValidJsonArray(rootElement, "titles")) {
|
385
|
context.incrementCounter("filtered", "invalid_title", 1);
|
386
|
return false;
|
387
|
}
|
388
|
return true;
|
389
|
}
|
390
|
|
391
|
private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
|
392
|
if (!rootElement.has(fieldName)) { return false; }
|
393
|
final JsonElement jsonElement = rootElement.get(fieldName);
|
394
|
if (jsonElement.isJsonNull()) { return false; }
|
395
|
if (jsonElement.isJsonArray()) {
|
396
|
final JsonArray jsonArray = jsonElement.getAsJsonArray();
|
397
|
if (jsonArray.isJsonNull()) { return false; }
|
398
|
if (jsonArray.get(0).isJsonNull()) { return false; }
|
399
|
}
|
400
|
return true;
|
401
|
}
|
402
|
}
|