1
|
package eu.dnetlib.data.mapreduce.hbase.dataimport;
|
2
|
|
3
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getArrayValues;
|
4
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getQualifier;
|
5
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.getStringValue;
|
6
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.isValidDate;
|
7
|
|
8
|
import java.io.IOException;
|
9
|
import java.io.InputStream;
|
10
|
import java.util.ArrayList;
|
11
|
import java.util.HashMap;
|
12
|
import java.util.List;
|
13
|
import java.util.Map;
|
14
|
|
15
|
import org.apache.commons.io.IOUtils;
|
16
|
import org.apache.commons.lang3.StringUtils;
|
17
|
|
18
|
import com.google.gson.Gson;
|
19
|
import com.google.gson.JsonArray;
|
20
|
import com.google.gson.JsonElement;
|
21
|
import com.google.gson.JsonObject;
|
22
|
|
23
|
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
24
|
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
25
|
import eu.dnetlib.actionmanager.common.Agent;
|
26
|
import eu.dnetlib.data.mapreduce.hbase.Reporter;
|
27
|
import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
28
|
import eu.dnetlib.data.proto.FieldTypeProtos;
|
29
|
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
|
30
|
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
|
31
|
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
32
|
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
|
33
|
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
34
|
import eu.dnetlib.data.proto.KindProtos;
|
35
|
import eu.dnetlib.data.proto.OafProtos;
|
36
|
import eu.dnetlib.data.proto.ResultProtos;
|
37
|
import eu.dnetlib.data.proto.TypeProtos;
|
38
|
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
39
|
import eu.dnetlib.miscutils.collections.Pair;
|
40
|
|
41
|
public class OrcidToActions {
|
42
|
|
43
|
public static final String ORCID = "ORCID";
|
44
|
public final static String orcidPREFIX = "orcid____";
|
45
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
46
|
public static final String SEPARATOR = "::";
|
47
|
|
48
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
49
|
|
50
|
{
|
51
|
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
52
|
|
53
|
}
|
54
|
};
|
55
|
|
56
|
// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
|
57
|
private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
|
58
|
|
59
|
{
|
60
|
put("ark".toLowerCase(), new Pair<>("ark", "ark"));
|
61
|
put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
|
62
|
put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
|
63
|
put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
|
64
|
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
|
65
|
put("urn".toLowerCase(), new Pair<>("urn", "urn"));
|
66
|
}
|
67
|
};
|
68
|
|
69
|
static Map<String, Map<String, String>> typologiesMapping;
|
70
|
|
71
|
static {
|
72
|
try {
|
73
|
final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
|
74
|
final String tt = IOUtils.toString(is);
|
75
|
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
76
|
} catch (final IOException e) {
|
77
|
e.printStackTrace();
|
78
|
}
|
79
|
}
|
80
|
|
81
|
public static final String PID_TYPES = "dnet:pid_types";
|
82
|
|
83
|
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
|
84
|
final ActionFactory factory,
|
85
|
final String setName,
|
86
|
final Agent agent,
|
87
|
final Reporter context) {
|
88
|
|
89
|
if (!isValid(rootElement, context)) { return null; }
|
90
|
|
91
|
// Create OAF proto
|
92
|
|
93
|
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
94
|
|
95
|
// Adding kind
|
96
|
oaf.setKind(KindProtos.Kind.entity);
|
97
|
|
98
|
// creating result proto
|
99
|
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
|
100
|
|
101
|
entity.setDateofcollection("2018-10-22");
|
102
|
|
103
|
// Adding external ids
|
104
|
StreamUtils.toStream(externalIds.keySet().iterator())
|
105
|
.forEach(jsonExtId -> {
|
106
|
final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
|
107
|
final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
|
108
|
final String extId = getStringValue(rootElement, jsonExtId);
|
109
|
if (StringUtils.isNotBlank(extId)) {
|
110
|
entity.addPid(StructuredProperty.newBuilder()
|
111
|
.setValue(extId)
|
112
|
.setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
|
113
|
.setSchemename("dnet:pid_types").build())
|
114
|
.build());
|
115
|
}
|
116
|
});
|
117
|
|
118
|
// Create result field
|
119
|
final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
|
120
|
|
121
|
// Create metadata proto
|
122
|
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
|
123
|
|
124
|
// Adding source
|
125
|
final String source = getStringValue(rootElement, "source");
|
126
|
if (StringUtils.isNotBlank(source)) {
|
127
|
metadata.addSource(StringField.newBuilder().setValue(source).build());
|
128
|
}
|
129
|
|
130
|
// Adding title
|
131
|
final String title = createRepeatedField(rootElement, "titles");
|
132
|
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
|
133
|
.setValue(title)
|
134
|
.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
135
|
.build());
|
136
|
|
137
|
// Adding identifier
|
138
|
final String id = getStringValue(rootElement, "id");
|
139
|
String sourceId = null;
|
140
|
if (id != null) {
|
141
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
|
142
|
} else {
|
143
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
|
144
|
}
|
145
|
entity.setId(sourceId);
|
146
|
|
147
|
// Adding relevant date
|
148
|
settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
|
149
|
|
150
|
// Adding type
|
151
|
final String type = getStringValue(rootElement, "type");
|
152
|
if (StringUtils.isNotBlank(type)) {
|
153
|
final String typeValue = typologiesMapping.get(type).get("value");
|
154
|
final String cobjValue = typologiesMapping.get(type).get("cobj");
|
155
|
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
156
|
|
157
|
// Adding hostedby
|
158
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
159
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
160
|
.setValue("Unknown Repository")
|
161
|
.build());
|
162
|
|
163
|
// Adding url
|
164
|
final String url = createRepeatedField(rootElement, "urls");
|
165
|
if (StringUtils.isNotBlank(url)) {
|
166
|
instance.addUrl(url);
|
167
|
}
|
168
|
|
169
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
170
|
if (StringUtils.isNotBlank(pubDate)) {
|
171
|
instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
172
|
}
|
173
|
|
174
|
// Adding collectedfrom
|
175
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
176
|
.setValue(ORCID)
|
177
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
|
178
|
.build();
|
179
|
instance.setCollectedfrom(collectedFrom);
|
180
|
|
181
|
// Adding accessright
|
182
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
183
|
.setClassid("UNKNOWN")
|
184
|
.setClassname("UNKNOWN")
|
185
|
.setSchemeid("dnet:access_modes")
|
186
|
.setSchemename("dnet:access_modes")
|
187
|
.build());
|
188
|
|
189
|
// Adding type
|
190
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
191
|
.setClassid(cobjValue)
|
192
|
.setClassname(typeValue)
|
193
|
.setSchemeid("dnet:publication_resource")
|
194
|
.setSchemename("dnet:publication_resource")
|
195
|
.build());
|
196
|
|
197
|
result.addInstance(instance);
|
198
|
}
|
199
|
|
200
|
// Adding authors
|
201
|
final List<Author> authors = createAuthors(rootElement);
|
202
|
if (authors != null) {
|
203
|
metadata.addAllAuthor(authors);
|
204
|
}
|
205
|
|
206
|
result.setMetadata(metadata.build());
|
207
|
|
208
|
entity.setResult(result.build());
|
209
|
|
210
|
oaf.setEntity(entity.build());
|
211
|
|
212
|
// System.out.println("Proto dump: " + com.googlecode.protobuf.format.JsonFormat.printToString(oaf.build()));
|
213
|
|
214
|
final List<AtomicAction> actionList = new ArrayList<>();
|
215
|
|
216
|
actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
|
217
|
|
218
|
return actionList;
|
219
|
|
220
|
}
|
221
|
|
222
|
public static List<Author> createAuthors(final JsonObject root) {
|
223
|
|
224
|
final String authorsJSONFieldName = "authors";
|
225
|
|
226
|
if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
|
227
|
|
228
|
final List<Author> authors = new ArrayList<>();
|
229
|
final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
|
230
|
int firstCounter = 0;
|
231
|
int defaultCounter = 0;
|
232
|
int rank = 1;
|
233
|
int currentRank = 0;
|
234
|
|
235
|
for (final JsonElement item : jsonAuthors) {
|
236
|
final JsonObject author = item.getAsJsonObject();
|
237
|
final Author.Builder result = Author.newBuilder();
|
238
|
if (item.isJsonObject()) {
|
239
|
final String surname = getStringValue(author, "surname");
|
240
|
final String name = getStringValue(author, "name");
|
241
|
final String oid = getStringValue(author, "oid");
|
242
|
final String seq = getStringValue(author, "seq");
|
243
|
if (StringUtils.isNotBlank(seq)) {
|
244
|
if (seq.equals("first")) {
|
245
|
firstCounter += 1;
|
246
|
rank = firstCounter;
|
247
|
|
248
|
} else if (seq.equals("additional")) {
|
249
|
rank = currentRank + 1;
|
250
|
} else {
|
251
|
defaultCounter += 1;
|
252
|
rank = defaultCounter;
|
253
|
}
|
254
|
}
|
255
|
|
256
|
if (StringUtils.isNotBlank(oid)) {
|
257
|
result.addPid(KeyValue.newBuilder()
|
258
|
.setValue(oid)
|
259
|
.setKey("ORCID")
|
260
|
.build());
|
261
|
result.setFullname(name + " " + surname);
|
262
|
if (StringUtils.isNotBlank(name)) {
|
263
|
result.setName(name);
|
264
|
}
|
265
|
if (StringUtils.isNotBlank(surname)) {
|
266
|
result.setSurname(surname);
|
267
|
}
|
268
|
} else {
|
269
|
if (StringUtils.isNotBlank(name)) {
|
270
|
result.setFullname(name);
|
271
|
} else {
|
272
|
if (StringUtils.isNotBlank(surname)) {
|
273
|
result.setFullname(surname);
|
274
|
}
|
275
|
}
|
276
|
}
|
277
|
}
|
278
|
result.setRank(rank);
|
279
|
authors.add(result.build());
|
280
|
currentRank = rank;
|
281
|
}
|
282
|
return authors;
|
283
|
|
284
|
}
|
285
|
return null;
|
286
|
}
|
287
|
|
288
|
private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
|
289
|
String field = "";
|
290
|
if (!rootElement.has(fieldName)) { return null; }
|
291
|
if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
|
292
|
if (rootElement.get(fieldName).isJsonArray()) {
|
293
|
if (!isValidJsonArray(rootElement, fieldName)) { return null; }
|
294
|
final StringBuilder ttl = new StringBuilder();
|
295
|
getArrayValues(rootElement, fieldName).forEach(ttl::append);
|
296
|
field = ttl.toString();
|
297
|
} else {
|
298
|
field = getStringValue(rootElement, fieldName);
|
299
|
}
|
300
|
|
301
|
if (field != null && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
|
302
|
field = field.substring(1, field.length() - 1);
|
303
|
}
|
304
|
return field;
|
305
|
}
|
306
|
|
307
|
private static void settingRelevantDate(final JsonObject rootElement,
|
308
|
final ResultProtos.Result.Metadata.Builder metadata,
|
309
|
final String jsonKey,
|
310
|
final String dictionaryKey,
|
311
|
final boolean addToDateOfAcceptance) {
|
312
|
|
313
|
// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
314
|
// if (pubDateJson == null) { return; }
|
315
|
// final String year = getStringValue(pubDateJson, "year");
|
316
|
// final String month = getStringValue(pubDateJson, "month");
|
317
|
// final String day = getStringValue(pubDateJson, "day");
|
318
|
//
|
319
|
// if (StringUtils.isBlank(year)) { return; }
|
320
|
// String pubDate = "".concat(year);
|
321
|
// if (StringUtils.isNotBlank(month)) {
|
322
|
// pubDate = pubDate.concat("-" + month);
|
323
|
// if (StringUtils.isNotBlank(day)) {
|
324
|
// pubDate = pubDate.concat("-" + day);
|
325
|
// } else {
|
326
|
// pubDate += "-01";
|
327
|
// }
|
328
|
// } else {
|
329
|
// pubDate += "-01-01";
|
330
|
// }
|
331
|
|
332
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
333
|
if (StringUtils.isNotBlank(pubDate)) {
|
334
|
// if (addToDateOfAcceptance) {
|
335
|
// metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
336
|
// }
|
337
|
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
338
|
.setValue(pubDate)
|
339
|
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
340
|
.build());
|
341
|
}
|
342
|
}
|
343
|
|
344
|
private static String getPublicationDate(final JsonObject rootElement,
|
345
|
final String jsonKey) {
|
346
|
|
347
|
final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
348
|
if (pubDateJson == null) { return null; }
|
349
|
final String year = getStringValue(pubDateJson, "year");
|
350
|
final String month = getStringValue(pubDateJson, "month");
|
351
|
final String day = getStringValue(pubDateJson, "day");
|
352
|
|
353
|
if (StringUtils.isBlank(year)) { return null; }
|
354
|
String pubDate = "".concat(year);
|
355
|
if (StringUtils.isNotBlank(month)) {
|
356
|
pubDate = pubDate.concat("-" + month);
|
357
|
if (StringUtils.isNotBlank(day)) {
|
358
|
pubDate = pubDate.concat("-" + day);
|
359
|
} else {
|
360
|
pubDate += "-01";
|
361
|
}
|
362
|
} else {
|
363
|
pubDate += "-01-01";
|
364
|
}
|
365
|
if (isValidDate(pubDate)) { return pubDate; }
|
366
|
return null;
|
367
|
}
|
368
|
|
369
|
protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
|
370
|
|
371
|
final String type = getStringValue(rootElement, "type");
|
372
|
if (!typologiesMapping.containsKey(type)) {
|
373
|
context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
374
|
return false;
|
375
|
}
|
376
|
|
377
|
if (!isValidJsonArray(rootElement, "titles")) {
|
378
|
context.incrementCounter("filtered", "invalid_title", 1);
|
379
|
return false;
|
380
|
}
|
381
|
return true;
|
382
|
}
|
383
|
|
384
|
private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
|
385
|
if (!rootElement.has(fieldName)) { return false; }
|
386
|
final JsonElement jsonElement = rootElement.get(fieldName);
|
387
|
if (jsonElement.isJsonNull()) { return false; }
|
388
|
if (jsonElement.isJsonArray()) {
|
389
|
final JsonArray jsonArray = jsonElement.getAsJsonArray();
|
390
|
if (jsonArray.isJsonNull()) { return false; }
|
391
|
if (jsonArray.get(0).isJsonNull()) { return false; }
|
392
|
}
|
393
|
return true;
|
394
|
}
|
395
|
}
|