1
|
package eu.dnetlib.data.mapreduce.hbase.dataimport;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.io.InputStream;
|
5
|
import java.util.ArrayList;
|
6
|
import java.util.HashMap;
|
7
|
import java.util.List;
|
8
|
import java.util.Map;
|
9
|
|
10
|
import com.google.gson.Gson;
|
11
|
import com.google.gson.JsonArray;
|
12
|
import com.google.gson.JsonElement;
|
13
|
import com.google.gson.JsonObject;
|
14
|
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
15
|
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
16
|
import eu.dnetlib.actionmanager.common.Agent;
|
17
|
import eu.dnetlib.data.mapreduce.hbase.Reporter;
|
18
|
import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
19
|
import eu.dnetlib.data.proto.*;
|
20
|
import eu.dnetlib.data.proto.FieldTypeProtos.*;
|
21
|
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
22
|
import eu.dnetlib.miscutils.collections.Pair;
|
23
|
import org.apache.commons.io.IOUtils;
|
24
|
import org.apache.commons.lang3.StringUtils;
|
25
|
|
26
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
|
27
|
|
28
|
public class OrcidToActions {
|
29
|
|
30
|
public static final String ORCID = "ORCID";
|
31
|
public final static String orcidPREFIX = "orcid____";
|
32
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
33
|
public static final String SEPARATOR = "::";
|
34
|
|
35
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
36
|
|
37
|
{
|
38
|
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
39
|
|
40
|
}
|
41
|
};
|
42
|
|
43
|
// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
|
44
|
private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
|
45
|
|
46
|
{
|
47
|
put("ark".toLowerCase(), new Pair<>("ark", "ark"));
|
48
|
put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
|
49
|
put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
|
50
|
put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
|
51
|
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
|
52
|
put("urn".toLowerCase(), new Pair<>("urn", "urn"));
|
53
|
}
|
54
|
};
|
55
|
|
56
|
static Map<String, Map<String, String>> typologiesMapping;
|
57
|
|
58
|
static {
|
59
|
try {
|
60
|
final InputStream is = OrcidToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies_orcid.json");
|
61
|
final String tt = IOUtils.toString(is);
|
62
|
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
63
|
} catch (final IOException e) {
|
64
|
e.printStackTrace();
|
65
|
}
|
66
|
}
|
67
|
|
68
|
public static final String PID_TYPES = "dnet:pid_types";
|
69
|
|
70
|
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
|
71
|
final ActionFactory factory,
|
72
|
final String setName,
|
73
|
final Agent agent,
|
74
|
final Reporter context) {
|
75
|
|
76
|
if (!isValid(rootElement, context)) { return null; }
|
77
|
|
78
|
// Create OAF proto
|
79
|
|
80
|
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
81
|
|
82
|
// Adding kind
|
83
|
oaf.setKind(KindProtos.Kind.entity);
|
84
|
|
85
|
// creating result proto
|
86
|
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
|
87
|
|
88
|
entity.setDateofcollection("2018-10-22");
|
89
|
|
90
|
// Adding external ids
|
91
|
StreamUtils.toStream(externalIds.keySet().iterator())
|
92
|
.forEach(jsonExtId -> {
|
93
|
final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
|
94
|
final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
|
95
|
final String extId = getStringValue(rootElement, jsonExtId);
|
96
|
if (StringUtils.isNotBlank(extId)) {
|
97
|
entity.addPid(StructuredProperty.newBuilder()
|
98
|
.setValue(extId)
|
99
|
.setQualifier(Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid("dnet:pid_types")
|
100
|
.setSchemename("dnet:pid_types").build())
|
101
|
.build());
|
102
|
}
|
103
|
});
|
104
|
|
105
|
// Create result field
|
106
|
final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
|
107
|
|
108
|
// Create metadata proto
|
109
|
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
|
110
|
|
111
|
// Adding source
|
112
|
final String source = getStringValue(rootElement, "source");
|
113
|
if (StringUtils.isNotBlank(source)) {
|
114
|
metadata.addSource(StringField.newBuilder().setValue(source).build());
|
115
|
}
|
116
|
|
117
|
// Adding title
|
118
|
final String title = createRepeatedField(rootElement, "titles");
|
119
|
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
|
120
|
.setValue(title)
|
121
|
.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
122
|
.build());
|
123
|
|
124
|
// Adding identifier
|
125
|
final String id = getStringValue(rootElement, "id");
|
126
|
String sourceId = null;
|
127
|
if (id != null) {
|
128
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(id));
|
129
|
} else {
|
130
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, AbstractDNetXsltFunctions.md5(title));
|
131
|
}
|
132
|
entity.setId(sourceId);
|
133
|
|
134
|
// Adding relevant date
|
135
|
settingRelevantDate(rootElement, metadata, "publication_date", "issued", true);
|
136
|
|
137
|
// Adding type
|
138
|
final String type = getStringValue(rootElement, "type");
|
139
|
String cobjValue = "";
|
140
|
if (StringUtils.isNotBlank(type)) {
|
141
|
|
142
|
metadata.setResourcetype(FieldTypeProtos.Qualifier.newBuilder()
|
143
|
.setClassid(type)
|
144
|
.setClassname(type)
|
145
|
.setSchemeid("dnet:dataCite_resource")
|
146
|
.setSchemename("dnet:dataCite_resource")
|
147
|
.build());
|
148
|
|
149
|
final String typeValue = typologiesMapping.get(type).get("value");
|
150
|
cobjValue = typologiesMapping.get(type).get("cobj");
|
151
|
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
152
|
|
153
|
// Adding hostedby
|
154
|
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
155
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
156
|
.setValue("Unknown Repository")
|
157
|
.build());
|
158
|
|
159
|
// Adding url
|
160
|
final String url = createRepeatedField(rootElement, "urls");
|
161
|
if (StringUtils.isNotBlank(url)) {
|
162
|
instance.addUrl(url);
|
163
|
}
|
164
|
|
165
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
166
|
if (StringUtils.isNotBlank(pubDate)) {
|
167
|
instance.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
168
|
}
|
169
|
|
170
|
// Adding collectedfrom
|
171
|
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
172
|
.setValue(ORCID)
|
173
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a")
|
174
|
.build();
|
175
|
instance.setCollectedfrom(collectedFrom);
|
176
|
|
177
|
// Adding accessright
|
178
|
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
179
|
.setClassid("UNKNOWN")
|
180
|
.setClassname("UNKNOWN")
|
181
|
.setSchemeid("dnet:access_modes")
|
182
|
.setSchemename("dnet:access_modes")
|
183
|
.build());
|
184
|
|
185
|
// Adding type
|
186
|
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
187
|
.setClassid(cobjValue)
|
188
|
.setClassname(typeValue)
|
189
|
.setSchemeid("dnet:publication_resource")
|
190
|
.setSchemename("dnet:publication_resource")
|
191
|
.build());
|
192
|
|
193
|
result.addInstance(instance);
|
194
|
}
|
195
|
|
196
|
// Adding authors
|
197
|
final List<Author> authors = createAuthors(rootElement);
|
198
|
if (authors != null) {
|
199
|
metadata.addAllAuthor(authors);
|
200
|
}
|
201
|
|
202
|
metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
|
203
|
result.setMetadata(metadata.build());
|
204
|
entity.setResult(result.build());
|
205
|
oaf.setEntity(entity.build());
|
206
|
|
207
|
final List<AtomicAction> actionList = new ArrayList<>();
|
208
|
|
209
|
actionList.add(factory.createAtomicAction(setName, agent, oaf.getEntity().getId(), "result", "body", oaf.build().toByteArray()));
|
210
|
|
211
|
//System.out.println(JsonFormat.printToString(oaf.build()));
|
212
|
return actionList;
|
213
|
|
214
|
}
|
215
|
|
216
|
public static List<Author> createAuthors(final JsonObject root) {
|
217
|
|
218
|
final String authorsJSONFieldName = "authors";
|
219
|
|
220
|
if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
|
221
|
|
222
|
final List<Author> authors = new ArrayList<>();
|
223
|
final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
|
224
|
int firstCounter = 0;
|
225
|
int defaultCounter = 0;
|
226
|
int rank = 1;
|
227
|
int currentRank = 0;
|
228
|
|
229
|
for (final JsonElement item : jsonAuthors) {
|
230
|
final JsonObject author = item.getAsJsonObject();
|
231
|
final Author.Builder result = Author.newBuilder();
|
232
|
if (item.isJsonObject()) {
|
233
|
final String surname = getStringValue(author, "surname");
|
234
|
final String name = getStringValue(author, "name");
|
235
|
final String oid = getStringValue(author, "oid");
|
236
|
final String seq = getStringValue(author, "seq");
|
237
|
if (StringUtils.isNotBlank(seq)) {
|
238
|
if (seq.equals("first")) {
|
239
|
firstCounter += 1;
|
240
|
rank = firstCounter;
|
241
|
|
242
|
} else if (seq.equals("additional")) {
|
243
|
rank = currentRank + 1;
|
244
|
} else {
|
245
|
defaultCounter += 1;
|
246
|
rank = defaultCounter;
|
247
|
}
|
248
|
}
|
249
|
|
250
|
if (StringUtils.isNotBlank(oid)) {
|
251
|
result.addPid(KeyValue.newBuilder()
|
252
|
.setValue(oid)
|
253
|
.setKey("ORCID")
|
254
|
.build());
|
255
|
result.setFullname(name + " " + surname);
|
256
|
if (StringUtils.isNotBlank(name)) {
|
257
|
result.setName(name);
|
258
|
}
|
259
|
if (StringUtils.isNotBlank(surname)) {
|
260
|
result.setSurname(surname);
|
261
|
}
|
262
|
} else {
|
263
|
if (StringUtils.isNotBlank(name)) {
|
264
|
result.setFullname(name);
|
265
|
} else {
|
266
|
if (StringUtils.isNotBlank(surname)) {
|
267
|
result.setFullname(surname);
|
268
|
}
|
269
|
}
|
270
|
}
|
271
|
}
|
272
|
result.setRank(rank);
|
273
|
authors.add(result.build());
|
274
|
currentRank = rank;
|
275
|
}
|
276
|
return authors;
|
277
|
|
278
|
}
|
279
|
return null;
|
280
|
}
|
281
|
|
282
|
private static String createRepeatedField(final JsonObject rootElement, final String fieldName) {
|
283
|
String field = "";
|
284
|
if (!rootElement.has(fieldName)) { return null; }
|
285
|
if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) { return null; }
|
286
|
if (rootElement.get(fieldName).isJsonArray()) {
|
287
|
if (!isValidJsonArray(rootElement, fieldName)) { return null; }
|
288
|
final StringBuilder ttl = new StringBuilder();
|
289
|
getArrayValues(rootElement, fieldName).forEach(ttl::append);
|
290
|
field = ttl.toString();
|
291
|
} else {
|
292
|
field = getStringValue(rootElement, fieldName);
|
293
|
}
|
294
|
|
295
|
if (field != null && !field.isEmpty() && field.charAt(0) == '"' && field.charAt(field.length() - 1) == '"') {
|
296
|
field = field.substring(1, field.length() - 1);
|
297
|
}
|
298
|
return field;
|
299
|
}
|
300
|
|
301
|
private static void settingRelevantDate(final JsonObject rootElement,
|
302
|
final ResultProtos.Result.Metadata.Builder metadata,
|
303
|
final String jsonKey,
|
304
|
final String dictionaryKey,
|
305
|
final boolean addToDateOfAcceptance) {
|
306
|
|
307
|
// final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
308
|
// if (pubDateJson == null) { return; }
|
309
|
// final String year = getStringValue(pubDateJson, "year");
|
310
|
// final String month = getStringValue(pubDateJson, "month");
|
311
|
// final String day = getStringValue(pubDateJson, "day");
|
312
|
//
|
313
|
// if (StringUtils.isBlank(year)) { return; }
|
314
|
// String pubDate = "".concat(year);
|
315
|
// if (StringUtils.isNotBlank(month)) {
|
316
|
// pubDate = pubDate.concat("-" + month);
|
317
|
// if (StringUtils.isNotBlank(day)) {
|
318
|
// pubDate = pubDate.concat("-" + day);
|
319
|
// } else {
|
320
|
// pubDate += "-01";
|
321
|
// }
|
322
|
// } else {
|
323
|
// pubDate += "-01-01";
|
324
|
// }
|
325
|
|
326
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
327
|
if (StringUtils.isNotBlank(pubDate)) {
|
328
|
if (addToDateOfAcceptance) {
|
329
|
metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(pubDate).build());
|
330
|
}
|
331
|
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
332
|
.setValue(pubDate)
|
333
|
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
334
|
.build());
|
335
|
}
|
336
|
}
|
337
|
|
338
|
private static String getPublicationDate(final JsonObject rootElement,
|
339
|
final String jsonKey) {
|
340
|
|
341
|
final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
342
|
if (pubDateJson == null) { return null; }
|
343
|
final String year = getStringValue(pubDateJson, "year");
|
344
|
final String month = getStringValue(pubDateJson, "month");
|
345
|
final String day = getStringValue(pubDateJson, "day");
|
346
|
|
347
|
if (StringUtils.isBlank(year)) { return null; }
|
348
|
String pubDate = "".concat(year);
|
349
|
if (StringUtils.isNotBlank(month)) {
|
350
|
pubDate = pubDate.concat("-" + month);
|
351
|
if (StringUtils.isNotBlank(day)) {
|
352
|
pubDate = pubDate.concat("-" + day);
|
353
|
} else {
|
354
|
pubDate += "-01";
|
355
|
}
|
356
|
} else {
|
357
|
pubDate += "-01-01";
|
358
|
}
|
359
|
if (isValidDate(pubDate)) { return pubDate; }
|
360
|
return null;
|
361
|
}
|
362
|
|
363
|
protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
|
364
|
|
365
|
final String type = getStringValue(rootElement, "type");
|
366
|
if (!typologiesMapping.containsKey(type)) {
|
367
|
context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
368
|
return false;
|
369
|
}
|
370
|
|
371
|
if (!isValidJsonArray(rootElement, "titles")) {
|
372
|
context.incrementCounter("filtered", "invalid_title", 1);
|
373
|
return false;
|
374
|
}
|
375
|
return true;
|
376
|
}
|
377
|
|
378
|
private static boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
|
379
|
if (!rootElement.has(fieldName)) { return false; }
|
380
|
final JsonElement jsonElement = rootElement.get(fieldName);
|
381
|
if (jsonElement.isJsonNull()) { return false; }
|
382
|
if (jsonElement.isJsonArray()) {
|
383
|
final JsonArray jsonArray = jsonElement.getAsJsonArray();
|
384
|
if (jsonArray.isJsonNull()) { return false; }
|
385
|
if (jsonArray.get(0).isJsonNull()) { return false; }
|
386
|
}
|
387
|
return true;
|
388
|
}
|
389
|
}
|