1
|
package eu.dnetlib.data.mapreduce.hbase.dataimport;
|
2
|
|
3
|
import com.google.gson.JsonArray;
|
4
|
import com.google.gson.JsonElement;
|
5
|
import com.google.gson.JsonObject;
|
6
|
import com.googlecode.protobuf.format.JsonFormat;
|
7
|
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
8
|
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
9
|
import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
10
|
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
11
|
import org.apache.avro.data.Json;
|
12
|
import org.apache.commons.lang3.StringUtils;
|
13
|
import org.apache.solr.common.util.StrUtils;
|
14
|
|
15
|
import static eu.dnetlib.data.proto.FieldTypeProtos.*;
|
16
|
import static eu.dnetlib.data.proto.KindProtos.Kind;
|
17
|
import static eu.dnetlib.data.proto.OafProtos.*;
|
18
|
import static eu.dnetlib.data.proto.ResultProtos.Result;
|
19
|
import static eu.dnetlib.data.proto.ResultProtos.Result.*;
|
20
|
import static eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
|
21
|
import static eu.dnetlib.data.proto.TypeProtos.Type;
|
22
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.ScholixToActions.getQualifier;
|
23
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.ScholixToActions.getStringValue;
|
24
|
import static eu.dnetlib.data.mapreduce.hbase.dataimport.ScholixToActions.getArrayValues;
|
25
|
|
26
|
import java.util.ArrayList;
|
27
|
import java.util.List;
|
28
|
import java.util.Objects;
|
29
|
import java.util.stream.Collectors;
|
30
|
|
31
|
public class CrossRefToActions {
|
32
|
|
33
|
public static List<AtomicAction> generateActionsFromDump(final JsonObject rootElement,ActionFactory factory) {
|
34
|
final List<AtomicAction> actions = new ArrayList<>();
|
35
|
|
36
|
|
37
|
//Create OAF Proto
|
38
|
final Oaf.Builder oaf = Oaf.newBuilder();
|
39
|
//Add Data Info
|
40
|
oaf.setDataInfo(DataInfo.newBuilder()
|
41
|
.setInvisible(false)
|
42
|
.setDeletedbyinference(false)
|
43
|
.setInferred(false)
|
44
|
.setTrust("0.9")
|
45
|
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
46
|
.build());
|
47
|
|
48
|
//Adding Kind
|
49
|
oaf.setKind(Kind.entity);
|
50
|
|
51
|
//creating Result Proto
|
52
|
final OafEntity.Builder entity = OafEntity.newBuilder().setType(Type.result);
|
53
|
|
54
|
//Adding Collected From
|
55
|
entity.setDateofcollection(Objects.requireNonNull(getStringValue(rootElement, "dateOfCollection")));
|
56
|
if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()){
|
57
|
StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
|
58
|
.map(JsonElement::getAsJsonObject)
|
59
|
.forEach(cf ->
|
60
|
{
|
61
|
final String id =getStringValue(cf,"id");
|
62
|
final String name =getStringValue(cf,"name");
|
63
|
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
64
|
final KeyValue collectedFrom = KeyValue.newBuilder()
|
65
|
.setValue(name)
|
66
|
.setKey("10|openaire____::" + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(id, "::")))
|
67
|
.build();
|
68
|
entity.addCollectedfrom(collectedFrom);
|
69
|
}
|
70
|
}
|
71
|
);
|
72
|
}
|
73
|
|
74
|
|
75
|
//Adding identifier
|
76
|
final String objIdentifier = getStringValue(rootElement, "objIdentifier");
|
77
|
final String nsPrefix = getStringValue(rootElement,"datasourcePrefix");
|
78
|
if (StringUtils.isBlank(objIdentifier)) return null;
|
79
|
final String sourceId = String.format("50|%s::%s", nsPrefix, objIdentifier);
|
80
|
entity.setId(sourceId);
|
81
|
final String doi = getStringValue(rootElement, "doi");
|
82
|
//ADDING PID
|
83
|
if (doi == null)
|
84
|
return null;
|
85
|
entity.addPid(StructuredProperty.newBuilder()
|
86
|
.setValue(doi)
|
87
|
.setQualifier(getQualifier("doi", "dnet:pid_types"))
|
88
|
.build());
|
89
|
|
90
|
|
91
|
//Create Result Field
|
92
|
Result.Builder result = Result.newBuilder();
|
93
|
|
94
|
|
95
|
//Adding Instance
|
96
|
final String typeValue = getStringValue(rootElement.getAsJsonObject("type"),"value");
|
97
|
final String cobjValue = getStringValue(rootElement.getAsJsonObject("type"),"cobj");
|
98
|
|
99
|
//Add UnpayWall instance
|
100
|
final String best_oa_location_url = getStringValue(rootElement, "best_oa_location_url");
|
101
|
Instance.Builder instance= Instance.newBuilder();
|
102
|
instance.setInstancetype(Qualifier.newBuilder()
|
103
|
.setClassid(cobjValue)
|
104
|
.setClassname(typeValue)
|
105
|
.setSchemeid("dnet:publication_resource")
|
106
|
.setSchemename("dnet:publication_resource")
|
107
|
.build());
|
108
|
instance.setHostedby(KeyValue.newBuilder()
|
109
|
.setKey("10|openaire____::55045bd2a65019fd8e6741a755395c8c")
|
110
|
.setValue("Unknown Repository")
|
111
|
.build());
|
112
|
if (StringUtils.isNotBlank(best_oa_location_url)){
|
113
|
|
114
|
instance.addUrl(best_oa_location_url);
|
115
|
instance.setAccessright(Qualifier.newBuilder()
|
116
|
.setClassid("OPEN")
|
117
|
.setClassname("open access")
|
118
|
.setSchemeid("dnet:access_modes")
|
119
|
.setSchemename("dnet:access_modes")
|
120
|
.build());
|
121
|
instance.setCollectedfrom(KeyValue.newBuilder()
|
122
|
.setValue("UnpayWall")
|
123
|
.setKey("10|openaire____::" + AbstractDNetXsltFunctions.md5("unpaywall"))
|
124
|
.build());
|
125
|
} else {
|
126
|
instance = Instance.newBuilder();
|
127
|
instance.addUrl(String.format("http://dx.doi.org/%s", doi));
|
128
|
instance.setAccessright(Qualifier.newBuilder()
|
129
|
.setClassid("CLOSED")
|
130
|
.setClassname("Closed Access")
|
131
|
.setSchemeid("dnet:access_modes")
|
132
|
.setSchemename("dnet:access_modes")
|
133
|
.build());
|
134
|
instance.setCollectedfrom(KeyValue.newBuilder()
|
135
|
.setValue("CrossRef")
|
136
|
.setKey("10|openaire____::" + AbstractDNetXsltFunctions.md5("crossref"))
|
137
|
.build());
|
138
|
}
|
139
|
result.addInstance(instance.build());
|
140
|
|
141
|
|
142
|
|
143
|
|
144
|
//Create Metadata Proto
|
145
|
Metadata.Builder metadata = Metadata.newBuilder();
|
146
|
|
147
|
//Adding Authors
|
148
|
final List<Author> authors = createAuthors(rootElement);
|
149
|
if (authors!= null)
|
150
|
metadata.addAllAuthor(authors);
|
151
|
//adding Language
|
152
|
metadata.setLanguage(Qualifier.newBuilder()
|
153
|
.setClassid("und")
|
154
|
.setClassname("Undetermined")
|
155
|
.setSchemeid("dent:languages")
|
156
|
.setSchemename("dent:languages")
|
157
|
.build());
|
158
|
|
159
|
//Adding subjects
|
160
|
List<String> subjects =getArrayValues(rootElement, "subject");
|
161
|
|
162
|
subjects.forEach(s-> metadata.addSubject(StructuredProperty.newBuilder()
|
163
|
.setValue(s)
|
164
|
.setQualifier(getQualifier("keyword", "dnet:subject"))
|
165
|
.build()));
|
166
|
|
167
|
//Adding titles
|
168
|
List<String>titles =getArrayValues(rootElement, "title");
|
169
|
titles.forEach(t->
|
170
|
metadata.addTitle(StructuredProperty.newBuilder()
|
171
|
.setValue(t)
|
172
|
.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
173
|
.build()));
|
174
|
|
175
|
//Adding date
|
176
|
String date = getStringValue(rootElement,"issued");
|
177
|
if (date.length()==4)
|
178
|
date +="-01-01";
|
179
|
|
180
|
metadata.setDateofacceptance(StringField.newBuilder().setValue(date).build());
|
181
|
|
182
|
//Adding description
|
183
|
String description=null;
|
184
|
if (rootElement.has("abstract") && rootElement.get("abstract").isJsonArray())
|
185
|
description =String.join(" ",getArrayValues(rootElement,"abstract"));
|
186
|
else if (rootElement.has("abstract") )
|
187
|
description = rootElement.get("abstract").getAsString();
|
188
|
|
189
|
if(StringUtils.isNotBlank(description))
|
190
|
metadata.addDescription(StringField.newBuilder().setValue(description).build());
|
191
|
|
192
|
//Adding Journal
|
193
|
final String publisher = getStringValue(rootElement,"publisher");
|
194
|
if (StringUtils.isNotBlank(publisher)){
|
195
|
|
196
|
final Journal.Builder journal = Journal.newBuilder().setName(publisher);
|
197
|
|
198
|
if (hasJSONArrayField(rootElement,"issn" )){
|
199
|
StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
|
200
|
.map(JsonElement::getAsJsonObject)
|
201
|
.forEach(it -> {
|
202
|
final String type = getStringValue(it, "type");
|
203
|
final String value = getStringValue(it, "value");
|
204
|
if("electronic".equals(type)){
|
205
|
journal.setIssnOnline(value);
|
206
|
}
|
207
|
if ("print".equals(type))
|
208
|
journal.setIssnPrinted(value);
|
209
|
});
|
210
|
}
|
211
|
metadata.setJournal(journal.build());
|
212
|
}
|
213
|
|
214
|
metadata.setResulttype(getQualifier(getDefaultResulttype(cobjValue), "dnet:result_typologies"));
|
215
|
result.setMetadata(metadata.build());
|
216
|
entity.setResult(result.build());
|
217
|
oaf.setEntity(entity.build());
|
218
|
|
219
|
|
220
|
System.out.println(JsonFormat.printToString(oaf.build()));
|
221
|
|
222
|
return actions;
|
223
|
}
|
224
|
|
225
|
|
226
|
private static boolean hasJSONArrayField(final JsonObject root, final String key) {
|
227
|
return root.has(key) && root.get(key).isJsonArray();
|
228
|
}
|
229
|
|
230
|
|
231
|
public static List<Author> createAuthors(final JsonObject root) {
|
232
|
|
233
|
if (root.has("author") && root.get("author").isJsonArray()) {
|
234
|
|
235
|
final List<Author> authors = new ArrayList<>();
|
236
|
final JsonArray jsonAuthors = root.getAsJsonArray("author");
|
237
|
int i = 0;
|
238
|
for (JsonElement item: jsonAuthors) {
|
239
|
final JsonObject author = item.getAsJsonObject();
|
240
|
final Author.Builder result =Author.newBuilder();
|
241
|
final String given = getStringValue(author, "given");
|
242
|
final String family = getStringValue(author, "family");
|
243
|
final String orchid = getStringValue(author, "ORCID");
|
244
|
if (StringUtils.isBlank(given) && StringUtils.isBlank(family))
|
245
|
continue;
|
246
|
result.setFullname(given+" "+ family);
|
247
|
if (StringUtils.isNotBlank(given))
|
248
|
result.setName(given);
|
249
|
if (StringUtils.isNotBlank(family))
|
250
|
result.setSurname(family);
|
251
|
if (StringUtils.isNotBlank(orchid))
|
252
|
{
|
253
|
result.addPid(KeyValue.newBuilder()
|
254
|
.setValue(orchid)
|
255
|
.setKey("ORCID")
|
256
|
.build());
|
257
|
}
|
258
|
result.setRank(i++);
|
259
|
authors.add(result.build());
|
260
|
}
|
261
|
return authors;
|
262
|
|
263
|
}
|
264
|
return null;
|
265
|
|
266
|
|
267
|
}
|
268
|
|
269
|
private static String getDefaultResulttype(final String cobjcategory) {
|
270
|
switch (cobjcategory) {
|
271
|
case "0029":
|
272
|
return "software";
|
273
|
case "0021":
|
274
|
case "0024":
|
275
|
case "0025":
|
276
|
case "0030":
|
277
|
return "dataset";
|
278
|
case "0000":
|
279
|
case "0010":
|
280
|
case "0018":
|
281
|
case "0020":
|
282
|
case "0022":
|
283
|
case "0023":
|
284
|
case "0026":
|
285
|
case "0027":
|
286
|
case "0028":
|
287
|
case "0037":
|
288
|
return "other";
|
289
|
case "0001":
|
290
|
case "0002":
|
291
|
case "0004":
|
292
|
case "0005":
|
293
|
case "0006":
|
294
|
case "0007":
|
295
|
case "0008":
|
296
|
case "0009":
|
297
|
case "0011":
|
298
|
case "0012":
|
299
|
case "0013":
|
300
|
case "0014":
|
301
|
case "0015":
|
302
|
case "0016":
|
303
|
case "0017":
|
304
|
case "0019":
|
305
|
case "0031":
|
306
|
case "0032":
|
307
|
return "publication";
|
308
|
default:
|
309
|
return "publication";
|
310
|
}
|
311
|
}
|
312
|
|
313
|
|
314
|
|
315
|
}
|