1 |
1 |
package eu.dnetlib.data.mapreduce.hbase.dataimport;
|
2 |
2 |
|
|
3 |
import java.io.ByteArrayOutputStream;
|
|
4 |
import java.io.IOException;
|
|
5 |
import java.io.InputStream;
|
|
6 |
import java.util.*;
|
|
7 |
import java.util.concurrent.atomic.AtomicInteger;
|
|
8 |
import java.util.function.Function;
|
|
9 |
import java.util.stream.Collectors;
|
|
10 |
import java.util.stream.Stream;
|
|
11 |
import java.util.zip.Inflater;
|
|
12 |
|
3 |
13 |
import com.google.gson.Gson;
|
4 |
14 |
import com.google.gson.JsonElement;
|
5 |
15 |
import com.google.gson.JsonObject;
|
6 |
16 |
import eu.dnetlib.actionmanager.actions.ActionFactory;
|
7 |
17 |
import eu.dnetlib.actionmanager.actions.AtomicAction;
|
8 |
18 |
import eu.dnetlib.actionmanager.common.Agent;
|
|
19 |
import eu.dnetlib.data.mapreduce.hbase.Reporter;
|
9 |
20 |
import eu.dnetlib.data.mapreduce.util.StreamUtils;
|
10 |
21 |
import eu.dnetlib.data.proto.*;
|
11 |
22 |
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
... | ... | |
14 |
25 |
import org.apache.commons.io.IOUtils;
|
15 |
26 |
import org.apache.commons.lang3.StringUtils;
|
16 |
27 |
|
17 |
|
import java.io.ByteArrayOutputStream;
|
18 |
|
import java.io.IOException;
|
19 |
|
import java.io.InputStream;
|
20 |
|
import java.util.*;
|
21 |
|
import java.util.concurrent.atomic.AtomicInteger;
|
22 |
|
import java.util.function.Function;
|
23 |
|
import java.util.stream.Collectors;
|
24 |
|
import java.util.zip.Inflater;
|
25 |
|
|
26 |
28 |
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
|
27 |
29 |
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
|
28 |
30 |
|
... | ... | |
36 |
38 |
public static final String GRID_AC = "grid.ac";
|
37 |
39 |
public static final String WIKPEDIA = "wikpedia";
|
38 |
40 |
|
39 |
|
public final static String doiBoostNSPREFIX ="doiboost____";
|
|
41 |
public final static String doiBoostNSPREFIX = "doiboost____";
|
40 |
42 |
public static final String OPENAIRE_PREFIX = "openaire____";
|
41 |
43 |
|
42 |
44 |
public static final String SEPARATOR = "::";
|
43 |
45 |
|
44 |
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
|
45 |
|
put(MAG, new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
|
46 |
|
put(ORCID, new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
47 |
|
put(CROSSREF, new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
|
48 |
|
put(UNPAYWALL, new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
|
|
46 |
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
|
|
47 |
put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
|
|
48 |
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
|
49 |
put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
|
|
50 |
put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
|
49 |
51 |
|
50 |
52 |
}};
|
51 |
53 |
|
52 |
|
private static String decompressAbstract(final String abstractCompressed) {
|
|
54 |
private static String decompressAbstract(final String abstractCompressed) {
|
53 |
55 |
try {
|
54 |
56 |
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
|
55 |
57 |
final Inflater decompresser = new Inflater();
|
... | ... | |
64 |
66 |
decompresser.end();
|
65 |
67 |
return new String(unzippeddata);
|
66 |
68 |
} catch (Throwable e) {
|
67 |
|
System.out.println("Wrong abstract:"+ abstractCompressed);
|
68 |
|
throw new RuntimeException(e);
|
|
69 |
System.out.println("Wrong abstract:" + abstractCompressed);
|
|
70 |
throw new RuntimeException(e);
|
69 |
71 |
}
|
70 |
72 |
}
|
71 |
73 |
|
72 |
74 |
public static final String PID_TYPES = "dnet:pid_types";
|
73 |
|
private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
|
74 |
|
put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id" ).setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES).setSchemeid(PID_TYPES).build());
|
|
75 |
private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{
|
|
76 |
put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES)
|
|
77 |
.setSchemeid(PID_TYPES).build());
|
75 |
78 |
put(GRID_AC, getQualifier("grid", PID_TYPES));
|
76 |
79 |
put(WIKPEDIA, getQualifier("urn", PID_TYPES));
|
77 |
80 |
}};
|
... | ... | |
81 |
84 |
static {
|
82 |
85 |
try {
|
83 |
86 |
final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json");
|
84 |
|
final String tt =IOUtils.toString(is);
|
|
87 |
final String tt = IOUtils.toString(is);
|
85 |
88 |
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
86 |
89 |
} catch (IOException e) {
|
87 |
90 |
e.printStackTrace();
|
88 |
91 |
}
|
89 |
92 |
}
|
90 |
93 |
|
|
94 |
protected static boolean isValid(final JsonObject rootElement, final Reporter context) {
|
91 |
95 |
|
|
96 |
final String doi = getStringValue(rootElement, "doi");
|
|
97 |
if (doi == null) {
|
|
98 |
context.incrementCounter("filtered", "no_doi", 1);
|
|
99 |
return false;
|
|
100 |
}
|
|
101 |
final String type = getStringValue(rootElement, "type");
|
|
102 |
if (!typologiesMapping.containsKey(type)) {
|
|
103 |
context.incrementCounter("filtered", "unknowntype_" + type, 1);
|
|
104 |
return false;
|
|
105 |
}
|
|
106 |
// fixes #4360 (test publisher)
|
|
107 |
final String publisher = getStringValue(rootElement, "publisher");
|
|
108 |
if (StringUtils.isNotBlank(publisher) && publisher.equalsIgnoreCase("Test accounts")) {
|
|
109 |
context.incrementCounter("filtered", "test_publisher", 1);
|
|
110 |
return false;
|
|
111 |
}
|
92 |
112 |
|
|
113 |
List<JsonObject> authors = getArrayObjects(rootElement, "authors");
|
|
114 |
boolean hasAuthors = false;
|
|
115 |
for (JsonObject author : authors) {
|
|
116 |
final String given = getStringValue(author, "given");
|
|
117 |
final String family = getStringValue(author, "family");
|
|
118 |
String fullname = getStringValue(author, "fullname");
|
|
119 |
if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) {
|
|
120 |
fullname = String.format("%s %s", given, family);
|
|
121 |
}
|
|
122 |
// fixes #4368
|
|
123 |
if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) {
|
|
124 |
context.incrementCounter("invalid_author", "addiejackson", 1);
|
|
125 |
context.incrementCounter("filtered", "invalid_authors", 1);
|
|
126 |
return false;
|
|
127 |
}
|
|
128 |
if (isValidAuthorName(fullname, context)) hasAuthors = true;
|
|
129 |
}
|
93 |
130 |
|
94 |
|
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement, final ActionFactory factory, final String setName, final Agent agent, boolean invisible,
|
95 |
|
final boolean onlyOrganization) {
|
|
131 |
if (!hasAuthors) {
|
|
132 |
context.incrementCounter("filtered", "invalid_authors", 1);
|
|
133 |
return false;
|
|
134 |
}
|
|
135 |
// fixes #4360
|
|
136 |
if (getCleanedTitles(rootElement).isEmpty()) {
|
|
137 |
context.incrementCounter("filtered", "invalid_title", 1);
|
|
138 |
return false;
|
|
139 |
}
|
96 |
140 |
|
|
141 |
return true;
|
|
142 |
}
|
|
143 |
|
|
144 |
private static List<String> getCleanedTitles(final JsonObject rootElement) {
|
|
145 |
List<String> titles = getArrayValues(rootElement, "title");
|
|
146 |
return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
|
|
147 |
}
|
|
148 |
|
|
149 |
private static boolean isValidAuthorName(final String fullName, final Reporter context) {
|
|
150 |
if (StringUtils.isBlank(fullName)) {
|
|
151 |
if(context != null) context.incrementCounter("invalid_author", "blank", 1);
|
|
152 |
return false;
|
|
153 |
}
|
|
154 |
// fixes #4391 and subtasks related to DOIBoost
|
|
155 |
switch (fullName) {
|
|
156 |
case ",":
|
|
157 |
case "none none":
|
|
158 |
case "none &na;":
|
|
159 |
case "(:null)":
|
|
160 |
case "&na; &na;": {
|
|
161 |
if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1);
|
|
162 |
return false;
|
|
163 |
}
|
|
164 |
}
|
|
165 |
return true;
|
|
166 |
}
|
|
167 |
|
|
168 |
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement,
|
|
169 |
final ActionFactory factory,
|
|
170 |
final String setName,
|
|
171 |
final Agent agent,
|
|
172 |
boolean invisible,
|
|
173 |
final boolean onlyOrganization,
|
|
174 |
final Reporter context) {
|
|
175 |
|
|
176 |
if (!isValid(rootElement, context)) return null;
|
|
177 |
|
97 |
178 |
//Create OAF Proto
|
98 |
179 |
|
99 |
180 |
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
|
... | ... | |
112 |
193 |
//creating Result Proto
|
113 |
194 |
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result);
|
114 |
195 |
|
115 |
|
entity.setDateofcollection("2018-10-10");
|
|
196 |
entity.setDateofcollection("2019-02-15");
|
116 |
197 |
|
117 |
|
if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()){
|
|
198 |
if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
|
118 |
199 |
StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator())
|
119 |
200 |
.map(JsonElement::getAsString)
|
120 |
201 |
.forEach(cf -> {
|
121 |
|
final String id = datasources.get(cf).getValue();
|
122 |
|
final String name = datasources.get(cf).getKey();
|
|
202 |
final String id = datasources.get(cf.toLowerCase()).getValue();
|
|
203 |
final String name = datasources.get(cf.toLowerCase()).getKey();
|
123 |
204 |
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
124 |
205 |
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
125 |
206 |
.setValue(name)
|
... | ... | |
132 |
213 |
}
|
133 |
214 |
//Adding identifier
|
134 |
215 |
final String doi = getStringValue(rootElement, "doi");
|
135 |
|
if (doi == null)
|
136 |
|
return null;
|
|
216 |
entity.addOriginalId(doi);
|
|
217 |
|
137 |
218 |
final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
|
138 |
219 |
entity.setId(sourceId);
|
139 |
220 |
|
... | ... | |
142 |
223 |
.setQualifier(getQualifier("doi", PID_TYPES))
|
143 |
224 |
.build());
|
144 |
225 |
|
145 |
|
|
146 |
226 |
//Create Result Field
|
147 |
227 |
ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder();
|
148 |
228 |
|
149 |
|
final String type = getStringValue(rootElement,"type");
|
|
229 |
final String type = getStringValue(rootElement, "type");
|
150 |
230 |
|
151 |
|
if (!typologiesMapping.containsKey(type))
|
152 |
|
return null;
|
153 |
|
|
154 |
231 |
//Adding Instances
|
155 |
232 |
final String typeValue = typologiesMapping.get(type).get("value");
|
156 |
233 |
final String cobjValue = typologiesMapping.get(type).get("cobj");
|
157 |
234 |
|
|
235 |
// TODO: workaround for #4362: remove it when UnpayWall is correctly mapped
|
|
236 |
List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> {
|
|
237 |
String provS = getStringValue(prov, "provenance");
|
|
238 |
if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true;
|
|
239 |
else return false;
|
|
240 |
}).collect(Collectors.toList());
|
158 |
241 |
|
159 |
|
getArrayObjects(rootElement, "instances").stream().map(it ->
|
|
242 |
Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
|
160 |
243 |
{
|
161 |
|
ResultProtos.Result.Instance.Builder instance= ResultProtos.Result.Instance.newBuilder();
|
|
244 |
ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
162 |
245 |
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
163 |
246 |
.setClassid(cobjValue)
|
164 |
247 |
.setClassname(typeValue)
|
... | ... | |
170 |
253 |
.setValue("Unknown Repository")
|
171 |
254 |
.build());
|
172 |
255 |
|
173 |
|
final String acc_class_id =it.get("access-rights").getAsString();
|
|
256 |
final String acc_class_id = it.get("access-rights").getAsString();
|
174 |
257 |
String acc_class_value;
|
175 |
|
switch (acc_class_id){
|
176 |
|
case "OPEN": {
|
177 |
|
acc_class_value = "open access";
|
178 |
|
break;
|
179 |
|
}
|
180 |
|
case "CLOSED": {
|
181 |
|
acc_class_value = "closed access";
|
182 |
|
break;
|
183 |
|
}
|
|
258 |
switch (acc_class_id) {
|
|
259 |
case "OPEN": {
|
|
260 |
acc_class_value = "open access";
|
|
261 |
break;
|
|
262 |
}
|
|
263 |
case "CLOSED": {
|
|
264 |
acc_class_value = "closed access";
|
|
265 |
break;
|
|
266 |
}
|
|
267 |
default: {
|
|
268 |
acc_class_value = "not available";
|
|
269 |
}
|
184 |
270 |
|
185 |
|
default: {
|
186 |
|
acc_class_value = "not available";
|
187 |
|
}
|
188 |
|
|
189 |
271 |
}
|
190 |
272 |
|
191 |
273 |
instance.addUrl(it.get("url").getAsString());
|
... | ... | |
196 |
278 |
.setSchemename("dnet:access_modes")
|
197 |
279 |
.build());
|
198 |
280 |
|
199 |
|
final String id =datasources.get(it.get("provenance").getAsString()).getValue();
|
200 |
|
final String name =datasources.get(it.get("provenance").getAsString()).getKey();
|
|
281 |
final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
|
|
282 |
final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
|
201 |
283 |
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
202 |
284 |
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
203 |
285 |
.setValue(name)
|
... | ... | |
207 |
289 |
instance.setCollectedfrom(collectedFrom);
|
208 |
290 |
}
|
209 |
291 |
|
210 |
|
return instance.build();
|
|
292 |
return instance.build();
|
211 |
293 |
}).forEach(result::addInstance);
|
212 |
294 |
|
213 |
295 |
//Adding DOI URL as Instance
|
214 |
296 |
final String doiURL = getStringValue(rootElement, "doi-url");
|
|
297 |
JsonObject hostedByOpenAire = null;
|
|
298 |
if (rootElement.has("hostedByOpenAire")) {
|
|
299 |
hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire");
|
|
300 |
}
|
|
301 |
|
215 |
302 |
if (StringUtils.isNotBlank(doiURL)) {
|
216 |
|
|
217 |
|
|
218 |
303 |
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
219 |
304 |
instance.addUrl(doiURL);
|
|
305 |
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
|
|
306 |
.setClassid(cobjValue)
|
|
307 |
.setClassname(typeValue)
|
|
308 |
.setSchemeid("dnet:publication_resource")
|
|
309 |
.setSchemename("dnet:publication_resource")
|
|
310 |
.build());
|
220 |
311 |
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder()
|
221 |
312 |
.setClassid("CLOSED")
|
222 |
313 |
.setClassname("Closed Access")
|
... | ... | |
224 |
315 |
.setSchemename("dnet:access_modes")
|
225 |
316 |
.build());
|
226 |
317 |
instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
|
227 |
|
.setValue("CrossRef")
|
|
318 |
.setValue(CROSSREF)
|
228 |
319 |
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref"))
|
229 |
320 |
.build());
|
|
321 |
|
|
322 |
if (hostedByOpenAire == null)
|
|
323 |
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
|
324 |
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c")
|
|
325 |
.setValue("Unknown Repository")
|
|
326 |
.build());
|
|
327 |
else {
|
|
328 |
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder()
|
|
329 |
.setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString()))
|
|
330 |
.setValue(hostedByOpenAire.get("name").getAsString())
|
|
331 |
.build());
|
|
332 |
}
|
|
333 |
|
230 |
334 |
result.addInstance(instance);
|
231 |
335 |
}
|
232 |
336 |
|
233 |
337 |
//Create Metadata Proto
|
234 |
338 |
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder();
|
235 |
339 |
|
236 |
|
|
237 |
340 |
Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
|
238 |
341 |
|
239 |
342 |
if (authorsOrganizations.getKey().size() > 0) {
|
240 |
343 |
metadata.addAllAuthor(authorsOrganizations.getKey());
|
241 |
|
}
|
242 |
|
else {
|
|
344 |
} else {
|
|
345 |
//Should never enter here becasue of the isValid method at the beginning.
|
|
346 |
context.incrementCounter("filtered", "unexpected_no_authors", 1);
|
243 |
347 |
return null;
|
244 |
348 |
}
|
245 |
349 |
//adding Language
|
... | ... | |
251 |
355 |
.build());
|
252 |
356 |
|
253 |
357 |
//Adding subjects
|
254 |
|
List<String> subjects =getArrayValues(rootElement, "subject");
|
|
358 |
List<String> subjects = getArrayValues(rootElement, "subject");
|
255 |
359 |
|
256 |
|
subjects.forEach(s-> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
|
|
360 |
subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
|
257 |
361 |
.setValue(s)
|
258 |
362 |
.setQualifier(getQualifier("keyword", "dnet:subject"))
|
259 |
363 |
.build()));
|
260 |
364 |
|
261 |
|
List<String>titles =getArrayValues(rootElement, "title");
|
262 |
|
titles.forEach(t->
|
|
365 |
List<String> titles = getCleanedTitles(rootElement);
|
|
366 |
titles.forEach(t ->
|
263 |
367 |
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
|
264 |
368 |
.setValue(t)
|
265 |
369 |
.setQualifier(getQualifier("main title", "dnet:dataCite_title"))
|
266 |
370 |
.build()));
|
|
371 |
|
267 |
372 |
settingRelevantDate(rootElement, metadata, "issued", "issued", true);
|
268 |
373 |
settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
|
269 |
374 |
settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
|
270 |
375 |
settingRelevantDate(rootElement, metadata, "published-print", "published-print", false);
|
271 |
376 |
|
272 |
|
|
273 |
377 |
getArrayObjects(rootElement, "abstract").forEach(d ->
|
274 |
378 |
{
|
275 |
379 |
if (MAG.equals(d.get("provenance").getAsString()))
|
... | ... | |
277 |
381 |
else
|
278 |
382 |
metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build());
|
279 |
383 |
}
|
280 |
|
);
|
|
384 |
);
|
281 |
385 |
|
282 |
|
|
283 |
|
|
284 |
386 |
//Adding Journal
|
285 |
|
final String publisher = getStringValue(rootElement,"publisher");
|
286 |
|
if (StringUtils.isNotBlank(publisher)){
|
|
387 |
final String publisher = getStringValue(rootElement, "publisher");
|
|
388 |
if (StringUtils.isNotBlank(publisher)) {
|
287 |
389 |
|
288 |
390 |
final ResultProtos.Result.Journal.Builder journal = ResultProtos.Result.Journal.newBuilder().setName(publisher);
|
289 |
391 |
|
290 |
|
if (hasJSONArrayField(rootElement,"issn" )){
|
|
392 |
if (hasJSONArrayField(rootElement, "issn")) {
|
291 |
393 |
StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator())
|
292 |
394 |
.map(JsonElement::getAsJsonObject)
|
293 |
395 |
.forEach(it -> {
|
294 |
396 |
final String issntype = getStringValue(it, "type");
|
295 |
397 |
final String value = getStringValue(it, "value");
|
296 |
|
if("electronic".equals(issntype)){
|
|
398 |
if ("electronic".equals(issntype)) {
|
297 |
399 |
journal.setIssnOnline(value);
|
298 |
400 |
}
|
299 |
401 |
if ("print".equals(issntype))
|
... | ... | |
306 |
408 |
result.setMetadata(metadata.build());
|
307 |
409 |
entity.setResult(result.build());
|
308 |
410 |
oaf.setEntity(entity.build());
|
|
411 |
|
|
412 |
//System.out.println(JsonFormat.printToString(oaf.build()));
|
|
413 |
|
309 |
414 |
final List<AtomicAction> actionList = new ArrayList<>();
|
310 |
415 |
|
311 |
416 |
if (!onlyOrganization)
|
... | ... | |
320 |
425 |
if (!onlyOrganization)
|
321 |
426 |
actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent));
|
322 |
427 |
final String gridOrganization = getSimilarGridOrganization(o.getEntity());
|
323 |
|
if (gridOrganization!= null) {
|
324 |
|
actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization, "".getBytes()));
|
325 |
|
actionList.add(factory.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(), "".getBytes()));
|
|
428 |
if (gridOrganization != null) {
|
|
429 |
actionList.add(factory
|
|
430 |
.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization,
|
|
431 |
"".getBytes()));
|
|
432 |
actionList.add(factory
|
|
433 |
.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(),
|
|
434 |
"".getBytes()));
|
326 |
435 |
}
|
327 |
436 |
});
|
328 |
437 |
}
|
... | ... | |
330 |
439 |
|
331 |
440 |
}
|
332 |
441 |
|
333 |
|
|
334 |
442 |
private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) {
|
335 |
443 |
|
336 |
444 |
final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList();
|
337 |
|
if (pidList!= null ) {
|
338 |
|
for (FieldTypeProtos.StructuredProperty p: pidList) {
|
339 |
|
if (p.getQualifier().getClassname().equals("grid")){
|
340 |
|
return "20|grid________" + SEPARATOR +AbstractDNetXsltFunctions.md5(p.getValue());
|
|
445 |
if (pidList != null) {
|
|
446 |
for (FieldTypeProtos.StructuredProperty p : pidList) {
|
|
447 |
if (p.getQualifier().getClassname().equals("grid")) {
|
|
448 |
return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
|
341 |
449 |
}
|
342 |
450 |
}
|
343 |
451 |
}
|
... | ... | |
345 |
453 |
|
346 |
454 |
}
|
347 |
455 |
|
348 |
|
private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication, final OafProtos.Oaf organization, final ActionFactory factory, final String setName, final Agent agent) {
|
|
456 |
private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication,
|
|
457 |
final OafProtos.Oaf organization,
|
|
458 |
final ActionFactory factory,
|
|
459 |
final String setName,
|
|
460 |
final Agent agent) {
|
349 |
461 |
|
350 |
462 |
List<AtomicAction> result = new ArrayList<>();
|
351 |
463 |
|
... | ... | |
360 |
472 |
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
361 |
473 |
.build());
|
362 |
474 |
|
363 |
|
|
364 |
475 |
final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder();
|
365 |
476 |
|
366 |
477 |
rel.setRelType(RelTypeProtos.RelType.resultOrganization);
|
... | ... | |
381 |
492 |
rel.setResultOrganization(rel_instance.build());
|
382 |
493 |
|
383 |
494 |
rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder()
|
384 |
|
.setValue(datasources.get(MAG).getKey())
|
385 |
|
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(datasources.get(MAG).getValue(), SEPARATOR)))
|
|
495 |
.setValue(datasources.get(MAG.toLowerCase()).getKey())
|
|
496 |
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions
|
|
497 |
.md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR)))
|
386 |
498 |
.build());
|
387 |
499 |
|
388 |
|
|
389 |
|
|
390 |
500 |
rel.setChild(false);
|
391 |
501 |
roaf.setRel(rel.build());
|
392 |
502 |
|
393 |
|
result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution", organization.getEntity().getId(), roaf.build().toByteArray() ));
|
|
503 |
result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution",
|
|
504 |
organization.getEntity().getId(), roaf.build().toByteArray()));
|
394 |
505 |
|
395 |
|
|
396 |
506 |
//Create a relation Organization --> Result
|
397 |
507 |
rel.setTarget(publication.getEntity().getId());
|
398 |
508 |
rel.setSource(organization.getEntity().getId());
|
399 |
509 |
rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString());
|
400 |
510 |
|
401 |
|
|
402 |
511 |
affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder()
|
403 |
512 |
.setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations"))
|
404 |
513 |
.build());
|
405 |
514 |
rel_instance.setAffiliation(affiliationRel.build());
|
406 |
515 |
rel.setResultOrganization(rel_instance.build());
|
407 |
516 |
roaf.setRel(rel.build());
|
408 |
|
result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf", publication.getEntity().getId(), roaf.build().toByteArray()));
|
|
517 |
result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf",
|
|
518 |
publication.getEntity().getId(), roaf.build().toByteArray()));
|
409 |
519 |
|
410 |
520 |
return result;
|
411 |
521 |
|
... | ... | |
415 |
525 |
return root.has(key) && root.get(key).isJsonArray();
|
416 |
526 |
}
|
417 |
527 |
|
418 |
|
private static void settingRelevantDate(JsonObject rootElement, ResultProtos.Result.Metadata.Builder metadata , final String jsonKey, final String dictionaryKey, final boolean addToDateOfAcceptance) {
|
|
528 |
private static void settingRelevantDate(JsonObject rootElement,
|
|
529 |
ResultProtos.Result.Metadata.Builder metadata,
|
|
530 |
final String jsonKey,
|
|
531 |
final String dictionaryKey,
|
|
532 |
final boolean addToDateOfAcceptance) {
|
419 |
533 |
//Adding date
|
420 |
|
String date = getStringValue(rootElement,jsonKey);
|
|
534 |
String date = getStringValue(rootElement, jsonKey);
|
421 |
535 |
if (date == null)
|
422 |
536 |
return;
|
423 |
537 |
if (date.length() == 4) {
|
... | ... | |
428 |
542 |
metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build());
|
429 |
543 |
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder()
|
430 |
544 |
.setValue(date)
|
431 |
|
.setQualifier(getQualifier(dictionaryKey,"dnet:dataCite_date"))
|
|
545 |
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
432 |
546 |
.build());
|
433 |
547 |
}
|
434 |
548 |
}
|
435 |
549 |
|
436 |
|
|
437 |
550 |
public static FieldTypeProtos.KeyValue extractIdentifier(final String value) {
|
438 |
551 |
FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder();
|
439 |
|
if (StringUtils.contains(value, "orcid.org")){
|
440 |
|
return pid.setValue(value)
|
|
552 |
if (StringUtils.contains(value, "orcid.org")) {
|
|
553 |
return pid.setValue(value.replaceAll("https://orcid.org/", ""))
|
441 |
554 |
.setKey(ORCID).build();
|
442 |
555 |
}
|
443 |
|
if (StringUtils.contains(value, "academic.microsoft.com/#/detail")){
|
444 |
|
return pid.setValue(value)
|
|
556 |
if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
|
|
557 |
return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
|
445 |
558 |
.setKey("MAG Identifier").build();
|
446 |
559 |
}
|
447 |
560 |
return pid.setValue(value)
|
448 |
561 |
.setKey("URL").build();
|
449 |
562 |
}
|
450 |
563 |
|
451 |
|
|
452 |
564 |
public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) {
|
453 |
565 |
final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>();
|
454 |
566 |
final List<String> magId = new ArrayList<>();
|
... | ... | |
456 |
568 |
if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) {
|
457 |
569 |
affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG));
|
458 |
570 |
magId.add(it.get("value").getAsString());
|
459 |
|
}
|
460 |
|
else
|
461 |
|
affiliationIdentifiers.put( it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
|
|
571 |
} else
|
|
572 |
affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString()));
|
462 |
573 |
});
|
463 |
574 |
if (magId.size() > 0) {
|
464 |
575 |
final String microsoftID = magId.get(0);
|
... | ... | |
466 |
577 |
oaf.setKind(KindProtos.Kind.entity);
|
467 |
578 |
OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder();
|
468 |
579 |
entity.setType(TypeProtos.Type.organization);
|
469 |
|
entity.setId("20|microsoft___" + SEPARATOR +AbstractDNetXsltFunctions.md5(microsoftID));
|
470 |
|
final String id =datasources.get(affiliation.get("provenance").getAsString()).getValue();
|
471 |
|
final String name =datasources.get(affiliation.get("provenance").getAsString()).getKey();
|
|
580 |
entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
|
|
581 |
final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
|
|
582 |
final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
|
472 |
583 |
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) {
|
473 |
584 |
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder()
|
474 |
585 |
.setValue(name)
|
... | ... | |
503 |
614 |
.build());
|
504 |
615 |
return oaf.build();
|
505 |
616 |
}
|
506 |
|
return null;
|
|
617 |
return null;
|
507 |
618 |
}
|
508 |
619 |
|
509 |
|
public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
|
|
620 |
public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
|
510 |
621 |
|
511 |
622 |
final Map<String, OafProtos.Oaf> affiliations = new HashMap<>();
|
512 |
623 |
|
513 |
624 |
List<JsonObject> authors = getArrayObjects(root, "authors");
|
514 |
625 |
|
515 |
|
final AtomicInteger counter = new AtomicInteger();
|
|
626 |
final AtomicInteger counter = new AtomicInteger(1);
|
516 |
627 |
|
517 |
628 |
List<FieldTypeProtos.Author> collect = authors.stream().map(author -> {
|
518 |
629 |
final String given = getStringValue(author, "given");
|
... | ... | |
523 |
634 |
fullname = String.format("%s %s", given, family);
|
524 |
635 |
}
|
525 |
636 |
|
526 |
|
if (StringUtils.isBlank(fullname)){
|
|
637 |
if (!isValidAuthorName(fullname, null)) {
|
527 |
638 |
return null;
|
528 |
|
|
529 |
639 |
}
|
530 |
640 |
final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder();
|
531 |
641 |
|
... | ... | |
554 |
664 |
Collectors.toMap(
|
555 |
665 |
FieldTypeProtos.KeyValue::getKey,
|
556 |
666 |
Function.identity(),
|
557 |
|
(a,b) -> a
|
|
667 |
(a, b) -> a
|
558 |
668 |
)).values().forEach(abuilder::addPid);
|
559 |
669 |
abuilder.setRank(counter.getAndIncrement());
|
560 |
670 |
|
... | ... | |
562 |
672 |
|
563 |
673 |
}).filter(Objects::nonNull).collect(Collectors.toList());
|
564 |
674 |
|
565 |
|
return new Pair<> ( collect,affiliations.values() );
|
|
675 |
return new Pair<>(collect, affiliations.values());
|
566 |
676 |
}
|
567 |
677 |
|
568 |
|
|
569 |
|
|
570 |
|
|
571 |
|
|
572 |
|
|
573 |
678 |
}
|
Updated classes for DOIBoost based on the trunk version