Revision 55436
Added by Alessia Bardi almost 5 years ago
DOIBoostToActions.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.dataimport; |
2 | 2 |
|
3 |
import java.io.ByteArrayOutputStream; |
|
4 |
import java.io.IOException; |
|
5 |
import java.io.InputStream; |
|
6 |
import java.util.*; |
|
7 |
import java.util.concurrent.atomic.AtomicInteger; |
|
8 |
import java.util.function.Function; |
|
9 |
import java.util.stream.Collectors; |
|
10 |
import java.util.stream.Stream; |
|
11 |
import java.util.zip.Inflater; |
|
12 |
|
|
3 | 13 |
import com.google.gson.Gson; |
4 | 14 |
import com.google.gson.JsonElement; |
5 | 15 |
import com.google.gson.JsonObject; |
6 | 16 |
import eu.dnetlib.actionmanager.actions.ActionFactory; |
7 | 17 |
import eu.dnetlib.actionmanager.actions.AtomicAction; |
8 | 18 |
import eu.dnetlib.actionmanager.common.Agent; |
19 |
import eu.dnetlib.data.mapreduce.hbase.Reporter; |
|
9 | 20 |
import eu.dnetlib.data.mapreduce.util.StreamUtils; |
10 | 21 |
import eu.dnetlib.data.proto.*; |
11 | 22 |
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions; |
... | ... | |
14 | 25 |
import org.apache.commons.io.IOUtils; |
15 | 26 |
import org.apache.commons.lang3.StringUtils; |
16 | 27 |
|
17 |
import java.io.ByteArrayOutputStream; |
|
18 |
import java.io.IOException; |
|
19 |
import java.io.InputStream; |
|
20 |
import java.util.*; |
|
21 |
import java.util.concurrent.atomic.AtomicInteger; |
|
22 |
import java.util.function.Function; |
|
23 |
import java.util.stream.Collectors; |
|
24 |
import java.util.zip.Inflater; |
|
25 |
|
|
26 | 28 |
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*; |
27 | 29 |
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization; |
28 | 30 |
|
... | ... | |
36 | 38 |
public static final String GRID_AC = "grid.ac"; |
37 | 39 |
public static final String WIKPEDIA = "wikpedia"; |
38 | 40 |
|
39 |
public final static String doiBoostNSPREFIX ="doiboost____"; |
|
41 |
public final static String doiBoostNSPREFIX = "doiboost____";
|
|
40 | 42 |
public static final String OPENAIRE_PREFIX = "openaire____"; |
41 | 43 |
|
42 | 44 |
public static final String SEPARATOR = "::"; |
43 | 45 |
|
44 |
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{
|
|
45 |
put(MAG, new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft")); |
|
46 |
put(ORCID, new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid")); |
|
47 |
put(CROSSREF, new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref")); |
|
48 |
put(UNPAYWALL, new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall")); |
|
46 |
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {{ |
|
47 |
put(MAG.toLowerCase(), new Pair<>("Microsoft Academic Graph", OPENAIRE_PREFIX + SEPARATOR + "microsoft"));
|
|
48 |
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
|
49 |
put(CROSSREF.toLowerCase(), new Pair<>(CROSSREF, OPENAIRE_PREFIX + SEPARATOR + "crossref"));
|
|
50 |
put(UNPAYWALL.toLowerCase(), new Pair<>(UNPAYWALL, OPENAIRE_PREFIX + SEPARATOR + "unpaywall"));
|
|
49 | 51 |
|
50 | 52 |
}}; |
51 | 53 |
|
52 |
private static String decompressAbstract(final String abstractCompressed) {
|
|
54 |
private static String decompressAbstract(final String abstractCompressed) { |
|
53 | 55 |
try { |
54 | 56 |
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes()); |
55 | 57 |
final Inflater decompresser = new Inflater(); |
... | ... | |
64 | 66 |
decompresser.end(); |
65 | 67 |
return new String(unzippeddata); |
66 | 68 |
} catch (Throwable e) { |
67 |
System.out.println("Wrong abstract:"+ abstractCompressed); |
|
68 |
throw new RuntimeException(e);
|
|
69 |
System.out.println("Wrong abstract:" + abstractCompressed);
|
|
70 |
throw new RuntimeException(e); |
|
69 | 71 |
} |
70 | 72 |
} |
71 | 73 |
|
72 | 74 |
public static final String PID_TYPES = "dnet:pid_types"; |
73 |
private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{ |
|
74 |
put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id" ).setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES).setSchemeid(PID_TYPES).build()); |
|
75 |
private static Map<String, FieldTypeProtos.Qualifier> affiliationPIDType = new HashMap<String, FieldTypeProtos.Qualifier>() {{ |
|
76 |
put(MAG, FieldTypeProtos.Qualifier.newBuilder().setClassid("mag_id").setClassname("Microsoft Academic Graph Identifier").setSchemename(PID_TYPES) |
|
77 |
.setSchemeid(PID_TYPES).build()); |
|
75 | 78 |
put(GRID_AC, getQualifier("grid", PID_TYPES)); |
76 | 79 |
put(WIKPEDIA, getQualifier("urn", PID_TYPES)); |
77 | 80 |
}}; |
... | ... | |
81 | 84 |
static { |
82 | 85 |
try { |
83 | 86 |
final InputStream is = DOIBoostToActions.class.getResourceAsStream("/eu/dnetlib/data/mapreduce/hbase/dataimport/mapping_typologies.json"); |
84 |
final String tt =IOUtils.toString(is); |
|
87 |
final String tt = IOUtils.toString(is);
|
|
85 | 88 |
typologiesMapping = new Gson().fromJson(tt, Map.class); |
86 | 89 |
} catch (IOException e) { |
87 | 90 |
e.printStackTrace(); |
88 | 91 |
} |
89 | 92 |
} |
90 | 93 |
|
94 |
protected static boolean isValid(final JsonObject rootElement, final Reporter context) { |
|
91 | 95 |
|
96 |
final String doi = getStringValue(rootElement, "doi"); |
|
97 |
if (doi == null) { |
|
98 |
context.incrementCounter("filtered", "no_doi", 1); |
|
99 |
return false; |
|
100 |
} |
|
101 |
final String type = getStringValue(rootElement, "type"); |
|
102 |
if (!typologiesMapping.containsKey(type)) { |
|
103 |
context.incrementCounter("filtered", "unknowntype_" + type, 1); |
|
104 |
return false; |
|
105 |
} |
|
106 |
// fixes #4360 (test publisher) |
|
107 |
final String publisher = getStringValue(rootElement, "publisher"); |
|
108 |
if (StringUtils.isNotBlank(publisher) && publisher.equalsIgnoreCase("Test accounts")) { |
|
109 |
context.incrementCounter("filtered", "test_publisher", 1); |
|
110 |
return false; |
|
111 |
} |
|
92 | 112 |
|
113 |
List<JsonObject> authors = getArrayObjects(rootElement, "authors"); |
|
114 |
boolean hasAuthors = false; |
|
115 |
for (JsonObject author : authors) { |
|
116 |
final String given = getStringValue(author, "given"); |
|
117 |
final String family = getStringValue(author, "family"); |
|
118 |
String fullname = getStringValue(author, "fullname"); |
|
119 |
if (StringUtils.isBlank(fullname) && StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family)) { |
|
120 |
fullname = String.format("%s %s", given, family); |
|
121 |
} |
|
122 |
// fixes #4368 |
|
123 |
if (fullname.equalsIgnoreCase("Addie Jackson") && publisher.equalsIgnoreCase("Elsevier BV")) { |
|
124 |
context.incrementCounter("invalid_author", "addiejackson", 1); |
|
125 |
context.incrementCounter("filtered", "invalid_authors", 1); |
|
126 |
return false; |
|
127 |
} |
|
128 |
if (isValidAuthorName(fullname, context)) hasAuthors = true; |
|
129 |
} |
|
93 | 130 |
|
94 |
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement, final ActionFactory factory, final String setName, final Agent agent, boolean invisible, |
|
95 |
final boolean onlyOrganization) { |
|
131 |
if (!hasAuthors) { |
|
132 |
context.incrementCounter("filtered", "invalid_authors", 1); |
|
133 |
return false; |
|
134 |
} |
|
135 |
// fixes #4360 |
|
136 |
if (getCleanedTitles(rootElement).isEmpty()) { |
|
137 |
context.incrementCounter("filtered", "invalid_title", 1); |
|
138 |
return false; |
|
139 |
} |
|
96 | 140 |
|
141 |
return true; |
|
142 |
} |
|
143 |
|
|
144 |
private static List<String> getCleanedTitles(final JsonObject rootElement) { |
|
145 |
List<String> titles = getArrayValues(rootElement, "title"); |
|
146 |
return titles.stream().filter(t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList()); |
|
147 |
} |
|
148 |
|
|
149 |
private static boolean isValidAuthorName(final String fullName, final Reporter context) { |
|
150 |
if (StringUtils.isBlank(fullName)) { |
|
151 |
if(context != null) context.incrementCounter("invalid_author", "blank", 1); |
|
152 |
return false; |
|
153 |
} |
|
154 |
// fixes #4391 and subtasks related to DOIBoost |
|
155 |
switch (fullName) { |
|
156 |
case ",": |
|
157 |
case "none none": |
|
158 |
case "none &na;": |
|
159 |
case "(:null)": |
|
160 |
case "&na; &na;": { |
|
161 |
if(context != null) context.incrementCounter("invalid_author", "value_" + fullName, 1); |
|
162 |
return false; |
|
163 |
} |
|
164 |
} |
|
165 |
return true; |
|
166 |
} |
|
167 |
|
|
168 |
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement, |
|
169 |
final ActionFactory factory, |
|
170 |
final String setName, |
|
171 |
final Agent agent, |
|
172 |
boolean invisible, |
|
173 |
final boolean onlyOrganization, |
|
174 |
final Reporter context) { |
|
175 |
|
|
176 |
if (!isValid(rootElement, context)) return null; |
|
177 |
|
|
97 | 178 |
//Create OAF Proto |
98 | 179 |
|
99 | 180 |
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder(); |
... | ... | |
112 | 193 |
//creating Result Proto |
113 | 194 |
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder().setType(TypeProtos.Type.result); |
114 | 195 |
|
115 |
entity.setDateofcollection("2018-10-10");
|
|
196 |
entity.setDateofcollection("2019-02-15");
|
|
116 | 197 |
|
117 |
if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()){ |
|
198 |
if (rootElement.has("collectedFrom") && rootElement.get("collectedFrom").isJsonArray()) {
|
|
118 | 199 |
StreamUtils.toStream(rootElement.getAsJsonArray("collectedFrom").iterator()) |
119 | 200 |
.map(JsonElement::getAsString) |
120 | 201 |
.forEach(cf -> { |
121 |
final String id = datasources.get(cf).getValue(); |
|
122 |
final String name = datasources.get(cf).getKey(); |
|
202 |
final String id = datasources.get(cf.toLowerCase()).getValue();
|
|
203 |
final String name = datasources.get(cf.toLowerCase()).getKey();
|
|
123 | 204 |
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) { |
124 | 205 |
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder() |
125 | 206 |
.setValue(name) |
... | ... | |
132 | 213 |
} |
133 | 214 |
//Adding identifier |
134 | 215 |
final String doi = getStringValue(rootElement, "doi"); |
135 |
if (doi == null)
|
|
136 |
return null; |
|
216 |
entity.addOriginalId(doi);
|
|
217 |
|
|
137 | 218 |
final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi)); |
138 | 219 |
entity.setId(sourceId); |
139 | 220 |
|
... | ... | |
142 | 223 |
.setQualifier(getQualifier("doi", PID_TYPES)) |
143 | 224 |
.build()); |
144 | 225 |
|
145 |
|
|
146 | 226 |
//Create Result Field |
147 | 227 |
ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder(); |
148 | 228 |
|
149 |
final String type = getStringValue(rootElement,"type"); |
|
229 |
final String type = getStringValue(rootElement, "type");
|
|
150 | 230 |
|
151 |
if (!typologiesMapping.containsKey(type)) |
|
152 |
return null; |
|
153 |
|
|
154 | 231 |
//Adding Instances |
155 | 232 |
final String typeValue = typologiesMapping.get(type).get("value"); |
156 | 233 |
final String cobjValue = typologiesMapping.get(type).get("cobj"); |
157 | 234 |
|
235 |
// TODO: workaround for #4362: remove it when UnpayWall is correctly mapped |
|
236 |
List<JsonObject> unpaywallLicenses = getArrayObjects(rootElement, "license").stream().filter(prov -> { |
|
237 |
String provS = getStringValue(prov, "provenance"); |
|
238 |
if (StringUtils.isNotBlank(provS) && provS.equalsIgnoreCase(UNPAYWALL)) return true; |
|
239 |
else return false; |
|
240 |
}).collect(Collectors.toList()); |
|
158 | 241 |
|
159 |
getArrayObjects(rootElement, "instances").stream().map(it ->
|
|
242 |
Stream.concat(unpaywallLicenses.stream(), getArrayObjects(rootElement, "instances").stream()).map(it ->
|
|
160 | 243 |
{ |
161 |
ResultProtos.Result.Instance.Builder instance= ResultProtos.Result.Instance.newBuilder(); |
|
244 |
ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
|
|
162 | 245 |
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder() |
163 | 246 |
.setClassid(cobjValue) |
164 | 247 |
.setClassname(typeValue) |
... | ... | |
170 | 253 |
.setValue("Unknown Repository") |
171 | 254 |
.build()); |
172 | 255 |
|
173 |
final String acc_class_id =it.get("access-rights").getAsString(); |
|
256 |
final String acc_class_id = it.get("access-rights").getAsString();
|
|
174 | 257 |
String acc_class_value; |
175 |
switch (acc_class_id){ |
|
176 |
case "OPEN": { |
|
177 |
acc_class_value = "open access"; |
|
178 |
break; |
|
179 |
} |
|
180 |
case "CLOSED": { |
|
181 |
acc_class_value = "closed access"; |
|
182 |
break; |
|
183 |
} |
|
258 |
switch (acc_class_id) { |
|
259 |
case "OPEN": { |
|
260 |
acc_class_value = "open access"; |
|
261 |
break; |
|
262 |
} |
|
263 |
case "CLOSED": { |
|
264 |
acc_class_value = "closed access"; |
|
265 |
break; |
|
266 |
} |
|
267 |
default: { |
|
268 |
acc_class_value = "not available"; |
|
269 |
} |
|
184 | 270 |
|
185 |
default: { |
|
186 |
acc_class_value = "not available"; |
|
187 |
} |
|
188 |
|
|
189 | 271 |
} |
190 | 272 |
|
191 | 273 |
instance.addUrl(it.get("url").getAsString()); |
... | ... | |
196 | 278 |
.setSchemename("dnet:access_modes") |
197 | 279 |
.build()); |
198 | 280 |
|
199 |
final String id =datasources.get(it.get("provenance").getAsString()).getValue();
|
|
200 |
final String name =datasources.get(it.get("provenance").getAsString()).getKey();
|
|
281 |
final String id = datasources.get(it.get("provenance").getAsString().toLowerCase()).getValue();
|
|
282 |
final String name = datasources.get(it.get("provenance").getAsString().toLowerCase()).getKey();
|
|
201 | 283 |
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) { |
202 | 284 |
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder() |
203 | 285 |
.setValue(name) |
... | ... | |
207 | 289 |
instance.setCollectedfrom(collectedFrom); |
208 | 290 |
} |
209 | 291 |
|
210 |
return instance.build();
|
|
292 |
return instance.build(); |
|
211 | 293 |
}).forEach(result::addInstance); |
212 | 294 |
|
213 | 295 |
//Adding DOI URL as Instance |
214 | 296 |
final String doiURL = getStringValue(rootElement, "doi-url"); |
297 |
JsonObject hostedByOpenAire = null; |
|
298 |
if (rootElement.has("hostedByOpenAire")) { |
|
299 |
hostedByOpenAire = rootElement.getAsJsonObject("hostedByOpenAire"); |
|
300 |
} |
|
301 |
|
|
215 | 302 |
if (StringUtils.isNotBlank(doiURL)) { |
216 |
|
|
217 |
|
|
218 | 303 |
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder(); |
219 | 304 |
instance.addUrl(doiURL); |
305 |
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder() |
|
306 |
.setClassid(cobjValue) |
|
307 |
.setClassname(typeValue) |
|
308 |
.setSchemeid("dnet:publication_resource") |
|
309 |
.setSchemename("dnet:publication_resource") |
|
310 |
.build()); |
|
220 | 311 |
instance.setAccessright(FieldTypeProtos.Qualifier.newBuilder() |
221 | 312 |
.setClassid("CLOSED") |
222 | 313 |
.setClassname("Closed Access") |
... | ... | |
224 | 315 |
.setSchemename("dnet:access_modes") |
225 | 316 |
.build()); |
226 | 317 |
instance.setCollectedfrom(FieldTypeProtos.KeyValue.newBuilder() |
227 |
.setValue("CrossRef")
|
|
318 |
.setValue(CROSSREF)
|
|
228 | 319 |
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5("crossref")) |
229 | 320 |
.build()); |
321 |
|
|
322 |
if (hostedByOpenAire == null) |
|
323 |
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder() |
|
324 |
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c") |
|
325 |
.setValue("Unknown Repository") |
|
326 |
.build()); |
|
327 |
else { |
|
328 |
instance.setHostedby(FieldTypeProtos.KeyValue.newBuilder() |
|
329 |
.setKey(AbstractDNetXsltFunctions.oafSplitId("datasource", hostedByOpenAire.get("id").getAsString())) |
|
330 |
.setValue(hostedByOpenAire.get("name").getAsString()) |
|
331 |
.build()); |
|
332 |
} |
|
333 |
|
|
230 | 334 |
result.addInstance(instance); |
231 | 335 |
} |
232 | 336 |
|
233 | 337 |
//Create Metadata Proto |
234 | 338 |
final ResultProtos.Result.Metadata.Builder metadata = ResultProtos.Result.Metadata.newBuilder(); |
235 | 339 |
|
236 |
|
|
237 | 340 |
Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement); |
238 | 341 |
|
239 | 342 |
if (authorsOrganizations.getKey().size() > 0) { |
240 | 343 |
metadata.addAllAuthor(authorsOrganizations.getKey()); |
241 |
} |
|
242 |
else { |
|
344 |
} else { |
|
345 |
//Should never enter here becasue of the isValid method at the beginning. |
|
346 |
context.incrementCounter("filtered", "unexpected_no_authors", 1); |
|
243 | 347 |
return null; |
244 | 348 |
} |
245 | 349 |
//adding Language |
... | ... | |
251 | 355 |
.build()); |
252 | 356 |
|
253 | 357 |
//Adding subjects |
254 |
List<String> subjects =getArrayValues(rootElement, "subject"); |
|
358 |
List<String> subjects = getArrayValues(rootElement, "subject");
|
|
255 | 359 |
|
256 |
subjects.forEach(s-> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder() |
|
360 |
subjects.forEach(s -> metadata.addSubject(FieldTypeProtos.StructuredProperty.newBuilder()
|
|
257 | 361 |
.setValue(s) |
258 | 362 |
.setQualifier(getQualifier("keyword", "dnet:subject")) |
259 | 363 |
.build())); |
260 | 364 |
|
261 |
List<String>titles =getArrayValues(rootElement, "title");
|
|
262 |
titles.forEach(t-> |
|
365 |
List<String> titles = getCleanedTitles(rootElement);
|
|
366 |
titles.forEach(t ->
|
|
263 | 367 |
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder() |
264 | 368 |
.setValue(t) |
265 | 369 |
.setQualifier(getQualifier("main title", "dnet:dataCite_title")) |
266 | 370 |
.build())); |
371 |
|
|
267 | 372 |
settingRelevantDate(rootElement, metadata, "issued", "issued", true); |
268 | 373 |
settingRelevantDate(rootElement, metadata, "accepted", "accepted", false); |
269 | 374 |
settingRelevantDate(rootElement, metadata, "published-online", "published-online", false); |
270 | 375 |
settingRelevantDate(rootElement, metadata, "published-print", "published-print", false); |
271 | 376 |
|
272 |
|
|
273 | 377 |
getArrayObjects(rootElement, "abstract").forEach(d -> |
274 | 378 |
{ |
275 | 379 |
if (MAG.equals(d.get("provenance").getAsString())) |
... | ... | |
277 | 381 |
else |
278 | 382 |
metadata.addDescription(FieldTypeProtos.StringField.newBuilder().setValue(d.get("value").getAsString()).build()); |
279 | 383 |
} |
280 |
);
|
|
384 |
); |
|
281 | 385 |
|
282 |
|
|
283 |
|
|
284 | 386 |
//Adding Journal |
285 |
final String publisher = getStringValue(rootElement,"publisher"); |
|
286 |
if (StringUtils.isNotBlank(publisher)){ |
|
387 |
final String publisher = getStringValue(rootElement, "publisher");
|
|
388 |
if (StringUtils.isNotBlank(publisher)) {
|
|
287 | 389 |
|
288 | 390 |
final ResultProtos.Result.Journal.Builder journal = ResultProtos.Result.Journal.newBuilder().setName(publisher); |
289 | 391 |
|
290 |
if (hasJSONArrayField(rootElement,"issn" )){
|
|
392 |
if (hasJSONArrayField(rootElement, "issn")) {
|
|
291 | 393 |
StreamUtils.toStream(rootElement.getAsJsonArray("issn").iterator()) |
292 | 394 |
.map(JsonElement::getAsJsonObject) |
293 | 395 |
.forEach(it -> { |
294 | 396 |
final String issntype = getStringValue(it, "type"); |
295 | 397 |
final String value = getStringValue(it, "value"); |
296 |
if("electronic".equals(issntype)){
|
|
398 |
if ("electronic".equals(issntype)) {
|
|
297 | 399 |
journal.setIssnOnline(value); |
298 | 400 |
} |
299 | 401 |
if ("print".equals(issntype)) |
... | ... | |
306 | 408 |
result.setMetadata(metadata.build()); |
307 | 409 |
entity.setResult(result.build()); |
308 | 410 |
oaf.setEntity(entity.build()); |
411 |
|
|
412 |
//System.out.println(JsonFormat.printToString(oaf.build())); |
|
413 |
|
|
309 | 414 |
final List<AtomicAction> actionList = new ArrayList<>(); |
310 | 415 |
|
311 | 416 |
if (!onlyOrganization) |
... | ... | |
320 | 425 |
if (!onlyOrganization) |
321 | 426 |
actionList.addAll(createPublicationOrganizationRelation(oaf.build(), o, factory, setName, agent)); |
322 | 427 |
final String gridOrganization = getSimilarGridOrganization(o.getEntity()); |
323 |
if (gridOrganization!= null) { |
|
324 |
actionList.add(factory.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization, "".getBytes())); |
|
325 |
actionList.add(factory.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(), "".getBytes())); |
|
428 |
if (gridOrganization != null) { |
|
429 |
actionList.add(factory |
|
430 |
.createAtomicAction(setName, agent, o.getEntity().getId(), "organizationOrganization_dedupSimilarity_isSimilarTo", gridOrganization, |
|
431 |
"".getBytes())); |
|
432 |
actionList.add(factory |
|
433 |
.createAtomicAction(setName, agent, gridOrganization, "organizationOrganization_dedupSimilarity_isSimilarTo", o.getEntity().getId(), |
|
434 |
"".getBytes())); |
|
326 | 435 |
} |
327 | 436 |
}); |
328 | 437 |
} |
... | ... | |
330 | 439 |
|
331 | 440 |
} |
332 | 441 |
|
333 |
|
|
334 | 442 |
private static String getSimilarGridOrganization(final OafProtos.OafEntity organization) { |
335 | 443 |
|
336 | 444 |
final List<FieldTypeProtos.StructuredProperty> pidList = organization.getPidList(); |
337 |
if (pidList!= null ) {
|
|
338 |
for (FieldTypeProtos.StructuredProperty p: pidList) { |
|
339 |
if (p.getQualifier().getClassname().equals("grid")){ |
|
340 |
return "20|grid________" + SEPARATOR +AbstractDNetXsltFunctions.md5(p.getValue()); |
|
445 |
if (pidList != null) {
|
|
446 |
for (FieldTypeProtos.StructuredProperty p : pidList) {
|
|
447 |
if (p.getQualifier().getClassname().equals("grid")) {
|
|
448 |
return "20|grid________" + SEPARATOR + AbstractDNetXsltFunctions.md5(p.getValue());
|
|
341 | 449 |
} |
342 | 450 |
} |
343 | 451 |
} |
... | ... | |
345 | 453 |
|
346 | 454 |
} |
347 | 455 |
|
348 |
private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication, final OafProtos.Oaf organization, final ActionFactory factory, final String setName, final Agent agent) { |
|
456 |
private static List<AtomicAction> createPublicationOrganizationRelation(final OafProtos.Oaf publication, |
|
457 |
final OafProtos.Oaf organization, |
|
458 |
final ActionFactory factory, |
|
459 |
final String setName, |
|
460 |
final Agent agent) { |
|
349 | 461 |
|
350 | 462 |
List<AtomicAction> result = new ArrayList<>(); |
351 | 463 |
|
... | ... | |
360 | 472 |
.setProvenanceaction(getQualifier("sysimport:actionset", "dnet:provenanceActions")) |
361 | 473 |
.build()); |
362 | 474 |
|
363 |
|
|
364 | 475 |
final OafProtos.OafRel.Builder rel = OafProtos.OafRel.newBuilder(); |
365 | 476 |
|
366 | 477 |
rel.setRelType(RelTypeProtos.RelType.resultOrganization); |
... | ... | |
381 | 492 |
rel.setResultOrganization(rel_instance.build()); |
382 | 493 |
|
383 | 494 |
rel.addCollectedfrom(FieldTypeProtos.KeyValue.newBuilder() |
384 |
.setValue(datasources.get(MAG).getKey()) |
|
385 |
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions.md5(StringUtils.substringAfter(datasources.get(MAG).getValue(), SEPARATOR))) |
|
495 |
.setValue(datasources.get(MAG.toLowerCase()).getKey()) |
|
496 |
.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + AbstractDNetXsltFunctions |
|
497 |
.md5(StringUtils.substringAfter(datasources.get(MAG.toLowerCase()).getValue(), SEPARATOR))) |
|
386 | 498 |
.build()); |
387 | 499 |
|
388 |
|
|
389 |
|
|
390 | 500 |
rel.setChild(false); |
391 | 501 |
roaf.setRel(rel.build()); |
392 | 502 |
|
393 |
result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution", organization.getEntity().getId(), roaf.build().toByteArray() )); |
|
503 |
result.add(factory.createAtomicAction(setName, agent, publication.getEntity().getId(), "resultOrganization_affiliation_hasAuthorInstitution", |
|
504 |
organization.getEntity().getId(), roaf.build().toByteArray())); |
|
394 | 505 |
|
395 |
|
|
396 | 506 |
//Create a relation Organization --> Result |
397 | 507 |
rel.setTarget(publication.getEntity().getId()); |
398 | 508 |
rel.setSource(organization.getEntity().getId()); |
399 | 509 |
rel.setRelClass(ResultOrganization.Affiliation.RelName.isAuthorInstitutionOf.toString()); |
400 | 510 |
|
401 |
|
|
402 | 511 |
affiliationRel.setRelMetadata(RelMetadataProtos.RelMetadata.newBuilder() |
403 | 512 |
.setSemantics(getQualifier("isAuthorInstitutionOf", "dnet:result_organization_relations")) |
404 | 513 |
.build()); |
405 | 514 |
rel_instance.setAffiliation(affiliationRel.build()); |
406 | 515 |
rel.setResultOrganization(rel_instance.build()); |
407 | 516 |
roaf.setRel(rel.build()); |
408 |
result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf", publication.getEntity().getId(), roaf.build().toByteArray())); |
|
517 |
result.add(factory.createAtomicAction(setName, agent, organization.getEntity().getId(), "resultOrganization_affiliation_isAuthorInstitutionOf", |
|
518 |
publication.getEntity().getId(), roaf.build().toByteArray())); |
|
409 | 519 |
|
410 | 520 |
return result; |
411 | 521 |
|
... | ... | |
415 | 525 |
return root.has(key) && root.get(key).isJsonArray(); |
416 | 526 |
} |
417 | 527 |
|
418 |
private static void settingRelevantDate(JsonObject rootElement, ResultProtos.Result.Metadata.Builder metadata , final String jsonKey, final String dictionaryKey, final boolean addToDateOfAcceptance) { |
|
528 |
private static void settingRelevantDate(JsonObject rootElement, |
|
529 |
ResultProtos.Result.Metadata.Builder metadata, |
|
530 |
final String jsonKey, |
|
531 |
final String dictionaryKey, |
|
532 |
final boolean addToDateOfAcceptance) { |
|
419 | 533 |
//Adding date |
420 |
String date = getStringValue(rootElement,jsonKey); |
|
534 |
String date = getStringValue(rootElement, jsonKey);
|
|
421 | 535 |
if (date == null) |
422 | 536 |
return; |
423 | 537 |
if (date.length() == 4) { |
... | ... | |
428 | 542 |
metadata.setDateofacceptance(FieldTypeProtos.StringField.newBuilder().setValue(date).build()); |
429 | 543 |
metadata.addRelevantdate(FieldTypeProtos.StructuredProperty.newBuilder() |
430 | 544 |
.setValue(date) |
431 |
.setQualifier(getQualifier(dictionaryKey,"dnet:dataCite_date")) |
|
545 |
.setQualifier(getQualifier(dictionaryKey, "dnet:dataCite_date"))
|
|
432 | 546 |
.build()); |
433 | 547 |
} |
434 | 548 |
} |
435 | 549 |
|
436 |
|
|
437 | 550 |
public static FieldTypeProtos.KeyValue extractIdentifier(final String value) { |
438 | 551 |
FieldTypeProtos.KeyValue.Builder pid = FieldTypeProtos.KeyValue.newBuilder(); |
439 |
if (StringUtils.contains(value, "orcid.org")){ |
|
440 |
return pid.setValue(value) |
|
552 |
if (StringUtils.contains(value, "orcid.org")) {
|
|
553 |
return pid.setValue(value.replaceAll("https://orcid.org/", ""))
|
|
441 | 554 |
.setKey(ORCID).build(); |
442 | 555 |
} |
443 |
if (StringUtils.contains(value, "academic.microsoft.com/#/detail")){ |
|
444 |
return pid.setValue(value) |
|
556 |
if (StringUtils.contains(value, "academic.microsoft.com/#/detail")) {
|
|
557 |
return pid.setValue(value.replaceAll("https://academic.microsoft.com/#/detail/", ""))
|
|
445 | 558 |
.setKey("MAG Identifier").build(); |
446 | 559 |
} |
447 | 560 |
return pid.setValue(value) |
448 | 561 |
.setKey("URL").build(); |
449 | 562 |
} |
450 | 563 |
|
451 |
|
|
452 | 564 |
public static OafProtos.Oaf createOrganizationFromJSON(final JsonObject affiliation) { |
453 | 565 |
final Map<String, FieldTypeProtos.Qualifier> affiliationIdentifiers = new HashMap<>(); |
454 | 566 |
final List<String> magId = new ArrayList<>(); |
... | ... | |
456 | 568 |
if (StringUtils.contains(it.get("value").getAsString(), "academic.microsoft.com")) { |
457 | 569 |
affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(MAG)); |
458 | 570 |
magId.add(it.get("value").getAsString()); |
459 |
} |
|
460 |
else |
|
461 |
affiliationIdentifiers.put( it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString())); |
|
571 |
} else |
|
572 |
affiliationIdentifiers.put(it.get("value").getAsString(), affiliationPIDType.get(it.get("schema").getAsString())); |
|
462 | 573 |
}); |
463 | 574 |
if (magId.size() > 0) { |
464 | 575 |
final String microsoftID = magId.get(0); |
... | ... | |
466 | 577 |
oaf.setKind(KindProtos.Kind.entity); |
467 | 578 |
OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder(); |
468 | 579 |
entity.setType(TypeProtos.Type.organization); |
469 |
entity.setId("20|microsoft___" + SEPARATOR +AbstractDNetXsltFunctions.md5(microsoftID)); |
|
470 |
final String id =datasources.get(affiliation.get("provenance").getAsString()).getValue();
|
|
471 |
final String name =datasources.get(affiliation.get("provenance").getAsString()).getKey();
|
|
580 |
entity.setId("20|microsoft___" + SEPARATOR + AbstractDNetXsltFunctions.md5(microsoftID));
|
|
581 |
final String id = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getValue();
|
|
582 |
final String name = datasources.get(affiliation.get("provenance").getAsString().toLowerCase()).getKey();
|
|
472 | 583 |
if (StringUtils.isNotBlank(id) && StringUtils.isNotBlank(name)) { |
473 | 584 |
final FieldTypeProtos.KeyValue collectedFrom = FieldTypeProtos.KeyValue.newBuilder() |
474 | 585 |
.setValue(name) |
... | ... | |
503 | 614 |
.build()); |
504 | 615 |
return oaf.build(); |
505 | 616 |
} |
506 |
return null;
|
|
617 |
return null; |
|
507 | 618 |
} |
508 | 619 |
|
509 |
public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) {
|
|
620 |
public static Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> createAuthorsOrganization(final JsonObject root) { |
|
510 | 621 |
|
511 | 622 |
final Map<String, OafProtos.Oaf> affiliations = new HashMap<>(); |
512 | 623 |
|
513 | 624 |
List<JsonObject> authors = getArrayObjects(root, "authors"); |
514 | 625 |
|
515 |
final AtomicInteger counter = new AtomicInteger(); |
|
626 |
final AtomicInteger counter = new AtomicInteger(1);
|
|
516 | 627 |
|
517 | 628 |
List<FieldTypeProtos.Author> collect = authors.stream().map(author -> { |
518 | 629 |
final String given = getStringValue(author, "given"); |
... | ... | |
523 | 634 |
fullname = String.format("%s %s", given, family); |
524 | 635 |
} |
525 | 636 |
|
526 |
if (StringUtils.isBlank(fullname)){
|
|
637 |
if (!isValidAuthorName(fullname, null)) {
|
|
527 | 638 |
return null; |
528 |
|
|
529 | 639 |
} |
530 | 640 |
final FieldTypeProtos.Author.Builder abuilder = FieldTypeProtos.Author.newBuilder(); |
531 | 641 |
|
... | ... | |
554 | 664 |
Collectors.toMap( |
555 | 665 |
FieldTypeProtos.KeyValue::getKey, |
556 | 666 |
Function.identity(), |
557 |
(a,b) -> a |
|
667 |
(a, b) -> a
|
|
558 | 668 |
)).values().forEach(abuilder::addPid); |
559 | 669 |
abuilder.setRank(counter.getAndIncrement()); |
560 | 670 |
|
... | ... | |
562 | 672 |
|
563 | 673 |
}).filter(Objects::nonNull).collect(Collectors.toList()); |
564 | 674 |
|
565 |
return new Pair<> ( collect,affiliations.values() );
|
|
675 |
return new Pair<>(collect, affiliations.values());
|
|
566 | 676 |
} |
567 | 677 |
|
568 |
|
|
569 |
|
|
570 |
|
|
571 |
|
|
572 |
|
|
573 | 678 |
} |
Also available in: Unified diff
Updated classes for DOIBoost based on the trunk version