1
|
package eu.dnetlib.data.claims.parser;
|
2
|
|
3
|
import com.google.gson.Gson;
|
4
|
import eu.dnetlib.data.claims.entity.Result;
|
5
|
import eu.dnetlib.data.claims.utils.ClaimUtils;
|
6
|
import org.apache.log4j.Logger;
|
7
|
import org.w3c.dom.Document;
|
8
|
import org.w3c.dom.NodeList;
|
9
|
import org.xml.sax.InputSource;
|
10
|
import org.xml.sax.SAXException;
|
11
|
|
12
|
import javax.xml.parsers.DocumentBuilder;
|
13
|
import javax.xml.parsers.DocumentBuilderFactory;
|
14
|
import javax.xml.parsers.ParserConfigurationException;
|
15
|
import javax.xml.transform.Transformer;
|
16
|
import javax.xml.transform.TransformerFactory;
|
17
|
import javax.xml.transform.dom.DOMSource;
|
18
|
import javax.xml.transform.stream.StreamResult;
|
19
|
import javax.xml.xpath.XPath;
|
20
|
import javax.xml.xpath.XPathConstants;
|
21
|
import javax.xml.xpath.XPathExpressionException;
|
22
|
import javax.xml.xpath.XPathFactory;
|
23
|
import java.io.BufferedReader;
|
24
|
import java.io.IOException;
|
25
|
import java.io.StringReader;
|
26
|
import java.io.StringWriter;
|
27
|
import java.math.BigInteger;
|
28
|
import java.security.MessageDigest;
|
29
|
import java.security.NoSuchAlgorithmException;
|
30
|
import java.util.List;
|
31
|
|
32
|
/**
|
33
|
* Created by kiatrop on 5/2/2016.
|
34
|
*/
|
35
|
|
36
|
public class ExternalRecordParser {
|
37
|
|
38
|
/**
|
39
|
* Gets the json response from crossref API
|
40
|
* and returns a Result object
|
41
|
* *
|
42
|
* @param json
|
43
|
* @return Result or null
|
44
|
*/
|
45
|
private static final Logger logger = Logger.getLogger(ExternalRecordParser.class);
|
46
|
|
47
|
public static Result crossref2Result(String json) {
|
48
|
Result result = null;
|
49
|
if (json == null){
|
50
|
return result;
|
51
|
}
|
52
|
|
53
|
BufferedReader br = new BufferedReader(new StringReader(json));
|
54
|
//convert the json string back to object
|
55
|
Gson gson = new Gson();
|
56
|
CrossrefResponse obj = gson.fromJson(br, CrossrefResponse.class);
|
57
|
|
58
|
if(obj!=null && obj.getMessage().getItems().size()>0){
|
59
|
result= new Result();
|
60
|
result.setMetadataRecord(json);
|
61
|
result.setRecordFormat(ClaimUtils.FORMAT_JSON);
|
62
|
// result.setFound(true);
|
63
|
result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_CROSSREF);
|
64
|
CrossrefResponse.Message.Item item = obj.getMessage().getItems().get(0);
|
65
|
if (item.getTitle()!=null && item.getTitle().size()>0){
|
66
|
result.setTitle(item.getTitle().get(0));
|
67
|
if(item.getAuthor() != null) {
|
68
|
for (int i = 0; i < item.getAuthor().size(); i++) {
|
69
|
result.getAuthors().put(i + "", item.getAuthor().get(i).getGiven() + " " + item.getAuthor().get(i).getFamily());
|
70
|
}
|
71
|
}
|
72
|
}
|
73
|
// System.out.println(item.getDOI());
|
74
|
result.setDoi(item.getDOI());
|
75
|
result.setOpenaireId(createOpenaireId(item.getDOI()));
|
76
|
result.setExternalUrl(item.getUrl());
|
77
|
if(result.getDoi()!=null&& result.getExternalUrl()==null){
|
78
|
result.setExternalUrl(ClaimUtils.PREFIX_URL_FOR_DOI + result.getDoi());
|
79
|
}
|
80
|
result.setResultType(ClaimUtils.PUBLICATION);
|
81
|
|
82
|
}
|
83
|
|
84
|
return result;
|
85
|
}
|
86
|
|
87
|
public static Result dataciteJson2Result(String json) {
|
88
|
Result result = null;
|
89
|
if (json == null){
|
90
|
return result;
|
91
|
}
|
92
|
|
93
|
BufferedReader br = new BufferedReader(new StringReader(json));
|
94
|
//convert the json string back to object
|
95
|
Gson gson = new Gson();
|
96
|
|
97
|
DataciteResponse obj = gson.fromJson(br, DataciteResponse.class);
|
98
|
System.out.println(json);
|
99
|
if(obj!=null && obj.getData() != null ){
|
100
|
result= new Result();
|
101
|
result.setMetadataRecord(json);
|
102
|
result.setRecordFormat(ClaimUtils.FORMAT_JSON);
|
103
|
// result.setFound(true);
|
104
|
result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_DATACITE);
|
105
|
DataciteResponse.Data.Attributes item = obj.getData().getAttributes();
|
106
|
if (item.getTitle()!=null){
|
107
|
result.setTitle(item.getTitle());
|
108
|
if(item.getAuthor() != null) {
|
109
|
for (int i = 0; i < item.getAuthor().size(); i++) {
|
110
|
result.getAuthors().put(i + "", (item.getAuthor().get(i).getGiven() != null)?(item.getAuthor().get(i).getGiven() + " " + item.getAuthor().get(i).getFamily()):item.getAuthor().get(i).getLiteral());
|
111
|
}
|
112
|
}
|
113
|
}
|
114
|
System.out.println(item.getDoi());
|
115
|
result.setDoi(item.getDoi());
|
116
|
result.setOpenaireId(createOpenaireId(item.getDoi()));
|
117
|
if(result.getDoi()!=null&& result.getExternalUrl()==null){
|
118
|
result.setExternalUrl(ClaimUtils.PREFIX_URL_FOR_DOI + result.getDoi());
|
119
|
}
|
120
|
result.setResultType(ClaimUtils.DATASET);
|
121
|
|
122
|
}
|
123
|
|
124
|
return result;
|
125
|
}
|
126
|
|
127
|
/**
|
128
|
*
|
129
|
* @param xml
|
130
|
* @param orcidworkid The id from DMF identifier[@identifierType='orcidworkid'] {orcid + work-id}
|
131
|
* @return Result or null
|
132
|
*/
|
133
|
public static Result orcid2Result(String xml, String orcidworkid) {
|
134
|
Result result = null;
|
135
|
if (xml == null || orcidworkid == null){
|
136
|
return result;
|
137
|
}
|
138
|
String orcidwork = orcidworkid.substring(20, orcidworkid.length());
|
139
|
System.out.println();
|
140
|
try {
|
141
|
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
|
142
|
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
|
143
|
InputSource is = new InputSource(new StringReader(xml));
|
144
|
Document document = dBuilder.parse(is);
|
145
|
XPathFactory xPathfactory= XPathFactory.newInstance();
|
146
|
XPath xpath = xPathfactory.newXPath();
|
147
|
|
148
|
NodeList nl;
|
149
|
System.out.println(orcidwork);
|
150
|
NodeList worknl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']").evaluate(document,XPathConstants.NODESET);
|
151
|
for( int n = 0; n<worknl.getLength(); n++){
|
152
|
String code = (String) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']/@put-code").evaluate(worknl.item(n), XPathConstants.STRING);
|
153
|
result = new Result();
|
154
|
result.setResultType(ClaimUtils.PUBLICATION);
|
155
|
nl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']/*[name()='work:title']/*[name()='common:title']/text()").evaluate(worknl.item(n), XPathConstants.NODESET);
|
156
|
if (nl.getLength() > 0) {
|
157
|
result.setTitle(nl.item(0).getNodeValue());
|
158
|
}
|
159
|
|
160
|
nl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']//*[name()='common:external-id']").evaluate(worknl.item(n), XPathConstants.NODESET);
|
161
|
for (int i = 0; i < nl.getLength(); i++) {
|
162
|
NodeList identifiersNl;
|
163
|
String type=null;
|
164
|
String id=null;
|
165
|
identifiersNl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']//*[name()='common:external-id-type']/text()").evaluate(nl.item(i), XPathConstants.NODESET);
|
166
|
if (identifiersNl.getLength() > 0) {
|
167
|
type=identifiersNl.item(0).getNodeValue();
|
168
|
}
|
169
|
identifiersNl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']//*[name()='common:external-id-value']/text()").evaluate(nl.item(i), XPathConstants.NODESET);
|
170
|
if (identifiersNl.getLength() > 0) {
|
171
|
id=identifiersNl.item(0).getNodeValue();
|
172
|
}
|
173
|
if(type != null && type.equals("doi") && id != null){
|
174
|
result.setDoi(id);
|
175
|
result.setExternalUrl(ClaimUtils.PREFIX_URL_FOR_DOI + id);
|
176
|
}
|
177
|
//more types (isbn)
|
178
|
|
179
|
}
|
180
|
if(result.getExternalUrl()==null) {
|
181
|
nl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']//*[name()='common:source']/*[name()='common:source-client-id']/*[name()='common:uri']/text()").evaluate(document, XPathConstants.NODESET);
|
182
|
if (nl.getLength() > 0) {
|
183
|
result.setExternalUrl(nl.item(0).getNodeValue());
|
184
|
}
|
185
|
}
|
186
|
|
187
|
nl = (NodeList) xpath.compile("//*[name()='work:work-summary'][@put-code='"+orcidwork+"']//*[name()='work:work-summary']").evaluate(document, XPathConstants.NODESET);
|
188
|
for (int i = 0; i < nl.getLength(); i++) {
|
189
|
if(!worknl.item(0).isEqualNode(nl.item(i))) {
|
190
|
nl.item(i).getParentNode().removeChild(nl.item(i));
|
191
|
}
|
192
|
}
|
193
|
|
194
|
DOMSource domSource = new DOMSource(document);
|
195
|
StringWriter writer = new StringWriter();
|
196
|
StreamResult streamResult = new StreamResult(writer);
|
197
|
TransformerFactory tf = TransformerFactory.newInstance();
|
198
|
Transformer transformer = tf.newTransformer();
|
199
|
transformer.transform(domSource, streamResult);
|
200
|
result.setOrcidworkid(orcidworkid);
|
201
|
result.setOpenaireId(createOpenaireId(orcidworkid));
|
202
|
result.setMetadataRecord(writer.toString());
|
203
|
result.setRecordFormat(ClaimUtils.FORMAT_XML);
|
204
|
result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_ORCID);
|
205
|
// result.setFound(true);
|
206
|
}
|
207
|
|
208
|
} catch (Exception e) {
|
209
|
logger.error("Error parsing Orcid result\n"+xml,e);
|
210
|
}
|
211
|
|
212
|
return result;
|
213
|
}
|
214
|
|
215
|
|
216
|
/**
|
217
|
*
|
218
|
* @param xml
|
219
|
* @return Result object or null
|
220
|
*/
|
221
|
public static Result datacite2Result(String xml) {
|
222
|
logger.debug("Datacite xml response:\n"+xml);
|
223
|
Result result = null;
|
224
|
if (xml != null ) {
|
225
|
|
226
|
try {
|
227
|
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
|
228
|
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
|
229
|
InputSource is = new InputSource(new StringReader(xml));
|
230
|
Document document = dBuilder.parse(is);
|
231
|
XPathFactory xPathfactory = XPathFactory.newInstance();
|
232
|
XPath xpath = xPathfactory.newXPath();
|
233
|
NodeList nl;
|
234
|
nl = (NodeList) xpath.compile("//*[local-name()='identifier']/text()").evaluate(document, XPathConstants.NODESET);
|
235
|
if (nl.getLength() > 0) {
|
236
|
result = new Result();
|
237
|
result.setResultType(ClaimUtils.DATASET);
|
238
|
|
239
|
result.setDoi(nl.item(0).getNodeValue());
|
240
|
result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
|
241
|
|
242
|
result.setOpenaireId(createOpenaireId(result.getDoi()));
|
243
|
nl = (NodeList) xpath.compile("//*[local-name()='title']/text()").evaluate(document, XPathConstants.NODESET);
|
244
|
if (nl.getLength() > 0) {
|
245
|
result.setTitle(nl.item(0).getNodeValue());
|
246
|
}
|
247
|
|
248
|
nl = (NodeList) xpath.compile("//*[local-name()='creator']/text()").evaluate(document, XPathConstants.NODESET);
|
249
|
if (nl.getLength() > 0) {
|
250
|
for (int i = 0; i < nl.getLength(); i++) {
|
251
|
result.getAuthors().put(i + "", nl.item(i).getNodeValue());
|
252
|
}
|
253
|
}
|
254
|
|
255
|
result.setMetadataRecord(xml);
|
256
|
result.setRecordFormat(ClaimUtils.FORMAT_XML);
|
257
|
result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_DATACITE);
|
258
|
// result.setFound(true);
|
259
|
logger.debug("Datacite result" + result.toString());
|
260
|
|
261
|
} else {
|
262
|
|
263
|
logger.error("Couldn't parse Datacite result\n" + xml);
|
264
|
|
265
|
}
|
266
|
|
267
|
} catch(ParserConfigurationException | IOException | XPathExpressionException |SAXException e){
|
268
|
logger.error("Error parsing Orcid result\n"+xml,e);
|
269
|
}
|
270
|
|
271
|
}
|
272
|
return result;
|
273
|
}
|
274
|
public static String createOpenaireId(String id){
|
275
|
// System.out.println("createOpenaireId from id:" +id);
|
276
|
if(id==null){
|
277
|
return null;
|
278
|
}
|
279
|
String openaireId=id;
|
280
|
MessageDigest m= null;
|
281
|
try {
|
282
|
m = MessageDigest.getInstance("MD5");
|
283
|
m.update(id.getBytes(),0,id.length());
|
284
|
openaireId = new BigInteger(1,m.digest()).toString(16);
|
285
|
while(openaireId.length() < 32 ){
|
286
|
openaireId = "0"+openaireId;
|
287
|
}
|
288
|
} catch (NoSuchAlgorithmException e) {
|
289
|
logger.error("Couldn't instatiate md5 algorithm",e);
|
290
|
}
|
291
|
openaireId ="userclaim___::"+openaireId;
|
292
|
return openaireId;
|
293
|
}
|
294
|
|
295
|
}
|
296
|
|
297
|
/**
|
298
|
* The following classes display the crossref structure of API response
|
299
|
* They are used for parsing the json CrossrefResponse
|
300
|
*/
|
301
|
|
302
|
class CrossrefResponse {
|
303
|
String status;
|
304
|
Message message;
|
305
|
|
306
|
public String getStatus() {
|
307
|
return status;
|
308
|
}
|
309
|
|
310
|
public Message getMessage() {
|
311
|
return message;
|
312
|
}
|
313
|
class Message{
|
314
|
List<Item> items;
|
315
|
|
316
|
public List<Item> getItems() {
|
317
|
return items;
|
318
|
}
|
319
|
|
320
|
class Item {
|
321
|
private String DOI;
|
322
|
private List<String> title;
|
323
|
private List<Author> author;
|
324
|
private String Url;
|
325
|
private String type;
|
326
|
private String source;
|
327
|
|
328
|
public String getDOI() {
|
329
|
return DOI;
|
330
|
}
|
331
|
|
332
|
public List<String> getTitle() {
|
333
|
return title;
|
334
|
}
|
335
|
|
336
|
public List<Author> getAuthor() { return author;}
|
337
|
|
338
|
public String getUrl() {
|
339
|
return Url;
|
340
|
}
|
341
|
|
342
|
public String getType() {
|
343
|
return type;
|
344
|
}
|
345
|
|
346
|
public String getSource() {
|
347
|
return source;
|
348
|
}
|
349
|
|
350
|
class Author {
|
351
|
String given;
|
352
|
String family;
|
353
|
|
354
|
public String getGiven() {
|
355
|
return given;
|
356
|
}
|
357
|
|
358
|
public String getFamily() {
|
359
|
return family;
|
360
|
}
|
361
|
|
362
|
}
|
363
|
}
|
364
|
}
|
365
|
}
|
366
|
|
367
|
class DataciteResponse {
|
368
|
|
369
|
Data data;
|
370
|
|
371
|
public Data getData() {
|
372
|
return data;
|
373
|
}
|
374
|
|
375
|
class Data{
|
376
|
|
377
|
|
378
|
private Attributes attributes;
|
379
|
|
380
|
public Attributes getAttributes() {
|
381
|
return attributes;
|
382
|
}
|
383
|
|
384
|
class Attributes{
|
385
|
private String doi;
|
386
|
private String containerTitle;
|
387
|
private String published;
|
388
|
private String title;
|
389
|
private List<Author> author;
|
390
|
|
391
|
public String getDoi() {
|
392
|
return doi;
|
393
|
}
|
394
|
|
395
|
public String getContainerTitle() {
|
396
|
return containerTitle;
|
397
|
}
|
398
|
|
399
|
public String getPublished() {
|
400
|
return published;
|
401
|
}
|
402
|
|
403
|
public String getTitle() {
|
404
|
return title;
|
405
|
}
|
406
|
|
407
|
public List<Author> getAuthor() {
|
408
|
return author;
|
409
|
}
|
410
|
}
|
411
|
class Author {
|
412
|
String given;
|
413
|
String family;
|
414
|
String literal;
|
415
|
|
416
|
public String getGiven() {
|
417
|
return given;
|
418
|
}
|
419
|
|
420
|
public String getFamily() {
|
421
|
return family;
|
422
|
}
|
423
|
|
424
|
public String getLiteral() {
|
425
|
return literal;
|
426
|
}
|
427
|
}
|
428
|
}
|
429
|
|
430
|
}
|