Project

General

Profile

1
package eu.dnetlib.data.claims.parser;
2

    
3
import com.google.gson.Gson;
4
import eu.dnetlib.data.claims.entity.Result;
5
import eu.dnetlib.data.claims.utils.ClaimUtils;
6
import org.apache.logging.log4j.LogManager;
7
import org.apache.logging.log4j.Logger;
8
import org.w3c.dom.Document;
9
import org.w3c.dom.NodeList;
10
import org.xml.sax.InputSource;
11
import org.xml.sax.SAXException;
12

    
13
import javax.xml.parsers.DocumentBuilder;
14
import javax.xml.parsers.DocumentBuilderFactory;
15
import javax.xml.parsers.ParserConfigurationException;
16
import javax.xml.transform.Transformer;
17
import javax.xml.transform.TransformerFactory;
18
import javax.xml.transform.dom.DOMSource;
19
import javax.xml.transform.stream.StreamResult;
20
import javax.xml.xpath.XPath;
21
import javax.xml.xpath.XPathConstants;
22
import javax.xml.xpath.XPathExpressionException;
23
import javax.xml.xpath.XPathFactory;
24
import java.io.BufferedReader;
25
import java.io.IOException;
26
import java.io.StringReader;
27
import java.io.StringWriter;
28
import java.math.BigInteger;
29
import java.security.MessageDigest;
30
import java.security.NoSuchAlgorithmException;
31
import java.util.List;
32

    
33
/**
34
 * Created by kiatrop on 5/2/2016.
35
 */
36

    
37
public class ExternalRecordParser {
38

    
39
    /**
40
     * Gets the json response from crossref API
41
     * and returns a Result object
42
     * *
43
     * @param json
44
     * @return Result or null
45
     */
46
    private static final Logger logger = LogManager.getLogger(ExternalRecordParser.class);
47

    
48
    public static Result crossref2Result(String json) {
49
        Result result = null;
50
        if (json == null){
51
            return result;
52
        }
53

    
54
        BufferedReader br = new BufferedReader(new StringReader(json));
55
        //convert the json string back to object
56
        Gson gson = new Gson();
57
        CrossrefResponse obj = gson.fromJson(br, CrossrefResponse.class);
58

    
59
        if(obj!=null && obj.getMessage().getItems().size()>0){
60
            result= new Result();
61
            result.setMetadataRecord(json);
62
            result.setRecordFormat(ClaimUtils.FORMAT_JSON);
63
//            result.setFound(true);
64
            result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_CROSSREF);
65
            CrossrefResponse.Message.Item item = obj.getMessage().getItems().get(0);
66
            if (item.getTitle()!=null && item.getTitle().size()>0){
67
                result.setTitle(item.getTitle().get(0));
68
                if(item.getAuthor() != null) {
69
                    for (int i = 0; i < item.getAuthor().size(); i++) {
70
                        result.getAuthors().put(i + "", item.getAuthor().get(i).getGiven() + " " + item.getAuthor().get(i).getFamily());
71
                    }
72
                }
73
            }
74
//            System.out.println(item.getDOI());
75
            result.setDoi(item.getDOI());
76
            result.setOpenaireId(createOpenaireId(item.getDOI()));
77
            result.setExternalUrl(item.getUrl());
78
            if(result.getDoi()!=null&& result.getExternalUrl()==null){
79
                result.setExternalUrl(ClaimUtils.PREFIX_URL_FOR_DOI + result.getDoi());
80
            }
81
            result.setResultType(ClaimUtils.PUBLICATION);
82

    
83
        }
84

    
85
        return result;
86
    }
87

    
88
    public static Result dataciteJson2Result(String json) {
89
        Result result = null;
90
        if (json == null){
91
            return result;
92
        }
93

    
94
        BufferedReader br = new BufferedReader(new StringReader(json));
95
        //convert the json string back to object
96
        Gson gson = new Gson();
97

    
98
        DataciteResponse obj = gson.fromJson(br, DataciteResponse.class);
99
        System.out.println(json);
100
        if(obj!=null && obj.getData() != null ){
101
            result= new Result();
102
            result.setMetadataRecord(json);
103
            result.setRecordFormat(ClaimUtils.FORMAT_JSON);
104
//            result.setFound(true);
105
            result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_DATACITE);
106
            DataciteResponse.Data.Attributes item = obj.getData().getAttributes();
107
            if (item.getTitle()!=null){
108
                result.setTitle(item.getTitle());
109
                if(item.getAuthor() != null) {
110
                    for (int i = 0; i < item.getAuthor().size(); i++) {
111
                        result.getAuthors().put(i + "", (item.getAuthor().get(i).getGiven() != null)?(item.getAuthor().get(i).getGiven() + " " + item.getAuthor().get(i).getFamily()):item.getAuthor().get(i).getLiteral());
112
                    }
113
                }
114
            }
115
            System.out.println(item.getDoi());
116
            result.setDoi(item.getDoi());
117
            result.setOpenaireId(createOpenaireId(item.getDoi()));
118
            if(result.getDoi()!=null&& result.getExternalUrl()==null){
119
                result.setExternalUrl(ClaimUtils.PREFIX_URL_FOR_DOI + result.getDoi());
120
            }
121
            result.setResultType(ClaimUtils.DATASET);
122

    
123
        }
124

    
125
        return result;
126
    }
127

    
128
    /**
129
     *
130
     * @param xml
131
     * @param orcidworkid The id from DMF identifier[@identifierType='orcidworkid'] {orcid + work-id}
132
     * @return Result or null
133
     */
134
    public static Result orcid2Result(String xml, String orcidworkid) {
135
         Result result = null;
136
        if (xml == null || orcidworkid == null){
137
            return result;
138
        }
139
        String orcidwork = orcidworkid.substring(20, orcidworkid.length());
140
        System.out.println();
141
        try {
142
            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
143
            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
144
            InputSource is = new InputSource(new StringReader(xml));
145
            Document document = dBuilder.parse(is);
146
            XPathFactory xPathfactory= XPathFactory.newInstance();
147
            XPath xpath = xPathfactory.newXPath();
148

    
149
            NodeList nl;
150
            System.out.println(orcidwork);
151
            NodeList worknl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']").evaluate(document,XPathConstants.NODESET);
152
            for( int n = 0; n<worknl.getLength(); n++){
153
                String code = (String) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']/@put-code").evaluate(worknl.item(n), XPathConstants.STRING);
154
                result = new Result();
155
                result.setResultType(ClaimUtils.PUBLICATION);
156
                nl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']/*[name()='work:title']/*[name()='common:title']/text()").evaluate(worknl.item(n), XPathConstants.NODESET);
157
                if (nl.getLength() > 0) {
158
                    result.setTitle(nl.item(0).getNodeValue());
159
                }
160

    
161
                nl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']//*[name()='common:external-id']").evaluate(worknl.item(n), XPathConstants.NODESET);
162
                for (int i = 0; i < nl.getLength(); i++) {
163
                    NodeList identifiersNl;
164
                    String type=null;
165
                    String id=null;
166
                    identifiersNl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']//*[name()='common:external-id-type']/text()").evaluate(nl.item(i), XPathConstants.NODESET);
167
                    if (identifiersNl.getLength() > 0) {
168
                        type=identifiersNl.item(0).getNodeValue();
169
                    }
170
                    identifiersNl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']//*[name()='common:external-id-value']/text()").evaluate(nl.item(i), XPathConstants.NODESET);
171
                    if (identifiersNl.getLength() > 0) {
172
                        id=identifiersNl.item(0).getNodeValue();
173
                    }
174
                    if(type != null && type.equals("doi") && id != null){
175
                        result.setDoi(id);
176
                        result.setExternalUrl(ClaimUtils.PREFIX_URL_FOR_DOI + id);
177
                    }
178
                    //more types (isbn)
179

    
180
                }
181
                if(result.getExternalUrl()==null) {
182
                    nl = (NodeList) xpath.compile("//*[name()='work:work'][@put-code='"+orcidwork+"']//*[name()='common:source']/*[name()='common:source-client-id']/*[name()='common:uri']/text()").evaluate(document, XPathConstants.NODESET);
183
                    if (nl.getLength() > 0) {
184
                        result.setExternalUrl(nl.item(0).getNodeValue());
185
                    }
186
                }
187

    
188
                nl = (NodeList) xpath.compile("//*[name()='work:work-summary'][@put-code='"+orcidwork+"']//*[name()='work:work-summary']").evaluate(document, XPathConstants.NODESET);
189
                for (int i = 0; i < nl.getLength(); i++) {
190
                    if(!worknl.item(0).isEqualNode(nl.item(i))) {
191
                        nl.item(i).getParentNode().removeChild(nl.item(i));
192
                    }
193
                }
194

    
195
                DOMSource domSource = new DOMSource(document);
196
                StringWriter writer = new StringWriter();
197
                StreamResult streamResult = new StreamResult(writer);
198
                TransformerFactory tf = TransformerFactory.newInstance();
199
                Transformer transformer = tf.newTransformer();
200
                transformer.transform(domSource, streamResult);
201
                result.setOrcidworkid(orcidworkid);
202
                result.setOpenaireId(createOpenaireId(orcidworkid));
203
                result.setMetadataRecord(writer.toString());
204
                result.setRecordFormat(ClaimUtils.FORMAT_XML);
205
                result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_ORCID);
206
//                result.setFound(true);
207
            }
208

    
209
        } catch (Exception e) {
210
            logger.error("Error parsing Orcid result\n"+xml,e);
211
        }
212

    
213
        return result;
214
    }
215

    
216

    
217
    /**
218
     *
219
     * @param xml
220
     * @return Result object or null
221
     */
222
    public static Result datacite2Result(String xml) {
223
        logger.debug("Datacite xml response:\n"+xml);
224
        Result result = null;
225
        if (xml != null ) {
226

    
227
            try {
228
                DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
229
                DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
230
                InputSource is = new InputSource(new StringReader(xml));
231
                Document document = dBuilder.parse(is);
232
                XPathFactory xPathfactory = XPathFactory.newInstance();
233
                XPath xpath = xPathfactory.newXPath();
234
                NodeList nl;
235
                nl = (NodeList) xpath.compile("//*[local-name()='identifier']/text()").evaluate(document, XPathConstants.NODESET);
236
                if (nl.getLength() > 0) {
237
                    result = new Result();
238
                    result.setResultType(ClaimUtils.DATASET);
239

    
240
                    result.setDoi(nl.item(0).getNodeValue());
241
                    result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
242

    
243
                    result.setOpenaireId(createOpenaireId(result.getDoi()));
244
                    nl = (NodeList) xpath.compile("//*[local-name()='title']/text()").evaluate(document, XPathConstants.NODESET);
245
                    if (nl.getLength() > 0) {
246
                        result.setTitle(nl.item(0).getNodeValue());
247
                    }
248

    
249
                    nl = (NodeList) xpath.compile("//*[local-name()='creator']/text()").evaluate(document, XPathConstants.NODESET);
250
                    if (nl.getLength() > 0) {
251
                        for (int i = 0; i < nl.getLength(); i++) {
252
                            result.getAuthors().put(i + "", nl.item(i).getNodeValue());
253
                        }
254
                    }
255

    
256
                    result.setMetadataRecord(xml);
257
                    result.setRecordFormat(ClaimUtils.FORMAT_XML);
258
                    result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_DATACITE);
259
//                result.setFound(true);
260
                    logger.debug("Datacite result" + result.toString());
261

    
262
                } else {
263

    
264
                  logger.error("Couldn't parse Datacite result\n" + xml);
265

    
266
                }
267

    
268
                } catch(ParserConfigurationException | IOException | XPathExpressionException |SAXException e){
269
                    logger.error("Error parsing Orcid result\n"+xml,e);
270
                }
271

    
272
        }
273
            return result;
274
    }
275
    public static String createOpenaireId(String id){
276
//        System.out.println("createOpenaireId from id:" +id);
277
        if(id==null){
278
            return null;
279
        }
280
        String openaireId=id;
281
        MessageDigest m= null;
282
        try {
283
            m = MessageDigest.getInstance("MD5");
284
            m.update(id.getBytes(),0,id.length());
285
            openaireId = new BigInteger(1,m.digest()).toString(16);
286
            while(openaireId.length() < 32 ){
287
                openaireId = "0"+openaireId;
288
            }
289
        } catch (NoSuchAlgorithmException e) {
290
            logger.error("Couldn't instatiate md5 algorithm",e);
291
        }
292
        openaireId ="userclaim___::"+openaireId;
293
        return openaireId;
294
    }
295

    
296
}
297

    
298
/**
299
 * The following classes display the crossref structure of API response
300
 * They are used for parsing the json CrossrefResponse
301
 */
302

    
303
class CrossrefResponse {
304
    String status;
305
    Message message;
306

    
307
    public String getStatus() {
308
        return status;
309
    }
310

    
311
    public Message getMessage() {
312
        return message;
313
    }
314
    class Message{
315
        List<Item> items;
316

    
317
        public List<Item> getItems() {
318
            return items;
319
        }
320

    
321
        class Item {
322
            private String DOI;
323
            private List<String> title;
324
            private List<Author> author;
325
            private String Url;
326
            private String type;
327
            private String source;
328

    
329
            public String getDOI() {
330
                return DOI;
331
            }
332

    
333
            public List<String> getTitle() {
334
                return title;
335
            }
336

    
337
            public List<Author> getAuthor() { return author;}
338

    
339
            public String getUrl() {
340
                return Url;
341
            }
342

    
343
            public String getType() {
344
                return type;
345
            }
346

    
347
            public String getSource() {
348
                return source;
349
            }
350

    
351
            class Author {
352
                String given;
353
                String family;
354

    
355
                public String getGiven() {
356
                    return given;
357
                }
358

    
359
                public String getFamily() {
360
                    return family;
361
                }
362

    
363
            }
364
        }
365
    }
366
}
367

    
368
class DataciteResponse {
369

    
370
    Data data;
371

    
372
    public Data getData() {
373
        return data;
374
    }
375

    
376
    class Data{
377

    
378

    
379
            private Attributes attributes;
380

    
381
            public Attributes getAttributes() {
382
                return attributes;
383
            }
384

    
385
            class Attributes{
386
                private String doi;
387
                private String containerTitle;
388
                private String published;
389
                private String title;
390
                private List<Author> author;
391

    
392
                public String getDoi() {
393
                    return doi;
394
                }
395

    
396
                public String getContainerTitle() {
397
                    return containerTitle;
398
                }
399

    
400
                public String getPublished() {
401
                    return published;
402
                }
403

    
404
                public String getTitle() {
405
                    return title;
406
                }
407

    
408
                public List<Author> getAuthor() {
409
                    return author;
410
                }
411
            }
412
            class Author {
413
                String given;
414
                String family;
415
                String literal;
416

    
417
                public String getGiven() {
418
                    return given;
419
                }
420

    
421
                public String getFamily() {
422
                    return family;
423
                }
424

    
425
                public String getLiteral() {
426
                    return literal;
427
                }
428
            }
429
        }
430

    
431
}
(1-1/2)