Project

General

Profile

1 57029 argiro.kok
package eu.dnetlib.data.claims.parser;
2 41200 katerina.i
3 57029 argiro.kok
import eu.dnetlib.data.claims.entity.Project;
4
import eu.dnetlib.data.claims.entity.Result;
5
import eu.dnetlib.data.claims.utils.ClaimUtils;
6 47219 argiro.kok
import org.apache.log4j.Logger;
7 41200 katerina.i
import org.w3c.dom.Document;
8 46755 katerina.i
import org.w3c.dom.Element;
9 41200 katerina.i
import org.w3c.dom.NodeList;
10
import org.xml.sax.InputSource;
11
import org.xml.sax.SAXException;
12
13
import javax.xml.parsers.DocumentBuilder;
14
import javax.xml.parsers.DocumentBuilderFactory;
15
import javax.xml.parsers.ParserConfigurationException;
16 41408 argiro.kok
import javax.xml.transform.Transformer;
17
import javax.xml.transform.TransformerException;
18
import javax.xml.transform.TransformerFactory;
19
import javax.xml.transform.dom.DOMSource;
20
import javax.xml.transform.stream.StreamResult;
21 41200 katerina.i
import javax.xml.xpath.XPath;
22
import javax.xml.xpath.XPathConstants;
23
import javax.xml.xpath.XPathExpressionException;
24
import javax.xml.xpath.XPathFactory;
25
import java.io.IOException;
26
import java.io.StringReader;
27 41408 argiro.kok
import java.io.StringWriter;
28 46884 konstantin
import java.util.ArrayList;
29 41200 katerina.i
30
/**
31
 * Created by kiatrop on 5/2/2016.
32
 */
33
public class OafParser {
34 41411 argiro.kok
    /**
35
     * For external url: returns http://dx.doi.org/ +doi else the first url from websource
36
     *
37
     * @param oaf
38
     * @return
39
     * @throws ParserConfigurationException
40
     * @throws IOException
41
     * @throws SAXException
42
     * @throws XPathExpressionException
43
     * @throws TransformerException
44
     */
45 47219 argiro.kok
46
    private static final Logger logger = Logger.getLogger(OafParser.class);
47
48 41408 argiro.kok
    public static Result oaf2Result(String oaf) throws ParserConfigurationException, IOException, SAXException, XPathExpressionException, TransformerException {
49 41200 katerina.i
        Result result = null;
50
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
51
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
52
        InputSource is = new InputSource(new StringReader(oaf));
53
        Document document= dBuilder.parse(is);
54
        XPathFactory xPathfactory= XPathFactory.newInstance();
55
        XPath xpath = xPathfactory.newXPath();
56
57
        String size = null;
58
        NodeList nl = (NodeList) xpath.compile("/response/header/total/text()").evaluate(document, XPathConstants.NODESET);
59
        if (nl.getLength() > 0) {
60
            size= nl.item(0).getNodeValue();
61 49865 argiro.kok
        }else{
62
            nl = (NodeList) xpath.compile("//objIdentifier/text()").evaluate(document, XPathConstants.NODESET);
63
            if (nl.getLength() > 0) {
64
//                result.setOpenaireId(nl.item(0).getNodeValue());
65
                size="1";
66
            }
67 41200 katerina.i
        }
68
        if(size!=null && Integer.parseInt(size)>0){
69
            result = new Result();
70 41350 argiro.kok
            nl = (NodeList) xpath.compile("//objIdentifier/text()").evaluate(document, XPathConstants.NODESET);
71
            if (nl.getLength() > 0) {
72
                result.setOpenaireId(nl.item(0).getNodeValue());
73
            }
74 41200 katerina.i
            nl = (NodeList) xpath.compile("//resulttype/@classid").evaluate(document, XPathConstants.NODESET);
75
            if (nl.getLength() > 0) {
76
                result.setResultType(nl.item(0).getNodeValue());
77
            }
78
            nl = (NodeList) xpath.compile("//bestlicense/@classid").evaluate(document, XPathConstants.NODESET);
79
            if (nl.getLength() > 0) {
80
                result.setBestLicense(nl.item(0).getNodeValue());
81
            }
82 49865 argiro.kok
            nl = (NodeList) xpath.compile("//bestaccessright/@classid").evaluate(document, XPathConstants.NODESET);
83
            if (nl.getLength() > 0) {
84
                result.setBestLicense(nl.item(0).getNodeValue());
85
            }
86 41411 argiro.kok
            nl = (NodeList) xpath.compile("//embargoenddate/text()").evaluate(document, XPathConstants.NODESET);
87
            if (nl.getLength() > 0) {
88
                result.setEmbargoEndDate(nl.item(0).getNodeValue());
89
            }
90 41200 katerina.i
            nl = (NodeList) xpath.compile("//title/text()").evaluate(document, XPathConstants.NODESET);
91
            if (nl.getLength() > 0) {
92
                result.setTitle(nl.item(0).getNodeValue());
93
            }
94
            nl = (NodeList) xpath.compile("//pid[@classid='doi']/text()").evaluate(document, XPathConstants.NODESET);
95
            if (nl.getLength() > 0) {
96
                result.setDoi(nl.item(0).getNodeValue());
97 41350 argiro.kok
                result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
98 41200 katerina.i
            }
99 41411 argiro.kok
            if(result.getExternalUrl()==null){
100 41621 argiro.kok
//                nl = (NodeList) xpath.compile("//webresource/url/text()").evaluate(document, XPathConstants.NODESET);
101
//                if (nl.getLength() > 0) {
102
//                    result.setExternalUrl(nl.item(0).getNodeValue());
103
//                }
104
                nl = (NodeList) xpath.compile("//children/instance/licence[@classid='OPEN']").evaluate(document, XPathConstants.NODESET);
105 41411 argiro.kok
                if (nl.getLength() > 0) {
106 41621 argiro.kok
                    NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
107
                    if (list.getLength() > 0) {
108
                        result.setExternalUrl(list.item(0).getNodeValue());
109
                    }
110
                }else{
111
                    nl = (NodeList) xpath.compile("//children/instance/licence[@classid='EMBARGO']").evaluate(document, XPathConstants.NODESET);
112
                    if (nl.getLength() > 0) {
113
                        NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
114
                        if (list.getLength() > 0) {
115
                            result.setExternalUrl(list.item(0).getNodeValue());
116
                        }
117
                    }else{
118
                        nl = (NodeList) xpath.compile("//children/instance/licence[@classid='CLOSED']").evaluate(document, XPathConstants.NODESET);
119
                        if (nl.getLength() > 0) {
120
                            NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
121
                            if (list.getLength() > 0) {
122
                                result.setExternalUrl(list.item(0).getNodeValue());
123
                            }
124
                        }
125
                    }
126 41411 argiro.kok
                }
127
            }
128 41200 katerina.i
            nl = (NodeList) xpath.compile("//pid[@classid='pmc']/text()").evaluate(document, XPathConstants.NODESET);
129
            if (nl.getLength() > 0) {
130
                result.setPmcid(nl.item(0).getNodeValue());
131
132
            }
133 41411 argiro.kok
            nl = (NodeList) xpath.compile("//pid[@classid='orcidworkid']/text()").evaluate(document, XPathConstants.NODESET);
134
            if (nl.getLength() > 0) {
135
                result.setOrcidworkid(nl.item(0).getNodeValue());
136
137
            }
138 41350 argiro.kok
            nl = (NodeList) xpath.compile("//pid[@classid='oai']/text()").evaluate(document, XPathConstants.NODESET);
139
            if (nl.getLength() > 0) {
140
                result.setOai(nl.item(0).getNodeValue());
141
142
            }
143
            nl = (NodeList) xpath.compile("//provenanceaction[@classid='user:claim:datacite']/@classid").evaluate(document, XPathConstants.NODESET);
144
            if (nl.getLength() > 0) {
145
                result.setProvenanceaction(nl.item(0).getNodeValue());
146
147
            }
148 41408 argiro.kok
            nl = (NodeList) xpath.compile("//response/header").evaluate(document, XPathConstants.NODESET);
149
            if(nl.getLength()>0){
150
                    nl.item(0).getParentNode().removeChild(nl.item(0));
151
            }
152 46755 katerina.i
153 49865 argiro.kok
            nl = (NodeList) xpath.compile("//rel[@class='hasAuthor']").evaluate(document, XPathConstants.NODESET);
154
            System.out.println("Author length"+nl.getLength());
155 46755 katerina.i
            if (nl.getLength() > 0) {
156
                for (int i = 0; i < nl.getLength(); i++) {
157
                    String ranking = ((Element)(nl.item(i).getParentNode())).getElementsByTagName("ranking").item(0).getTextContent();
158
                    String fullname =  (((Element)(nl.item(i).getParentNode())).getElementsByTagName("fullname").item(0).getTextContent().replace(",", " "));
159
                    result.getAuthors().put(ranking,fullname);
160
                }
161
162
            }
163 49865 argiro.kok
            nl = (NodeList) xpath.compile("//creator/text()").evaluate(document, XPathConstants.NODESET);
164
            System.out.println("Creator length"+nl.getLength());
165
            if (nl.getLength() > 0) {
166
                for (int i = 0; i < nl.getLength(); i++) {
167
                    String ranking = ((Element)(nl.item(i).getParentNode())).getAttribute("rank");
168
                    String fullname =  (((Element)(nl.item(i).getParentNode())).getTextContent().replace(",", " "));
169
                    result.getAuthors().put(ranking,fullname);
170
                }
171 46755 katerina.i
172 49865 argiro.kok
            }
173
174 41408 argiro.kok
            DOMSource domSource = new DOMSource(document);
175
            StringWriter writer = new StringWriter();
176
            StreamResult streamResult = new StreamResult(writer);
177
            TransformerFactory tf = TransformerFactory.newInstance();
178
            Transformer transformer = tf.newTransformer();
179
            transformer.transform(domSource, streamResult);
180
             result.setMetadataRecord(writer.toString());
181 41200 katerina.i
            result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_OPENAIRE);
182 41408 argiro.kok
            result.setRecordFormat(ClaimUtils.FORMAT_XML);
183 41791 argiro.kok
//            result.setFound(true);
184 41200 katerina.i
185
        }
186
        return result;
187
    }
188
189
190 41213 katerina.i
    public static Project oaf2Project(String oaf) throws Exception {
191 41200 katerina.i
        Project project = null;
192 45958 argiro.kok
        if(oaf == null ){
193
            return null;
194 41200 katerina.i
195 45958 argiro.kok
        }
196 41200 katerina.i
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
197 46977 argiro.kok
        DocumentBuilder dBuilder = null;
198
        try {
199
            dBuilder = dbFactory.newDocumentBuilder();
200 45958 argiro.kok
201 46977 argiro.kok
202 41200 katerina.i
        InputSource is = new InputSource(new StringReader(oaf));
203 47219 argiro.kok
        Document document = dBuilder.parse(is);
204 41200 katerina.i
        XPathFactory xPathfactory= XPathFactory.newInstance();
205
        XPath xpath = xPathfactory.newXPath();
206
207
        String size = null;
208
        NodeList nl = (NodeList) xpath.compile("/response/header/total/text()").evaluate(document, XPathConstants.NODESET);
209
        if (nl.getLength() > 0) {
210
            size= nl.item(0).getNodeValue();
211
        }
212
        if(size!=null && Integer.parseInt(size)>0){
213
            project = new Project();
214 41350 argiro.kok
            nl = (NodeList) xpath.compile("//objIdentifier/text()").evaluate(document, XPathConstants.NODESET);
215
            if (nl.getLength() > 0) {
216
                project.setOpenaireId(nl.item(0).getNodeValue());
217
            }
218 41200 katerina.i
            nl = (NodeList) xpath.compile("//title/text()").evaluate(document, XPathConstants.NODESET);
219
            if (nl.getLength() > 0) {
220
                project.setName(nl.item(0).getNodeValue());
221
            }
222
            nl = (NodeList) xpath.compile("//acronym/text()").evaluate(document, XPathConstants.NODESET);
223
            if (nl.getLength() > 0) {
224
                project.setAcronym(nl.item(0).getNodeValue());
225
            }
226
            nl = (NodeList) xpath.compile("//funder/name/text()").evaluate(document, XPathConstants.NODESET);
227
            if (nl.getLength() > 0) {
228
                project.setFunderName(nl.item(0).getNodeValue());
229
            }
230 41411 argiro.kok
            nl = (NodeList) xpath.compile("//funder/shortname/text()").evaluate(document, XPathConstants.NODESET);
231
            if (nl.getLength() > 0) {
232
                project.setFunderShortName(nl.item(0).getNodeValue());
233
            }
234 41200 katerina.i
            nl = (NodeList) xpath.compile("//funder/id/text()").evaluate(document, XPathConstants.NODESET);
235
            if (nl.getLength() > 0) {
236
                project.setFunderId(nl.item(0).getNodeValue());
237
            }
238
239 46755 katerina.i
            nl = (NodeList) xpath.compile("//funding_level_0/text()").evaluate(document, XPathConstants.NODESET);
240
            if (nl.getLength() > 0) {
241
                project.setFundingStreamLevel0(nl.item(0).getNodeValue());
242
            }
243
244
            nl = (NodeList) xpath.compile("//code/text()").evaluate(document, XPathConstants.NODESET);
245
            if (nl.getLength() > 0) {
246
                project.setCode(nl.item(0).getNodeValue());
247
            }
248
249
            nl = (NodeList) xpath.compile("//rels/rel/to[@class='hasContact']").evaluate(document, XPathConstants.NODESET);
250
            if (nl.getLength() > 0) {
251 46884 konstantin
                project.setContactEmails(new ArrayList<String>());
252 46755 katerina.i
                for (int i = 0; i < nl.getLength(); i++) {
253 46977 argiro.kok
                    if(((Element)(nl.item(i).getParentNode())).getElementsByTagName("email").getLength() > 0) {
254
                        String email = ((Element) (nl.item(i).getParentNode())).getElementsByTagName("email").item(0).getTextContent();
255
                        project.getContactEmails().add(email);
256
                    }
257 46755 katerina.i
                }
258
            }
259
260 41791 argiro.kok
//            project.setFound(true);
261 41200 katerina.i
262
        }
263 46977 argiro.kok
        } catch (Exception e) {
264 47219 argiro.kok
           logger.error("Error while parsing project",e);
265
            throw new Exception(e);
266 46977 argiro.kok
        }
267 41200 katerina.i
268
        return project;
269
270
    }
271 49865 argiro.kok
    public static Result oaf2Software(String oaf) throws ParserConfigurationException, IOException, SAXException, XPathExpressionException, TransformerException {
272
        Result result = null;
273
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
274
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
275
        InputSource is = new InputSource(new StringReader(oaf));
276
        Document document= dBuilder.parse(is);
277
        XPathFactory xPathfactory= XPathFactory.newInstance();
278
        XPath xpath = xPathfactory.newXPath();
279 41200 katerina.i
280 49865 argiro.kok
        String size = null;
281
        NodeList nl = (NodeList) xpath.compile("/response/header/total/text()").evaluate(document, XPathConstants.NODESET);
282
        if (nl.getLength() > 0) {
283
            size= nl.item(0).getNodeValue();
284
        }
285
        System.out.println("size:"+size);
286
        if(size!=null && Integer.parseInt(size)>0){
287
            result = new Result();
288
            nl = (NodeList) xpath.compile("//field[@name = \"resultId\"]/@value").evaluate(document, XPathConstants.NODESET);
289
            if (nl.getLength() > 0) {
290
                result.setOpenaireId(nl.item(0).getNodeValue());
291
            }
292
            nl = (NodeList) xpath.compile("//field[@name = \"resulttypeid\"]/@value").evaluate(document, XPathConstants.NODESET);
293
            if (nl.getLength() > 0) {
294
                result.setResultType(nl.item(0).getNodeValue());
295
            }
296
            nl = (NodeList) xpath.compile("//field[@name = \"bestaccessright\"]/@value").evaluate(document, XPathConstants.NODESET);
297
            if (nl.getLength() > 0) {
298
                result.setBestLicense(nl.item(0).getNodeValue());
299
            }
300
            nl = (NodeList) xpath.compile("//field[@name = \"embargoenddate\"]/@value").evaluate(document, XPathConstants.NODESET);
301
            if (nl.getLength() > 0) {
302
                result.setEmbargoEndDate(nl.item(0).getNodeValue());
303
            }
304
            nl = (NodeList) xpath.compile("//field[@name = \"title\"]/@value").evaluate(document, XPathConstants.NODESET);
305
            if (nl.getLength() > 0) {
306
                result.setTitle(nl.item(0).getNodeValue());
307
            }
308
            //
309
            nl = (NodeList) xpath.compile("//field[@name = \"pid\"]").evaluate(document, XPathConstants.NODESET);
310
            System.out.println("size: pid"+nl.getLength());
311
            if (nl.getLength() > 0) {
312
                for (int i = 0; i < nl.getLength(); i++) {
313
                    NodeList list = (NodeList) xpath.compile("./field[@name = \"classid\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
314
                    System.out.println("size: classid"+list.getLength());
315
316
                    if (list.getLength() > 0 && list.item(0).getNodeValue() !=null ) {
317
318
                        if (list.item(0).getNodeValue().equals("doi")) {
319
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
320
                            result.setDoi(list.item(0).getNodeValue());
321
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
322
323
324
                        } else if (list.item(0).getNodeValue().equals("pmc")) {
325
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
326
                            result.setDoi(list.item(0).getNodeValue());
327
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
328
329
330
                        }else if (list.item(0).getNodeValue().equals("orcidworkid")) {
331
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
332
                            result.setDoi(list.item(0).getNodeValue());
333
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
334
335
336
                        }else if (list.item(0).getNodeValue().equals("oai")) {
337
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
338
                            result.setDoi(list.item(0).getNodeValue());
339
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
340
341
342
                        }
343
                    }
344
                }
345
            }
346
            nl = (NodeList) xpath.compile("//rels/rel/to[@class='hasAuthor']").evaluate(document, XPathConstants.NODESET);
347
            if (nl.getLength() > 0) {
348
                for (int i = 0; i < nl.getLength(); i++) {
349
                    String ranking = ((Element)(nl.item(i).getParentNode())).getElementsByTagName("ranking").item(0).getTextContent();
350
                    String fullname =  (((Element)(nl.item(i).getParentNode())).getElementsByTagName("fullname").item(0).getTextContent().replace(",", " "));
351
                    result.getAuthors().put(ranking,fullname);
352
                }
353
354
            }
355
            if(result.getExternalUrl()==null){
356
//                nl = (NodeList) xpath.compile("//webresource/url/text()").evaluate(document, XPathConstants.NODESET);
357
//                if (nl.getLength() > 0) {
358
//                    result.setExternalUrl(nl.item(0).getNodeValue());
359
//                }
360
                nl = (NodeList) xpath.compile("//children/instance/licence[@classid='OPEN']").evaluate(document, XPathConstants.NODESET);
361
                if (nl.getLength() > 0) {
362
                    NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
363
                    if (list.getLength() > 0) {
364
                        result.setExternalUrl(list.item(0).getNodeValue());
365
                    }
366
                }else{
367
                    nl = (NodeList) xpath.compile("//children/instance/licence[@classid='EMBARGO']").evaluate(document, XPathConstants.NODESET);
368
                    if (nl.getLength() > 0) {
369
                        NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
370
                        if (list.getLength() > 0) {
371
                            result.setExternalUrl(list.item(0).getNodeValue());
372
                        }
373
                    }else{
374
                        nl = (NodeList) xpath.compile("//children/instance/licence[@classid='CLOSED']").evaluate(document, XPathConstants.NODESET);
375
                        if (nl.getLength() > 0) {
376
                            NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
377
                            if (list.getLength() > 0) {
378
                                result.setExternalUrl(list.item(0).getNodeValue());
379
                            }
380
                        }
381
                    }
382
                }
383
            }
384
385
386
            nl = (NodeList) xpath.compile("//response/header").evaluate(document, XPathConstants.NODESET);
387
            if(nl.getLength()>0){
388
                nl.item(0).getParentNode().removeChild(nl.item(0));
389
            }
390
391
392
393
            DOMSource domSource = new DOMSource(document);
394
            StringWriter writer = new StringWriter();
395
            StreamResult streamResult = new StreamResult(writer);
396
            TransformerFactory tf = TransformerFactory.newInstance();
397
            Transformer transformer = tf.newTransformer();
398
            transformer.transform(domSource, streamResult);
399
            result.setMetadataRecord(writer.toString());
400
            result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_OPENAIRE);
401
            result.setRecordFormat(ClaimUtils.FORMAT_XML);
402
//            result.setFound(true);
403
404
        }
405
        return result;
406
    }
407
408
409 41200 katerina.i
}