Project

General

Profile

1
package eu.dnetlib.data.claims.parser;
2

    
3
import eu.dnetlib.data.claims.entity.Project;
4
import eu.dnetlib.data.claims.entity.Result;
5
import eu.dnetlib.data.claims.utils.ClaimUtils;
6
import org.apache.logging.log4j.LogManager;
7
import org.apache.logging.log4j.Logger;
8
import org.w3c.dom.Document;
9
import org.w3c.dom.Element;
10
import org.w3c.dom.NodeList;
11
import org.xml.sax.InputSource;
12
import org.xml.sax.SAXException;
13

    
14
import javax.xml.parsers.DocumentBuilder;
15
import javax.xml.parsers.DocumentBuilderFactory;
16
import javax.xml.parsers.ParserConfigurationException;
17
import javax.xml.transform.Transformer;
18
import javax.xml.transform.TransformerException;
19
import javax.xml.transform.TransformerFactory;
20
import javax.xml.transform.dom.DOMSource;
21
import javax.xml.transform.stream.StreamResult;
22
import javax.xml.xpath.XPath;
23
import javax.xml.xpath.XPathConstants;
24
import javax.xml.xpath.XPathExpressionException;
25
import javax.xml.xpath.XPathFactory;
26
import java.io.IOException;
27
import java.io.StringReader;
28
import java.io.StringWriter;
29
import java.util.ArrayList;
30

    
31
/**
32
 * Created by kiatrop on 5/2/2016.
33
 */
34
public class OafParser {
35
    /**
36
     * For external url: returns http://dx.doi.org/ +doi else the first url from websource
37
     *
38
     * @param oaf
39
     * @return
40
     * @throws ParserConfigurationException
41
     * @throws IOException
42
     * @throws SAXException
43
     * @throws XPathExpressionException
44
     * @throws TransformerException
45
     */
46

    
47
    private static final Logger logger = LogManager.getLogger(OafParser.class);
48

    
49
    public static Result oaf2Result(String oaf) throws ParserConfigurationException, IOException, SAXException, XPathExpressionException, TransformerException {
50
        Result result = null;
51
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
52
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
53
        InputSource is = new InputSource(new StringReader(oaf));
54
        Document document= dBuilder.parse(is);
55
        XPathFactory xPathfactory= XPathFactory.newInstance();
56
        XPath xpath = xPathfactory.newXPath();
57

    
58
        String size = null;
59
        NodeList nl = (NodeList) xpath.compile("/response/header/total/text()").evaluate(document, XPathConstants.NODESET);
60
        if (nl.getLength() > 0) {
61
            size= nl.item(0).getNodeValue();
62
        }else{
63
            nl = (NodeList) xpath.compile("//objIdentifier/text()").evaluate(document, XPathConstants.NODESET);
64
            if (nl.getLength() > 0) {
65
//                result.setOpenaireId(nl.item(0).getNodeValue());
66
                size="1";
67
            }
68
        }
69
        if(size!=null && Integer.parseInt(size)>0){
70
            result = new Result();
71
            nl = (NodeList) xpath.compile("//objIdentifier/text()").evaluate(document, XPathConstants.NODESET);
72
            if (nl.getLength() > 0) {
73
                result.setOpenaireId(nl.item(0).getNodeValue());
74
            }
75
            nl = (NodeList) xpath.compile("//resulttype/@classid").evaluate(document, XPathConstants.NODESET);
76
            if (nl.getLength() > 0) {
77
                result.setResultType(nl.item(0).getNodeValue());
78
            }
79
            nl = (NodeList) xpath.compile("//bestlicense/@classid").evaluate(document, XPathConstants.NODESET);
80
            if (nl.getLength() > 0) {
81
                result.setBestLicense(nl.item(0).getNodeValue());
82
            }
83
            nl = (NodeList) xpath.compile("//bestaccessright/@classid").evaluate(document, XPathConstants.NODESET);
84
            if (nl.getLength() > 0) {
85
                result.setBestLicense(nl.item(0).getNodeValue());
86
            }
87
            nl = (NodeList) xpath.compile("//embargoenddate/text()").evaluate(document, XPathConstants.NODESET);
88
            if (nl.getLength() > 0) {
89
                result.setEmbargoEndDate(nl.item(0).getNodeValue());
90
            }
91
            nl = (NodeList) xpath.compile("//title/text()").evaluate(document, XPathConstants.NODESET);
92
            if (nl.getLength() > 0) {
93
                result.setTitle(nl.item(0).getNodeValue());
94
            }
95
            nl = (NodeList) xpath.compile("//pid[@classid='doi']/text()").evaluate(document, XPathConstants.NODESET);
96
            if (nl.getLength() > 0) {
97
                result.setDoi(nl.item(0).getNodeValue());
98
                result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
99
            }
100
            if(result.getExternalUrl()==null){
101
//                nl = (NodeList) xpath.compile("//webresource/url/text()").evaluate(document, XPathConstants.NODESET);
102
//                if (nl.getLength() > 0) {
103
//                    result.setExternalUrl(nl.item(0).getNodeValue());
104
//                }
105
                nl = (NodeList) xpath.compile("//children/instance/licence[@classid='OPEN']").evaluate(document, XPathConstants.NODESET);
106
                if (nl.getLength() > 0) {
107
                    NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
108
                    if (list.getLength() > 0) {
109
                        result.setExternalUrl(list.item(0).getNodeValue());
110
                    }
111
                }else{
112
                    nl = (NodeList) xpath.compile("//children/instance/licence[@classid='EMBARGO']").evaluate(document, XPathConstants.NODESET);
113
                    if (nl.getLength() > 0) {
114
                        NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
115
                        if (list.getLength() > 0) {
116
                            result.setExternalUrl(list.item(0).getNodeValue());
117
                        }
118
                    }else{
119
                        nl = (NodeList) xpath.compile("//children/instance/licence[@classid='CLOSED']").evaluate(document, XPathConstants.NODESET);
120
                        if (nl.getLength() > 0) {
121
                            NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
122
                            if (list.getLength() > 0) {
123
                                result.setExternalUrl(list.item(0).getNodeValue());
124
                            }
125
                        }
126
                    }
127
                }
128
            }
129
            nl = (NodeList) xpath.compile("//pid[@classid='pmc']/text()").evaluate(document, XPathConstants.NODESET);
130
            if (nl.getLength() > 0) {
131
                result.setPmcid(nl.item(0).getNodeValue());
132

    
133
            }
134
            nl = (NodeList) xpath.compile("//pid[@classid='orcidworkid']/text()").evaluate(document, XPathConstants.NODESET);
135
            if (nl.getLength() > 0) {
136
                result.setOrcidworkid(nl.item(0).getNodeValue());
137

    
138
            }
139
            nl = (NodeList) xpath.compile("//pid[@classid='oai']/text()").evaluate(document, XPathConstants.NODESET);
140
            if (nl.getLength() > 0) {
141
                result.setOai(nl.item(0).getNodeValue());
142

    
143
            }
144
            nl = (NodeList) xpath.compile("//provenanceaction[@classid='user:claim:datacite']/@classid").evaluate(document, XPathConstants.NODESET);
145
            if (nl.getLength() > 0) {
146
                result.setProvenanceaction(nl.item(0).getNodeValue());
147

    
148
            }
149
            nl = (NodeList) xpath.compile("//response/header").evaluate(document, XPathConstants.NODESET);
150
            if(nl.getLength()>0){
151
                    nl.item(0).getParentNode().removeChild(nl.item(0));
152
            }
153

    
154
            nl = (NodeList) xpath.compile("//rel[@class='hasAuthor']").evaluate(document, XPathConstants.NODESET);
155
            System.out.println("Author length"+nl.getLength());
156
            if (nl.getLength() > 0) {
157
                for (int i = 0; i < nl.getLength(); i++) {
158
                    String ranking = ((Element)(nl.item(i).getParentNode())).getElementsByTagName("ranking").item(0).getTextContent();
159
                    String fullname =  (((Element)(nl.item(i).getParentNode())).getElementsByTagName("fullname").item(0).getTextContent().replace(",", " "));
160
                    result.getAuthors().put(ranking,fullname);
161
                }
162

    
163
            }
164
            nl = (NodeList) xpath.compile("//creator/text()").evaluate(document, XPathConstants.NODESET);
165
            System.out.println("Creator length"+nl.getLength());
166
            if (nl.getLength() > 0) {
167
                for (int i = 0; i < nl.getLength(); i++) {
168
                    String ranking = ((Element)(nl.item(i).getParentNode())).getAttribute("rank");
169
                    String fullname =  (((Element)(nl.item(i).getParentNode())).getTextContent().replace(",", " "));
170
                    result.getAuthors().put(ranking,fullname);
171
                }
172

    
173
            }
174

    
175
            DOMSource domSource = new DOMSource(document);
176
            StringWriter writer = new StringWriter();
177
            StreamResult streamResult = new StreamResult(writer);
178
            TransformerFactory tf = TransformerFactory.newInstance();
179
            Transformer transformer = tf.newTransformer();
180
            transformer.transform(domSource, streamResult);
181
             result.setMetadataRecord(writer.toString());
182
            result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_OPENAIRE);
183
            result.setRecordFormat(ClaimUtils.FORMAT_XML);
184
//            result.setFound(true);
185

    
186
        }
187
        return result;
188
    }
189

    
190

    
191
    public static Project oaf2Project(String oaf) throws Exception {
192
        Project project = null;
193
        if(oaf == null ){
194
            return null;
195

    
196
        }
197
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
198
        DocumentBuilder dBuilder = null;
199
        try {
200
            dBuilder = dbFactory.newDocumentBuilder();
201

    
202

    
203
        InputSource is = new InputSource(new StringReader(oaf));
204
        Document document = dBuilder.parse(is);
205
        XPathFactory xPathfactory= XPathFactory.newInstance();
206
        XPath xpath = xPathfactory.newXPath();
207

    
208
        String size = null;
209
        NodeList nl = (NodeList) xpath.compile("/response/header/total/text()").evaluate(document, XPathConstants.NODESET);
210
        if (nl.getLength() > 0) {
211
            size= nl.item(0).getNodeValue();
212
        }
213
        if(size!=null && Integer.parseInt(size)>0){
214
            project = new Project();
215
            nl = (NodeList) xpath.compile("//objIdentifier/text()").evaluate(document, XPathConstants.NODESET);
216
            if (nl.getLength() > 0) {
217
                project.setOpenaireId(nl.item(0).getNodeValue());
218
            }
219
            nl = (NodeList) xpath.compile("//title/text()").evaluate(document, XPathConstants.NODESET);
220
            if (nl.getLength() > 0) {
221
                project.setName(nl.item(0).getNodeValue());
222
            }
223
            nl = (NodeList) xpath.compile("//acronym/text()").evaluate(document, XPathConstants.NODESET);
224
            if (nl.getLength() > 0) {
225
                project.setAcronym(nl.item(0).getNodeValue());
226
            }
227
            nl = (NodeList) xpath.compile("//funder/name/text()").evaluate(document, XPathConstants.NODESET);
228
            if (nl.getLength() > 0) {
229
                project.setFunderName(nl.item(0).getNodeValue());
230
            }
231
            nl = (NodeList) xpath.compile("//funder/shortname/text()").evaluate(document, XPathConstants.NODESET);
232
            if (nl.getLength() > 0) {
233
                project.setFunderShortName(nl.item(0).getNodeValue());
234
            }
235
            nl = (NodeList) xpath.compile("//funder/id/text()").evaluate(document, XPathConstants.NODESET);
236
            if (nl.getLength() > 0) {
237
                project.setFunderId(nl.item(0).getNodeValue());
238
            }
239

    
240
            nl = (NodeList) xpath.compile("//funding_level_0/text()").evaluate(document, XPathConstants.NODESET);
241
            if (nl.getLength() > 0) {
242
                project.setFundingStreamLevel0(nl.item(0).getNodeValue());
243
            }
244

    
245
            nl = (NodeList) xpath.compile("//code/text()").evaluate(document, XPathConstants.NODESET);
246
            if (nl.getLength() > 0) {
247
                project.setCode(nl.item(0).getNodeValue());
248
            }
249

    
250
            nl = (NodeList) xpath.compile("//rels/rel/to[@class='hasContact']").evaluate(document, XPathConstants.NODESET);
251
            if (nl.getLength() > 0) {
252
                project.setContactEmails(new ArrayList<String>());
253
                for (int i = 0; i < nl.getLength(); i++) {
254
                    if(((Element)(nl.item(i).getParentNode())).getElementsByTagName("email").getLength() > 0) {
255
                        String email = ((Element) (nl.item(i).getParentNode())).getElementsByTagName("email").item(0).getTextContent();
256
                        project.getContactEmails().add(email);
257
                    }
258
                }
259
            }
260

    
261
//            project.setFound(true);
262

    
263
        }
264
        } catch (Exception e) {
265
           logger.error("Error while parsing project",e);
266
            throw new Exception(e);
267
        }
268

    
269
        return project;
270

    
271
    }
272
    public static Result oaf2Software(String oaf) throws ParserConfigurationException, IOException, SAXException, XPathExpressionException, TransformerException {
273
        Result result = null;
274
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
275
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
276
        InputSource is = new InputSource(new StringReader(oaf));
277
        Document document= dBuilder.parse(is);
278
        XPathFactory xPathfactory= XPathFactory.newInstance();
279
        XPath xpath = xPathfactory.newXPath();
280

    
281
        String size = null;
282
        NodeList nl = (NodeList) xpath.compile("/response/header/total/text()").evaluate(document, XPathConstants.NODESET);
283
        if (nl.getLength() > 0) {
284
            size= nl.item(0).getNodeValue();
285
        }
286
        System.out.println("size:"+size);
287
        if(size!=null && Integer.parseInt(size)>0){
288
            result = new Result();
289
            nl = (NodeList) xpath.compile("//field[@name = \"resultId\"]/@value").evaluate(document, XPathConstants.NODESET);
290
            if (nl.getLength() > 0) {
291
                result.setOpenaireId(nl.item(0).getNodeValue());
292
            }
293
            nl = (NodeList) xpath.compile("//field[@name = \"resulttypeid\"]/@value").evaluate(document, XPathConstants.NODESET);
294
            if (nl.getLength() > 0) {
295
                result.setResultType(nl.item(0).getNodeValue());
296
            }
297
            nl = (NodeList) xpath.compile("//field[@name = \"bestaccessright\"]/@value").evaluate(document, XPathConstants.NODESET);
298
            if (nl.getLength() > 0) {
299
                result.setBestLicense(nl.item(0).getNodeValue());
300
            }
301
            nl = (NodeList) xpath.compile("//field[@name = \"embargoenddate\"]/@value").evaluate(document, XPathConstants.NODESET);
302
            if (nl.getLength() > 0) {
303
                result.setEmbargoEndDate(nl.item(0).getNodeValue());
304
            }
305
            nl = (NodeList) xpath.compile("//field[@name = \"title\"]/@value").evaluate(document, XPathConstants.NODESET);
306
            if (nl.getLength() > 0) {
307
                result.setTitle(nl.item(0).getNodeValue());
308
            }
309
            //
310
            nl = (NodeList) xpath.compile("//field[@name = \"pid\"]").evaluate(document, XPathConstants.NODESET);
311
            System.out.println("size: pid"+nl.getLength());
312
            if (nl.getLength() > 0) {
313
                for (int i = 0; i < nl.getLength(); i++) {
314
                    NodeList list = (NodeList) xpath.compile("./field[@name = \"classid\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
315
                    System.out.println("size: classid"+list.getLength());
316

    
317
                    if (list.getLength() > 0 && list.item(0).getNodeValue() !=null ) {
318

    
319
                        if (list.item(0).getNodeValue().equals("doi")) {
320
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
321
                            result.setDoi(list.item(0).getNodeValue());
322
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
323

    
324

    
325
                        } else if (list.item(0).getNodeValue().equals("pmc")) {
326
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
327
                            result.setDoi(list.item(0).getNodeValue());
328
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
329

    
330

    
331
                        }else if (list.item(0).getNodeValue().equals("orcidworkid")) {
332
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
333
                            result.setDoi(list.item(0).getNodeValue());
334
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
335

    
336

    
337
                        }else if (list.item(0).getNodeValue().equals("oai")) {
338
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
339
                            result.setDoi(list.item(0).getNodeValue());
340
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
341

    
342

    
343
                        }
344
                    }
345
                }
346
            }
347
            nl = (NodeList) xpath.compile("//rels/rel/to[@class='hasAuthor']").evaluate(document, XPathConstants.NODESET);
348
            if (nl.getLength() > 0) {
349
                for (int i = 0; i < nl.getLength(); i++) {
350
                    String ranking = ((Element)(nl.item(i).getParentNode())).getElementsByTagName("ranking").item(0).getTextContent();
351
                    String fullname =  (((Element)(nl.item(i).getParentNode())).getElementsByTagName("fullname").item(0).getTextContent().replace(",", " "));
352
                    result.getAuthors().put(ranking,fullname);
353
                }
354

    
355
            }
356
            if(result.getExternalUrl()==null){
357
//                nl = (NodeList) xpath.compile("//webresource/url/text()").evaluate(document, XPathConstants.NODESET);
358
//                if (nl.getLength() > 0) {
359
//                    result.setExternalUrl(nl.item(0).getNodeValue());
360
//                }
361
                nl = (NodeList) xpath.compile("//children/instance/licence[@classid='OPEN']").evaluate(document, XPathConstants.NODESET);
362
                if (nl.getLength() > 0) {
363
                    NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
364
                    if (list.getLength() > 0) {
365
                        result.setExternalUrl(list.item(0).getNodeValue());
366
                    }
367
                }else{
368
                    nl = (NodeList) xpath.compile("//children/instance/licence[@classid='EMBARGO']").evaluate(document, XPathConstants.NODESET);
369
                    if (nl.getLength() > 0) {
370
                        NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
371
                        if (list.getLength() > 0) {
372
                            result.setExternalUrl(list.item(0).getNodeValue());
373
                        }
374
                    }else{
375
                        nl = (NodeList) xpath.compile("//children/instance/licence[@classid='CLOSED']").evaluate(document, XPathConstants.NODESET);
376
                        if (nl.getLength() > 0) {
377
                            NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
378
                            if (list.getLength() > 0) {
379
                                result.setExternalUrl(list.item(0).getNodeValue());
380
                            }
381
                        }
382
                    }
383
                }
384
            }
385

    
386

    
387
            nl = (NodeList) xpath.compile("//response/header").evaluate(document, XPathConstants.NODESET);
388
            if(nl.getLength()>0){
389
                nl.item(0).getParentNode().removeChild(nl.item(0));
390
            }
391

    
392

    
393

    
394
            DOMSource domSource = new DOMSource(document);
395
            StringWriter writer = new StringWriter();
396
            StreamResult streamResult = new StreamResult(writer);
397
            TransformerFactory tf = TransformerFactory.newInstance();
398
            Transformer transformer = tf.newTransformer();
399
            transformer.transform(domSource, streamResult);
400
            result.setMetadataRecord(writer.toString());
401
            result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_OPENAIRE);
402
            result.setRecordFormat(ClaimUtils.FORMAT_XML);
403
//            result.setFound(true);
404

    
405
        }
406
        return result;
407
    }
408

    
409

    
410
}
(2-2/2)