Project

General

Profile

1
package eu.dnetlib.data.claims.parser;
2

    
3
import eu.dnetlib.data.claims.entity.Project;
4
import eu.dnetlib.data.claims.entity.Result;
5
import eu.dnetlib.data.claims.utils.ClaimUtils;
6
import org.apache.log4j.Logger;
7
import org.w3c.dom.Document;
8
import org.w3c.dom.Element;
9
import org.w3c.dom.NodeList;
10
import org.xml.sax.InputSource;
11
import org.xml.sax.SAXException;
12

    
13
import javax.xml.parsers.DocumentBuilder;
14
import javax.xml.parsers.DocumentBuilderFactory;
15
import javax.xml.parsers.ParserConfigurationException;
16
import javax.xml.transform.Transformer;
17
import javax.xml.transform.TransformerException;
18
import javax.xml.transform.TransformerFactory;
19
import javax.xml.transform.dom.DOMSource;
20
import javax.xml.transform.stream.StreamResult;
21
import javax.xml.xpath.XPath;
22
import javax.xml.xpath.XPathConstants;
23
import javax.xml.xpath.XPathExpressionException;
24
import javax.xml.xpath.XPathFactory;
25
import java.io.IOException;
26
import java.io.StringReader;
27
import java.io.StringWriter;
28
import java.util.ArrayList;
29

    
30
/**
31
 * Created by kiatrop on 5/2/2016.
32
 */
33
public class OafParser {
34
    /**
35
     * For external url: returns http://dx.doi.org/ +doi else the first url from websource
36
     *
37
     * @param oaf
38
     * @return
39
     * @throws ParserConfigurationException
40
     * @throws IOException
41
     * @throws SAXException
42
     * @throws XPathExpressionException
43
     * @throws TransformerException
44
     */
45

    
46
    private static final Logger logger = Logger.getLogger(OafParser.class);
47

    
48
    public static Result oaf2Result(String oaf) throws ParserConfigurationException, IOException, SAXException, XPathExpressionException, TransformerException {
49
        Result result = null;
50
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
51
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
52
        InputSource is = new InputSource(new StringReader(oaf));
53
        Document document= dBuilder.parse(is);
54
        XPathFactory xPathfactory= XPathFactory.newInstance();
55
        XPath xpath = xPathfactory.newXPath();
56

    
57
        String size = null;
58
        NodeList nl = (NodeList) xpath.compile("/response/header/total/text()").evaluate(document, XPathConstants.NODESET);
59
        if (nl.getLength() > 0) {
60
            size= nl.item(0).getNodeValue();
61
        }else{
62
            nl = (NodeList) xpath.compile("//objIdentifier/text()").evaluate(document, XPathConstants.NODESET);
63
            if (nl.getLength() > 0) {
64
//                result.setOpenaireId(nl.item(0).getNodeValue());
65
                size="1";
66
            }
67
        }
68
        if(size!=null && Integer.parseInt(size)>0){
69
            result = new Result();
70
            nl = (NodeList) xpath.compile("//objIdentifier/text()").evaluate(document, XPathConstants.NODESET);
71
            if (nl.getLength() > 0) {
72
                result.setOpenaireId(nl.item(0).getNodeValue());
73
            }
74
            nl = (NodeList) xpath.compile("//resulttype/@classid").evaluate(document, XPathConstants.NODESET);
75
            if (nl.getLength() > 0) {
76
                result.setResultType(nl.item(0).getNodeValue());
77
            }
78
            nl = (NodeList) xpath.compile("//bestlicense/@classid").evaluate(document, XPathConstants.NODESET);
79
            if (nl.getLength() > 0) {
80
                result.setBestLicense(nl.item(0).getNodeValue());
81
            }
82
            nl = (NodeList) xpath.compile("//bestaccessright/@classid").evaluate(document, XPathConstants.NODESET);
83
            if (nl.getLength() > 0) {
84
                result.setBestLicense(nl.item(0).getNodeValue());
85
            }
86
            nl = (NodeList) xpath.compile("//embargoenddate/text()").evaluate(document, XPathConstants.NODESET);
87
            if (nl.getLength() > 0) {
88
                result.setEmbargoEndDate(nl.item(0).getNodeValue());
89
            }
90
            nl = (NodeList) xpath.compile("//title/text()").evaluate(document, XPathConstants.NODESET);
91
            if (nl.getLength() > 0) {
92
                result.setTitle(nl.item(0).getNodeValue());
93
            }
94
            nl = (NodeList) xpath.compile("//pid[@classid='doi']/text()").evaluate(document, XPathConstants.NODESET);
95
            if (nl.getLength() > 0) {
96
                result.setDoi(nl.item(0).getNodeValue());
97
                result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
98
            }
99
            if(result.getExternalUrl()==null){
100
//                nl = (NodeList) xpath.compile("//webresource/url/text()").evaluate(document, XPathConstants.NODESET);
101
//                if (nl.getLength() > 0) {
102
//                    result.setExternalUrl(nl.item(0).getNodeValue());
103
//                }
104
                nl = (NodeList) xpath.compile("//children/instance/licence[@classid='OPEN']").evaluate(document, XPathConstants.NODESET);
105
                if (nl.getLength() > 0) {
106
                    NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
107
                    if (list.getLength() > 0) {
108
                        result.setExternalUrl(list.item(0).getNodeValue());
109
                    }
110
                }else{
111
                    nl = (NodeList) xpath.compile("//children/instance/licence[@classid='EMBARGO']").evaluate(document, XPathConstants.NODESET);
112
                    if (nl.getLength() > 0) {
113
                        NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
114
                        if (list.getLength() > 0) {
115
                            result.setExternalUrl(list.item(0).getNodeValue());
116
                        }
117
                    }else{
118
                        nl = (NodeList) xpath.compile("//children/instance/licence[@classid='CLOSED']").evaluate(document, XPathConstants.NODESET);
119
                        if (nl.getLength() > 0) {
120
                            NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
121
                            if (list.getLength() > 0) {
122
                                result.setExternalUrl(list.item(0).getNodeValue());
123
                            }
124
                        }
125
                    }
126
                }
127
            }
128
            nl = (NodeList) xpath.compile("//pid[@classid='pmc']/text()").evaluate(document, XPathConstants.NODESET);
129
            if (nl.getLength() > 0) {
130
                result.setPmcid(nl.item(0).getNodeValue());
131

    
132
            }
133
            nl = (NodeList) xpath.compile("//pid[@classid='orcidworkid']/text()").evaluate(document, XPathConstants.NODESET);
134
            if (nl.getLength() > 0) {
135
                result.setOrcidworkid(nl.item(0).getNodeValue());
136

    
137
            }
138
            nl = (NodeList) xpath.compile("//pid[@classid='oai']/text()").evaluate(document, XPathConstants.NODESET);
139
            if (nl.getLength() > 0) {
140
                result.setOai(nl.item(0).getNodeValue());
141

    
142
            }
143
            nl = (NodeList) xpath.compile("//provenanceaction[@classid='user:claim:datacite']/@classid").evaluate(document, XPathConstants.NODESET);
144
            if (nl.getLength() > 0) {
145
                result.setProvenanceaction(nl.item(0).getNodeValue());
146

    
147
            }
148
            nl = (NodeList) xpath.compile("//response/header").evaluate(document, XPathConstants.NODESET);
149
            if(nl.getLength()>0){
150
                    nl.item(0).getParentNode().removeChild(nl.item(0));
151
            }
152

    
153
            nl = (NodeList) xpath.compile("//rel[@class='hasAuthor']").evaluate(document, XPathConstants.NODESET);
154
            System.out.println("Author length"+nl.getLength());
155
            if (nl.getLength() > 0) {
156
                for (int i = 0; i < nl.getLength(); i++) {
157
                    String ranking = ((Element)(nl.item(i).getParentNode())).getElementsByTagName("ranking").item(0).getTextContent();
158
                    String fullname =  (((Element)(nl.item(i).getParentNode())).getElementsByTagName("fullname").item(0).getTextContent().replace(",", " "));
159
                    result.getAuthors().put(ranking,fullname);
160
                }
161

    
162
            }
163
            nl = (NodeList) xpath.compile("//creator/text()").evaluate(document, XPathConstants.NODESET);
164
            System.out.println("Creator length"+nl.getLength());
165
            if (nl.getLength() > 0) {
166
                for (int i = 0; i < nl.getLength(); i++) {
167
                    String ranking = ((Element)(nl.item(i).getParentNode())).getAttribute("rank");
168
                    String fullname =  (((Element)(nl.item(i).getParentNode())).getTextContent().replace(",", " "));
169
                    result.getAuthors().put(ranking,fullname);
170
                }
171

    
172
            }
173

    
174
            DOMSource domSource = new DOMSource(document);
175
            StringWriter writer = new StringWriter();
176
            StreamResult streamResult = new StreamResult(writer);
177
            TransformerFactory tf = TransformerFactory.newInstance();
178
            Transformer transformer = tf.newTransformer();
179
            transformer.transform(domSource, streamResult);
180
             result.setMetadataRecord(writer.toString());
181
            result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_OPENAIRE);
182
            result.setRecordFormat(ClaimUtils.FORMAT_XML);
183
//            result.setFound(true);
184

    
185
        }
186
        return result;
187
    }
188

    
189

    
190
    public static Project oaf2Project(String oaf) throws Exception {
191
        Project project = null;
192
        if(oaf == null ){
193
            return null;
194

    
195
        }
196
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
197
        DocumentBuilder dBuilder = null;
198
        try {
199
            dBuilder = dbFactory.newDocumentBuilder();
200

    
201

    
202
        InputSource is = new InputSource(new StringReader(oaf));
203
        Document document = dBuilder.parse(is);
204
        XPathFactory xPathfactory= XPathFactory.newInstance();
205
        XPath xpath = xPathfactory.newXPath();
206

    
207
        String size = null;
208
        NodeList nl = (NodeList) xpath.compile("/response/header/total/text()").evaluate(document, XPathConstants.NODESET);
209
        if (nl.getLength() > 0) {
210
            size= nl.item(0).getNodeValue();
211
        }
212
        if(size!=null && Integer.parseInt(size)>0){
213
            project = new Project();
214
            nl = (NodeList) xpath.compile("//objIdentifier/text()").evaluate(document, XPathConstants.NODESET);
215
            if (nl.getLength() > 0) {
216
                project.setOpenaireId(nl.item(0).getNodeValue());
217
            }
218
            nl = (NodeList) xpath.compile("//title/text()").evaluate(document, XPathConstants.NODESET);
219
            if (nl.getLength() > 0) {
220
                project.setName(nl.item(0).getNodeValue());
221
            }
222
            nl = (NodeList) xpath.compile("//acronym/text()").evaluate(document, XPathConstants.NODESET);
223
            if (nl.getLength() > 0) {
224
                project.setAcronym(nl.item(0).getNodeValue());
225
            }
226
            nl = (NodeList) xpath.compile("//funder/name/text()").evaluate(document, XPathConstants.NODESET);
227
            if (nl.getLength() > 0) {
228
                project.setFunderName(nl.item(0).getNodeValue());
229
            }
230
            nl = (NodeList) xpath.compile("//funder/shortname/text()").evaluate(document, XPathConstants.NODESET);
231
            if (nl.getLength() > 0) {
232
                project.setFunderShortName(nl.item(0).getNodeValue());
233
            }
234
            nl = (NodeList) xpath.compile("//funder/id/text()").evaluate(document, XPathConstants.NODESET);
235
            if (nl.getLength() > 0) {
236
                project.setFunderId(nl.item(0).getNodeValue());
237
            }
238

    
239
            nl = (NodeList) xpath.compile("//funding_level_0/text()").evaluate(document, XPathConstants.NODESET);
240
            if (nl.getLength() > 0) {
241
                project.setFundingStreamLevel0(nl.item(0).getNodeValue());
242
            }
243

    
244
            nl = (NodeList) xpath.compile("//code/text()").evaluate(document, XPathConstants.NODESET);
245
            if (nl.getLength() > 0) {
246
                project.setCode(nl.item(0).getNodeValue());
247
            }
248

    
249
            nl = (NodeList) xpath.compile("//rels/rel/to[@class='hasContact']").evaluate(document, XPathConstants.NODESET);
250
            if (nl.getLength() > 0) {
251
                project.setContactEmails(new ArrayList<String>());
252
                for (int i = 0; i < nl.getLength(); i++) {
253
                    if(((Element)(nl.item(i).getParentNode())).getElementsByTagName("email").getLength() > 0) {
254
                        String email = ((Element) (nl.item(i).getParentNode())).getElementsByTagName("email").item(0).getTextContent();
255
                        project.getContactEmails().add(email);
256
                    }
257
                }
258
            }
259

    
260
//            project.setFound(true);
261

    
262
        }
263
        } catch (Exception e) {
264
           logger.error("Error while parsing project",e);
265
            throw new Exception(e);
266
        }
267

    
268
        return project;
269

    
270
    }
271
    public static Result oaf2Software(String oaf) throws ParserConfigurationException, IOException, SAXException, XPathExpressionException, TransformerException {
272
        Result result = null;
273
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
274
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
275
        InputSource is = new InputSource(new StringReader(oaf));
276
        Document document= dBuilder.parse(is);
277
        XPathFactory xPathfactory= XPathFactory.newInstance();
278
        XPath xpath = xPathfactory.newXPath();
279

    
280
        String size = null;
281
        NodeList nl = (NodeList) xpath.compile("/response/header/total/text()").evaluate(document, XPathConstants.NODESET);
282
        if (nl.getLength() > 0) {
283
            size= nl.item(0).getNodeValue();
284
        }
285
        System.out.println("size:"+size);
286
        if(size!=null && Integer.parseInt(size)>0){
287
            result = new Result();
288
            nl = (NodeList) xpath.compile("//field[@name = \"resultId\"]/@value").evaluate(document, XPathConstants.NODESET);
289
            if (nl.getLength() > 0) {
290
                result.setOpenaireId(nl.item(0).getNodeValue());
291
            }
292
            nl = (NodeList) xpath.compile("//field[@name = \"resulttypeid\"]/@value").evaluate(document, XPathConstants.NODESET);
293
            if (nl.getLength() > 0) {
294
                result.setResultType(nl.item(0).getNodeValue());
295
            }
296
            nl = (NodeList) xpath.compile("//field[@name = \"bestaccessright\"]/@value").evaluate(document, XPathConstants.NODESET);
297
            if (nl.getLength() > 0) {
298
                result.setBestLicense(nl.item(0).getNodeValue());
299
            }
300
            nl = (NodeList) xpath.compile("//field[@name = \"embargoenddate\"]/@value").evaluate(document, XPathConstants.NODESET);
301
            if (nl.getLength() > 0) {
302
                result.setEmbargoEndDate(nl.item(0).getNodeValue());
303
            }
304
            nl = (NodeList) xpath.compile("//field[@name = \"title\"]/@value").evaluate(document, XPathConstants.NODESET);
305
            if (nl.getLength() > 0) {
306
                result.setTitle(nl.item(0).getNodeValue());
307
            }
308
            //
309
            nl = (NodeList) xpath.compile("//field[@name = \"pid\"]").evaluate(document, XPathConstants.NODESET);
310
            System.out.println("size: pid"+nl.getLength());
311
            if (nl.getLength() > 0) {
312
                for (int i = 0; i < nl.getLength(); i++) {
313
                    NodeList list = (NodeList) xpath.compile("./field[@name = \"classid\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
314
                    System.out.println("size: classid"+list.getLength());
315

    
316
                    if (list.getLength() > 0 && list.item(0).getNodeValue() !=null ) {
317

    
318
                        if (list.item(0).getNodeValue().equals("doi")) {
319
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
320
                            result.setDoi(list.item(0).getNodeValue());
321
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
322

    
323

    
324
                        } else if (list.item(0).getNodeValue().equals("pmc")) {
325
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
326
                            result.setDoi(list.item(0).getNodeValue());
327
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
328

    
329

    
330
                        }else if (list.item(0).getNodeValue().equals("orcidworkid")) {
331
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
332
                            result.setDoi(list.item(0).getNodeValue());
333
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
334

    
335

    
336
                        }else if (list.item(0).getNodeValue().equals("oai")) {
337
                            list = (NodeList) xpath.compile("./field[@name = \"value\"]/@value").evaluate(nl.item(i), XPathConstants.NODESET);
338
                            result.setDoi(list.item(0).getNodeValue());
339
                            result.setExternalUrl("http://dx.doi.org/" + result.getDoi());
340

    
341

    
342
                        }
343
                    }
344
                }
345
            }
346
            nl = (NodeList) xpath.compile("//rels/rel/to[@class='hasAuthor']").evaluate(document, XPathConstants.NODESET);
347
            if (nl.getLength() > 0) {
348
                for (int i = 0; i < nl.getLength(); i++) {
349
                    String ranking = ((Element)(nl.item(i).getParentNode())).getElementsByTagName("ranking").item(0).getTextContent();
350
                    String fullname =  (((Element)(nl.item(i).getParentNode())).getElementsByTagName("fullname").item(0).getTextContent().replace(",", " "));
351
                    result.getAuthors().put(ranking,fullname);
352
                }
353

    
354
            }
355
            if(result.getExternalUrl()==null){
356
//                nl = (NodeList) xpath.compile("//webresource/url/text()").evaluate(document, XPathConstants.NODESET);
357
//                if (nl.getLength() > 0) {
358
//                    result.setExternalUrl(nl.item(0).getNodeValue());
359
//                }
360
                nl = (NodeList) xpath.compile("//children/instance/licence[@classid='OPEN']").evaluate(document, XPathConstants.NODESET);
361
                if (nl.getLength() > 0) {
362
                    NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
363
                    if (list.getLength() > 0) {
364
                        result.setExternalUrl(list.item(0).getNodeValue());
365
                    }
366
                }else{
367
                    nl = (NodeList) xpath.compile("//children/instance/licence[@classid='EMBARGO']").evaluate(document, XPathConstants.NODESET);
368
                    if (nl.getLength() > 0) {
369
                        NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
370
                        if (list.getLength() > 0) {
371
                            result.setExternalUrl(list.item(0).getNodeValue());
372
                        }
373
                    }else{
374
                        nl = (NodeList) xpath.compile("//children/instance/licence[@classid='CLOSED']").evaluate(document, XPathConstants.NODESET);
375
                        if (nl.getLength() > 0) {
376
                            NodeList list = (NodeList) xpath.compile("./webresource/url/text()").evaluate(nl.item(0).getParentNode(), XPathConstants.NODESET);
377
                            if (list.getLength() > 0) {
378
                                result.setExternalUrl(list.item(0).getNodeValue());
379
                            }
380
                        }
381
                    }
382
                }
383
            }
384

    
385

    
386
            nl = (NodeList) xpath.compile("//response/header").evaluate(document, XPathConstants.NODESET);
387
            if(nl.getLength()>0){
388
                nl.item(0).getParentNode().removeChild(nl.item(0));
389
            }
390

    
391

    
392

    
393
            DOMSource domSource = new DOMSource(document);
394
            StringWriter writer = new StringWriter();
395
            StreamResult streamResult = new StreamResult(writer);
396
            TransformerFactory tf = TransformerFactory.newInstance();
397
            Transformer transformer = tf.newTransformer();
398
            transformer.transform(domSource, streamResult);
399
            result.setMetadataRecord(writer.toString());
400
            result.setCollectedFrom(ClaimUtils.COLLECTED_FROM_OPENAIRE);
401
            result.setRecordFormat(ClaimUtils.FORMAT_XML);
402
//            result.setFound(true);
403

    
404
        }
405
        return result;
406
    }
407

    
408

    
409
}
(2-2/2)