Project

General

Profile

1
package eu.dnetlib.msro.workflows.nodes.download;
2

    
3
import java.io.ByteArrayInputStream;
4
import java.util.ArrayList;
5
import java.util.List;
6

    
7
import javax.xml.parsers.DocumentBuilder;
8
import javax.xml.parsers.DocumentBuilderFactory;
9
import javax.xml.xpath.XPath;
10
import javax.xml.xpath.XPathConstants;
11
import javax.xml.xpath.XPathExpression;
12
import javax.xml.xpath.XPathFactory;
13

    
14
import org.apache.commons.logging.Log;
15
import org.apache.commons.logging.LogFactory;
16
import org.w3c.dom.Document;
17
import org.w3c.dom.NodeList;
18

    
19
import com.google.common.base.Function;
20
import com.google.gson.Gson;
21

    
22
import eu.dnetlib.data.download.rmi.DownloadItem;
23

    
24
/**
25
 * The Class UrlExtractor.
26
 */
27
public class UrlExtractor implements Function<String, String> {
28

    
29
	private static final Log log = LogFactory.getLog(UrlExtractor.class);
30

    
31
	/** The xpath. */
32
	private String xpath;
33

    
34
	/** The xpath. */
35
	private String xpathMetadataID;
36

    
37
	public UrlExtractor(final String xpath, final String xpathMetadataID) {
38
		this.xpath = xpath;
39
		this.xpathMetadataID = xpathMetadataID;
40
	}
41

    
42
	/*
43
	 * (non-Javadoc)
44
	 * 
45
	 * @see com.google.common.base.Function#apply(java.lang.Object)
46
	 */
47
	@Override
48
	public String apply(final String input) {
49
		try {
50

    
51
			DownloadItem di = new DownloadItem();
52
			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
53
			DocumentBuilder builder;
54
			builder = factory.newDocumentBuilder();
55
			Document doc = builder.parse(new ByteArrayInputStream(input.getBytes()));
56
			XPathFactory xPathFactory = XPathFactory.newInstance();
57
			XPath myXpath = xPathFactory.newXPath();
58
			XPathExpression expression = myXpath.compile(xpath);
59
			Object values = expression.evaluate(doc, XPathConstants.NODESET);
60
			di.setUrl(getNodes((NodeList) values));
61
			di.setOriginalUrl(getNodes((NodeList) values));
62
			expression = myXpath.compile(xpathMetadataID);
63
			String extracted_metadataId = expression.evaluate(doc);
64
			di.setIdItemMetadata(extracted_metadataId);
65
			// di.setFileName(extracted_metadataId);
66
			return di.toJSON();
67
		} catch (Exception e) {
68
			log.error("OPSSS... Something bad happen on evaluating ", e);
69
			return null;
70
		}
71

    
72
	}
73

    
74
	private String getNodes(final NodeList nodes) {
75
		List<String> extracted_Url = new ArrayList<String>();
76
		if (nodes != null) {
77
			for (int i = 0; i < nodes.getLength(); i++) {
78
				extracted_Url.add(nodes.item(i).getNodeValue());
79
			}
80
		}
81
		return new Gson().toJson(extracted_Url);
82
	}
83

    
84
	/**
85
	 * Gets the xpath.
86
	 * 
87
	 * @return the xpath
88
	 */
89
	public String getXpath() {
90
		return xpath;
91
	}
92

    
93
	/**
94
	 * Sets the xpath.
95
	 * 
96
	 * @param xpath
97
	 *            the xpath to set
98
	 */
99
	public void setXpath(final String xpath) {
100
		this.xpath = xpath;
101
	}
102

    
103
	/**
104
	 * @return the xpathMetadataID
105
	 */
106
	public String getXpathMetadataID() {
107
		return xpathMetadataID;
108
	}
109

    
110
	/**
111
	 * @param xpathMetadataID
112
	 *            the xpathMetadataID to set
113
	 */
114
	public void setXpathMetadataID(final String xpathMetadataID) {
115
		this.xpathMetadataID = xpathMetadataID;
116
	}
117

    
118
}
(2-2/2)