Project

General

Profile

1
package eu.dnetlib.msro.workflows.nodes.download;
2

    
3
import java.io.ByteArrayInputStream;
4
import java.util.ArrayList;
5
import java.util.List;
6

    
7
import javax.xml.parsers.DocumentBuilder;
8
import javax.xml.parsers.DocumentBuilderFactory;
9
import javax.xml.xpath.XPath;
10
import javax.xml.xpath.XPathConstants;
11
import javax.xml.xpath.XPathExpression;
12
import javax.xml.xpath.XPathFactory;
13

    
14
import org.apache.commons.lang.StringUtils;
15
import org.apache.commons.logging.Log;
16
import org.apache.commons.logging.LogFactory;
17
import org.joda.time.DateTime;
18
import org.joda.time.format.DateTimeFormat;
19
import org.joda.time.format.DateTimeFormatter;
20
import org.w3c.dom.Document;
21
import org.w3c.dom.NodeList;
22

    
23
import com.google.common.base.Function;
24
import com.google.gson.Gson;
25

    
26
import eu.dnetlib.data.download.rmi.DownloadItem;
27

    
28
// TODO: Auto-generated Javadoc
29
/**
30
 * The Class UrlExtractor.
31
 */
32
public class UrlExtractor implements Function<String, String> {
33

    
34
	/** The Constant log. */
35
	private static final Log log = LogFactory.getLog(UrlExtractor.class);
36

    
37
	/** The xpath url. */
38
	private String xpathURL;
39

    
40
	/** The xpath. */
41
	private String xpathMetadataID;
42

    
43
	/** The xpath open access. */
44
	private String xpathOpenAccess;
45

    
46
	/** The xpath embargo date. */
47
	private String xpathEmbargoDate;
48

    
49
	/**
50
	 * Instantiates a new url extractor.
51
	 *
52
	 * @param xpath
53
	 *            the xpath
54
	 * @param xpathMetadataID
55
	 *            the xpath metadata id
56
	 */
57
	public UrlExtractor(final String xpath, final String xpathMetadataID, final String xpathOpenAccess, final String xpathEmbargoDate) {
58
		this.xpathURL = xpath;
59
		this.xpathMetadataID = xpathMetadataID;
60
		this.xpathOpenAccess = xpathOpenAccess;
61
		this.xpathEmbargoDate = xpathEmbargoDate;
62
	}
63

    
64
	/*
65
	 * (non-Javadoc)
66
	 *
67
	 * @see com.google.common.base.Function#apply(java.lang.Object)
68
	 */
69
	@Override
70
	public String apply(final String input) {
71
		try {
72
			DownloadItem di = new DownloadItem();
73
			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
74
			DocumentBuilder builder;
75
			builder = factory.newDocumentBuilder();
76
			if (input == null) {
77
				log.error("Metadata input is null");
78
				return null;
79
			}
80
			Document doc = builder.parse(new ByteArrayInputStream(input.getBytes()));
81
			XPathFactory xPathFactory = XPathFactory.newInstance();
82
			XPath myXpath = xPathFactory.newXPath();
83
			XPathExpression expression = myXpath.compile(xpathURL);
84
			Object values = expression.evaluate(doc, XPathConstants.NODESET);
85
			di.setUrl(getNodes((NodeList) values));
86
			di.setOriginalUrl(getNodes((NodeList) values));
87

    
88
			if (xpathOpenAccess != null) {
89
				expression = myXpath.compile(xpathOpenAccess);
90
				String openAccess = expression.evaluate(doc);
91
				di.setOpenAccess(openAccess);
92
			}
93
			if (xpathEmbargoDate != null) {
94
				expression = myXpath.compile(xpathEmbargoDate);
95
				String embargoDate = expression.evaluate(doc);
96
				if (!StringUtils.isEmpty(embargoDate)) {
97
					try {
98
						DateTimeFormatter fmt = DateTimeFormat.forPattern("yyyy-MM-dd");
99
						DateTime dt = fmt.parseDateTime(embargoDate);
100
						di.setEmbargoDate(dt.toDate());
101
					} catch (Exception pe) {}
102
				}
103
			}
104
			expression = myXpath.compile(xpathMetadataID);
105
			String extracted_metadataId = expression.evaluate(doc);
106
			di.setIdItemMetadata(extracted_metadataId);
107
			return di.toJSON();
108
		} catch (Exception e) {
109
			log.error("OPSSS... Something bad happen on evaluating ", e);
110
			return null;
111
		}
112

    
113
	}
114

    
115
	/**
116
	 * Gets the nodes.
117
	 *
118
	 * @param nodes
119
	 *            the nodes
120
	 * @return the nodes
121
	 */
122
	private String getNodes(final NodeList nodes) {
123
		List<String> extracted_Url = new ArrayList<String>();
124
		if (nodes != null) {
125
			for (int i = 0; i < nodes.getLength(); i++) {
126
				extracted_Url.add(nodes.item(i).getNodeValue());
127
			}
128
		}
129
		return new Gson().toJson(extracted_Url);
130
	}
131

    
132
	/**
133
	 * Gets the xpath metadata id.
134
	 *
135
	 * @return the xpathMetadataID
136
	 */
137
	public String getXpathMetadataID() {
138
		return xpathMetadataID;
139
	}
140

    
141
	/**
142
	 * Sets the xpath metadata id.
143
	 *
144
	 * @param xpathMetadataID
145
	 *            the xpathMetadataID to set
146
	 */
147
	public void setXpathMetadataID(final String xpathMetadataID) {
148
		this.xpathMetadataID = xpathMetadataID;
149
	}
150

    
151
	/**
152
	 * Gets the xpath url.
153
	 *
154
	 * @return the xpath url
155
	 */
156
	public String getXpathURL() {
157
		return xpathURL;
158
	}
159

    
160
	/**
161
	 * Sets the xpath url.
162
	 *
163
	 * @param xpathURL
164
	 *            the new xpath url
165
	 */
166
	public void setXpathURL(final String xpathURL) {
167
		this.xpathURL = xpathURL;
168
	}
169

    
170
	/**
171
	 * Gets the xpath open access.
172
	 *
173
	 * @return the xpath open access
174
	 */
175
	public String getXpathOpenAccess() {
176
		return xpathOpenAccess;
177
	}
178

    
179
	/**
180
	 * Sets the xpath open access.
181
	 *
182
	 * @param xpathOpenAccess
183
	 *            the new xpath open access
184
	 */
185
	public void setXpathOpenAccess(final String xpathOpenAccess) {
186
		this.xpathOpenAccess = xpathOpenAccess;
187
	}
188

    
189
	/**
190
	 * Gets the xpath embargo date.
191
	 *
192
	 * @return the xpath embargo date
193
	 */
194
	public String getXpathEmbargoDate() {
195
		return xpathEmbargoDate;
196
	}
197

    
198
	/**
199
	 * Sets the xpath embargo date.
200
	 *
201
	 * @param xpathEmbargoDate
202
	 *            the new xpath embargo date
203
	 */
204
	public void setXpathEmbargoDate(final String xpathEmbargoDate) {
205
		this.xpathEmbargoDate = xpathEmbargoDate;
206
	}
207

    
208
}
(2-2/2)