Project

General

Profile

1
package eu.dnetlib.msro.workflows.nodes.download;
2

    
3
import java.io.ByteArrayInputStream;
4
import java.text.SimpleDateFormat;
5
import java.util.ArrayList;
6
import java.util.Date;
7
import java.util.List;
8
import java.util.function.Function;
9

    
10
import javax.xml.parsers.DocumentBuilder;
11
import javax.xml.parsers.DocumentBuilderFactory;
12
import javax.xml.xpath.XPath;
13
import javax.xml.xpath.XPathConstants;
14
import javax.xml.xpath.XPathExpression;
15
import javax.xml.xpath.XPathFactory;
16

    
17
import org.apache.commons.lang3.StringUtils;
18
import org.apache.commons.logging.Log;
19
import org.apache.commons.logging.LogFactory;
20
import org.w3c.dom.Document;
21
import org.w3c.dom.NodeList;
22

    
23
import com.google.gson.Gson;
24

    
25
import eu.dnetlib.rmi.data.DownloadItem;
26

    
27
// TODO: Auto-generated Javadoc
28
/**
29
 * The Class UrlExtractor.
30
 */
31
public class UrlExtractor implements Function<String, String> {
32

    
33
	/** The Constant log. */
34
	private static final Log log = LogFactory.getLog(UrlExtractor.class);
35

    
36
	/** The xpath url. */
37
	private String xpathURL;
38

    
39
	/** The xpath. */
40
	private String xpathMetadataID;
41

    
42
	/** The xpath open access. */
43
	private String xpathOpenAccess;
44

    
45
	/** The xpath embargo date. */
46
	private String xpathEmbargoDate;
47

    
48
	/**
49
	 * Instantiates a new url extractor.
50
	 *
51
	 * @param xpath
52
	 *            the xpath
53
	 * @param xpathMetadataID
54
	 *            the xpath metadata id
55
	 */
56
	public UrlExtractor(final String xpath, final String xpathMetadataID, final String xpathOpenAccess, final String xpathEmbargoDate) {
57
		this.xpathURL = xpath;
58
		this.xpathMetadataID = xpathMetadataID;
59
		this.xpathOpenAccess = xpathOpenAccess;
60
		this.xpathEmbargoDate = xpathEmbargoDate;
61
	}
62

    
63
	/*
64
	 * (non-Javadoc)
65
	 *
66
	 * @see com.google.common.base.Function#apply(java.lang.Object)
67
	 */
68
	@Override
69
	public String apply(final String input) {
70
		try {
71
			final DownloadItem di = new DownloadItem();
72
			final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
73
			DocumentBuilder builder;
74
			builder = factory.newDocumentBuilder();
75
			if (input == null) {
76
				log.error("Metadata input is null");
77
				return null;
78
			}
79
			final Document doc = builder.parse(new ByteArrayInputStream(input.getBytes()));
80
			final XPathFactory xPathFactory = XPathFactory.newInstance();
81
			final XPath myXpath = xPathFactory.newXPath();
82
			XPathExpression expression = myXpath.compile(this.xpathURL);
83
			final Object values = expression.evaluate(doc, XPathConstants.NODESET);
84
			di.setUrl(getNodes((NodeList) values));
85
			di.setOriginalUrl(getNodes((NodeList) values));
86

    
87
			if (this.xpathOpenAccess != null) {
88
				expression = myXpath.compile(this.xpathOpenAccess);
89
				final String openAccess = expression.evaluate(doc);
90
				di.setOpenAccess(openAccess);
91
			}
92
			if (this.xpathEmbargoDate != null) {
93
				expression = myXpath.compile(this.xpathEmbargoDate);
94
				final String embargoDate = expression.evaluate(doc);
95
				if (!StringUtils.isEmpty(embargoDate)) {
96
					try {
97
						final Date date = new SimpleDateFormat("yyyy-MM-dd").parse(embargoDate);
98
						di.setEmbargoDate(date);
99
					} catch (final Exception pe) {}
100
				}
101
			}
102
			expression = myXpath.compile(this.xpathMetadataID);
103
			final String extracted_metadataId = expression.evaluate(doc);
104
			di.setIdItemMetadata(extracted_metadataId);
105
			return di.toJSON();
106
		} catch (final Exception e) {
107
			log.error("OPSSS... Something bad happen on evaluating ", e);
108
			return null;
109
		}
110

    
111
	}
112

    
113
	/**
114
	 * Gets the nodes.
115
	 *
116
	 * @param nodes
117
	 *            the nodes
118
	 * @return the nodes
119
	 */
120
	private String getNodes(final NodeList nodes) {
121
		final List<String> extracted_Url = new ArrayList<String>();
122
		if (nodes != null) {
123
			for (int i = 0; i < nodes.getLength(); i++) {
124
				extracted_Url.add(nodes.item(i).getNodeValue());
125
			}
126
		}
127
		return new Gson().toJson(extracted_Url);
128
	}
129

    
130
	/**
131
	 * Gets the xpath metadata id.
132
	 *
133
	 * @return the xpathMetadataID
134
	 */
135
	public String getXpathMetadataID() {
136
		return this.xpathMetadataID;
137
	}
138

    
139
	/**
140
	 * Sets the xpath metadata id.
141
	 *
142
	 * @param xpathMetadataID
143
	 *            the xpathMetadataID to set
144
	 */
145
	public void setXpathMetadataID(final String xpathMetadataID) {
146
		this.xpathMetadataID = xpathMetadataID;
147
	}
148

    
149
	/**
150
	 * Gets the xpath url.
151
	 *
152
	 * @return the xpath url
153
	 */
154
	public String getXpathURL() {
155
		return this.xpathURL;
156
	}
157

    
158
	/**
159
	 * Sets the xpath url.
160
	 *
161
	 * @param xpathURL
162
	 *            the new xpath url
163
	 */
164
	public void setXpathURL(final String xpathURL) {
165
		this.xpathURL = xpathURL;
166
	}
167

    
168
	/**
169
	 * Gets the xpath open access.
170
	 *
171
	 * @return the xpath open access
172
	 */
173
	public String getXpathOpenAccess() {
174
		return this.xpathOpenAccess;
175
	}
176

    
177
	/**
178
	 * Sets the xpath open access.
179
	 *
180
	 * @param xpathOpenAccess
181
	 *            the new xpath open access
182
	 */
183
	public void setXpathOpenAccess(final String xpathOpenAccess) {
184
		this.xpathOpenAccess = xpathOpenAccess;
185
	}
186

    
187
	/**
188
	 * Gets the xpath embargo date.
189
	 *
190
	 * @return the xpath embargo date
191
	 */
192
	public String getXpathEmbargoDate() {
193
		return this.xpathEmbargoDate;
194
	}
195

    
196
	/**
197
	 * Sets the xpath embargo date.
198
	 *
199
	 * @param xpathEmbargoDate
200
	 *            the new xpath embargo date
201
	 */
202
	public void setXpathEmbargoDate(final String xpathEmbargoDate) {
203
		this.xpathEmbargoDate = xpathEmbargoDate;
204
	}
205

    
206
}
(2-2/2)