Project

General

Profile

1 50066 jochen.sch
/**
2
 *
3 52983 andreas.cz
 *
4
 * log.debug(...) equal to  log.trace(...) in the application-logs
5 52997 andreas.cz
 *
6
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
7 50066 jochen.sch
 */
8
package eu.dnetlib.data.collector.plugins.rest;
9
10
import java.io.InputStream;
11 50582 jochen.sch
import java.io.StringWriter;
12 50066 jochen.sch
import java.net.URL;
13
import java.util.Iterator;
14
import java.util.Queue;
15 50582 jochen.sch
import java.util.concurrent.PriorityBlockingQueue;
16 50066 jochen.sch
17
import javax.xml.transform.OutputKeys;
18
import javax.xml.transform.Transformer;
19
import javax.xml.transform.TransformerConfigurationException;
20
import javax.xml.transform.TransformerFactory;
21 50582 jochen.sch
import javax.xml.transform.dom.DOMSource;
22
import javax.xml.transform.stream.StreamResult;
23 50066 jochen.sch
import javax.xml.xpath.XPath;
24
import javax.xml.xpath.XPathConstants;
25
import javax.xml.xpath.XPathExpression;
26
import javax.xml.xpath.XPathExpressionException;
27
import javax.xml.xpath.XPathFactory;
28
29
import org.apache.commons.io.IOUtils;
30 50662 alessia.ba
import org.apache.commons.lang3.StringUtils;
31 50582 jochen.sch
import org.apache.commons.logging.Log;
32
import org.apache.commons.logging.LogFactory;
33 50066 jochen.sch
import org.w3c.dom.Node;
34 50582 jochen.sch
import org.w3c.dom.NodeList;
35 50066 jochen.sch
import org.xml.sax.InputSource;
36
37 50582 jochen.sch
import eu.dnetlib.data.collector.plugins.oai.OaiIterator;
38 53116 andreas.cz
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
39 50582 jochen.sch
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
40
41 50066 jochen.sch
/**
42 52970 andreas.cz
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
43 52997 andreas.cz
 * @date 2018-09-03
44 50066 jochen.sch
 *
45
 */
46
public class RestIterator implements Iterator<String> {
47
48 52971 andreas.cz
    // TODO: clean up the comments of replaced source code
49 50582 jochen.sch
	private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
50
51 50066 jochen.sch
	private static final String wrapName = "recordWrap";
52
	private String baseUrl;
53
	private String resumptionType;
54
	private String resumptionParam;
55
	private String resultFormatValue;
56
	private String queryParams;
57 52970 andreas.cz
	private int resultSizeValue;
58 50066 jochen.sch
	private int resumptionInt = 0;			// integer resumption token (first record to harvest)
59
	private int resultTotal = -1;
60
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
61
	private InputStream resultStream;
62
	private Transformer transformer;
63
	private XPath xpath;
64 50582 jochen.sch
	private String query;
65 50066 jochen.sch
	private XPathExpression xprResultTotalPath;
66
	private XPathExpression xprResumptionPath;
67 50582 jochen.sch
	private XPathExpression xprEntity;
68 50066 jochen.sch
	private String queryFormat;
69
	private String querySize;
70 50582 jochen.sch
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
71 52983 andreas.cz
        private int discoverResultSize = 0;
72 53183 andreas.cz
        private int pagination = 1;
73 52983 andreas.cz
74 50066 jochen.sch
	public RestIterator(
75
			final String baseUrl,
76
			final String resumptionType,
77
			final String resumptionParam,
78
			final String resumptionXpath,
79
			final String resultTotalXpath,
80
			final String resultFormatParam,
81
			final String resultFormatValue,
82
			final String resultSizeParam,
83 52971 andreas.cz
                        final String resultSizeValueStr,
84 50582 jochen.sch
			final String queryParams,
85
			final String entityXpath
86 50066 jochen.sch
			) {
87
		this.baseUrl = baseUrl;
88
		this.resumptionType = resumptionType;
89
		this.resumptionParam = resumptionParam;
90
		this.resultFormatValue = resultFormatValue;
91
		this.queryParams = queryParams;
92 52971 andreas.cz
                this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
93
94 50584 claudio.at
        queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
95 52971 andreas.cz
        querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
96 50066 jochen.sch
97
		try {
98 50582 jochen.sch
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
99 50584 claudio.at
		} catch(Exception e) {
100
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
101 50066 jochen.sch
		}
102
        initQueue();
103
	}
104
105 50582 jochen.sch
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) throws TransformerConfigurationException, XPathExpressionException{
106 50066 jochen.sch
		transformer = TransformerFactory.newInstance().newTransformer();
107 52970 andreas.cz
                transformer.setOutputProperty(OutputKeys.INDENT,"yes");
108
                transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount","3");
109
		xpath              = XPathFactory.newInstance().newXPath();
110 50066 jochen.sch
		xprResultTotalPath = xpath.compile(resultTotalXpath);
111 52970 andreas.cz
		xprResumptionPath  = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
112
		xprEntity          = xpath.compile(entityXpath);
113 50066 jochen.sch
	}
114
115
	private void initQueue() {
116 52997 andreas.cz
//		query = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
117
                //  change at 2018-08-30 and remove resumptionParam&-Type from "query" URL because first request starts mostly at the beginning.
118
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
119
120 50066 jochen.sch
	}
121
122
	private void disconnect() {
123
		// TODO close inputstream
124
	}
125
126
	/* (non-Javadoc)
127
	 * @see java.util.Iterator#hasNext()
128
	 */
129
	@Override
130
	public boolean hasNext() {
131 50582 jochen.sch
		if (recordQueue.isEmpty() && query.isEmpty()) {
132 50066 jochen.sch
			disconnect();
133
			return false;
134
		} else {
135
			return true;
136
		}
137
	}
138
139
	/* (non-Javadoc)
140
	 * @see java.util.Iterator#next()
141
	 */
142
	@Override
143
	public String next() {
144 50582 jochen.sch
		synchronized (recordQueue) {
145
			while (recordQueue.isEmpty() && !query.isEmpty() ) {
146
				try {
147 52983 andreas.cz
                                        log.info("get Query: " + query);
148 50582 jochen.sch
					query = downloadPage(query);
149 53116 andreas.cz
                                        log.debug("next queryURL from downloadPage(): " + query);
150 50584 claudio.at
				} catch(CollectorServiceException e) {
151 52983 andreas.cz
                                        log.debug("CollectorPlugin.next()-Exception: " + e);
152 50582 jochen.sch
					throw new RuntimeException(e);
153
				}
154
			}
155
			return recordQueue.poll();
156
		}
157
	}
158
159
160
	/*
161
	 * download page and return nextQuery
162
	 */
163
	private String downloadPage(String query) throws CollectorServiceException{
164 50066 jochen.sch
		String resultJson;
165 53116 andreas.cz
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
166 50582 jochen.sch
		String nextQuery = "";
167 53116 andreas.cz
                String emptyXml = resultXml + "<"+wrapName+"></"+wrapName+">";
168 52979 andreas.cz
                Node resultNode = null;
169 52982 andreas.cz
                NodeList nodeList = null;
170 52983 andreas.cz
                String qUrlArgument = "";
171
                int urlOldResumptionSize = 0;
172 52979 andreas.cz
173 50066 jochen.sch
		try {
174 52983 andreas.cz
                        URL qUrl = new URL(query);
175
176
                        resultStream = qUrl.openStream();
177 53116 andreas.cz
			if("json".equals(resultFormatValue.toLowerCase())){
178
179 50066 jochen.sch
				resultJson = IOUtils.toString(resultStream,"UTF-8");
180 52970 andreas.cz
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
181 50066 jochen.sch
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
182 53116 andreas.cz
                                resultXml += org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element
183
				log.trace("before inputStream: " + resultXml);
184 53123 andreas.cz
                                resultXml = XmlCleaner.cleanAllEntities(resultXml);
185
                                log.trace("after cleaning: " + resultXml);
186 50066 jochen.sch
				resultStream = IOUtils.toInputStream(resultXml,"UTF-8");
187
			}
188
189 52979 andreas.cz
                        if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
190
                            resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
191
                            nodeList   = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
192 52982 andreas.cz
                            log.debug("nodeList.length: " + nodeList.getLength());
193 52979 andreas.cz
                            for (int i = 0; i < nodeList.getLength(); i++) {
194
                                StringWriter sw = new StringWriter();
195
                                transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
196
                                recordQueue.add(sw.toString());
197
                            }
198 52982 andreas.cz
                        } else { log.info("resultXml is equal with emptyXml"); }
199
200 50066 jochen.sch
			resumptionInt += resultSizeValue;
201 52970 andreas.cz
202 52971 andreas.cz
/*	replaced by switch statement as follow
203
                        if("scan".equals(resumptionType.toLowerCase())) { resumptionStr = xprResumptionPath.evaluate(resultNode);}
204
			if("count".equals(resumptionType.toLowerCase())){ resumptionStr = Integer.toString(resumptionInt); }
205
*/
206 52970 andreas.cz
                        switch(resumptionType.toLowerCase()) {
207 52997 andreas.cz
                            case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
208 52970 andreas.cz
                                            resumptionStr = xprResumptionPath.evaluate(resultNode);
209
                                            break;
210 52979 andreas.cz
211 52997 andreas.cz
                            case "count":   // begin at one step for all records, iterate over items
212 52970 andreas.cz
                                            resumptionStr = Integer.toString(resumptionInt);
213
                                            break;
214 52979 andreas.cz
215 53116 andreas.cz
                            case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)
216 52997 andreas.cz
                                            if (resultSizeValue < 2 ) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
217 52983 andreas.cz
                                            qUrlArgument = qUrl.getQuery();
218
                                            String[] arrayQUrlArgument = qUrlArgument.split("&");
219
                                            for(String arrayUrlArgStr : arrayQUrlArgument ) {
220
                                                if(arrayUrlArgStr.startsWith(resumptionParam)) {
221
                                                    String[] resumptionKeyValue = arrayUrlArgStr.split("=");
222
                                                    urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
223
                                                    log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize);
224
                                                }
225
                                            }
226
227
                                            //
228
                                            if(   ( (emptyXml.toLowerCase()).equals(resultXml.toLowerCase()) )
229
                                               || ( (nodeList != null) && (nodeList.getLength() < resultSizeValue) )
230
                                              ) {
231 52982 andreas.cz
                                                // resumptionStr = "";
232 52983 andreas.cz
                                                if(nodeList != null) { discoverResultSize += nodeList.getLength(); }
233 52982 andreas.cz
                                                resultTotal   = discoverResultSize;
234 52970 andreas.cz
                                            } else {
235
                                                resumptionStr = Integer.toString(resumptionInt);
236
                                                resultTotal   = resumptionInt+1;
237 52982 andreas.cz
                                                if(nodeList != null) { discoverResultSize += nodeList.getLength(); }
238 52970 andreas.cz
                                            }
239 52982 andreas.cz
                                            log.info("discoverResultSize:  " + discoverResultSize);
240 52970 andreas.cz
                                            break;
241 52979 andreas.cz
242 53183 andreas.cz
                            case "pagination":    // pagination, iterate over pages
243
                            case "page":
244
                                            pagination += 1;
245
                                            if (nodeList != null) {
246
                                                discoverResultSize += nodeList.getLength();
247
                                            } else {
248
                                                resultTotal = discoverResultSize;
249
                                                pagination  = discoverResultSize;
250
                                            }
251
                                            resumptionInt = pagination;
252
                                            resumptionStr = Integer.toString(resumptionInt);
253 52997 andreas.cz
                                            break;
254
255 52979 andreas.cz
                            default:        // otherwise: abort
256 52982 andreas.cz
                                            // resultTotal = resumptionInt;
257 52971 andreas.cz
                                            break;
258 52970 andreas.cz
                        }
259 52971 andreas.cz
260 50066 jochen.sch
			if (resultTotal == -1) {
261
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
262 52979 andreas.cz
				log.info("resultTotal was -1 is now: " + resultTotal);
263 50066 jochen.sch
			}
264 50584 claudio.at
			log.info("resultTotal: " + resultTotal);
265
			log.info("resInt: " + resumptionInt);
266 50066 jochen.sch
			if (resumptionInt < resultTotal) {
267 50582 jochen.sch
				nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
268 50584 claudio.at
			} else
269 50582 jochen.sch
				nextQuery = "";
270 52971 andreas.cz
271 52979 andreas.cz
                        log.debug("nextQueryUrl: " + nextQuery);
272 50582 jochen.sch
			return nextQuery;
273 50066 jochen.sch
274 50584 claudio.at
		} catch(Exception e) {
275
			log.error(e);
276
			throw new IllegalStateException("collection failed: " + e.getMessage());
277 50066 jochen.sch
		}
278
	}
279 52970 andreas.cz
280
        /**
281 52997 andreas.cz
         * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
282
         * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
283
         * and work-around for the JSON to XML converting of org.json.XML-package.
284 52970 andreas.cz
         *
285 53163 andreas.cz
         * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
286
         *
287 52970 andreas.cz
         * @param jsonInput
288 52997 andreas.cz
         * @return convertedJsonKeynameOutput
289 52970 andreas.cz
         */
290
        private String syntaxConvertJsonKeyNamens(String jsonInput) {
291 50066 jochen.sch
292 52979 andreas.cz
            log.trace("before convertJsonKeyNames: " + jsonInput);
293 53116 andreas.cz
            // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
294 52970 andreas.cz
            // replace ' 's in JSON Namens with '_'
295
            while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
296
                jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
297
            }
298
299
            // replace forward-slash (sign '/' ) in JSON Names with '_'
300
            while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
301
                jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
302
            }
303
304
            // replace '(' in JSON Names with ''
305
            while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
306
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
307
            }
308 52997 andreas.cz
309 52970 andreas.cz
            // replace ')' in JSON Names with ''
310
            while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
311
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
312
            }
313
314 52997 andreas.cz
            // replace startNumbers in JSON Keynames with 'n_'
315
            while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
316
                jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
317
            }
318
319
            // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
320
            while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
321
                jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
322
            }
323
324 53163 andreas.cz
            // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
325
//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
326
//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
327
//            }
328 52997 andreas.cz
329
            // replace '=' in JSON Keynames with '-'
330
            while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
331
                jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
332
            }
333
334 52979 andreas.cz
            log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
335 52970 andreas.cz
            return jsonInput;
336
        }
337 53123 andreas.cz
338
        /**
339
         *
340 53163 andreas.cz
         * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
341 53123 andreas.cz
         *          *
342 53163 andreas.cz
         * @param bufferStr - XML string
343 53123 andreas.cz
         * @return
344
         */
345 53163 andreas.cz
        private static String cleanUnwantedJsonCharsInXmlTagnames( String bufferStr ) {
346
347
            while ( bufferStr.matches(".*<([^<>].*),(.)>.*") ) {
348
                bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
349
            }
350 52970 andreas.cz
351 53123 andreas.cz
            // replace [#x10-#x1f] with ''
352 53163 andreas.cz
//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
353
//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
354
//            }
355 53123 andreas.cz
356
            return bufferStr;
357
        }
358
359 50066 jochen.sch
}