Project

General

Profile

1
/**
2
 * log.debug(...) equal to  log.trace(...) in the application-logs
3
 * <p>
4
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
5
 */
6
package eu.dnetlib.data.collector.plugins.rest;
7

    
8
import java.io.InputStream;
9
import java.io.StringWriter;
10
import java.io.UnsupportedEncodingException;
11
import java.net.URL;
12
import java.net.URLEncoder;
13
import java.nio.charset.StandardCharsets;
14
import java.net.HttpURLConnection;
15
import java.util.Iterator;
16
import java.util.Queue;
17
import java.util.concurrent.PriorityBlockingQueue;
18
import javax.xml.transform.OutputKeys;
19
import javax.xml.transform.Transformer;
20
import javax.xml.transform.TransformerConfigurationException;
21
import javax.xml.transform.TransformerFactory;
22
import javax.xml.transform.dom.DOMSource;
23
import javax.xml.transform.stream.StreamResult;
24
import javax.xml.xpath.*;
25

    
26
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
27
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
28
import org.apache.commons.io.IOUtils;
29
import org.apache.commons.lang3.StringUtils;
30
import org.apache.commons.logging.Log;
31
import org.apache.commons.logging.LogFactory;
32
import org.w3c.dom.Node;
33
import org.w3c.dom.NodeList;
34
import org.xml.sax.InputSource;
35

    
36
/**
37
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
38
 * @date 2020-04-09
39
 *
40
 */
41
public class RestIterator implements Iterator<String> {
42

    
43
	// TODO: clean up the comments of replaced source code
44
	private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
45

    
46
	private static final String wrapName = "recordWrap";
47
	private String baseUrl;
48
	private String resumptionType;
49
	private String resumptionParam;
50
	private String resultFormatValue;
51
	private String queryParams;
52
	private int resultSizeValue;
53
	private int resumptionInt = 0;            // integer resumption token (first record to harvest)
54
	private int resultTotal = -1;
55
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
56
	private InputStream resultStream;
57
	private Transformer transformer;
58
	private XPath xpath;
59
	private String query;
60
	private XPathExpression xprResultTotalPath;
61
	private XPathExpression xprResumptionPath;
62
	private XPathExpression xprEntity;
63
	private String queryFormat;
64
	private String querySize;
65
	private String authMethod;
66
	private String authToken;
67
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
68
	private int discoverResultSize = 0;
69
	private int pagination = 1;
70

    
71
	/**
72
	 * RestIterator class
73
	 * 
74
	 * compatible to version before 1.3.33
75
	 * 
76
	 * @param baseUrl
77
	 * @param resumptionType
78
	 * @param resumptionParam
79
	 * @param resumptionXpath
80
	 * @param resultTotalXpath
81
	 * @param resultFormatParam
82
	 * @param resultFormatValue
83
	 * @param resultSizeParam
84
	 * @param resultSizeValueStr
85
	 * @param queryParams
86
	 * @param entityXpath
87
	 */
88
	public RestIterator(
89
			final String baseUrl,
90
			final String resumptionType,
91
			final String resumptionParam,
92
			final String resumptionXpath,
93
			final String resultTotalXpath,
94
			final String resultFormatParam,
95
			final String resultFormatValue,
96
			final String resultSizeParam,
97
			final String resultSizeValueStr,
98
			final String queryParams,
99
			final String entityXpath
100
	) {
101
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "");
102
	}
103

    
104
	/** RestIterator class
105
	 *  compatible to version 1.3.33
106
	 * @param baseUrl
107
	 * @param resumptionType
108
	 * @param resumptionParam
109
	 * @param resumptionXpath
110
	 * @param resultTotalXpath
111
	 * @param resultFormatParam
112
	 * @param resultFormatValue
113
	 * @param resultSizeParam
114
	 * @param resultSizeValueStr
115
	 * @param queryParams
116
	 * @param entityXpath
117
	 * @param authMethod
118
	 * @param authToken
119
	 */
120
	public RestIterator(
121
			final String baseUrl,
122
			final String resumptionType,
123
			final String resumptionParam,
124
			final String resumptionXpath,
125
			final String resultTotalXpath,
126
			final String resultFormatParam,
127
			final String resultFormatValue,
128
			final String resultSizeParam,
129
			final String resultSizeValueStr,
130
			final String queryParams,
131
			final String entityXpath,
132
			final String authMethod,
133
			final String authToken
134
	) {
135
		this.baseUrl = baseUrl;
136
		this.resumptionType = resumptionType;
137
		this.resumptionParam = resumptionParam;
138
		this.resultFormatValue = resultFormatValue;
139
		this.queryParams = queryParams;
140
		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
141
		this.authMethod = authMethod;
142
		this.authToken = authToken;
143

    
144
		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
145
		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
146

    
147
		try {
148
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
149
		} catch (Exception e) {
150
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
151
		}
152
		initQueue();
153
	}
154
	
155
	
156
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
157
			throws TransformerConfigurationException, XPathExpressionException {
158
		transformer = TransformerFactory.newInstance().newTransformer();
159
		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
160
		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
161
		xpath = XPathFactory.newInstance().newXPath();
162
		xprResultTotalPath = xpath.compile(resultTotalXpath);
163
		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
164
		xprEntity = xpath.compile(entityXpath);
165
	}
166

    
167
	private void initQueue() {
168
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
169
	}
170

    
171
	private void disconnect() {
172
		// TODO close inputstream
173
	}
174

    
175
	/* (non-Javadoc)
176
	 * @see java.util.Iterator#hasNext()
177
	 */
178
	@Override
179
	public boolean hasNext() {
180
		if (recordQueue.isEmpty() && query.isEmpty()) {
181
			disconnect();
182
			return false;
183
		} else {
184
			return true;
185
		}
186
	}
187

    
188
	/* (non-Javadoc)
189
	 * @see java.util.Iterator#next()
190
	 */
191
	@Override
192
	public String next() {
193
		synchronized (recordQueue) {
194
			while (recordQueue.isEmpty() && !query.isEmpty()) {
195
				try {
196
					log.info("get Query: " + query);
197
					query = downloadPage(query);
198
					log.debug("next queryURL from downloadPage(): " + query);
199
				} catch (CollectorServiceException e) {
200
					log.debug("CollectorPlugin.next()-Exception: " + e);
201
					throw new RuntimeException(e);
202
				}
203
			}
204
			return recordQueue.poll();
205
		}
206
	}
207

    
208
	/*
209
	 * download page and return nextQuery
210
	 */
211
	private String downloadPage(String query) throws CollectorServiceException {
212
		String resultJson;
213
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
214
		String nextQuery = "";
215
		String emptyXml = resultXml + "<" + wrapName + "></" + wrapName + ">";
216
		Node resultNode = null;
217
		NodeList nodeList = null;
218
		String qUrlArgument = "";
219
		int urlOldResumptionSize = 0;
220
		InputStream theHttpInputStream;
221

    
222
		try {
223
			URL qUrl = new URL(query);
224
			
225
			if (this.authMethod == "bearer") {
226
				log.trace("authMethode before inputStream: " + resultXml);
227

    
228
				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
229
	        	conn.setRequestProperty("Authorization","Bearer "+authToken);
230
	        	conn.setRequestProperty("Content-Type","application/json");
231
	        	conn.setRequestMethod("GET");   
232
	        	theHttpInputStream = conn.getInputStream();
233
			} else {
234
				theHttpInputStream = qUrl.openStream();
235
			}
236
			
237
			resultStream = theHttpInputStream;
238
			if ("json".equals(resultFormatValue.toLowerCase())) {
239
				resultJson = IOUtils.toString(resultStream, "UTF-8");
240
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
241
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
242
				resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
243
				log.trace("before inputStream: " + resultXml);
244
				resultXml = XmlCleaner.cleanAllEntities(resultXml);
245
				log.trace("after cleaning: " + resultXml);
246
				resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
247
			}
248

    
249
			if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
250
				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
251
				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
252
				log.debug("nodeList.length: " + nodeList.getLength());
253
				for (int i = 0; i < nodeList.getLength(); i++) {
254
					StringWriter sw = new StringWriter();
255
					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
256
					recordQueue.add(sw.toString());
257
				}
258
			} else { log.info("resultXml is equal with emptyXml"); }
259

    
260
			resumptionInt += resultSizeValue;
261

    
262
			switch (resumptionType.toLowerCase()) {
263
			case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
264
				resumptionStr = xprResumptionPath.evaluate(resultNode);
265
				break;
266

    
267
			case "count":   // begin at one step for all records, iterate over items
268
				resumptionStr = Integer.toString(resumptionInt);
269
				break;
270

    
271
			case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)
272
				if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
273
				qUrlArgument = qUrl.getQuery();
274
				String[] arrayQUrlArgument = qUrlArgument.split("&");
275
				for (String arrayUrlArgStr : arrayQUrlArgument) {
276
					if (arrayUrlArgStr.startsWith(resumptionParam)) {
277
						String[] resumptionKeyValue = arrayUrlArgStr.split("=");
278
						if(isInteger(resumptionKeyValue[1])) {
279
							urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
280
							log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
281
						} else {
282
							log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
283
						}
284
					}
285
				}
286

    
287
				if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase()))
288
						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
289
				) {
290
					// resumptionStr = "";
291
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
292
					resultTotal = discoverResultSize;
293
				} else {
294
					resumptionStr = Integer.toString(resumptionInt);
295
					resultTotal = resumptionInt + 1;
296
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
297
				}
298
				log.info("discoverResultSize:  " + discoverResultSize);
299
				break;
300

    
301
			case "pagination":
302
			case "page":         // pagination, iterate over page numbers
303
				pagination += 1;
304
				if (nodeList != null) {
305
					discoverResultSize += nodeList.getLength();
306
				} else {
307
					resultTotal = discoverResultSize;
308
					pagination = discoverResultSize;
309
				}
310
				resumptionInt = pagination;
311
				resumptionStr = Integer.toString(resumptionInt);
312
				break;
313

    
314
			case "deep-cursor":   // size of result items unknown, iterate over items  (for supporting deep cursor in solr)
315
				// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: deep-cursor, Param 'resultSizeValue' is less than 2");}
316

    
317
				resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
318
				queryParams = queryParams.replace("&cursor=*", "");
319
				
320
				resumptionInt += nodeList.getLength();
321
				log.debug("downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams=" + queryParams);
322

    
323
				break;
324
			
325
			default:        // otherwise: abort
326
				// resultTotal = resumptionInt;
327
				break;
328
			}
329

    
330
		} catch (Exception e) {
331
			log.error(e);
332
			throw new IllegalStateException("collection failed: " + e.getMessage());
333
		}			
334
			
335
		try {
336
			if (resultTotal == -1) {
337
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
338
				if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; }           // to correct the upper bound
339
				log.info("resultTotal was -1 is now: " + resultTotal);
340
		}
341
		} catch(Exception e) {
342
			log.error(e);
343
			throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
344
		}
345
		log.info("resultTotal: " + resultTotal);
346
		log.info("resInt: " + resumptionInt);
347
		if (resumptionInt <= resultTotal) {
348
			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
349
		} else
350
			nextQuery = "";
351

    
352
		log.debug("nextQueryUrl: " + nextQuery);
353
		return nextQuery;
354

    
355

    
356
	}
357

    
358
	/**
359
	 * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
360
	 * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
361
	 * and work-around for the JSON to XML converting of org.json.XML-package.
362
	 *
363
	 * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
364
	 *
365
	 * @param jsonInput
366
	 * @return convertedJsonKeynameOutput
367
	 */
368
	private String syntaxConvertJsonKeyNamens(String jsonInput) {
369

    
370
		log.trace("before convertJsonKeyNames: " + jsonInput);
371
		// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
372
		// replace ' 's in JSON Namens with '_'
373
		while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
374
			jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
375
		}
376

    
377
		// replace forward-slash (sign '/' ) in JSON Names with '_'
378
		while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
379
			jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
380
		}
381

    
382
		// replace '(' in JSON Names with ''
383
		while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
384
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
385
		}
386

    
387
		// replace ')' in JSON Names with ''
388
		while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
389
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
390
		}
391

    
392
		// add prefix of startNumbers in JSON Keynames with 'n_'
393
		while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
394
			jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
395
		}
396
        // add prefix of only numbers in JSON Keynames with 'm_'
397
        while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
398
                jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
399
        }
400

    
401
		// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
402
		while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
403
			jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
404
		}
405

    
406
		// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
407
		//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
408
		//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
409
		//            }
410

    
411
		// replace '=' in JSON Keynames with '-'
412
		while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
413
			jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
414
		}
415

    
416
		log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
417
		return jsonInput;
418
	}
419

    
420
	/**
421
	 *
422
	 * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
423
	 *          *
424
	 * @param bufferStr - XML string
425
	 * @return
426
	 */
427
	private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
428

    
429
		while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
430
			bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
431
		}
432

    
433
		// replace [#x10-#x1f] with ''
434
		//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
435
		//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
436
		//            }
437

    
438
		return bufferStr;
439
	}
440
	
441
	private boolean isInteger(String s) {
442
		boolean isValidInteger = false;
443
		try {
444
			Integer.parseInt(s);
445

    
446
			// s is a valid integer
447

    
448
			isValidInteger = true;
449
		} catch (NumberFormatException ex) {
450
			// s is not an integer
451
		}
452

    
453
		return isValidInteger;
454
	}
455
	
456
	// Method to encode a string value using `UTF-8` encoding scheme
457
    private String encodeValue(String value) {
458
        try {
459
            return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
460
        } catch (UnsupportedEncodingException ex) {
461
            throw new RuntimeException(ex.getCause());
462
        }
463
    }
464

    
465
}
(2-2/2)