Project

General

Profile

1
/**
2
 * log.debug(...) equal to  log.trace(...) in the application-logs
3
 * <p>
4
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
5
 */
6
package eu.dnetlib.data.collector.plugins.rest;
7

    
8
import java.io.InputStream;
9
import java.io.StringWriter;
10
import java.net.URL;
11
import java.net.HttpURLConnection;
12
import java.util.Iterator;
13
import java.util.Queue;
14
import java.util.concurrent.PriorityBlockingQueue;
15
import javax.xml.transform.OutputKeys;
16
import javax.xml.transform.Transformer;
17
import javax.xml.transform.TransformerConfigurationException;
18
import javax.xml.transform.TransformerFactory;
19
import javax.xml.transform.dom.DOMSource;
20
import javax.xml.transform.stream.StreamResult;
21
import javax.xml.xpath.*;
22

    
23
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
24
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
25
import org.apache.commons.io.IOUtils;
26
import org.apache.commons.lang3.StringUtils;
27
import org.apache.commons.logging.Log;
28
import org.apache.commons.logging.LogFactory;
29
import org.w3c.dom.Node;
30
import org.w3c.dom.NodeList;
31
import org.xml.sax.InputSource;
32

    
33
/**
34
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
35
 * @date 2020-03-20
36
 *
37
 */
38
public class RestIterator implements Iterator<String> {
39

    
40
	// TODO: clean up the comments of replaced source code
41
	private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
42

    
43
	private static final String wrapName = "recordWrap";
44
	private String baseUrl;
45
	private String resumptionType;
46
	private String resumptionParam;
47
	private String resultFormatValue;
48
	private String queryParams;
49
	private int resultSizeValue;
50
	private int resumptionInt = 0;            // integer resumption token (first record to harvest)
51
	private int resultTotal = -1;
52
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
53
	private InputStream resultStream;
54
	private Transformer transformer;
55
	private XPath xpath;
56
	private String query;
57
	private XPathExpression xprResultTotalPath;
58
	private XPathExpression xprResumptionPath;
59
	private XPathExpression xprEntity;
60
	private String queryFormat;
61
	private String querySize;
62
	private String authMethod;
63
	private String authToken;
64
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
65
	private int discoverResultSize = 0;
66
	private int pagination = 1;
67
	private String resultOffsetParam;
68

    
69
	/**
70
	 * RestIterator class
71
	 * 
72
	 * compatible to version before 1.3.33
73
	 * 
74
	 * @param baseUrl
75
	 * @param resumptionType
76
	 * @param resumptionParam
77
	 * @param resumptionXpath
78
	 * @param resultTotalXpath
79
	 * @param resultFormatParam
80
	 * @param resultFormatValue
81
	 * @param resultSizeParam
82
	 * @param resultSizeValueStr
83
	 * @param queryParams
84
	 * @param entityXpath
85
	 */
86
	public RestIterator(
87
			final String baseUrl,
88
			final String resumptionType,
89
			final String resumptionParam,
90
			final String resumptionXpath,
91
			final String resultTotalXpath,
92
			final String resultFormatParam,
93
			final String resultFormatValue,
94
			final String resultSizeParam,
95
			final String resultSizeValueStr,
96
			final String queryParams,
97
			final String entityXpath
98
	) {
99
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "","");
100
	}
101

    
102
	/** RestIterator class
103
	 *  compatible to version 1.3.33
104
	 * @param baseUrl
105
	 * @param resumptionType
106
	 * @param resumptionParam
107
	 * @param resumptionXpath
108
	 * @param resultTotalXpath
109
	 * @param resultFormatParam
110
	 * @param resultFormatValue
111
	 * @param resultSizeParam
112
	 * @param resultSizeValueStr
113
	 * @param queryParams
114
	 * @param entityXpath
115
	 * @param authMethod
116
	 * @param authToken
117
	 */
118
	public RestIterator(
119
			final String baseUrl,
120
			final String resumptionType,
121
			final String resumptionParam,
122
			final String resumptionXpath,
123
			final String resultTotalXpath,
124
			final String resultFormatParam,
125
			final String resultFormatValue,
126
			final String resultSizeParam,
127
			final String resultSizeValueStr,
128
			final String queryParams,
129
			final String entityXpath,
130
			final String authMethod,
131
			final String authToken
132
	) {
133
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "","");
134
	}
135
	
136
	/**
137
	 * 
138
	 * @param baseUrl
139
	 * @param resumptionType
140
	 * @param resumptionParam
141
	 * @param resumptionXpath
142
	 * @param resultTotalXpath
143
	 * @param resultFormatParam
144
	 * @param resultFormatValue
145
	 * @param resultSizeParam
146
	 * @param resultSizeValueStr
147
	 * @param queryParams
148
	 * @param entityXpath
149
	 * @param authMethod
150
	 * @param authToken
151
	 * @param resultOffsetParam
152
	 */
153
	public RestIterator(
154
			final String baseUrl,
155
			final String resumptionType,
156
			final String resumptionParam,
157
			final String resumptionXpath,
158
			final String resultTotalXpath,
159
			final String resultFormatParam,
160
			final String resultFormatValue,
161
			final String resultSizeParam,
162
			final String resultSizeValueStr,
163
			final String queryParams,
164
			final String entityXpath,
165
			final String authMethod,
166
			final String authToken,
167
			final String resultOffsetParam
168
	) {
169
		this.baseUrl = baseUrl;
170
		this.resumptionType = resumptionType;
171
		this.resumptionParam = resumptionParam;
172
		this.resultFormatValue = resultFormatValue;
173
		this.queryParams = queryParams;
174
		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
175
		this.authMethod = authMethod;
176
		this.authToken = authToken;
177
		this.resultOffsetParam = resultOffsetParam;
178

    
179
		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
180
		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
181

    
182
		try {
183
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
184
		} catch (Exception e) {
185
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
186
		}
187
		initQueue();
188
	}
189
	
190
	
191
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
192
			throws TransformerConfigurationException, XPathExpressionException {
193
		transformer = TransformerFactory.newInstance().newTransformer();
194
		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
195
		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
196
		xpath = XPathFactory.newInstance().newXPath();
197
		xprResultTotalPath = xpath.compile(resultTotalXpath);
198
		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
199
		xprEntity = xpath.compile(entityXpath);
200
	}
201

    
202
	private void initQueue() {
203
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
204
	}
205

    
206
	private void disconnect() {
207
		// TODO close inputstream
208
	}
209

    
210
	/* (non-Javadoc)
211
	 * @see java.util.Iterator#hasNext()
212
	 */
213
	@Override
214
	public boolean hasNext() {
215
		if (recordQueue.isEmpty() && query.isEmpty()) {
216
			disconnect();
217
			return false;
218
		} else {
219
			return true;
220
		}
221
	}
222

    
223
	/* (non-Javadoc)
224
	 * @see java.util.Iterator#next()
225
	 */
226
	@Override
227
	public String next() {
228
		synchronized (recordQueue) {
229
			while (recordQueue.isEmpty() && !query.isEmpty()) {
230
				try {
231
					log.info("get Query: " + query);
232
					query = downloadPage(query);
233
					log.debug("next queryURL from downloadPage(): " + query);
234
				} catch (CollectorServiceException e) {
235
					log.debug("CollectorPlugin.next()-Exception: " + e);
236
					throw new RuntimeException(e);
237
				}
238
			}
239
			return recordQueue.poll();
240
		}
241
	}
242

    
243
	/*
244
	 * download page and return nextQuery
245
	 */
246
	private String downloadPage(String query) throws CollectorServiceException {
247
		String resultJson;
248
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
249
		String nextQuery = "";
250
		String emptyXml = resultXml + "<" + wrapName + "></" + wrapName + ">";
251
		Node resultNode = null;
252
		NodeList nodeList = null;
253
		String qUrlArgument = "";
254
		int urlOldResumptionSize = 0;
255
		InputStream theHttpInputStream;
256

    
257
		try {
258
			URL qUrl = new URL(query);
259
			
260
			if (this.authMethod == "bearer") {
261
				log.trace("authMethode before inputStream: " + resultXml);
262

    
263
				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
264
	        	conn.setRequestProperty("Authorization","Bearer "+authToken);
265
	        	conn.setRequestProperty("Content-Type","application/json");
266
	        	conn.setRequestMethod("GET");   
267
	        	theHttpInputStream = conn.getInputStream();
268
			} else {
269
				theHttpInputStream = qUrl.openStream();
270
			}
271
			
272
			resultStream = theHttpInputStream;
273
			if ("json".equals(resultFormatValue.toLowerCase())) {
274
				resultJson = IOUtils.toString(resultStream, "UTF-8");
275
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
276
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
277
				resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
278
				log.trace("before inputStream: " + resultXml);
279
				resultXml = XmlCleaner.cleanAllEntities(resultXml);
280
				log.trace("after cleaning: " + resultXml);
281
				resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
282
			}
283

    
284
			if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
285
				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
286
				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
287
				log.debug("nodeList.length: " + nodeList.getLength());
288
				for (int i = 0; i < nodeList.getLength(); i++) {
289
					StringWriter sw = new StringWriter();
290
					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
291
					recordQueue.add(sw.toString());
292
				}
293
			} else { log.info("resultXml is equal with emptyXml"); }
294

    
295
			resumptionInt += resultSizeValue;
296

    
297
			switch (resumptionType.toLowerCase()) {
298
			case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
299
				resumptionStr = xprResumptionPath.evaluate(resultNode);
300
				break;
301

    
302
			case "count":   // begin at one step for all records, iterate over items
303
				resumptionStr = Integer.toString(resumptionInt);
304
				break;
305

    
306
			case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)
307
				if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
308
				qUrlArgument = qUrl.getQuery();
309
				String[] arrayQUrlArgument = qUrlArgument.split("&");
310
				for (String arrayUrlArgStr : arrayQUrlArgument) {
311
					if (arrayUrlArgStr.startsWith(resumptionParam)) {
312
						String[] resumptionKeyValue = arrayUrlArgStr.split("=");
313
						if(isInteger(resumptionKeyValue[1])) {
314
							urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
315
							log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
316
						} else {
317
							log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
318
						}
319
					}
320
				}
321

    
322
				if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase()))
323
						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
324
				) {
325
					// resumptionStr = "";
326
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
327
					resultTotal = discoverResultSize;
328
				} else {
329
					resumptionStr = Integer.toString(resumptionInt);
330
					resultTotal = resumptionInt + 1;
331
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
332
				}
333
				log.info("discoverResultSize:  " + discoverResultSize);
334
				break;
335

    
336
			case "pagination":
337
			case "page":         // pagination, iterate over page numbers
338
				pagination += 1;
339
				if (nodeList != null) {
340
					discoverResultSize += nodeList.getLength();
341
				} else {
342
					resultTotal = discoverResultSize;
343
					pagination = discoverResultSize;
344
				}
345
				resumptionInt = pagination;
346
				resumptionStr = Integer.toString(resumptionInt);
347
				break;
348

    
349
			case "deep-cursor":   // size of result items unknown, iterate over items  (for supporting deep cursor in solr)
350
				// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: deep-cursor, Param 'resultSizeValue' is less than 2");}
351
				// if ( (this.resultOffsetParam.length()) < 2) {throw new CollectorServiceException("Mode: deep-cursor, Param 'resultOffsetParam' is less than 2");}
352

    
353
				resumptionStr = xprResumptionPath.evaluate(resultNode);
354
				queryParams = queryParams.replace("&cursor=*", "");
355
				// queryParams = queryParams.replace("&&",  "&");
356
				
357
				//resumptionStr += "&" + resultOffsetParam + "=" + Integer.toString(resumptionInt);
358
				
359
				log.debug("downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams=" + queryParams);
360

    
361
				break;
362
			
363
			default:        // otherwise: abort
364
				// resultTotal = resumptionInt;
365
				break;
366
			}
367

    
368
		} catch (Exception e) {
369
			log.error(e);
370
			throw new IllegalStateException("collection failed: " + e.getMessage());
371
		}			
372
			
373
		try {
374
			if (resultTotal == -1) {
375
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
376
				if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; }           // to correct the upper bound
377
				log.info("resultTotal was -1 is now: " + resultTotal);
378
		}
379
		} catch(Exception e) {
380
			log.error(e);
381
			throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
382
		}
383
		log.info("resultTotal: " + resultTotal);
384
		log.info("resInt: " + resumptionInt);
385
		if (resumptionInt <= resultTotal) {
386
			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
387
		} else
388
			nextQuery = "";
389

    
390
		log.debug("nextQueryUrl: " + nextQuery);
391
		return nextQuery;
392

    
393

    
394
	}
395

    
396
	/**
397
	 * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
398
	 * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
399
	 * and work-around for the JSON to XML converting of org.json.XML-package.
400
	 *
401
	 * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
402
	 *
403
	 * @param jsonInput
404
	 * @return convertedJsonKeynameOutput
405
	 */
406
	private String syntaxConvertJsonKeyNamens(String jsonInput) {
407

    
408
		log.trace("before convertJsonKeyNames: " + jsonInput);
409
		// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
410
		// replace ' 's in JSON Namens with '_'
411
		while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
412
			jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
413
		}
414

    
415
		// replace forward-slash (sign '/' ) in JSON Names with '_'
416
		while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
417
			jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
418
		}
419

    
420
		// replace '(' in JSON Names with ''
421
		while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
422
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
423
		}
424

    
425
		// replace ')' in JSON Names with ''
426
		while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
427
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
428
		}
429

    
430
		// add prefix of startNumbers in JSON Keynames with 'n_'
431
		while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
432
			jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
433
		}
434
        // add prefix of only numbers in JSON Keynames with 'm_'
435
        while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
436
                jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
437
        }
438

    
439
		// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
440
		while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
441
			jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
442
		}
443

    
444
		// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
445
		//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
446
		//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
447
		//            }
448

    
449
		// replace '=' in JSON Keynames with '-'
450
		while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
451
			jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
452
		}
453

    
454
		log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
455
		return jsonInput;
456
	}
457

    
458
	/**
459
	 *
460
	 * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
461
	 *          *
462
	 * @param bufferStr - XML string
463
	 * @return
464
	 */
465
	private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
466

    
467
		while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
468
			bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
469
		}
470

    
471
		// replace [#x10-#x1f] with ''
472
		//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
473
		//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
474
		//            }
475

    
476
		return bufferStr;
477
	}
478
	
479
	private boolean isInteger(String s) {
480
		boolean isValidInteger = false;
481
		try {
482
			Integer.parseInt(s);
483

    
484
			// s is a valid integer
485

    
486
			isValidInteger = true;
487
		} catch (NumberFormatException ex) {
488
			// s is not an integer
489
		}
490

    
491
		return isValidInteger;
492
	}
493

    
494
}
(2-2/2)