Project

General

Profile

1
/**
2
 * log.debug(...) equal to  log.trace(...) in the application-logs
3
 * <p>
4
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
5
 */
6
package eu.dnetlib.data.collector.plugins.rest;
7

    
8
import java.io.InputStream;
9
import java.io.StringWriter;
10
import java.io.UnsupportedEncodingException;
11
import java.net.URL;
12
import java.net.URLEncoder;
13
import java.nio.charset.StandardCharsets;
14
import java.net.HttpURLConnection;
15
import java.util.Iterator;
16
import java.util.Queue;
17
import java.util.concurrent.PriorityBlockingQueue;
18
import javax.xml.transform.OutputKeys;
19
import javax.xml.transform.Transformer;
20
import javax.xml.transform.TransformerConfigurationException;
21
import javax.xml.transform.TransformerFactory;
22
import javax.xml.transform.dom.DOMSource;
23
import javax.xml.transform.stream.StreamResult;
24
import javax.xml.xpath.*;
25

    
26
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
27
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
28
import org.apache.commons.io.IOUtils;
29
import org.apache.commons.lang3.StringUtils;
30
import org.apache.commons.logging.Log;
31
import org.apache.commons.logging.LogFactory;
32
import org.w3c.dom.Node;
33
import org.w3c.dom.NodeList;
34
import org.xml.sax.InputSource;
35

    
36
/**
37
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
38
 * @date 2020-04-09
39
 *
40
 */
41
public class RestIterator implements Iterator<String> {
42

    
43
	// TODO: clean up the comments of replaced source code
44
	private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
45

    
46
	private static final String wrapName = "recordWrap";
47
	private String baseUrl;
48
	private String resumptionType;
49
	private String resumptionParam;
50
	private String resultFormatValue;
51
	private String queryParams;
52
	private int resultSizeValue;
53
	private int resumptionInt = 0;            // integer resumption token (first record to harvest)
54
	private int resultTotal = -1;
55
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
56
	private InputStream resultStream;
57
	private Transformer transformer;
58
	private XPath xpath;
59
	private String query;
60
	private XPathExpression xprResultTotalPath;
61
	private XPathExpression xprResumptionPath;
62
	private XPathExpression xprEntity;
63
	private String queryFormat;
64
	private String querySize;
65
	private String authMethod;
66
	private String authToken;
67
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
68
	private int discoverResultSize = 0;
69
	private int pagination = 1;
70

    
71
	/**
72
	 * RestIterator class
73
	 * 
74
	 * compatible to version before 1.3.33
75
	 * 
76
	 * @param baseUrl
77
	 * @param resumptionType
78
	 * @param resumptionParam
79
	 * @param resumptionXpath
80
	 * @param resultTotalXpath
81
	 * @param resultFormatParam
82
	 * @param resultFormatValue
83
	 * @param resultSizeParam
84
	 * @param resultSizeValueStr
85
	 * @param queryParams
86
	 * @param entityXpath
87
	 */
88
	public RestIterator(
89
			final String baseUrl,
90
			final String resumptionType,
91
			final String resumptionParam,
92
			final String resumptionXpath,
93
			final String resultTotalXpath,
94
			final String resultFormatParam,
95
			final String resultFormatValue,
96
			final String resultSizeParam,
97
			final String resultSizeValueStr,
98
			final String queryParams,
99
			final String entityXpath
100
	) {
101
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "");
102
	}
103

    
104
	public RestIterator(
105
			final String baseUrl,
106
			final String resumptionType,
107
			final String resumptionParam,
108
			final String resumptionXpath,
109
			final String resultTotalXpath,
110
			final String resultFormatParam,
111
			final String resultFormatValue,
112
			final String resultSizeParam,
113
			final String resultSizeValueStr,
114
			final String queryParams,
115
			final String entityXpath,
116
			final String authMethod,
117
			final String authToken,
118
			final String resultOffsetParam
119
	) {
120
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "");
121
	}
122

    
123
	/** RestIterator class
124
	 *  compatible to version 1.3.33
125
	 * @param baseUrl
126
	 * @param resumptionType
127
	 * @param resumptionParam
128
	 * @param resumptionXpath
129
	 * @param resultTotalXpath
130
	 * @param resultFormatParam
131
	 * @param resultFormatValue
132
	 * @param resultSizeParam
133
	 * @param resultSizeValueStr
134
	 * @param queryParams
135
	 * @param entityXpath
136
	 * @param authMethod
137
	 * @param authToken
138
	 */
139
	public RestIterator(
140
			final String baseUrl,
141
			final String resumptionType,
142
			final String resumptionParam,
143
			final String resumptionXpath,
144
			final String resultTotalXpath,
145
			final String resultFormatParam,
146
			final String resultFormatValue,
147
			final String resultSizeParam,
148
			final String resultSizeValueStr,
149
			final String queryParams,
150
			final String entityXpath,
151
			final String authMethod,
152
			final String authToken
153
	) {
154
		this.baseUrl = baseUrl;
155
		this.resumptionType = resumptionType;
156
		this.resumptionParam = resumptionParam;
157
		this.resultFormatValue = resultFormatValue;
158
		this.queryParams = queryParams;
159
		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
160
		this.authMethod = authMethod;
161
		this.authToken = authToken;
162

    
163
		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
164
		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
165

    
166
		try {
167
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
168
		} catch (Exception e) {
169
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
170
		}
171
		initQueue();
172
	}
173
	
174
	
175
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
176
			throws TransformerConfigurationException, XPathExpressionException {
177
		transformer = TransformerFactory.newInstance().newTransformer();
178
		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
179
		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
180
		xpath = XPathFactory.newInstance().newXPath();
181
		xprResultTotalPath = xpath.compile(resultTotalXpath);
182
		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
183
		xprEntity = xpath.compile(entityXpath);
184
	}
185

    
186
	private void initQueue() {
187
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
188
	}
189

    
190
	private void disconnect() {
191
		// TODO close inputstream
192
	}
193

    
194
	/* (non-Javadoc)
195
	 * @see java.util.Iterator#hasNext()
196
	 */
197
	@Override
198
	public boolean hasNext() {
199
		if (recordQueue.isEmpty() && query.isEmpty()) {
200
			disconnect();
201
			return false;
202
		} else {
203
			return true;
204
		}
205
	}
206

    
207
	/* (non-Javadoc)
208
	 * @see java.util.Iterator#next()
209
	 */
210
	@Override
211
	public String next() {
212
		synchronized (recordQueue) {
213
			while (recordQueue.isEmpty() && !query.isEmpty()) {
214
				try {
215
					log.info("get Query: " + query);
216
					query = downloadPage(query);
217
					log.debug("next queryURL from downloadPage(): " + query);
218
				} catch (CollectorServiceException e) {
219
					log.debug("CollectorPlugin.next()-Exception: " + e);
220
					throw new RuntimeException(e);
221
				}
222
			}
223
			return recordQueue.poll();
224
		}
225
	}
226

    
227
	/*
228
	 * download page and return nextQuery
229
	 */
230
	private String downloadPage(String query) throws CollectorServiceException {
231
		String resultJson;
232
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
233
		String nextQuery = "";
234
		String emptyXml = resultXml + "<" + wrapName + "></" + wrapName + ">";
235
		Node resultNode = null;
236
		NodeList nodeList = null;
237
		String qUrlArgument = "";
238
		int urlOldResumptionSize = 0;
239
		InputStream theHttpInputStream;
240
		
241
		// check if cursor=* is initial set otherwise add it to the queryParam URL
242
		if( resumptionType.equalsIgnoreCase("deep-cursor") ) {
243
			log.debug("check resumptionType deep-cursor and check cursor=*?" + query);
244
			if(!query.contains("&cursor=")) {
245
				query += "&cursor=*";
246
			}
247
		}
248

    
249
		try {
250
			URL qUrl = new URL(query);
251
			
252
			if (this.authMethod == "bearer") {
253
				log.trace("authMethode before inputStream: " + resultXml);
254

    
255
				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
256
	        	conn.setRequestProperty("Authorization","Bearer "+authToken);
257
	        	conn.setRequestProperty("Content-Type","application/json");
258
	        	conn.setRequestMethod("GET");   
259
	        	theHttpInputStream = conn.getInputStream();
260
			} else {
261
				theHttpInputStream = qUrl.openStream();
262
			}
263
			
264
			resultStream = theHttpInputStream;
265
			if ("json".equals(resultFormatValue.toLowerCase())) {
266
				resultJson = IOUtils.toString(resultStream, "UTF-8");
267
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
268
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
269
				resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
270
				log.trace("before inputStream: " + resultXml);
271
				resultXml = XmlCleaner.cleanAllEntities(resultXml);
272
				log.trace("after cleaning: " + resultXml);
273
				resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
274
			}
275

    
276
			if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
277
				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
278
				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
279
				log.debug("nodeList.length: " + nodeList.getLength());
280
				for (int i = 0; i < nodeList.getLength(); i++) {
281
					StringWriter sw = new StringWriter();
282
					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
283
					recordQueue.add(sw.toString());
284
				}
285
			} else { log.info("resultXml is equal with emptyXml"); }
286

    
287
			resumptionInt += resultSizeValue;
288

    
289
			switch (resumptionType.toLowerCase()) {
290
			case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
291
				resumptionStr = xprResumptionPath.evaluate(resultNode);
292
				break;
293

    
294
			case "count":   // begin at one step for all records, iterate over items
295
				resumptionStr = Integer.toString(resumptionInt);
296
				break;
297

    
298
			case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)
299
				if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
300
				qUrlArgument = qUrl.getQuery();
301
				String[] arrayQUrlArgument = qUrlArgument.split("&");
302
				for (String arrayUrlArgStr : arrayQUrlArgument) {
303
					if (arrayUrlArgStr.startsWith(resumptionParam)) {
304
						String[] resumptionKeyValue = arrayUrlArgStr.split("=");
305
						if(isInteger(resumptionKeyValue[1])) {
306
							urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
307
							log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
308
						} else {
309
							log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
310
						}
311
					}
312
				}
313

    
314
				if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase()))
315
						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
316
				) {
317
					// resumptionStr = "";
318
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
319
					resultTotal = discoverResultSize;
320
				} else {
321
					resumptionStr = Integer.toString(resumptionInt);
322
					resultTotal = resumptionInt + 1;
323
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
324
				}
325
				log.info("discoverResultSize:  " + discoverResultSize);
326
				break;
327

    
328
			case "pagination":
329
			case "page":         // pagination, iterate over page numbers
330
				pagination += 1;
331
				if (nodeList != null) {
332
					discoverResultSize += nodeList.getLength();
333
				} else {
334
					resultTotal = discoverResultSize;
335
					pagination = discoverResultSize;
336
				}
337
				resumptionInt = pagination;
338
				resumptionStr = Integer.toString(resumptionInt);
339
				break;
340

    
341
			case "deep-cursor":   // size of result items unknown, iterate over items  (for supporting deep cursor in solr)
342
				// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: deep-cursor, Param 'resultSizeValue' is less than 2");}
343

    
344
				resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
345
				queryParams = queryParams.replace("&cursor=*", "");
346
				
347
				// terminating if length of nodeList is 0
348
				if(nodeList.getLength() == 0) {
349
					resumptionInt += ( 1 - resultSizeValue);
350
				} else {
351
					resumptionInt += (nodeList.getLength() - resultSizeValue);	// subtract the resultSizeValue because the iteration is over real length and the resultSizeValue is added before the switch()
352
				}
353
				log.debug("downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams=" + queryParams + " resumptionLengthIncreased: " + resumptionInt);
354

    
355
				break;
356
			
357
			default:        // otherwise: abort
358
				// resultTotal = resumptionInt;
359
				break;
360
			}
361

    
362
		} catch (Exception e) {
363
			log.error(e);
364
			throw new IllegalStateException("collection failed: " + e.getMessage());
365
		}			
366
			
367
		try {
368
			if (resultTotal == -1) {
369
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
370
				if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; }           // to correct the upper bound
371
				log.info("resultTotal was -1 is now: " + resultTotal);
372
		}
373
		} catch(Exception e) {
374
			log.error(e);
375
			throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
376
		}
377
		log.info("resultTotal: " + resultTotal);
378
		log.info("resInt: " + resumptionInt);
379
		if (resumptionInt <= resultTotal) {
380
			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
381
		} else
382
			nextQuery = "";
383

    
384
		log.debug("nextQueryUrl: " + nextQuery);
385
		return nextQuery;
386

    
387

    
388
	}
389

    
390
	/**
391
	 * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
392
	 * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
393
	 * and work-around for the JSON to XML converting of org.json.XML-package.
394
	 *
395
	 * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
396
	 *
397
	 * @param jsonInput
398
	 * @return convertedJsonKeynameOutput
399
	 */
400
	private String syntaxConvertJsonKeyNamens(String jsonInput) {
401

    
402
		log.trace("before convertJsonKeyNames: " + jsonInput);
403
		// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
404
		// replace ' 's in JSON Namens with '_'
405
		while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
406
			jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
407
		}
408

    
409
		// replace forward-slash (sign '/' ) in JSON Names with '_'
410
		while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
411
			jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
412
		}
413

    
414
		// replace '(' in JSON Names with ''
415
		while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
416
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
417
		}
418

    
419
		// replace ')' in JSON Names with ''
420
		while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
421
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
422
		}
423

    
424
		// add prefix of startNumbers in JSON Keynames with 'n_'
425
		while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
426
			jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
427
		}
428
        // add prefix of only numbers in JSON Keynames with 'm_'
429
        while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
430
                jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
431
        }
432

    
433
		// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
434
		while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
435
			jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
436
		}
437

    
438
		// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
439
		//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
440
		//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
441
		//            }
442

    
443
		// replace '=' in JSON Keynames with '-'
444
		while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
445
			jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
446
		}
447

    
448
		log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
449
		return jsonInput;
450
	}
451

    
452
	/**
453
	 *
454
	 * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
455
	 *          *
456
	 * @param bufferStr - XML string
457
	 * @return
458
	 */
459
	private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
460

    
461
		while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
462
			bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
463
		}
464

    
465
		// replace [#x10-#x1f] with ''
466
		//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
467
		//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
468
		//            }
469

    
470
		return bufferStr;
471
	}
472
	
473
	private boolean isInteger(String s) {
474
		boolean isValidInteger = false;
475
		try {
476
			Integer.parseInt(s);
477

    
478
			// s is a valid integer
479

    
480
			isValidInteger = true;
481
		} catch (NumberFormatException ex) {
482
			// s is not an integer
483
		}
484

    
485
		return isValidInteger;
486
	}
487
	
488
	// Method to encode a string value using `UTF-8` encoding scheme
489
    private String encodeValue(String value) {
490
        try {
491
            return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
492
        } catch (UnsupportedEncodingException ex) {
493
            throw new RuntimeException(ex.getCause());
494
        }
495
    }
496

    
497
}
(2-2/2)