Project

General

Profile

1
/**
2
 * log.debug(...) equal to  log.trace(...) in the application-logs
3
 * <p>
4
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
5
 */
6
package eu.dnetlib.data.collector.plugins.rest;
7

    
8
import java.io.InputStream;
9
import java.io.StringWriter;
10
import java.io.UnsupportedEncodingException;
11
import java.net.URL;
12
import java.net.URLEncoder;
13
import java.nio.charset.StandardCharsets;
14
import java.net.HttpURLConnection;
15
import java.util.Iterator;
16
import java.util.Queue;
17
import java.util.concurrent.PriorityBlockingQueue;
18
import javax.xml.transform.OutputKeys;
19
import javax.xml.transform.Transformer;
20
import javax.xml.transform.TransformerConfigurationException;
21
import javax.xml.transform.TransformerFactory;
22
import javax.xml.transform.dom.DOMSource;
23
import javax.xml.transform.stream.StreamResult;
24
import javax.xml.xpath.*;
25

    
26
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
27
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
28
import org.apache.commons.io.IOUtils;
29
import org.apache.commons.lang3.StringUtils;
30
import org.apache.commons.logging.Log;
31
import org.apache.commons.logging.LogFactory;
32
import org.w3c.dom.Node;
33
import org.w3c.dom.NodeList;
34
import org.xml.sax.InputSource;
35

    
36
/**
37
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
38
 * @date 2020-04-09
39
 *
40
 */
41
public class RestIterator implements Iterator<String> {
42

    
43
	// TODO: clean up the comments of replaced source code
44
	private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
45

    
46
	private static final String wrapName = "recordWrap";
47
	private String baseUrl;
48
	private String resumptionType;
49
	private String resumptionParam;
50
	private String resultFormatValue;
51
	private String queryParams;
52
	private int resultSizeValue;
53
	private int resumptionInt = 0;            // integer resumption token (first record to harvest)
54
	private int resultTotal = -1;
55
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
56
	private InputStream resultStream;
57
	private Transformer transformer;
58
	private XPath xpath;
59
	private String query;
60
	private XPathExpression xprResultTotalPath;
61
	private XPathExpression xprResumptionPath;
62
	private XPathExpression xprEntity;
63
	private String queryFormat;
64
	private String querySize;
65
	private String authMethod;
66
	private String authToken;
67
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
68
	private int discoverResultSize = 0;
69
	private int pagination = 1;
70

    
71
	/**
72
	 * RestIterator class
73
	 * 
74
	 * compatible to version before 1.3.33
75
	 * 
76
	 * @param baseUrl
77
	 * @param resumptionType
78
	 * @param resumptionParam
79
	 * @param resumptionXpath
80
	 * @param resultTotalXpath
81
	 * @param resultFormatParam
82
	 * @param resultFormatValue
83
	 * @param resultSizeParam
84
	 * @param resultSizeValueStr
85
	 * @param queryParams
86
	 * @param entityXpath
87
	 */
88
	public RestIterator(
89
			final String baseUrl,
90
			final String resumptionType,
91
			final String resumptionParam,
92
			final String resumptionXpath,
93
			final String resultTotalXpath,
94
			final String resultFormatParam,
95
			final String resultFormatValue,
96
			final String resultSizeParam,
97
			final String resultSizeValueStr,
98
			final String queryParams,
99
			final String entityXpath
100
	) {
101
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "");
102
	}
103

    
104
	public RestIterator(
105
			final String baseUrl,
106
			final String resumptionType,
107
			final String resumptionParam,
108
			final String resumptionXpath,
109
			final String resultTotalXpath,
110
			final String resultFormatParam,
111
			final String resultFormatValue,
112
			final String resultSizeParam,
113
			final String resultSizeValueStr,
114
			final String queryParams,
115
			final String entityXpath,
116
			final String authMethod,
117
			final String authToken,
118
			final String resultOffsetParam
119
	) {
120
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "");
121
	}
122

    
123
	/** RestIterator class
124
	 *  compatible to version 1.3.33
125
	 * @param baseUrl
126
	 * @param resumptionType
127
	 * @param resumptionParam
128
	 * @param resumptionXpath
129
	 * @param resultTotalXpath
130
	 * @param resultFormatParam
131
	 * @param resultFormatValue
132
	 * @param resultSizeParam
133
	 * @param resultSizeValueStr
134
	 * @param queryParams
135
	 * @param entityXpath
136
	 * @param authMethod
137
	 * @param authToken
138
	 */
139
	public RestIterator(
140
			final String baseUrl,
141
			final String resumptionType,
142
			final String resumptionParam,
143
			final String resumptionXpath,
144
			final String resultTotalXpath,
145
			final String resultFormatParam,
146
			final String resultFormatValue,
147
			final String resultSizeParam,
148
			final String resultSizeValueStr,
149
			final String queryParams,
150
			final String entityXpath,
151
			final String authMethod,
152
			final String authToken
153
	) {
154
		this.baseUrl = baseUrl;
155
		this.resumptionType = resumptionType;
156
		this.resumptionParam = resumptionParam;
157
		this.resultFormatValue = resultFormatValue;
158
		this.queryParams = queryParams;
159
		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
160
		this.authMethod = authMethod;
161
		this.authToken = authToken;
162

    
163
		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
164
		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
165

    
166
		try {
167
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
168
		} catch (Exception e) {
169
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
170
		}
171
		initQueue();
172
	}
173
	
174
	
175
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
176
			throws TransformerConfigurationException, XPathExpressionException {
177
		transformer = TransformerFactory.newInstance().newTransformer();
178
		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
179
		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
180
		xpath = XPathFactory.newInstance().newXPath();
181
		xprResultTotalPath = xpath.compile(resultTotalXpath);
182
		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
183
		xprEntity = xpath.compile(entityXpath);
184
	}
185

    
186
	private void initQueue() {
187
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
188
	}
189

    
190
	private void disconnect() {
191
		// TODO close inputstream
192
	}
193

    
194
	/* (non-Javadoc)
195
	 * @see java.util.Iterator#hasNext()
196
	 */
197
	@Override
198
	public boolean hasNext() {
199
		if (recordQueue.isEmpty() && query.isEmpty()) {
200
			disconnect();
201
			return false;
202
		} else {
203
			return true;
204
		}
205
	}
206

    
207
	/* (non-Javadoc)
208
	 * @see java.util.Iterator#next()
209
	 */
210
	@Override
211
	public String next() {
212
		synchronized (recordQueue) {
213
			while (recordQueue.isEmpty() && !query.isEmpty()) {
214
				try {
215
					log.info("get Query: " + query);
216
					query = downloadPage(query);
217
					log.debug("next queryURL from downloadPage(): " + query);
218
				} catch (CollectorServiceException e) {
219
					log.debug("CollectorPlugin.next()-Exception: " + e);
220
					throw new RuntimeException(e);
221
				}
222
			}
223
			return recordQueue.poll();
224
		}
225
	}
226

    
227
	/*
228
	 * download page and return nextQuery
229
	 */
230
	private String downloadPage(String query) throws CollectorServiceException {
231
		String resultJson;
232
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
233
		String nextQuery = "";
234
		String emptyXml = resultXml + "<" + wrapName + "></" + wrapName + ">";
235
		Node resultNode = null;
236
		NodeList nodeList = null;
237
		String qUrlArgument = "";
238
		int urlOldResumptionSize = 0;
239
		InputStream theHttpInputStream;
240

    
241
		try {
242
			URL qUrl = new URL(query);
243
			
244
			if (this.authMethod == "bearer") {
245
				log.trace("authMethode before inputStream: " + resultXml);
246

    
247
				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
248
	        	conn.setRequestProperty("Authorization","Bearer "+authToken);
249
	        	conn.setRequestProperty("Content-Type","application/json");
250
	        	conn.setRequestMethod("GET");   
251
	        	theHttpInputStream = conn.getInputStream();
252
			} else {
253
				theHttpInputStream = qUrl.openStream();
254
			}
255
			
256
			resultStream = theHttpInputStream;
257
			if ("json".equals(resultFormatValue.toLowerCase())) {
258
				resultJson = IOUtils.toString(resultStream, "UTF-8");
259
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
260
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
261
				resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
262
				log.trace("before inputStream: " + resultXml);
263
				resultXml = XmlCleaner.cleanAllEntities(resultXml);
264
				log.trace("after cleaning: " + resultXml);
265
				resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
266
			}
267

    
268
			if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
269
				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
270
				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
271
				log.debug("nodeList.length: " + nodeList.getLength());
272
				for (int i = 0; i < nodeList.getLength(); i++) {
273
					StringWriter sw = new StringWriter();
274
					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
275
					recordQueue.add(sw.toString());
276
				}
277
			} else { log.info("resultXml is equal with emptyXml"); }
278

    
279
			resumptionInt += resultSizeValue;
280

    
281
			switch (resumptionType.toLowerCase()) {
282
			case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
283
				resumptionStr = xprResumptionPath.evaluate(resultNode);
284
				break;
285

    
286
			case "count":   // begin at one step for all records, iterate over items
287
				resumptionStr = Integer.toString(resumptionInt);
288
				break;
289

    
290
			case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)
291
				if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
292
				qUrlArgument = qUrl.getQuery();
293
				String[] arrayQUrlArgument = qUrlArgument.split("&");
294
				for (String arrayUrlArgStr : arrayQUrlArgument) {
295
					if (arrayUrlArgStr.startsWith(resumptionParam)) {
296
						String[] resumptionKeyValue = arrayUrlArgStr.split("=");
297
						if(isInteger(resumptionKeyValue[1])) {
298
							urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
299
							log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
300
						} else {
301
							log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
302
						}
303
					}
304
				}
305

    
306
				if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase()))
307
						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
308
				) {
309
					// resumptionStr = "";
310
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
311
					resultTotal = discoverResultSize;
312
				} else {
313
					resumptionStr = Integer.toString(resumptionInt);
314
					resultTotal = resumptionInt + 1;
315
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
316
				}
317
				log.info("discoverResultSize:  " + discoverResultSize);
318
				break;
319

    
320
			case "pagination":
321
			case "page":         // pagination, iterate over page numbers
322
				pagination += 1;
323
				if (nodeList != null) {
324
					discoverResultSize += nodeList.getLength();
325
				} else {
326
					resultTotal = discoverResultSize;
327
					pagination = discoverResultSize;
328
				}
329
				resumptionInt = pagination;
330
				resumptionStr = Integer.toString(resumptionInt);
331
				break;
332

    
333
			case "deep-cursor":   // size of result items unknown, iterate over items  (for supporting deep cursor in solr)
334
				// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: deep-cursor, Param 'resultSizeValue' is less than 2");}
335

    
336
				resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
337
				queryParams = queryParams.replace("&cursor=*", "");
338
				
339
				resumptionInt += nodeList.getLength();
340
				log.debug("downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams=" + queryParams);
341

    
342
				break;
343
			
344
			default:        // otherwise: abort
345
				// resultTotal = resumptionInt;
346
				break;
347
			}
348

    
349
		} catch (Exception e) {
350
			log.error(e);
351
			throw new IllegalStateException("collection failed: " + e.getMessage());
352
		}			
353
			
354
		try {
355
			if (resultTotal == -1) {
356
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
357
				if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; }           // to correct the upper bound
358
				log.info("resultTotal was -1 is now: " + resultTotal);
359
		}
360
		} catch(Exception e) {
361
			log.error(e);
362
			throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
363
		}
364
		log.info("resultTotal: " + resultTotal);
365
		log.info("resInt: " + resumptionInt);
366
		if (resumptionInt <= resultTotal) {
367
			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
368
		} else
369
			nextQuery = "";
370

    
371
		log.debug("nextQueryUrl: " + nextQuery);
372
		return nextQuery;
373

    
374

    
375
	}
376

    
377
	/**
378
	 * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
379
	 * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
380
	 * and work-around for the JSON to XML converting of org.json.XML-package.
381
	 *
382
	 * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
383
	 *
384
	 * @param jsonInput
385
	 * @return convertedJsonKeynameOutput
386
	 */
387
	private String syntaxConvertJsonKeyNamens(String jsonInput) {
388

    
389
		log.trace("before convertJsonKeyNames: " + jsonInput);
390
		// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
391
		// replace ' 's in JSON Namens with '_'
392
		while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
393
			jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
394
		}
395

    
396
		// replace forward-slash (sign '/' ) in JSON Names with '_'
397
		while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
398
			jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
399
		}
400

    
401
		// replace '(' in JSON Names with ''
402
		while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
403
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
404
		}
405

    
406
		// replace ')' in JSON Names with ''
407
		while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
408
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
409
		}
410

    
411
		// add prefix of startNumbers in JSON Keynames with 'n_'
412
		while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
413
			jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
414
		}
415
        // add prefix of only numbers in JSON Keynames with 'm_'
416
        while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
417
                jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
418
        }
419

    
420
		// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
421
		while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
422
			jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
423
		}
424

    
425
		// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
426
		//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
427
		//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
428
		//            }
429

    
430
		// replace '=' in JSON Keynames with '-'
431
		while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
432
			jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
433
		}
434

    
435
		log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
436
		return jsonInput;
437
	}
438

    
439
	/**
440
	 *
441
	 * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
442
	 *          *
443
	 * @param bufferStr - XML string
444
	 * @return
445
	 */
446
	private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
447

    
448
		while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
449
			bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
450
		}
451

    
452
		// replace [#x10-#x1f] with ''
453
		//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
454
		//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
455
		//            }
456

    
457
		return bufferStr;
458
	}
459
	
460
	private boolean isInteger(String s) {
461
		boolean isValidInteger = false;
462
		try {
463
			Integer.parseInt(s);
464

    
465
			// s is a valid integer
466

    
467
			isValidInteger = true;
468
		} catch (NumberFormatException ex) {
469
			// s is not an integer
470
		}
471

    
472
		return isValidInteger;
473
	}
474
	
475
	// Method to encode a string value using `UTF-8` encoding scheme
476
    private String encodeValue(String value) {
477
        try {
478
            return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
479
        } catch (UnsupportedEncodingException ex) {
480
            throw new RuntimeException(ex.getCause());
481
        }
482
    }
483

    
484
}
(2-2/2)