Project

General

Profile

1
/**
2
 * log.debug(...) equal to  log.trace(...) in the application-logs
3
 * <p>
4
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
5
 */
6
package eu.dnetlib.data.collector.plugins.rest;
7

    
8
import java.io.InputStream;
9
import java.io.StringWriter;
10
import java.net.URL;
11
import java.net.HttpURLConnection;
12
import java.util.Iterator;
13
import java.util.Queue;
14
import java.util.concurrent.PriorityBlockingQueue;
15
import javax.xml.transform.OutputKeys;
16
import javax.xml.transform.Transformer;
17
import javax.xml.transform.TransformerConfigurationException;
18
import javax.xml.transform.TransformerFactory;
19
import javax.xml.transform.dom.DOMSource;
20
import javax.xml.transform.stream.StreamResult;
21
import javax.xml.xpath.*;
22

    
23
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
24
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
25
import org.apache.commons.io.IOUtils;
26
import org.apache.commons.lang3.StringUtils;
27
import org.apache.commons.logging.Log;
28
import org.apache.commons.logging.LogFactory;
29
import org.w3c.dom.Node;
30
import org.w3c.dom.NodeList;
31
import org.xml.sax.InputSource;
32

    
33
/**
34
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
35
 * @date 2020-03-20
36
 *
37
 */
38
public class RestIterator implements Iterator<String> {
39

    
40
	// TODO: clean up the comments of replaced source code
41
	private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
42

    
43
	private static final String wrapName = "recordWrap";
44
	private String baseUrl;
45
	private String resumptionType;
46
	private String resumptionParam;
47
	private String resultFormatValue;
48
	private String queryParams;
49
	private int resultSizeValue;
50
	private int resumptionInt = 0;            // integer resumption token (first record to harvest)
51
	private int resultTotal = -1;
52
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
53
	private InputStream resultStream;
54
	private Transformer transformer;
55
	private XPath xpath;
56
	private String query;
57
	private XPathExpression xprResultTotalPath;
58
	private XPathExpression xprResumptionPath;
59
	private XPathExpression xprEntity;
60
	private String queryFormat;
61
	private String querySize;
62
	private String authMethod;
63
	private String authToken;
64
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
65
	private int discoverResultSize = 0;
66
	private int pagination = 1;
67
	private String resultOffsetParam;
68

    
69
	/**
70
	 * RestIterator class
71
	 * 
72
	 * compatible to version before 1.3.33
73
	 * 
74
	 * @param baseUrl
75
	 * @param resumptionType
76
	 * @param resumptionParam
77
	 * @param resumptionXpath
78
	 * @param resultTotalXpath
79
	 * @param resultFormatParam
80
	 * @param resultFormatValue
81
	 * @param resultSizeParam
82
	 * @param resultSizeValueStr
83
	 * @param queryParams
84
	 * @param entityXpath
85
	 */
86
	public RestIterator(
87
			final String baseUrl,
88
			final String resumptionType,
89
			final String resumptionParam,
90
			final String resumptionXpath,
91
			final String resultTotalXpath,
92
			final String resultFormatParam,
93
			final String resultFormatValue,
94
			final String resultSizeParam,
95
			final String resultSizeValueStr,
96
			final String queryParams,
97
			final String entityXpath
98
	) {
99
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "","");
100
	}
101

    
102
	/** RestIterator class
103
	 *  compatible to version 1.3.33
104
	 * @param baseUrl
105
	 * @param resumptionType
106
	 * @param resumptionParam
107
	 * @param resumptionXpath
108
	 * @param resultTotalXpath
109
	 * @param resultFormatParam
110
	 * @param resultFormatValue
111
	 * @param resultSizeParam
112
	 * @param resultSizeValueStr
113
	 * @param queryParams
114
	 * @param entityXpath
115
	 * @param authMethod
116
	 * @param authToken
117
	 */
118
	public RestIterator(
119
			final String baseUrl,
120
			final String resumptionType,
121
			final String resumptionParam,
122
			final String resumptionXpath,
123
			final String resultTotalXpath,
124
			final String resultFormatParam,
125
			final String resultFormatValue,
126
			final String resultSizeParam,
127
			final String resultSizeValueStr,
128
			final String queryParams,
129
			final String entityXpath,
130
			final String authMethod,
131
			final String authToken
132
	) {
133
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "","");
134
	}
135
	
136
	public RestIterator(
137
			final String baseUrl,
138
			final String resumptionType,
139
			final String resumptionParam,
140
			final String resumptionXpath,
141
			final String resultTotalXpath,
142
			final String resultFormatParam,
143
			final String resultFormatValue,
144
			final String resultSizeParam,
145
			final String resultSizeValueStr,
146
			final String queryParams,
147
			final String entityXpath,
148
			final String authMethod,
149
			final String authToken,
150
			final String resultOffsetParam
151
	) {
152
		this.baseUrl = baseUrl;
153
		this.resumptionType = resumptionType;
154
		this.resumptionParam = resumptionParam;
155
		this.resultFormatValue = resultFormatValue;
156
		this.queryParams = queryParams;
157
		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
158
		this.authMethod = authMethod;
159
		this.authToken = authToken;
160
		this.resultOffsetParam = resultOffsetParam;
161

    
162
		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
163
		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
164

    
165
		try {
166
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
167
		} catch (Exception e) {
168
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
169
		}
170
		initQueue();
171
	}
172
	
173
	
174
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
175
			throws TransformerConfigurationException, XPathExpressionException {
176
		transformer = TransformerFactory.newInstance().newTransformer();
177
		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
178
		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
179
		xpath = XPathFactory.newInstance().newXPath();
180
		xprResultTotalPath = xpath.compile(resultTotalXpath);
181
		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
182
		xprEntity = xpath.compile(entityXpath);
183
	}
184

    
185
	private void initQueue() {
186
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
187
	}
188

    
189
	private void disconnect() {
190
		// TODO close inputstream
191
	}
192

    
193
	/* (non-Javadoc)
194
	 * @see java.util.Iterator#hasNext()
195
	 */
196
	@Override
197
	public boolean hasNext() {
198
		if (recordQueue.isEmpty() && query.isEmpty()) {
199
			disconnect();
200
			return false;
201
		} else {
202
			return true;
203
		}
204
	}
205

    
206
	/* (non-Javadoc)
207
	 * @see java.util.Iterator#next()
208
	 */
209
	@Override
210
	public String next() {
211
		synchronized (recordQueue) {
212
			while (recordQueue.isEmpty() && !query.isEmpty()) {
213
				try {
214
					log.info("get Query: " + query);
215
					query = downloadPage(query);
216
					log.debug("next queryURL from downloadPage(): " + query);
217
				} catch (CollectorServiceException e) {
218
					log.debug("CollectorPlugin.next()-Exception: " + e);
219
					throw new RuntimeException(e);
220
				}
221
			}
222
			return recordQueue.poll();
223
		}
224
	}
225

    
226
	/*
227
	 * download page and return nextQuery
228
	 */
229
	private String downloadPage(String query) throws CollectorServiceException {
230
		String resultJson;
231
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
232
		String nextQuery = "";
233
		String emptyXml = resultXml + "<" + wrapName + "></" + wrapName + ">";
234
		Node resultNode = null;
235
		NodeList nodeList = null;
236
		String qUrlArgument = "";
237
		int urlOldResumptionSize = 0;
238
		InputStream theHttpInputStream;
239

    
240
		try {
241
			URL qUrl = new URL(query);
242
			
243
			if (this.authMethod == "bearer") {
244
				log.trace("authMethode before inputStream: " + resultXml);
245

    
246
				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
247
	        	conn.setRequestProperty("Authorization","Bearer "+authToken);
248
	        	conn.setRequestProperty("Content-Type","application/json");
249
	        	conn.setRequestMethod("GET");   
250
	        	theHttpInputStream = conn.getInputStream();
251
			} else {
252
				theHttpInputStream = qUrl.openStream();
253
			}
254
			
255
			resultStream = theHttpInputStream;
256
			if ("json".equals(resultFormatValue.toLowerCase())) {
257
				resultJson = IOUtils.toString(resultStream, "UTF-8");
258
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
259
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
260
				resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
261
				log.trace("before inputStream: " + resultXml);
262
				resultXml = XmlCleaner.cleanAllEntities(resultXml);
263
				log.trace("after cleaning: " + resultXml);
264
				resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
265
			}
266

    
267
			if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
268
				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
269
				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
270
				log.debug("nodeList.length: " + nodeList.getLength());
271
				for (int i = 0; i < nodeList.getLength(); i++) {
272
					StringWriter sw = new StringWriter();
273
					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
274
					recordQueue.add(sw.toString());
275
				}
276
			} else { log.info("resultXml is equal with emptyXml"); }
277

    
278
			resumptionInt += resultSizeValue;
279

    
280
			switch (resumptionType.toLowerCase()) {
281
			case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
282
				resumptionStr = xprResumptionPath.evaluate(resultNode);
283
				break;
284

    
285
			case "count":   // begin at one step for all records, iterate over items
286
				resumptionStr = Integer.toString(resumptionInt);
287
				break;
288

    
289
			case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)
290
				if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
291
				qUrlArgument = qUrl.getQuery();
292
				String[] arrayQUrlArgument = qUrlArgument.split("&");
293
				for (String arrayUrlArgStr : arrayQUrlArgument) {
294
					if (arrayUrlArgStr.startsWith(resumptionParam)) {
295
						String[] resumptionKeyValue = arrayUrlArgStr.split("=");
296
						if(isInteger(resumptionKeyValue[1])) {
297
							urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
298
							log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
299
						} else {
300
							log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
301
						}
302
					}
303
				}
304

    
305
				if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase()))
306
						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
307
				) {
308
					// resumptionStr = "";
309
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
310
					resultTotal = discoverResultSize;
311
				} else {
312
					resumptionStr = Integer.toString(resumptionInt);
313
					resultTotal = resumptionInt + 1;
314
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
315
				}
316
				log.info("discoverResultSize:  " + discoverResultSize);
317
				break;
318

    
319
			case "pagination":
320
			case "page":         // pagination, iterate over page numbers
321
				pagination += 1;
322
				if (nodeList != null) {
323
					discoverResultSize += nodeList.getLength();
324
				} else {
325
					resultTotal = discoverResultSize;
326
					pagination = discoverResultSize;
327
				}
328
				resumptionInt = pagination;
329
				resumptionStr = Integer.toString(resumptionInt);
330
				break;
331

    
332
			case "deep-cursor":   // size of result items unknown, iterate over items  (for supporting deep cursor in solr)
333
				if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: deep-cursor, Param 'resultSizeValue' is less than 2");}
334
				if (resultOffsetParam.length() < 2) {throw new CollectorServiceException("Mode: deep-cursor, Param 'resultOffset' is less than 2");}
335

    
336
				resumptionStr = xprResumptionPath.evaluate(resultNode);
337
				queryParams.replace("cursor=*", "");
338
				queryParams.replace("&&",  "&");
339
				
340
				resumptionStr += "&" + resultOffsetParam + "=" + Integer.toString(resumptionInt);
341
				
342
				log.debug("downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams=" + queryParams);
343

    
344
				break;
345
			
346
			default:        // otherwise: abort
347
				// resultTotal = resumptionInt;
348
				break;
349
			}
350

    
351
		} catch (Exception e) {
352
			log.error(e);
353
			throw new IllegalStateException("collection failed: " + e.getMessage());
354
		}			
355
			
356
		if (resultTotal == -1) {
357
			resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
358
			if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; }           // to correct the upper bound
359
			log.info("resultTotal was -1 is now: " + resultTotal);
360
		}
361
		log.info("resultTotal: " + resultTotal);
362
		log.info("resInt: " + resumptionInt);
363
		if (resumptionInt <= resultTotal) {
364
			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
365
		} else
366
			nextQuery = "";
367

    
368
		log.debug("nextQueryUrl: " + nextQuery);
369
		return nextQuery;
370

    
371

    
372
	}
373

    
374
	/**
375
	 * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
376
	 * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
377
	 * and work-around for the JSON to XML converting of org.json.XML-package.
378
	 *
379
	 * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
380
	 *
381
	 * @param jsonInput
382
	 * @return convertedJsonKeynameOutput
383
	 */
384
	private String syntaxConvertJsonKeyNamens(String jsonInput) {
385

    
386
		log.trace("before convertJsonKeyNames: " + jsonInput);
387
		// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
388
		// replace ' 's in JSON Namens with '_'
389
		while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
390
			jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
391
		}
392

    
393
		// replace forward-slash (sign '/' ) in JSON Names with '_'
394
		while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
395
			jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
396
		}
397

    
398
		// replace '(' in JSON Names with ''
399
		while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
400
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
401
		}
402

    
403
		// replace ')' in JSON Names with ''
404
		while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
405
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
406
		}
407

    
408
		// add prefix of startNumbers in JSON Keynames with 'n_'
409
		while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
410
			jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
411
		}
412
        // add prefix of only numbers in JSON Keynames with 'm_'
413
        while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
414
                jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
415
        }
416

    
417
		// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
418
		while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
419
			jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
420
		}
421

    
422
		// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
423
		//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
424
		//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
425
		//            }
426

    
427
		// replace '=' in JSON Keynames with '-'
428
		while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
429
			jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
430
		}
431

    
432
		log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
433
		return jsonInput;
434
	}
435

    
436
	/**
437
	 *
438
	 * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
439
	 *          *
440
	 * @param bufferStr - XML string
441
	 * @return
442
	 */
443
	private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
444

    
445
		while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
446
			bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
447
		}
448

    
449
		// replace [#x10-#x1f] with ''
450
		//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
451
		//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
452
		//            }
453

    
454
		return bufferStr;
455
	}
456
	
457
	private boolean isInteger(String s) {
458
		boolean isValidInteger = false;
459
		try {
460
			Integer.parseInt(s);
461

    
462
			// s is a valid integer
463

    
464
			isValidInteger = true;
465
		} catch (NumberFormatException ex) {
466
			// s is not an integer
467
		}
468

    
469
		return isValidInteger;
470
	}
471

    
472
}
(2-2/2)