Project

General

Profile

1
/**
2
 * log.debug(...) equal to  log.trace(...) in the application-logs
3
 * <p>
4
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
5
 */
6
package eu.dnetlib.data.collector.plugins.rest;
7

    
8
import java.io.InputStream;
9
import java.io.StringWriter;
10
import java.net.URL;
11
import java.net.HttpURLConnection;
12
import java.util.Iterator;
13
import java.util.Queue;
14
import java.util.concurrent.PriorityBlockingQueue;
15
import javax.xml.transform.OutputKeys;
16
import javax.xml.transform.Transformer;
17
import javax.xml.transform.TransformerConfigurationException;
18
import javax.xml.transform.TransformerFactory;
19
import javax.xml.transform.dom.DOMSource;
20
import javax.xml.transform.stream.StreamResult;
21
import javax.xml.xpath.*;
22

    
23
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
24
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
25
import org.apache.commons.io.IOUtils;
26
import org.apache.commons.lang3.StringUtils;
27
import org.apache.commons.logging.Log;
28
import org.apache.commons.logging.LogFactory;
29
import org.w3c.dom.Node;
30
import org.w3c.dom.NodeList;
31
import org.xml.sax.InputSource;
32

    
33
/**
34
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
35
 * @date 2020-03-20
36
 *
37
 */
38
public class RestIterator implements Iterator<String> {
39

    
40
	// TODO: clean up the comments of replaced source code
41
	private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
42

    
43
	private static final String wrapName = "recordWrap";
44
	private String baseUrl;
45
	private String resumptionType;
46
	private String resumptionParam;
47
	private String resultFormatValue;
48
	private String queryParams;
49
	private int resultSizeValue;
50
	private int resumptionInt = 0;            // integer resumption token (first record to harvest)
51
	private int resultTotal = -1;
52
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
53
	private InputStream resultStream;
54
	private Transformer transformer;
55
	private XPath xpath;
56
	private String query;
57
	private XPathExpression xprResultTotalPath;
58
	private XPathExpression xprResumptionPath;
59
	private XPathExpression xprEntity;
60
	private String queryFormat;
61
	private String querySize;
62
	private String authMethod;
63
	private String authToken;
64
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
65
	private int discoverResultSize = 0;
66
	private int pagination = 1;
67

    
68
	public RestIterator(
69
			final String baseUrl,
70
			final String resumptionType,
71
			final String resumptionParam,
72
			final String resumptionXpath,
73
			final String resultTotalXpath,
74
			final String resultFormatParam,
75
			final String resultFormatValue,
76
			final String resultSizeParam,
77
			final String resultSizeValueStr,
78
			final String queryParams,
79
			final String entityXpath
80
	) {
81
//		this.baseUrl = baseUrl;
82
//		this.resumptionType = resumptionType;
83
//		this.resumptionParam = resumptionParam;
84
//		this.resultFormatValue = resultFormatValue;
85
//		this.queryParams = queryParams;
86
//		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
87
//
88
//		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
89
//		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
90
//
91
//		try {
92
//			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
93
//		} catch (Exception e) {
94
//			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
95
//		}
96
//		initQueue();
97
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "");
98
	}
99

    
100
	public RestIterator(
101
			final String baseUrl,
102
			final String resumptionType,
103
			final String resumptionParam,
104
			final String resumptionXpath,
105
			final String resultTotalXpath,
106
			final String resultFormatParam,
107
			final String resultFormatValue,
108
			final String resultSizeParam,
109
			final String resultSizeValueStr,
110
			final String queryParams,
111
			final String entityXpath,
112
			final String authMethod,
113
			final String authToken
114
	) {
115
		this.baseUrl = baseUrl;
116
		this.resumptionType = resumptionType;
117
		this.resumptionParam = resumptionParam;
118
		this.resultFormatValue = resultFormatValue;
119
		this.queryParams = queryParams;
120
		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
121
		this.authMethod = authMethod;
122
		this.authToken = authToken;
123

    
124
		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
125
		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
126

    
127
		try {
128
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
129
		} catch (Exception e) {
130
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
131
		}
132
		initQueue();
133
	}
134
	
135
	
136
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
137
			throws TransformerConfigurationException, XPathExpressionException {
138
		transformer = TransformerFactory.newInstance().newTransformer();
139
		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
140
		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
141
		xpath = XPathFactory.newInstance().newXPath();
142
		xprResultTotalPath = xpath.compile(resultTotalXpath);
143
		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
144
		xprEntity = xpath.compile(entityXpath);
145
	}
146

    
147
	private void initQueue() {
148
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
149
	}
150

    
151
	private void disconnect() {
152
		// TODO close inputstream
153
	}
154

    
155
	/* (non-Javadoc)
156
	 * @see java.util.Iterator#hasNext()
157
	 */
158
	@Override
159
	public boolean hasNext() {
160
		if (recordQueue.isEmpty() && query.isEmpty()) {
161
			disconnect();
162
			return false;
163
		} else {
164
			return true;
165
		}
166
	}
167

    
168
	/* (non-Javadoc)
169
	 * @see java.util.Iterator#next()
170
	 */
171
	@Override
172
	public String next() {
173
		synchronized (recordQueue) {
174
			while (recordQueue.isEmpty() && !query.isEmpty()) {
175
				try {
176
					log.info("get Query: " + query);
177
					query = downloadPage(query);
178
					log.debug("next queryURL from downloadPage(): " + query);
179
				} catch (CollectorServiceException e) {
180
					log.debug("CollectorPlugin.next()-Exception: " + e);
181
					throw new RuntimeException(e);
182
				}
183
			}
184
			return recordQueue.poll();
185
		}
186
	}
187

    
188
	/*
189
	 * download page and return nextQuery
190
	 */
191
	private String downloadPage(String query) throws CollectorServiceException {
192
		String resultJson;
193
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
194
		String nextQuery = "";
195
		String emptyXml = resultXml + "<" + wrapName + "></" + wrapName + ">";
196
		Node resultNode = null;
197
		NodeList nodeList = null;
198
		String qUrlArgument = "";
199
		int urlOldResumptionSize = 0;
200
		InputStream theHttpInputStream;
201

    
202
		try {
203
			URL qUrl = new URL(query);
204
			
205
			if (this.authMethod == "bearer") {
206
				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
207
	        	conn.setRequestProperty("Authorization","Bearer "+authToken);
208
	        	conn.setRequestProperty("Content-Type","application/json");
209
	        	conn.setRequestMethod("GET");   
210
	        	theHttpInputStream = conn.getInputStream();
211
			} else {
212
				theHttpInputStream = qUrl.openStream();
213
			}
214
			
215
			resultStream = theHttpInputStream;
216
			if ("json".equals(resultFormatValue.toLowerCase())) {
217
				resultJson = IOUtils.toString(resultStream, "UTF-8");
218
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
219
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
220
				resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
221
				log.trace("before inputStream: " + resultXml);
222
				resultXml = XmlCleaner.cleanAllEntities(resultXml);
223
				log.trace("after cleaning: " + resultXml);
224
				resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
225
			}
226

    
227
			if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
228
				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
229
				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
230
				log.debug("nodeList.length: " + nodeList.getLength());
231
				for (int i = 0; i < nodeList.getLength(); i++) {
232
					StringWriter sw = new StringWriter();
233
					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
234
					recordQueue.add(sw.toString());
235
				}
236
			} else { log.info("resultXml is equal with emptyXml"); }
237

    
238
			resumptionInt += resultSizeValue;
239

    
240
			switch (resumptionType.toLowerCase()) {
241
			case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
242
				resumptionStr = xprResumptionPath.evaluate(resultNode);
243
				break;
244

    
245
			case "count":   // begin at one step for all records, iterate over items
246
				resumptionStr = Integer.toString(resumptionInt);
247
				break;
248

    
249
			case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)
250
				if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
251
				qUrlArgument = qUrl.getQuery();
252
				String[] arrayQUrlArgument = qUrlArgument.split("&");
253
				for (String arrayUrlArgStr : arrayQUrlArgument) {
254
					if (arrayUrlArgStr.startsWith(resumptionParam)) {
255
						String[] resumptionKeyValue = arrayUrlArgStr.split("=");
256
						urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
257
						log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize);
258
					}
259
				}
260

    
261
				if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase()))
262
						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
263
				) {
264
					// resumptionStr = "";
265
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
266
					resultTotal = discoverResultSize;
267
				} else {
268
					resumptionStr = Integer.toString(resumptionInt);
269
					resultTotal = resumptionInt + 1;
270
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
271
				}
272
				log.info("discoverResultSize:  " + discoverResultSize);
273
				break;
274

    
275
			case "pagination":
276
			case "page":         // pagination, iterate over pages
277
				pagination += 1;
278
				if (nodeList != null) {
279
					discoverResultSize += nodeList.getLength();
280
				} else {
281
					resultTotal = discoverResultSize;
282
					pagination = discoverResultSize;
283
				}
284
				resumptionInt = pagination;
285
				resumptionStr = Integer.toString(resumptionInt);
286
				break;
287

    
288
			default:        // otherwise: abort
289
				// resultTotal = resumptionInt;
290
				break;
291
			}
292

    
293
			if (resultTotal == -1) {
294
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
295
				if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; }           // to correct the upper bound
296
				log.info("resultTotal was -1 is now: " + resultTotal);
297
			}
298
			log.info("resultTotal: " + resultTotal);
299
			log.info("resInt: " + resumptionInt);
300
			if (resumptionInt <= resultTotal) {
301
				nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
302
			} else
303
				nextQuery = "";
304

    
305
			log.debug("nextQueryUrl: " + nextQuery);
306
			return nextQuery;
307

    
308
		} catch (Exception e) {
309
			log.error(e);
310
			throw new IllegalStateException("collection failed: " + e.getMessage());
311
		}
312
	}
313

    
314
	/**
315
	 * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
316
	 * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
317
	 * and work-around for the JSON to XML converting of org.json.XML-package.
318
	 *
319
	 * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
320
	 *
321
	 * @param jsonInput
322
	 * @return convertedJsonKeynameOutput
323
	 */
324
	private String syntaxConvertJsonKeyNamens(String jsonInput) {
325

    
326
		log.trace("before convertJsonKeyNames: " + jsonInput);
327
		// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
328
		// replace ' 's in JSON Namens with '_'
329
		while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
330
			jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
331
		}
332

    
333
		// replace forward-slash (sign '/' ) in JSON Names with '_'
334
		while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
335
			jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
336
		}
337

    
338
		// replace '(' in JSON Names with ''
339
		while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
340
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
341
		}
342

    
343
		// replace ')' in JSON Names with ''
344
		while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
345
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
346
		}
347

    
348
		// replace startNumbers in JSON Keynames with 'n_'
349
		while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
350
			jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
351
		}
352

    
353
		// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
354
		while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
355
			jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
356
		}
357

    
358
		// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
359
		//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
360
		//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
361
		//            }
362

    
363
		// replace '=' in JSON Keynames with '-'
364
		while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
365
			jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
366
		}
367

    
368
		log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
369
		return jsonInput;
370
	}
371

    
372
	/**
373
	 *
374
	 * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
375
	 *          *
376
	 * @param bufferStr - XML string
377
	 * @return
378
	 */
379
	private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
380

    
381
		while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
382
			bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
383
		}
384

    
385
		// replace [#x10-#x1f] with ''
386
		//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
387
		//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
388
		//            }
389

    
390
		return bufferStr;
391
	}
392

    
393
}
(2-2/2)