Project

General

Profile

1
/**
2
 * log.debug(...) equal to  log.trace(...) in the application-logs
3
 * <p>
4
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
5
 */
6
package eu.dnetlib.data.collector.plugins.rest;
7

    
8
import java.io.InputStream;
9
import java.io.StringWriter;
10
import java.net.URL;
11
import java.net.HttpURLConnection;
12
import java.util.Iterator;
13
import java.util.Queue;
14
import java.util.concurrent.PriorityBlockingQueue;
15
import javax.xml.transform.OutputKeys;
16
import javax.xml.transform.Transformer;
17
import javax.xml.transform.TransformerConfigurationException;
18
import javax.xml.transform.TransformerFactory;
19
import javax.xml.transform.dom.DOMSource;
20
import javax.xml.transform.stream.StreamResult;
21
import javax.xml.xpath.*;
22

    
23
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
24
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
25
import org.apache.commons.io.IOUtils;
26
import org.apache.commons.lang3.StringUtils;
27
import org.apache.commons.logging.Log;
28
import org.apache.commons.logging.LogFactory;
29
import org.w3c.dom.Node;
30
import org.w3c.dom.NodeList;
31
import org.xml.sax.InputSource;
32

    
33
/**
34
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
35
 * @date 2020-03-20
36
 *
37
 */
38
public class RestIterator implements Iterator<String> {
39

    
40
	// TODO: clean up the comments of replaced source code
41
	private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
42

    
43
	private static final String wrapName = "recordWrap";
44
	private String baseUrl;
45
	private String resumptionType;
46
	private String resumptionParam;
47
	private String resultFormatValue;
48
	private String queryParams;
49
	private int resultSizeValue;
50
	private int resumptionInt = 0;            // integer resumption token (first record to harvest)
51
	private int resultTotal = -1;
52
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
53
	private InputStream resultStream;
54
	private Transformer transformer;
55
	private XPath xpath;
56
	private String query;
57
	private XPathExpression xprResultTotalPath;
58
	private XPathExpression xprResumptionPath;
59
	private XPathExpression xprEntity;
60
	private String queryFormat;
61
	private String querySize;
62
	private String authMethod;
63
	private String authToken;
64
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
65
	private int discoverResultSize = 0;
66
	private int pagination = 1;
67

    
68
	public RestIterator(
69
			final String baseUrl,
70
			final String resumptionType,
71
			final String resumptionParam,
72
			final String resumptionXpath,
73
			final String resultTotalXpath,
74
			final String resultFormatParam,
75
			final String resultFormatValue,
76
			final String resultSizeParam,
77
			final String resultSizeValueStr,
78
			final String queryParams,
79
			final String entityXpath
80
	) {
81
//		this.baseUrl = baseUrl;
82
//		this.resumptionType = resumptionType;
83
//		this.resumptionParam = resumptionParam;
84
//		this.resultFormatValue = resultFormatValue;
85
//		this.queryParams = queryParams;
86
//		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
87
//
88
//		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
89
//		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
90
//
91
//		try {
92
//			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
93
//		} catch (Exception e) {
94
//			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
95
//		}
96
//		initQueue();
97
		this(baseUrl,resumptionType,resumptionParam,resumptionXpath,resultTotalXpath,resultFormatParam,resultFormatValue,resultSizeParam,resultSizeValueStr,queryParams,entityXpath,"", "");
98
	}
99

    
100
	public RestIterator(
101
			final String baseUrl,
102
			final String resumptionType,
103
			final String resumptionParam,
104
			final String resumptionXpath,
105
			final String resultTotalXpath,
106
			final String resultFormatParam,
107
			final String resultFormatValue,
108
			final String resultSizeParam,
109
			final String resultSizeValueStr,
110
			final String queryParams,
111
			final String entityXpath,
112
			final String authMethod,
113
			final String authToken
114
	) {
115
		this.baseUrl = baseUrl;
116
		this.resumptionType = resumptionType;
117
		this.resumptionParam = resumptionParam;
118
		this.resultFormatValue = resultFormatValue;
119
		this.queryParams = queryParams;
120
		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
121
		this.authMethod = authMethod;
122
		this.authToken = authToken;
123

    
124
		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
125
		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
126

    
127
		try {
128
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
129
		} catch (Exception e) {
130
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
131
		}
132
		initQueue();
133
	}
134
	
135
	
136
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
137
			throws TransformerConfigurationException, XPathExpressionException {
138
		transformer = TransformerFactory.newInstance().newTransformer();
139
		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
140
		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
141
		xpath = XPathFactory.newInstance().newXPath();
142
		xprResultTotalPath = xpath.compile(resultTotalXpath);
143
		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
144
		xprEntity = xpath.compile(entityXpath);
145
	}
146

    
147
	private void initQueue() {
148
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
149
	}
150

    
151
	private void disconnect() {
152
		// TODO close inputstream
153
	}
154

    
155
	/* (non-Javadoc)
156
	 * @see java.util.Iterator#hasNext()
157
	 */
158
	@Override
159
	public boolean hasNext() {
160
		if (recordQueue.isEmpty() && query.isEmpty()) {
161
			disconnect();
162
			return false;
163
		} else {
164
			return true;
165
		}
166
	}
167

    
168
	/* (non-Javadoc)
169
	 * @see java.util.Iterator#next()
170
	 */
171
	@Override
172
	public String next() {
173
		synchronized (recordQueue) {
174
			while (recordQueue.isEmpty() && !query.isEmpty()) {
175
				try {
176
					log.info("get Query: " + query);
177
					query = downloadPage(query);
178
					log.debug("next queryURL from downloadPage(): " + query);
179
				} catch (CollectorServiceException e) {
180
					log.debug("CollectorPlugin.next()-Exception: " + e);
181
					throw new RuntimeException(e);
182
				}
183
			}
184
			return recordQueue.poll();
185
		}
186
	}
187

    
188
	/*
189
	 * download page and return nextQuery
190
	 */
191
	private String downloadPage(String query) throws CollectorServiceException {
192
		String resultJson;
193
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
194
		String nextQuery = "";
195
		String emptyXml = resultXml + "<" + wrapName + "></" + wrapName + ">";
196
		Node resultNode = null;
197
		NodeList nodeList = null;
198
		String qUrlArgument = "";
199
		int urlOldResumptionSize = 0;
200
		InputStream theHttpInputStream;
201

    
202
		try {
203
			URL qUrl = new URL(query);
204
			
205
			if (this.authMethod == "bearer") {
206
				log.trace("authMethode before inputStream: " + resultXml);
207

    
208
				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
209
	        	conn.setRequestProperty("Authorization","Bearer "+authToken);
210
	        	conn.setRequestProperty("Content-Type","application/json");
211
	        	conn.setRequestMethod("GET");   
212
	        	theHttpInputStream = conn.getInputStream();
213
			} else {
214
				theHttpInputStream = qUrl.openStream();
215
			}
216
			
217
			resultStream = theHttpInputStream;
218
			if ("json".equals(resultFormatValue.toLowerCase())) {
219
				resultJson = IOUtils.toString(resultStream, "UTF-8");
220
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
221
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
222
				resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
223
				log.trace("before inputStream: " + resultXml);
224
				resultXml = XmlCleaner.cleanAllEntities(resultXml);
225
				log.trace("after cleaning: " + resultXml);
226
				resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
227
			}
228

    
229
			if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
230
				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
231
				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
232
				log.debug("nodeList.length: " + nodeList.getLength());
233
				for (int i = 0; i < nodeList.getLength(); i++) {
234
					StringWriter sw = new StringWriter();
235
					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
236
					recordQueue.add(sw.toString());
237
				}
238
			} else { log.info("resultXml is equal with emptyXml"); }
239

    
240
			resumptionInt += resultSizeValue;
241

    
242
			switch (resumptionType.toLowerCase()) {
243
			case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
244
				resumptionStr = xprResumptionPath.evaluate(resultNode);
245
				break;
246

    
247
			case "count":   // begin at one step for all records, iterate over items
248
				resumptionStr = Integer.toString(resumptionInt);
249
				break;
250

    
251
			case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)
252
				if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
253
				qUrlArgument = qUrl.getQuery();
254
				String[] arrayQUrlArgument = qUrlArgument.split("&");
255
				for (String arrayUrlArgStr : arrayQUrlArgument) {
256
					if (arrayUrlArgStr.startsWith(resumptionParam)) {
257
						String[] resumptionKeyValue = arrayUrlArgStr.split("=");
258
						urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
259
						log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize);
260
					}
261
				}
262

    
263
				if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase()))
264
						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
265
				) {
266
					// resumptionStr = "";
267
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
268
					resultTotal = discoverResultSize;
269
				} else {
270
					resumptionStr = Integer.toString(resumptionInt);
271
					resultTotal = resumptionInt + 1;
272
					if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
273
				}
274
				log.info("discoverResultSize:  " + discoverResultSize);
275
				break;
276

    
277
			case "pagination":
278
			case "page":         // pagination, iterate over pages
279
				pagination += 1;
280
				if (nodeList != null) {
281
					discoverResultSize += nodeList.getLength();
282
				} else {
283
					resultTotal = discoverResultSize;
284
					pagination = discoverResultSize;
285
				}
286
				resumptionInt = pagination;
287
				resumptionStr = Integer.toString(resumptionInt);
288
				break;
289

    
290
			default:        // otherwise: abort
291
				// resultTotal = resumptionInt;
292
				break;
293
			}
294

    
295
			if (resultTotal == -1) {
296
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
297
				if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; }           // to correct the upper bound
298
				log.info("resultTotal was -1 is now: " + resultTotal);
299
			}
300
			log.info("resultTotal: " + resultTotal);
301
			log.info("resInt: " + resumptionInt);
302
			if (resumptionInt <= resultTotal) {
303
				nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
304
			} else
305
				nextQuery = "";
306

    
307
			log.debug("nextQueryUrl: " + nextQuery);
308
			return nextQuery;
309

    
310
		} catch (Exception e) {
311
			log.error(e);
312
			throw new IllegalStateException("collection failed: " + e.getMessage());
313
		}
314
	}
315

    
316
	/**
317
	 * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
318
	 * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
319
	 * and work-around for the JSON to XML converting of org.json.XML-package.
320
	 *
321
	 * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
322
	 *
323
	 * @param jsonInput
324
	 * @return convertedJsonKeynameOutput
325
	 */
326
	private String syntaxConvertJsonKeyNamens(String jsonInput) {
327

    
328
		log.trace("before convertJsonKeyNames: " + jsonInput);
329
		// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
330
		// replace ' 's in JSON Namens with '_'
331
		while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
332
			jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
333
		}
334

    
335
		// replace forward-slash (sign '/' ) in JSON Names with '_'
336
		while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
337
			jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
338
		}
339

    
340
		// replace '(' in JSON Names with ''
341
		while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
342
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
343
		}
344

    
345
		// replace ')' in JSON Names with ''
346
		while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
347
			jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
348
		}
349

    
350
		// add prefix of startNumbers in JSON Keynames with 'n_'
351
		while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
352
			jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
353
		}
354
        // add prefix of only numbers in JSON Keynames with 'm_'
355
        while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
356
                jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
357
        }
358

    
359
		// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
360
		while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
361
			jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
362
		}
363

    
364
		// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
365
		//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
366
		//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
367
		//            }
368

    
369
		// replace '=' in JSON Keynames with '-'
370
		while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
371
			jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
372
		}
373

    
374
		log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
375
		return jsonInput;
376
	}
377

    
378
	/**
379
	 *
380
	 * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
381
	 *          *
382
	 * @param bufferStr - XML string
383
	 * @return
384
	 */
385
	private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
386

    
387
		while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
388
			bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
389
		}
390

    
391
		// replace [#x10-#x1f] with ''
392
		//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
393
		//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
394
		//            }
395

    
396
		return bufferStr;
397
	}
398

    
399
}
(2-2/2)