Project

General

Profile

1
/**
2
 * 
3
 * 
4
 * log.debug(...) equal to  log.trace(...) in the application-logs
5
 * 
6
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue  
7
 */
8
package eu.dnetlib.data.collector.plugins.rest;
9

    
10
import java.io.InputStream;
11
import java.io.StringWriter;
12
import java.net.URL;
13
import java.util.Iterator;
14
import java.util.Queue;
15
import java.util.concurrent.PriorityBlockingQueue;
16

    
17
import javax.xml.transform.OutputKeys;
18
import javax.xml.transform.Transformer;
19
import javax.xml.transform.TransformerConfigurationException;
20
import javax.xml.transform.TransformerFactory;
21
import javax.xml.transform.dom.DOMSource;
22
import javax.xml.transform.stream.StreamResult;
23
import javax.xml.xpath.XPath;
24
import javax.xml.xpath.XPathConstants;
25
import javax.xml.xpath.XPathExpression;
26
import javax.xml.xpath.XPathExpressionException;
27
import javax.xml.xpath.XPathFactory;
28

    
29
import org.apache.commons.io.IOUtils;
30
import org.apache.commons.lang3.StringUtils;
31
import org.apache.commons.logging.Log;
32
import org.apache.commons.logging.LogFactory;
33
import org.w3c.dom.Node;
34
import org.w3c.dom.NodeList;
35
import org.xml.sax.InputSource;
36

    
37
import eu.dnetlib.data.collector.plugins.oai.OaiIterator;
38
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
39
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
40

    
41
/**
42
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
43
 * @date 2018-09-03
44
 *
45
 */
46
public class RestIterator implements Iterator<String> {
47

    
48
    // TODO: clean up the comments of replaced source code
49
	private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
50

    
51
	private static final String wrapName = "recordWrap";
52
	private String baseUrl;
53
	private String resumptionType;
54
	private String resumptionParam;
55
	private String resultFormatValue;
56
	private String queryParams;
57
	private int resultSizeValue;
58
	private int resumptionInt = 0;			// integer resumption token (first record to harvest)
59
	private int resultTotal = -1;
60
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
61
	private InputStream resultStream;
62
	private Transformer transformer;
63
	private XPath xpath;
64
	private String query;
65
	private XPathExpression xprResultTotalPath;
66
	private XPathExpression xprResumptionPath;
67
	private XPathExpression xprEntity;
68
	private String queryFormat;
69
	private String querySize;
70
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
71
        private int discoverResultSize = 0;
72
        private int pagination = 1;
73

    
74
	public RestIterator(
75
			final String baseUrl,
76
			final String resumptionType,
77
			final String resumptionParam,
78
			final String resumptionXpath,
79
			final String resultTotalXpath,
80
			final String resultFormatParam,
81
			final String resultFormatValue,
82
			final String resultSizeParam,
83
                        final String resultSizeValueStr,
84
			final String queryParams,
85
			final String entityXpath
86
			) {
87
		this.baseUrl = baseUrl;
88
		this.resumptionType = resumptionType;
89
		this.resumptionParam = resumptionParam;
90
		this.resultFormatValue = resultFormatValue;
91
		this.queryParams = queryParams;
92
                this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
93
                        
94
        queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
95
        querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
96

    
97
		try {
98
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
99
		} catch(Exception e) {
100
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
101
		}
102
        initQueue();
103
	}
104
	
105
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) throws TransformerConfigurationException, XPathExpressionException{
106
		transformer = TransformerFactory.newInstance().newTransformer();
107
                transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 
108
                transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount","3");
109
		xpath              = XPathFactory.newInstance().newXPath();
110
		xprResultTotalPath = xpath.compile(resultTotalXpath);
111
		xprResumptionPath  = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
112
		xprEntity          = xpath.compile(entityXpath);
113
	}
114
	
115
	private void initQueue() {
116
//		query = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
117
                //  change at 2018-08-30 and remove resumptionParam&-Type from "query" URL because first request starts mostly at the beginning.
118
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
119

    
120
	}
121
	
122
	private void disconnect() {
123
		// TODO close inputstream
124
	}
125
	
126
	/* (non-Javadoc)
127
	 * @see java.util.Iterator#hasNext()
128
	 */
129
	@Override
130
	public boolean hasNext() {
131
		if (recordQueue.isEmpty() && query.isEmpty()) {
132
			disconnect();
133
			return false;
134
		} else {
135
			return true;
136
		}
137
	}
138

    
139
	/* (non-Javadoc)
140
	 * @see java.util.Iterator#next()
141
	 */
142
	@Override
143
	public String next() {
144
		synchronized (recordQueue) {
145
			while (recordQueue.isEmpty() && !query.isEmpty() ) {
146
				try {
147
                                        log.info("get Query: " + query);
148
					query = downloadPage(query);
149
                                        log.debug("next queryURL from downloadPage(): " + query);
150
				} catch(CollectorServiceException e) {
151
                                        log.debug("CollectorPlugin.next()-Exception: " + e);
152
					throw new RuntimeException(e);
153
				}
154
			}
155
			return recordQueue.poll();
156
		}
157
	}
158
	
159
	
160
	/*
161
	 * download page and return nextQuery
162
	 */
163
	private String downloadPage(String query) throws CollectorServiceException{
164
		String resultJson;
165
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
166
		String nextQuery = "";
167
                String emptyXml = resultXml + "<"+wrapName+"></"+wrapName+">";
168
                Node resultNode = null;
169
                NodeList nodeList = null;
170
                String qUrlArgument = "";
171
                int urlOldResumptionSize = 0;
172
                
173
		try {
174
                        URL qUrl = new URL(query);
175
                        
176
                        resultStream = qUrl.openStream();
177
			if("json".equals(resultFormatValue.toLowerCase())){	
178
                            
179
				resultJson = IOUtils.toString(resultStream,"UTF-8");
180
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
181
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
182
                                resultXml += org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element
183
				log.trace("before inputStream: " + resultXml);
184
                                resultXml = XmlCleaner.cleanAllEntities(resultXml);
185
                                log.trace("after cleaning: " + resultXml);
186
				resultStream = IOUtils.toInputStream(resultXml,"UTF-8");
187
			}
188
			
189
                        if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
190
                            resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
191
                            nodeList   = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
192
                            log.debug("nodeList.length: " + nodeList.getLength());
193
                            for (int i = 0; i < nodeList.getLength(); i++) {
194
                                StringWriter sw = new StringWriter();
195
                                transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
196
                                recordQueue.add(sw.toString());
197
                            }
198
                        } else { log.info("resultXml is equal with emptyXml"); }	
199
                        
200
			resumptionInt += resultSizeValue;
201
                        
202
/*	replaced by switch statement as follow		
203
                        if("scan".equals(resumptionType.toLowerCase())) { resumptionStr = xprResumptionPath.evaluate(resultNode);}
204
			if("count".equals(resumptionType.toLowerCase())){ resumptionStr = Integer.toString(resumptionInt); }
205
*/
206
                        switch(resumptionType.toLowerCase()) {
207
                            case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
208
                                            resumptionStr = xprResumptionPath.evaluate(resultNode);
209
                                            break;
210
                                            
211
                            case "count":   // begin at one step for all records, iterate over items
212
                                            resumptionStr = Integer.toString(resumptionInt);
213
                                            break;
214
                                            
215
                            case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)                                   
216
                                            if (resultSizeValue < 2 ) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
217
                                            qUrlArgument = qUrl.getQuery();
218
                                            String[] arrayQUrlArgument = qUrlArgument.split("&");
219
                                            for(String arrayUrlArgStr : arrayQUrlArgument ) {
220
                                                if(arrayUrlArgStr.startsWith(resumptionParam)) {
221
                                                    String[] resumptionKeyValue = arrayUrlArgStr.split("=");
222
                                                    urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
223
                                                    log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize);
224
                                                }
225
                                            }
226

    
227
                                            // 
228
                                            if(   ( (emptyXml.toLowerCase()).equals(resultXml.toLowerCase()) ) 
229
                                               || ( (nodeList != null) && (nodeList.getLength() < resultSizeValue) ) 
230
                                              ) {
231
                                                // resumptionStr = "";
232
                                                if(nodeList != null) { discoverResultSize += nodeList.getLength(); }
233
                                                resultTotal   = discoverResultSize;
234
                                            } else {
235
                                                resumptionStr = Integer.toString(resumptionInt);
236
                                                resultTotal   = resumptionInt+1;
237
                                                if(nodeList != null) { discoverResultSize += nodeList.getLength(); }
238
                                            }
239
                                            log.info("discoverResultSize:  " + discoverResultSize);
240
                                            break;
241
                                            
242
                            case "pagination":    // pagination, iterate over pages
243
                            case "page":
244
                                            pagination += 1;
245
                                            if (nodeList != null) {
246
                                                discoverResultSize += nodeList.getLength();
247
                                            } else {
248
                                                resultTotal = discoverResultSize;
249
                                                pagination  = discoverResultSize;
250
                                            }
251
                                            resumptionInt = pagination;
252
                                            resumptionStr = Integer.toString(resumptionInt);
253
                                            break;
254
                                            
255
                            default:        // otherwise: abort
256
                                            // resultTotal = resumptionInt;
257
                                            break;
258
                        }
259

    
260
			if (resultTotal == -1) {
261
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
262
				log.info("resultTotal was -1 is now: " + resultTotal);
263
			}
264
			log.info("resultTotal: " + resultTotal);
265
			log.info("resInt: " + resumptionInt);
266
			if (resumptionInt < resultTotal) {
267
				nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
268
			} else
269
				nextQuery = "";
270
                        
271
                        log.debug("nextQueryUrl: " + nextQuery);
272
			return nextQuery;
273

    
274
		} catch(Exception e) {
275
			log.error(e);
276
			throw new IllegalStateException("collection failed: " + e.getMessage());
277
		}
278
	}
279
        
280
        /**
281
         * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
282
         * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
283
         * and work-around for the JSON to XML converting of org.json.XML-package.
284
         * 
285
         * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
286
         * 
287
         * @param jsonInput
288
         * @return convertedJsonKeynameOutput
289
         */
290
        private String syntaxConvertJsonKeyNamens(String jsonInput) {
291

    
292
            log.trace("before convertJsonKeyNames: " + jsonInput);
293
            // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
294
            // replace ' 's in JSON Namens with '_'
295
            while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
296
                jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
297
            }
298

    
299
            // replace forward-slash (sign '/' ) in JSON Names with '_'
300
            while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
301
                jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
302
            }
303

    
304
            // replace '(' in JSON Names with ''
305
            while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
306
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
307
            }
308
            
309
            // replace ')' in JSON Names with ''
310
            while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
311
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
312
            }
313

    
314
            // replace startNumbers in JSON Keynames with 'n_'
315
            while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
316
                jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
317
            }
318
            
319
            // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
320
            while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
321
                jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
322
            }            
323

    
324
            // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. 
325
//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
326
//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
327
//            }
328
            
329
            // replace '=' in JSON Keynames with '-'
330
            while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
331
                jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
332
            }
333
            
334
            log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
335
            return jsonInput;
336
        }
337
        
338
        /**
339
         * 
340
         * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
341
         *          * 
342
         * @param bufferStr - XML string
343
         * @return 
344
         */
345
        private static String cleanUnwantedJsonCharsInXmlTagnames( String bufferStr ) {
346
            
347
            while ( bufferStr.matches(".*<([^<>].*),(.)>.*") ) {
348
                bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
349
            }
350

    
351
            // replace [#x10-#x1f] with ''
352
//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
353
//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
354
//            }
355
            
356
            return bufferStr;
357
        }
358

    
359
}
(2-2/2)