Project

General

Profile

1
/**
2
 * 
3
 * 
4
 * log.debug(...) equal to  log.trace(...) in the application-logs
5
 * 
6
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue  
7
 */
8
package eu.dnetlib.data.collector.plugins.rest;
9

    
10
import java.io.InputStream;
11
import java.io.StringWriter;
12
import java.net.URL;
13
import java.util.Iterator;
14
import java.util.Queue;
15
import java.util.concurrent.PriorityBlockingQueue;
16

    
17
import javax.xml.transform.OutputKeys;
18
import javax.xml.transform.Transformer;
19
import javax.xml.transform.TransformerConfigurationException;
20
import javax.xml.transform.TransformerFactory;
21
import javax.xml.transform.dom.DOMSource;
22
import javax.xml.transform.stream.StreamResult;
23
import javax.xml.xpath.XPath;
24
import javax.xml.xpath.XPathConstants;
25
import javax.xml.xpath.XPathExpression;
26
import javax.xml.xpath.XPathExpressionException;
27
import javax.xml.xpath.XPathFactory;
28

    
29
import org.apache.commons.io.IOUtils;
30
import org.apache.commons.lang3.StringUtils;
31
import org.apache.commons.logging.Log;
32
import org.apache.commons.logging.LogFactory;
33
import org.w3c.dom.Node;
34
import org.w3c.dom.NodeList;
35
import org.xml.sax.InputSource;
36

    
37
import eu.dnetlib.data.collector.plugins.oai.OaiIterator;
38
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
39
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
40

    
41
/**
42
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
43
 * @date 2018-09-03
44
 *
45
 */
46
public class RestIterator implements Iterator<String> {
47

    
48
    // TODO: clean up the comments of replaced source code
49
	private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
50

    
51
	private static final String wrapName = "recordWrap";
52
	private String baseUrl;
53
	private String resumptionType;
54
	private String resumptionParam;
55
	private String resultFormatValue;
56
	private String queryParams;
57
	private int resultSizeValue;
58
	private int resumptionInt = 0;			// integer resumption token (first record to harvest)
59
	private int resultTotal = -1;
60
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
61
	private InputStream resultStream;
62
	private Transformer transformer;
63
	private XPath xpath;
64
	private String query;
65
	private XPathExpression xprResultTotalPath;
66
	private XPathExpression xprResumptionPath;
67
	private XPathExpression xprEntity;
68
	private String queryFormat;
69
	private String querySize;
70
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
71
        private int discoverResultSize = 0;
72
        private int pagination = 1;
73

    
74
	public RestIterator(
75
			final String baseUrl,
76
			final String resumptionType,
77
			final String resumptionParam,
78
			final String resumptionXpath,
79
			final String resultTotalXpath,
80
			final String resultFormatParam,
81
			final String resultFormatValue,
82
			final String resultSizeParam,
83
                        final String resultSizeValueStr,
84
			final String queryParams,
85
			final String entityXpath
86
			) {
87
		this.baseUrl = baseUrl;
88
		this.resumptionType = resumptionType;
89
		this.resumptionParam = resumptionParam;
90
		this.resultFormatValue = resultFormatValue;
91
		this.queryParams = queryParams;
92
                this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
93
                        
94
        queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
95
        querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
96

    
97
		try {
98
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
99
		} catch(Exception e) {
100
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
101
		}
102
        initQueue();
103
	}
104
	
105
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) throws TransformerConfigurationException, XPathExpressionException{
106
		transformer = TransformerFactory.newInstance().newTransformer();
107
                transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 
108
                transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount","3");
109
		xpath              = XPathFactory.newInstance().newXPath();
110
		xprResultTotalPath = xpath.compile(resultTotalXpath);
111
		xprResumptionPath  = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
112
		xprEntity          = xpath.compile(entityXpath);
113
	}
114
	
115
	private void initQueue() {
116
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
117
	}
118
	
119
	private void disconnect() {
120
		// TODO close inputstream
121
	}
122
	
123
	/* (non-Javadoc)
124
	 * @see java.util.Iterator#hasNext()
125
	 */
126
	@Override
127
	public boolean hasNext() {
128
		if (recordQueue.isEmpty() && query.isEmpty()) {
129
			disconnect();
130
			return false;
131
		} else {
132
			return true;
133
		}
134
	}
135

    
136
	/* (non-Javadoc)
137
	 * @see java.util.Iterator#next()
138
	 */
139
	@Override
140
	public String next() {
141
		synchronized (recordQueue) {
142
			while (recordQueue.isEmpty() && !query.isEmpty() ) {
143
				try {
144
                                        log.info("get Query: " + query);
145
					query = downloadPage(query);
146
                                        log.debug("next queryURL from downloadPage(): " + query);
147
				} catch(CollectorServiceException e) {
148
                                        log.debug("CollectorPlugin.next()-Exception: " + e);
149
					throw new RuntimeException(e);
150
				}
151
			}
152
			return recordQueue.poll();
153
		}
154
	}
155
	
156
	
157
	/*
158
	 * download page and return nextQuery
159
	 */
160
	private String downloadPage(String query) throws CollectorServiceException{
161
		String resultJson;
162
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
163
		String nextQuery = "";
164
                String emptyXml = resultXml + "<"+wrapName+"></"+wrapName+">";
165
                Node resultNode = null;
166
                NodeList nodeList = null;
167
                String qUrlArgument = "";
168
                int urlOldResumptionSize = 0;
169
                
170
		try {
171
                        URL qUrl = new URL(query);
172
                        
173
                        resultStream = qUrl.openStream();
174
			if("json".equals(resultFormatValue.toLowerCase())){	
175
                            
176
				resultJson = IOUtils.toString(resultStream,"UTF-8");
177
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
178
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
179
                                resultXml += org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element
180
				log.trace("before inputStream: " + resultXml);
181
                                resultXml = XmlCleaner.cleanAllEntities(resultXml);
182
                                log.trace("after cleaning: " + resultXml);
183
				resultStream = IOUtils.toInputStream(resultXml,"UTF-8");
184
			}
185
			
186
                        if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
187
                            resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
188
                            nodeList   = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
189
                            log.debug("nodeList.length: " + nodeList.getLength());
190
                            for (int i = 0; i < nodeList.getLength(); i++) {
191
                                StringWriter sw = new StringWriter();
192
                                transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
193
                                recordQueue.add(sw.toString());
194
                            }
195
                        } else { log.info("resultXml is equal with emptyXml"); }	
196
                        
197
			resumptionInt += resultSizeValue;
198
                        
199
                        switch(resumptionType.toLowerCase()) {
200
                            case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
201
                                            resumptionStr = xprResumptionPath.evaluate(resultNode);
202
                                            break;
203
                                            
204
                            case "count":   // begin at one step for all records, iterate over items
205
                                            resumptionStr = Integer.toString(resumptionInt);
206
                                            break;
207
                                            
208
                            case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)                                   
209
                                            if (resultSizeValue < 2 ) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
210
                                            qUrlArgument = qUrl.getQuery();
211
                                            String[] arrayQUrlArgument = qUrlArgument.split("&");
212
                                            for(String arrayUrlArgStr : arrayQUrlArgument ) {
213
                                                if(arrayUrlArgStr.startsWith(resumptionParam)) {
214
                                                    String[] resumptionKeyValue = arrayUrlArgStr.split("=");
215
                                                    urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
216
                                                    log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize);
217
                                                }
218
                                            }
219

    
220
                                            if(   ( (emptyXml.toLowerCase()).equals(resultXml.toLowerCase()) ) 
221
                                               || ( (nodeList != null) && (nodeList.getLength() < resultSizeValue) ) 
222
                                              ) {
223
                                                // resumptionStr = "";
224
                                                if(nodeList != null) { discoverResultSize += nodeList.getLength(); }
225
                                                resultTotal   = discoverResultSize;
226
                                            } else {
227
                                                resumptionStr = Integer.toString(resumptionInt);
228
                                                resultTotal   = resumptionInt+1;
229
                                                if(nodeList != null) { discoverResultSize += nodeList.getLength(); }
230
                                            }
231
                                            log.info("discoverResultSize:  " + discoverResultSize);
232
                                            break;
233
                                            
234
                            case "pagination":   
235
                            case "page":         // pagination, iterate over pages
236
                                            pagination += 1;
237
                                            if (nodeList != null) {
238
                                                discoverResultSize += nodeList.getLength();
239
                                            } else {
240
                                                resultTotal = discoverResultSize;
241
                                                pagination  = discoverResultSize;
242
                                            }
243
                                            resumptionInt = pagination;
244
                                            resumptionStr = Integer.toString(resumptionInt);
245
                                            break;
246
                                            
247
                            default:        // otherwise: abort
248
                                            // resultTotal = resumptionInt;
249
                                            break;
250
                        }
251

    
252
			if (resultTotal == -1) {
253
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
254
                                if(resumptionType.toLowerCase().equals("page")) { resultTotal += 1; }           // to correct the upper bound
255
				log.info("resultTotal was -1 is now: " + resultTotal);
256
			}
257
			log.info("resultTotal: " + resultTotal);
258
			log.info("resInt: " + resumptionInt);
259
			if (resumptionInt < resultTotal) {
260
				nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
261
			} else
262
				nextQuery = "";
263
                        
264
                        log.debug("nextQueryUrl: " + nextQuery);
265
			return nextQuery;
266

    
267
		} catch(Exception e) {
268
			log.error(e);
269
			throw new IllegalStateException("collection failed: " + e.getMessage());
270
		}
271
	}
272
        
273
        /**
274
         * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
275
         * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
276
         * and work-around for the JSON to XML converting of org.json.XML-package.
277
         * 
278
         * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
279
         * 
280
         * @param jsonInput
281
         * @return convertedJsonKeynameOutput
282
         */
283
        private String syntaxConvertJsonKeyNamens(String jsonInput) {
284

    
285
            log.trace("before convertJsonKeyNames: " + jsonInput);
286
            // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
287
            // replace ' 's in JSON Namens with '_'
288
            while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
289
                jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
290
            }
291

    
292
            // replace forward-slash (sign '/' ) in JSON Names with '_'
293
            while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
294
                jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
295
            }
296

    
297
            // replace '(' in JSON Names with ''
298
            while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
299
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
300
            }
301
            
302
            // replace ')' in JSON Names with ''
303
            while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
304
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
305
            }
306

    
307
            // replace startNumbers in JSON Keynames with 'n_'
308
            while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
309
                jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
310
            }
311
            
312
            // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
313
            while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
314
                jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
315
            }            
316

    
317
            // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. 
318
//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
319
//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
320
//            }
321
            
322
            // replace '=' in JSON Keynames with '-'
323
            while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
324
                jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
325
            }
326
            
327
            log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
328
            return jsonInput;
329
        }
330
        
331
        /**
332
         * 
333
         * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
334
         *          * 
335
         * @param bufferStr - XML string
336
         * @return 
337
         */
338
        private static String cleanUnwantedJsonCharsInXmlTagnames( String bufferStr ) {
339
            
340
            while ( bufferStr.matches(".*<([^<>].*),(.)>.*") ) {
341
                bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
342
            }
343

    
344
            // replace [#x10-#x1f] with ''
345
//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
346
//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
347
//            }
348
            
349
            return bufferStr;
350
        }
351

    
352
}
(2-2/2)