Project

General

Profile

1
/**
2
 * 
3
 * 
4
 * log.debug(...) equal to  log.trace(...) in the application-logs
5
 * 
6
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue  
7
 */
8
package eu.dnetlib.data.collector.plugins.rest;
9

    
10
import java.io.InputStream;
11
import java.io.StringWriter;
12
import java.net.URL;
13
import java.util.Iterator;
14
import java.util.Queue;
15
import java.util.concurrent.PriorityBlockingQueue;
16

    
17
import javax.xml.transform.OutputKeys;
18
import javax.xml.transform.Transformer;
19
import javax.xml.transform.TransformerConfigurationException;
20
import javax.xml.transform.TransformerFactory;
21
import javax.xml.transform.dom.DOMSource;
22
import javax.xml.transform.stream.StreamResult;
23
import javax.xml.xpath.XPath;
24
import javax.xml.xpath.XPathConstants;
25
import javax.xml.xpath.XPathExpression;
26
import javax.xml.xpath.XPathExpressionException;
27
import javax.xml.xpath.XPathFactory;
28

    
29
import org.apache.commons.io.IOUtils;
30
import org.apache.commons.lang3.StringUtils;
31
import org.apache.commons.logging.Log;
32
import org.apache.commons.logging.LogFactory;
33
import org.w3c.dom.Node;
34
import org.w3c.dom.NodeList;
35
import org.xml.sax.InputSource;
36

    
37
import eu.dnetlib.data.collector.plugins.oai.OaiIterator;
38
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
39
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
40

    
41
/**
42
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
43
 * @date 2018-09-03
44
 *
45
 */
46
public class RestIterator implements Iterator<String> {
47

    
48
    // TODO: clean up the comments of replaced source code
49
	private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
50

    
51
	private static final String wrapName = "recordWrap";
52
	private String baseUrl;
53
	private String resumptionType;
54
	private String resumptionParam;
55
	private String resultFormatValue;
56
	private String queryParams;
57
	private int resultSizeValue;
58
	private int resumptionInt = 0;			// integer resumption token (first record to harvest)
59
	private int resultTotal = -1;
60
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
61
	private InputStream resultStream;
62
	private Transformer transformer;
63
	private XPath xpath;
64
	private String query;
65
	private XPathExpression xprResultTotalPath;
66
	private XPathExpression xprResumptionPath;
67
	private XPathExpression xprEntity;
68
	private String queryFormat;
69
	private String querySize;
70
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
71
        private int discoverResultSize = 0;
72

    
73
	public RestIterator(
74
			final String baseUrl,
75
			final String resumptionType,
76
			final String resumptionParam,
77
			final String resumptionXpath,
78
			final String resultTotalXpath,
79
			final String resultFormatParam,
80
			final String resultFormatValue,
81
			final String resultSizeParam,
82
                        final String resultSizeValueStr,
83
			final String queryParams,
84
			final String entityXpath
85
			) {
86
		this.baseUrl = baseUrl;
87
		this.resumptionType = resumptionType;
88
		this.resumptionParam = resumptionParam;
89
		this.resultFormatValue = resultFormatValue;
90
		this.queryParams = queryParams;
91
                this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
92
                        
93
        queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
94
        querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
95

    
96
		try {
97
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
98
		} catch(Exception e) {
99
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
100
		}
101
        initQueue();
102
	}
103
	
104
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) throws TransformerConfigurationException, XPathExpressionException{
105
		transformer = TransformerFactory.newInstance().newTransformer();
106
                transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 
107
                transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount","3");
108
		xpath              = XPathFactory.newInstance().newXPath();
109
		xprResultTotalPath = xpath.compile(resultTotalXpath);
110
		xprResumptionPath  = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
111
		xprEntity          = xpath.compile(entityXpath);
112
	}
113
	
114
	private void initQueue() {
115
//		query = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
116
                //  change at 2018-08-30 and remove resumptionParam&-Type from "query" URL because first request starts mostly at the beginning.
117
		query = baseUrl + "?" + queryParams + querySize + queryFormat;
118

    
119
	}
120
	
121
	private void disconnect() {
122
		// TODO close inputstream
123
	}
124
	
125
	/* (non-Javadoc)
126
	 * @see java.util.Iterator#hasNext()
127
	 */
128
	@Override
129
	public boolean hasNext() {
130
		if (recordQueue.isEmpty() && query.isEmpty()) {
131
			disconnect();
132
			return false;
133
		} else {
134
			return true;
135
		}
136
	}
137

    
138
	/* (non-Javadoc)
139
	 * @see java.util.Iterator#next()
140
	 */
141
	@Override
142
	public String next() {
143
		synchronized (recordQueue) {
144
			while (recordQueue.isEmpty() && !query.isEmpty() ) {
145
				try {
146
                                        log.info("get Query: " + query);
147
					query = downloadPage(query);
148
                                        log.debug("next queryURL from downloadPage(): " + query);
149
				} catch(CollectorServiceException e) {
150
                                        log.debug("CollectorPlugin.next()-Exception: " + e);
151
					throw new RuntimeException(e);
152
				}
153
			}
154
			return recordQueue.poll();
155
		}
156
	}
157
	
158
	
159
	/*
160
	 * download page and return nextQuery
161
	 */
162
	private String downloadPage(String query) throws CollectorServiceException{
163
		String resultJson;
164
		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
165
		String nextQuery = "";
166
                String emptyXml = resultXml + "<"+wrapName+"></"+wrapName+">";
167
                Node resultNode = null;
168
                NodeList nodeList = null;
169
                String qUrlArgument = "";
170
                int urlOldResumptionSize = 0;
171
                
172
		try {
173
                        URL qUrl = new URL(query);
174
                        
175
                        resultStream = qUrl.openStream();
176
			if("json".equals(resultFormatValue.toLowerCase())){	
177
                            
178
				resultJson = IOUtils.toString(resultStream,"UTF-8");
179
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
180
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
181
                                resultXml += org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element
182
				log.trace("before inputStream: " + resultXml);
183
                                resultXml = XmlCleaner.cleanAllEntities(resultXml);
184
                                log.trace("after cleaning: " + resultXml);
185
				resultStream = IOUtils.toInputStream(resultXml,"UTF-8");
186
			}
187
			
188
                        if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
189
                            resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
190
                            nodeList   = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
191
                            log.debug("nodeList.length: " + nodeList.getLength());
192
                            for (int i = 0; i < nodeList.getLength(); i++) {
193
                                StringWriter sw = new StringWriter();
194
                                transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
195
                                recordQueue.add(sw.toString());
196
                            }
197
                        } else { log.info("resultXml is equal with emptyXml"); }	
198
                        
199
			resumptionInt += resultSizeValue;
200
                        
201
/*	replaced by switch statement as follow		
202
                        if("scan".equals(resumptionType.toLowerCase())) { resumptionStr = xprResumptionPath.evaluate(resultNode);}
203
			if("count".equals(resumptionType.toLowerCase())){ resumptionStr = Integer.toString(resumptionInt); }
204
*/
205
                        switch(resumptionType.toLowerCase()) {
206
                            case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
207
                                            resumptionStr = xprResumptionPath.evaluate(resultNode);
208
                                            break;
209
                                            
210
                            case "count":   // begin at one step for all records, iterate over items
211
                                            resumptionStr = Integer.toString(resumptionInt);
212
                                            break;
213
                                            
214
                            case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)                                   
215
                                            if (resultSizeValue < 2 ) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
216
                                            qUrlArgument = qUrl.getQuery();
217
                                            String[] arrayQUrlArgument = qUrlArgument.split("&");
218
                                            for(String arrayUrlArgStr : arrayQUrlArgument ) {
219
                                                if(arrayUrlArgStr.startsWith(resumptionParam)) {
220
                                                    String[] resumptionKeyValue = arrayUrlArgStr.split("=");
221
                                                    urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
222
                                                    log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize);
223
                                                }
224
                                            }
225

    
226
                                            // 
227
                                            if(   ( (emptyXml.toLowerCase()).equals(resultXml.toLowerCase()) ) 
228
                                               || ( (nodeList != null) && (nodeList.getLength() < resultSizeValue) ) 
229
                                              ) {
230
                                                // resumptionStr = "";
231
                                                if(nodeList != null) { discoverResultSize += nodeList.getLength(); }
232
                                                resultTotal   = discoverResultSize;
233
                                            } else {
234
                                                resumptionStr = Integer.toString(resumptionInt);
235
                                                resultTotal   = resumptionInt+1;
236
                                                if(nodeList != null) { discoverResultSize += nodeList.getLength(); }
237
                                            }
238
                                            log.info("discoverResultSize:  " + discoverResultSize);
239
                                            break;
240
                                            
241
                            case "page":    // pagination, iterate over pages
242
                                            // TODO pagination collecting
243
                                            break;
244
                                            
245
                            default:        // otherwise: abort
246
                                            // resultTotal = resumptionInt;
247
                                            break;
248
                        }
249

    
250
			if (resultTotal == -1) {
251
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
252
				log.info("resultTotal was -1 is now: " + resultTotal);
253
			}
254
			log.info("resultTotal: " + resultTotal);
255
			log.info("resInt: " + resumptionInt);
256
			if (resumptionInt < resultTotal) {
257
				nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
258
			} else
259
				nextQuery = "";
260
                        
261
                        log.debug("nextQueryUrl: " + nextQuery);
262
			return nextQuery;
263

    
264
		} catch(Exception e) {
265
			log.error(e);
266
			throw new IllegalStateException("collection failed: " + e.getMessage());
267
		}
268
	}
269
        
270
        /**
271
         * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
272
         * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
273
         * and work-around for the JSON to XML converting of org.json.XML-package.
274
         * 
275
         * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
276
         * 
277
         * @param jsonInput
278
         * @return convertedJsonKeynameOutput
279
         */
280
        private String syntaxConvertJsonKeyNamens(String jsonInput) {
281

    
282
            log.trace("before convertJsonKeyNames: " + jsonInput);
283
            // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
284
            // replace ' 's in JSON Namens with '_'
285
            while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
286
                jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
287
            }
288

    
289
            // replace forward-slash (sign '/' ) in JSON Names with '_'
290
            while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
291
                jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
292
            }
293

    
294
            // replace '(' in JSON Names with ''
295
            while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
296
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
297
            }
298
            
299
            // replace ')' in JSON Names with ''
300
            while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
301
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
302
            }
303

    
304
            // replace startNumbers in JSON Keynames with 'n_'
305
            while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
306
                jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
307
            }
308
            
309
            // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
310
            while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
311
                jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
312
            }            
313

    
314
            // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. 
315
//            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
316
//                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
317
//            }
318
            
319
            // replace '=' in JSON Keynames with '-'
320
            while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
321
                jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
322
            }
323
            
324
            log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
325
            return jsonInput;
326
        }
327
        
328
        /**
329
         * 
330
         * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
331
         *          * 
332
         * @param bufferStr - XML string
333
         * @return 
334
         */
335
        private static String cleanUnwantedJsonCharsInXmlTagnames( String bufferStr ) {
336
            
337
            while ( bufferStr.matches(".*<([^<>].*),(.)>.*") ) {
338
                bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
339
            }
340

    
341
            // replace [#x10-#x1f] with ''
342
//            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
343
//                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
344
//            }
345
            
346
            return bufferStr;
347
        }
348

    
349
}
(2-2/2)