Project

General

Profile

1
/**
2
 * 
3
 * 
4
 * log.debug(...) equal to  log.trace(...) in the application-logs
5
 */
6
package eu.dnetlib.data.collector.plugins.rest;
7

    
8
import java.io.InputStream;
9
import java.io.StringWriter;
10
import java.net.URL;
11
import java.util.Iterator;
12
import java.util.Queue;
13
import java.util.concurrent.PriorityBlockingQueue;
14

    
15
import javax.xml.transform.OutputKeys;
16
import javax.xml.transform.Transformer;
17
import javax.xml.transform.TransformerConfigurationException;
18
import javax.xml.transform.TransformerFactory;
19
import javax.xml.transform.dom.DOMSource;
20
import javax.xml.transform.stream.StreamResult;
21
import javax.xml.xpath.XPath;
22
import javax.xml.xpath.XPathConstants;
23
import javax.xml.xpath.XPathExpression;
24
import javax.xml.xpath.XPathExpressionException;
25
import javax.xml.xpath.XPathFactory;
26

    
27
import org.apache.commons.io.IOUtils;
28
import org.apache.commons.lang3.StringUtils;
29
import org.apache.commons.logging.Log;
30
import org.apache.commons.logging.LogFactory;
31
import org.w3c.dom.Node;
32
import org.w3c.dom.NodeList;
33
import org.xml.sax.InputSource;
34

    
35
import eu.dnetlib.data.collector.plugins.oai.OaiIterator;
36
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
37

    
38
/**
39
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
40
 * @date 2018-08-06
41
 *
42
 */
43
public class RestIterator implements Iterator<String> {
44

    
45
    // TODO: clean up the comments of replaced source code
46
	private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
47

    
48
	private static final String wrapName = "recordWrap";
49
	private String baseUrl;
50
	private String resumptionType;
51
	private String resumptionParam;
52
	private String resultFormatValue;
53
	private String queryParams;
54
	private int resultSizeValue;
55
	private int resumptionInt = 0;			// integer resumption token (first record to harvest)
56
	private int resultTotal = -1;
57
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
58
	private InputStream resultStream;
59
	private Transformer transformer;
60
	private XPath xpath;
61
	private String query;
62
	private XPathExpression xprResultTotalPath;
63
	private XPathExpression xprResumptionPath;
64
	private XPathExpression xprEntity;
65
	private String queryFormat;
66
	private String querySize;
67
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
68
        private int discoverResultSize = 0;
69

    
70
	public RestIterator(
71
			final String baseUrl,
72
			final String resumptionType,
73
			final String resumptionParam,
74
			final String resumptionXpath,
75
			final String resultTotalXpath,
76
			final String resultFormatParam,
77
			final String resultFormatValue,
78
			final String resultSizeParam,
79
                        final String resultSizeValueStr,
80
			final String queryParams,
81
			final String entityXpath
82
			) {
83
		this.baseUrl = baseUrl;
84
		this.resumptionType = resumptionType;
85
		this.resumptionParam = resumptionParam;
86
		this.resultFormatValue = resultFormatValue;
87
		this.queryParams = queryParams;
88
                this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
89
                        
90
        queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
91
        querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
92

    
93
		try {
94
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
95
		} catch(Exception e) {
96
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
97
		}
98
        initQueue();
99
	}
100
	
101
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) throws TransformerConfigurationException, XPathExpressionException{
102
		transformer = TransformerFactory.newInstance().newTransformer();
103
                transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 
104
                transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount","3");
105
		xpath              = XPathFactory.newInstance().newXPath();
106
		xprResultTotalPath = xpath.compile(resultTotalXpath);
107
		xprResumptionPath  = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
108
		xprEntity          = xpath.compile(entityXpath);
109
	}
110
	
111
	private void initQueue() {
112
		query = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
113
	}
114
	
115
	private void disconnect() {
116
		// TODO close inputstream
117
	}
118
	
119
	/* (non-Javadoc)
120
	 * @see java.util.Iterator#hasNext()
121
	 */
122
	@Override
123
	public boolean hasNext() {
124
		if (recordQueue.isEmpty() && query.isEmpty()) {
125
			disconnect();
126
			return false;
127
		} else {
128
			return true;
129
		}
130
	}
131

    
132
	/* (non-Javadoc)
133
	 * @see java.util.Iterator#next()
134
	 */
135
	@Override
136
	public String next() {
137
		synchronized (recordQueue) {
138
			while (recordQueue.isEmpty() && !query.isEmpty() ) {
139
				try {
140
                                        log.info("get Query: " + query);
141
					query = downloadPage(query);
142
                                        log.debug("next query from downloadPage method: " + query);
143
				} catch(CollectorServiceException e) {
144
                                        log.debug("CollectorPlugin.next()-Exception: " + e);
145
					throw new RuntimeException(e);
146
				}
147
			}
148
			return recordQueue.poll();
149
		}
150
	}
151
	
152
	
153
	/*
154
	 * download page and return nextQuery
155
	 */
156
	private String downloadPage(String query) throws CollectorServiceException{
157
		String resultJson;
158
		String resultXml = "";
159
		String nextQuery = "";
160
                String emptyXml = "<"+wrapName+"></"+wrapName+">";
161
                Node resultNode = null;
162
                NodeList nodeList = null;
163
                String qUrlArgument = "";
164
                int urlOldResumptionSize = 0;
165
                
166
		try {
167
                        URL qUrl = new URL(query);
168
                        
169
                        resultStream = qUrl.openStream();
170
			if("json".equals(resultFormatValue.toLowerCase())){				
171
				resultJson = IOUtils.toString(resultStream,"UTF-8");
172

    
173
				//TODO move regex definitions as constant fields
174
				// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
175
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
176
//                                while(resultJson.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")){
177
//					resultJson = resultJson.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
178
//				}
179
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
180
				resultXml = org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element
181
				log.trace(resultXml);
182
				resultStream = IOUtils.toInputStream(resultXml,"UTF-8");
183
			}
184
			
185
                        if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
186
                            resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
187
                            nodeList   = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
188
                            log.debug("nodeList.length: " + nodeList.getLength());
189
                            for (int i = 0; i < nodeList.getLength(); i++) {
190
                                StringWriter sw = new StringWriter();
191
                                transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
192
                                recordQueue.add(sw.toString());
193
                            }
194
                        } else { log.info("resultXml is equal with emptyXml"); }	
195
                        
196
			resumptionInt += resultSizeValue;
197
                        
198
/*	replaced by switch statement as follow		
199
                        if("scan".equals(resumptionType.toLowerCase())) { resumptionStr = xprResumptionPath.evaluate(resultNode);}
200
			if("count".equals(resumptionType.toLowerCase())){ resumptionStr = Integer.toString(resumptionInt); }
201
*/
202
                        switch(resumptionType.toLowerCase()) {
203
                            case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI
204
                                            resumptionStr = xprResumptionPath.evaluate(resultNode);
205
                                            break;
206
                                            
207
                            case "count":   // begin at one step for all records
208
                                            resumptionStr = Integer.toString(resumptionInt);
209
                                            break;
210
                                            
211
                            case "discover":   // length of results unknown                                      
212
                                            qUrlArgument = qUrl.getQuery();
213
                                            String[] arrayQUrlArgument = qUrlArgument.split("&");
214
                                            for(String arrayUrlArgStr : arrayQUrlArgument ) {
215
                                                if(arrayUrlArgStr.startsWith(resumptionParam)) {
216
                                                    String[] resumptionKeyValue = arrayUrlArgStr.split("=");
217
                                                    urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
218
                                                    log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize);
219
                                                }
220
                                            }
221

    
222
                                            // 
223
                                            if(   ( (emptyXml.toLowerCase()).equals(resultXml.toLowerCase()) ) 
224
                                               || ( (nodeList != null) && (nodeList.getLength() < resultSizeValue) ) 
225
                                              ) {
226
                                                // resumptionStr = "";
227
                                                if(nodeList != null) { discoverResultSize += nodeList.getLength(); }
228
                                                resultTotal   = discoverResultSize;
229
                                            } else {
230
                                                resumptionStr = Integer.toString(resumptionInt);
231
                                                resultTotal   = resumptionInt+1;
232
                                                if(nodeList != null) { discoverResultSize += nodeList.getLength(); }
233
                                            }
234
                                            log.info("discoverResultSize:  " + discoverResultSize);
235
                                            break;
236
                                            
237
                            default:        // otherwise: abort
238
                                            // resultTotal = resumptionInt;
239
                                            break;
240
                        }
241

    
242
			if (resultTotal == -1) {
243
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
244
				log.info("resultTotal was -1 is now: " + resultTotal);
245
			}
246
			log.info("resultTotal: " + resultTotal);
247
			log.info("resInt: " + resumptionInt);
248
			if (resumptionInt < resultTotal) {
249
				nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
250
			} else
251
				nextQuery = "";
252
                        
253
                        log.debug("nextQueryUrl: " + nextQuery);
254
			return nextQuery;
255

    
256
		} catch(Exception e) {
257
			log.error(e);
258
			throw new IllegalStateException("collection failed: " + e.getMessage());
259
		}
260
	}
261
        
262
        /**
263
         * convert in Json-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
264
         * 
265
         * @param jsonInput
266
         * @return 
267
         */
268
        private String syntaxConvertJsonKeyNamens(String jsonInput) {
269

    
270
            log.trace("before convertJsonKeyNames: " + jsonInput);
271
            // replace ' 's in JSON Namens with '_'
272
            while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
273
                jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
274
            }
275

    
276
            // replace forward-slash (sign '/' ) in JSON Names with '_'
277
            while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
278
                jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
279
            }
280

    
281
            // replace '(' in JSON Names with ''
282
            while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
283
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
284
            }
285
            // replace ')' in JSON Names with ''
286
            while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
287
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
288
            }
289

    
290
            log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
291
            return jsonInput;
292
        }
293

    
294
}
(2-2/2)