Project

General

Profile

1
/**
2
 * 
3
 */
4
package eu.dnetlib.data.collector.plugins.rest;
5

    
6
import java.io.InputStream;
7
import java.io.StringWriter;
8
import java.net.URL;
9
import java.util.Iterator;
10
import java.util.Queue;
11
import java.util.concurrent.PriorityBlockingQueue;
12

    
13
import javax.xml.transform.OutputKeys;
14
import javax.xml.transform.Transformer;
15
import javax.xml.transform.TransformerConfigurationException;
16
import javax.xml.transform.TransformerFactory;
17
import javax.xml.transform.dom.DOMSource;
18
import javax.xml.transform.stream.StreamResult;
19
import javax.xml.xpath.XPath;
20
import javax.xml.xpath.XPathConstants;
21
import javax.xml.xpath.XPathExpression;
22
import javax.xml.xpath.XPathExpressionException;
23
import javax.xml.xpath.XPathFactory;
24

    
25
import org.apache.commons.io.IOUtils;
26
import org.apache.commons.lang3.StringUtils;
27
import org.apache.commons.logging.Log;
28
import org.apache.commons.logging.LogFactory;
29
import org.w3c.dom.Node;
30
import org.w3c.dom.NodeList;
31
import org.xml.sax.InputSource;
32

    
33
import eu.dnetlib.data.collector.plugins.oai.OaiIterator;
34
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
35

    
36
/**
37
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
38
 * @date 2018-08-06
39
 *
40
 */
41
public class RestIterator implements Iterator<String> {
42

    
43
	private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
44

    
45
	private static final String wrapName = "recordWrap";
46
	private String baseUrl;
47
	private String resumptionType;
48
	private String resumptionParam;
49
	private String resultFormatValue;
50
	private String queryParams;
51
	private int resultSizeValue;
52
	private int resumptionInt = 0;			// integer resumption token (first record to harvest)
53
	private int resultTotal = -1;
54
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
55
	private InputStream resultStream;
56
	private Transformer transformer;
57
	private XPath xpath;
58
	private String query;
59
	private XPathExpression xprResultTotalPath;
60
	private XPathExpression xprResumptionPath;
61
	private XPathExpression xprEntity;
62
	private String queryFormat;
63
	private String querySize;
64
	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
65
	
66
	public RestIterator(
67
			final String baseUrl,
68
			final String resumptionType,
69
			final String resumptionParam,
70
			final String resumptionXpath,
71
			final String resultTotalXpath,
72
			final String resultFormatParam,
73
			final String resultFormatValue,
74
			final String resultSizeParam,
75
                        final String resultSizeValue,
76
			final String queryParams,
77
			final String entityXpath
78
			) {
79
		this.baseUrl = baseUrl;
80
		this.resumptionType = resumptionType;
81
		this.resumptionParam = resumptionParam;
82
		this.resultFormatValue = resultFormatValue;
83
		this.queryParams = queryParams;
84
		
85
        queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
86
        querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValue : "";
87

    
88
		try {
89
			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
90
		} catch(Exception e) {
91
			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
92
		}
93
        initQueue();
94
	}
95
	
96
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) throws TransformerConfigurationException, XPathExpressionException{
97
		transformer = TransformerFactory.newInstance().newTransformer();
98
                transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 
99
                transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount","3");
100
		xpath              = XPathFactory.newInstance().newXPath();
101
		xprResultTotalPath = xpath.compile(resultTotalXpath);
102
		xprResumptionPath  = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
103
		xprEntity          = xpath.compile(entityXpath);
104
	}
105
	
106
	private void initQueue() {
107
		query = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
108
	}
109
	
110
	private void disconnect() {
111
		// TODO close inputstream
112
	}
113
	
114
	/* (non-Javadoc)
115
	 * @see java.util.Iterator#hasNext()
116
	 */
117
	@Override
118
	public boolean hasNext() {
119
		if (recordQueue.isEmpty() && query.isEmpty()) {
120
			disconnect();
121
			return false;
122
		} else {
123
			return true;
124
		}
125
	}
126

    
127
	/* (non-Javadoc)
128
	 * @see java.util.Iterator#next()
129
	 */
130
	@Override
131
	public String next() {
132
		synchronized (recordQueue) {
133
			while (recordQueue.isEmpty() && !query.isEmpty() ) {
134
				try {
135
					query = downloadPage(query);
136
				} catch(CollectorServiceException e) {
137
					throw new RuntimeException(e);
138
				}
139
			}
140
			return recordQueue.poll();
141
		}
142
	}
143
	
144
	
145
	/*
146
	 * download page and return nextQuery
147
	 */
148
	private String downloadPage(String query) throws CollectorServiceException{
149
		String resultJson;
150
		String resultXml = "";
151
		String nextQuery = "";
152
		try {
153
            resultStream = new URL(query).openStream();
154
			if("json".equals(resultFormatValue.toLowerCase())){				
155
				resultJson = IOUtils.toString(resultStream,"UTF-8");
156

    
157
				//TODO move regex definitions as constant fields
158
				// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
159
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
160
//                                while(resultJson.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")){
161
//					resultJson = resultJson.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
162
//				}
163
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
164
				resultXml = org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element
165
//				log.info(resultXml);
166
				resultStream = IOUtils.toInputStream(resultXml,"UTF-8");
167
			}
168
			
169
			Node resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
170
			NodeList nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
171
			
172
			for (int i = 0; i < nodeList.getLength(); i++) {
173
				StringWriter sw = new StringWriter();
174
				transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
175
				recordQueue.add(sw.toString());
176
			}
177
				
178
			resumptionInt += resultSizeValue;
179
                        
180
                        switch(resumptionType.toLowerCase()) {
181
                            case "scan":
182
                                            resumptionStr = xprResumptionPath.evaluate(resultNode);
183
                                            break;
184
                            case "count":
185
                                            resumptionStr = Integer.toString(resumptionInt);
186
                                            break;
187
                            case "discover":
188
                                            String emptyXml = "<"+wrapName+"></"+wrapName+">";
189
                                            if( (emptyXml.toLowerCase()).equals(resultXml.toLowerCase()) ) {
190
                                                resumptionStr = "";
191
                                                resultTotal   = resumptionInt;
192
                                            } else {
193
                                                resumptionStr = Integer.toString(resumptionInt);
194
                                                resultTotal   = resumptionInt+1;
195
                                            }
196
                                            break;
197
                            default:
198
                        }
199
/*			if("scan".equals(resumptionType.toLowerCase())) { resumptionStr = xprResumptionPath.evaluate(resultNode);}
200
			if("count".equals(resumptionType.toLowerCase())){ resumptionStr = Integer.toString(resumptionInt); }
201
*/
202
			if (resultTotal == -1) {
203
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
204
				log.info("resultTotal: " + resultTotal);
205
			}
206
			log.info("resultTotal: " + resultTotal);
207
			log.info("resInt: " + resumptionInt);
208
			if (resumptionInt < resultTotal) {
209
				nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
210
			} else
211
				nextQuery = "";
212
			return nextQuery;
213

    
214
		} catch(Exception e) {
215
			log.error(e);
216
			throw new IllegalStateException("collection failed: " + e.getMessage());
217
		}
218
	}
219
        
220
        /**
221
         * convert in Json-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
222
         * 
223
         * @param jsonInput
224
         * @return 
225
         */
226
        private String syntaxConvertJsonKeyNamens(String jsonInput) {
227

    
228
            // replace ' 's in JSON Namens with '_'
229
            while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
230
                jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
231
            }
232

    
233
            // replace forward-slash (sign '/' ) in JSON Names with '_'
234
            while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
235
                jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
236
            }
237

    
238
            // replace '(' in JSON Names with ''
239
            while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
240
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
241
            }
242
            // replace ')' in JSON Names with ''
243
            while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
244
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
245
            }
246

    
247
            return jsonInput;
248
        }
249

    
250
}
(2-2/2)