Revision 53116
Added by Andreas Czerniak over 5 years ago
RestIterator.java | ||
---|---|---|
35 | 35 |
import org.xml.sax.InputSource; |
36 | 36 |
|
37 | 37 |
import eu.dnetlib.data.collector.plugins.oai.OaiIterator; |
38 |
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; |
|
38 | 39 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
39 | 40 |
|
40 | 41 |
/** |
... | ... | |
144 | 145 |
try { |
145 | 146 |
log.info("get Query: " + query); |
146 | 147 |
query = downloadPage(query); |
147 |
log.debug("next query from downloadPage method: " + query);
|
|
148 |
log.debug("next queryURL from downloadPage(): " + query);
|
|
148 | 149 |
} catch(CollectorServiceException e) { |
149 | 150 |
log.debug("CollectorPlugin.next()-Exception: " + e); |
150 | 151 |
throw new RuntimeException(e); |
... | ... | |
160 | 161 |
*/ |
161 | 162 |
private String downloadPage(String query) throws CollectorServiceException{ |
162 | 163 |
String resultJson; |
163 |
String resultXml = ""; |
|
164 |
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
|
164 | 165 |
String nextQuery = ""; |
165 |
String emptyXml = "<"+wrapName+"></"+wrapName+">"; |
|
166 |
String emptyXml = resultXml + "<"+wrapName+"></"+wrapName+">";
|
|
166 | 167 |
Node resultNode = null; |
167 | 168 |
NodeList nodeList = null; |
168 | 169 |
String qUrlArgument = ""; |
... | ... | |
172 | 173 |
URL qUrl = new URL(query); |
173 | 174 |
|
174 | 175 |
resultStream = qUrl.openStream(); |
175 |
if("json".equals(resultFormatValue.toLowerCase())){ |
|
176 |
if("json".equals(resultFormatValue.toLowerCase())){ |
|
177 |
|
|
176 | 178 |
resultJson = IOUtils.toString(resultStream,"UTF-8"); |
177 |
|
|
178 |
//TODO move regex definitions as constant fields |
|
179 |
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml) |
|
180 | 179 |
resultJson = syntaxConvertJsonKeyNamens(resultJson); |
181 |
// while(resultJson.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")){ |
|
182 |
// resultJson = resultJson.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); |
|
183 |
// } |
|
184 | 180 |
org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson); |
185 |
resultXml = org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element |
|
186 |
log.trace(resultXml); |
|
181 |
resultXml += org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element |
|
182 |
log.trace("before inputStream: " + resultXml); |
|
183 |
// resultXml = XmlCleaner.cleanAllEntities(resultXml); |
|
184 |
// log.trace("after cleaning: " + resultXml); |
|
187 | 185 |
resultStream = IOUtils.toInputStream(resultXml,"UTF-8"); |
188 | 186 |
} |
189 | 187 |
|
... | ... | |
213 | 211 |
resumptionStr = Integer.toString(resumptionInt); |
214 | 212 |
break; |
215 | 213 |
|
216 |
case "discover": // size of result items unknown, iterate over items |
|
214 |
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
|
217 | 215 |
if (resultSizeValue < 2 ) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");} |
218 | 216 |
qUrlArgument = qUrl.getQuery(); |
219 | 217 |
String[] arrayQUrlArgument = qUrlArgument.split("&"); |
... | ... | |
280 | 278 |
private String syntaxConvertJsonKeyNamens(String jsonInput) { |
281 | 279 |
|
282 | 280 |
log.trace("before convertJsonKeyNames: " + jsonInput); |
281 |
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml) |
|
283 | 282 |
// replace ' 's in JSON Namens with '_' |
284 | 283 |
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) { |
285 | 284 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); |
Also available in: Unified diff
org.json.XML - update maven package version to 20180813
better unicode support