35 |
35 |
import org.xml.sax.InputSource;
|
36 |
36 |
|
37 |
37 |
import eu.dnetlib.data.collector.plugins.oai.OaiIterator;
|
|
38 |
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
|
38 |
39 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
39 |
40 |
|
40 |
41 |
/**
|
... | ... | |
144 |
145 |
try {
|
145 |
146 |
log.info("get Query: " + query);
|
146 |
147 |
query = downloadPage(query);
|
147 |
|
log.debug("next query from downloadPage method: " + query);
|
|
148 |
log.debug("next queryURL from downloadPage(): " + query);
|
148 |
149 |
} catch(CollectorServiceException e) {
|
149 |
150 |
log.debug("CollectorPlugin.next()-Exception: " + e);
|
150 |
151 |
throw new RuntimeException(e);
|
... | ... | |
160 |
161 |
*/
|
161 |
162 |
private String downloadPage(String query) throws CollectorServiceException{
|
162 |
163 |
String resultJson;
|
163 |
|
String resultXml = "";
|
|
164 |
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
164 |
165 |
String nextQuery = "";
|
165 |
|
String emptyXml = "<"+wrapName+"></"+wrapName+">";
|
|
166 |
String emptyXml = resultXml + "<"+wrapName+"></"+wrapName+">";
|
166 |
167 |
Node resultNode = null;
|
167 |
168 |
NodeList nodeList = null;
|
168 |
169 |
String qUrlArgument = "";
|
... | ... | |
172 |
173 |
URL qUrl = new URL(query);
|
173 |
174 |
|
174 |
175 |
resultStream = qUrl.openStream();
|
175 |
|
if("json".equals(resultFormatValue.toLowerCase())){
|
|
176 |
if("json".equals(resultFormatValue.toLowerCase())){
|
|
177 |
|
176 |
178 |
resultJson = IOUtils.toString(resultStream,"UTF-8");
|
177 |
|
|
178 |
|
//TODO move regex definitions as constant fields
|
179 |
|
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
|
180 |
179 |
resultJson = syntaxConvertJsonKeyNamens(resultJson);
|
181 |
|
// while(resultJson.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")){
|
182 |
|
// resultJson = resultJson.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
|
183 |
|
// }
|
184 |
180 |
org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
|
185 |
|
resultXml = org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element
|
186 |
|
log.trace(resultXml);
|
|
181 |
resultXml += org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element
|
|
182 |
log.trace("before inputStream: " + resultXml);
|
|
183 |
// resultXml = XmlCleaner.cleanAllEntities(resultXml);
|
|
184 |
// log.trace("after cleaning: " + resultXml);
|
187 |
185 |
resultStream = IOUtils.toInputStream(resultXml,"UTF-8");
|
188 |
186 |
}
|
189 |
187 |
|
... | ... | |
213 |
211 |
resumptionStr = Integer.toString(resumptionInt);
|
214 |
212 |
break;
|
215 |
213 |
|
216 |
|
case "discover": // size of result items unknown, iterate over items
|
|
214 |
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
217 |
215 |
if (resultSizeValue < 2 ) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
|
218 |
216 |
qUrlArgument = qUrl.getQuery();
|
219 |
217 |
String[] arrayQUrlArgument = qUrlArgument.split("&");
|
... | ... | |
280 |
278 |
private String syntaxConvertJsonKeyNamens(String jsonInput) {
|
281 |
279 |
|
282 |
280 |
log.trace("before convertJsonKeyNames: " + jsonInput);
|
|
281 |
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
|
283 |
282 |
// replace ' 's in JSON Namens with '_'
|
284 |
283 |
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
|
285 |
284 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
|
org.json.XML - update maven package version to 20180813
better unicode support