Revision 52970
Added by Andreas Czerniak over 5 years ago
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/rest/RestCollectorPluginTest.java | ||
---|---|---|
15 | 15 |
import eu.dnetlib.data.collector.rmi.ProtocolDescriptor; |
16 | 16 |
|
17 | 17 |
/** |
18 |
* @author js |
|
18 |
* @author js, Andreas Czerniak
|
|
19 | 19 |
* |
20 | 20 |
*/ |
21 | 21 |
public class RestCollectorPluginTest { |
... | ... | |
29 | 29 |
private String resultFormatParam = "format"; |
30 | 30 |
private String resultFormatValue = "json"; |
31 | 31 |
private String resultSizeParam = "size"; |
32 |
private String resultSizeValue = "10"; |
|
32 | 33 |
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; |
33 | 34 |
private String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29"; |
34 | 35 |
// private String query = "=(sources:engrXiv AND type:preprint)"; |
... | ... | |
47 | 48 |
params.put("resultFormatParam", resultFormatParam); |
48 | 49 |
params.put("resultFormatValue", resultFormatValue); |
49 | 50 |
params.put("resultSizeParam", resultSizeParam); |
51 |
params.put("resultSizeValue", resultSizeValue); |
|
50 | 52 |
params.put("queryParams", query); |
51 | 53 |
params.put("entityXpath", entityXpath); |
52 | 54 |
|
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/rest/RestIteratorTest.java | ||
---|---|---|
7 | 7 |
import org.junit.Test; |
8 | 8 |
|
9 | 9 |
/** |
10 |
* @author js |
|
11 |
* |
|
10 |
* |
|
11 |
* @author js, Andreas Czerniak |
|
12 |
* @date 2018-08-06 |
|
12 | 13 |
*/ |
13 | 14 |
public class RestIteratorTest { |
14 | 15 |
|
... | ... | |
19 | 20 |
private String resultTotalXpath = "//hits/total"; |
20 | 21 |
private String entityXpath = "//hits/hits"; |
21 | 22 |
private String resultFormatParam = "format"; |
22 |
private String resultFormatValue = "json";
|
|
23 |
private String resultFormatValue = "Json"; // Change from lowerCase to one UpperCase
|
|
23 | 24 |
private String resultSizeParam = "size"; |
25 |
private String resultSizeValue = "10"; // add new |
|
24 | 26 |
private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; |
25 | 27 |
|
26 | 28 |
|
27 | 29 |
@Ignore |
28 | 30 |
@Test |
29 | 31 |
public void test(){ |
30 |
final RestIterator iterator = new RestIterator(baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, query, entityXpath); |
|
32 |
final RestIterator iterator = new RestIterator(baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue, query, entityXpath);
|
|
31 | 33 |
int i =20; |
32 | 34 |
while (iterator.hasNext() && i > 0) { |
33 | 35 |
String result = iterator.next(); |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/rest/RestCollectorPlugin.java | ||
---|---|---|
9 | 9 |
import org.apache.commons.lang3.StringUtils; |
10 | 10 |
|
11 | 11 |
/** |
12 |
* @author js |
|
12 |
* @author js, Andreas Czerniak
|
|
13 | 13 |
* |
14 | 14 |
*/ |
15 | 15 |
public class RestCollectorPlugin extends AbstractCollectorPlugin { |
... | ... | |
25 | 25 |
final String resultFormatParam = ifDescriptor.getParams().get("resultFormatParam"); |
26 | 26 |
final String resultFormatValue = ifDescriptor.getParams().get("resultFormatValue"); |
27 | 27 |
final String resultSizeParam = ifDescriptor.getParams().get("resultSizeParam"); |
28 |
final String queryParams = ifDescriptor.getParams().get("queryParams"); |
|
28 |
final String resultSizeValue = (StringUtils.isBlank(ifDescriptor.getParams().get("resultSizeValue"))) ? "100" : ifDescriptor.getParams().get("resultSizeValue"); |
|
29 |
final String queryParams = ifDescriptor.getParams().get("queryParams"); |
|
29 | 30 |
final String entityXpath = ifDescriptor.getParams().get("entityXpath"); |
30 | 31 |
|
31 | 32 |
if (StringUtils.isBlank(baseUrl)) {throw new CollectorServiceException("Param 'baseUrl' is null or empty");} |
... | ... | |
33 | 34 |
if (StringUtils.isBlank(resumptionParam)) {throw new CollectorServiceException("Param 'resumptionParam' is null or empty");} |
34 | 35 |
// if (StringUtils.isBlank(resumptionXpath)) {throw new CollectorServiceException("Param 'resumptionXpath' is null or empty");} |
35 | 36 |
// if (StringUtils.isBlank(resultTotalXpath)) {throw new CollectorServiceException("Param 'resultTotalXpath' is null or empty");} |
36 |
if (StringUtils.isBlank(resultFormatParam)) {throw new CollectorServiceException("Param 'resultFormatParam' is null or empty");} |
|
37 |
// resultFormatParam can be emtpy because some Rest-APIs doesn't like this argument in the query |
|
38 |
//if (StringUtils.isBlank(resultFormatParam)) {throw new CollectorServiceException("Param 'resultFormatParam' is null, empty or whitespace");} |
|
37 | 39 |
if (StringUtils.isBlank(resultFormatValue)) {throw new CollectorServiceException("Param 'resultFormatValue' is null or empty");} |
38 | 40 |
if (StringUtils.isBlank(resultSizeParam)) {throw new CollectorServiceException("Param 'resultSizeParam' is null or empty");} |
39 |
if (StringUtils.isBlank(queryParams)) {throw new CollectorServiceException("Param 'queryParams' is null or empty");}
|
|
41 |
if (StringUtils.isBlank(queryParams)) {throw new CollectorServiceException("Param 'queryParams' is null or empty");}
|
|
40 | 42 |
if (StringUtils.isBlank(entityXpath)) {throw new CollectorServiceException("Param 'entityXpath' is null or empty");} |
41 | 43 |
|
42 | 44 |
return () -> new RestIterator( |
... | ... | |
48 | 50 |
resultFormatParam, |
49 | 51 |
resultFormatValue, |
50 | 52 |
resultSizeParam, |
53 |
resultSizeValue, |
|
51 | 54 |
queryParams, |
52 | 55 |
entityXpath); |
53 | 56 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/rest/RestIterator.java | ||
---|---|---|
34 | 34 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
35 | 35 |
|
36 | 36 |
/** |
37 |
* @author Jochen Schirrwagen, Aenne Loehden |
|
37 |
* @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak |
|
38 |
* @date 2018-08-06 |
|
38 | 39 |
* |
39 | 40 |
*/ |
40 | 41 |
public class RestIterator implements Iterator<String> { |
... | ... | |
47 | 48 |
private String resumptionParam; |
48 | 49 |
private String resultFormatValue; |
49 | 50 |
private String queryParams; |
50 |
private int resultSizeValue = 100;
|
|
51 |
private int resultSizeValue; |
|
51 | 52 |
private int resumptionInt = 0; // integer resumption token (first record to harvest) |
52 | 53 |
private int resultTotal = -1; |
53 | 54 |
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest or token scanned from results) |
... | ... | |
71 | 72 |
final String resultFormatParam, |
72 | 73 |
final String resultFormatValue, |
73 | 74 |
final String resultSizeParam, |
75 |
final String resultSizeValue, |
|
74 | 76 |
final String queryParams, |
75 | 77 |
final String entityXpath |
76 | 78 |
) { |
... | ... | |
93 | 95 |
|
94 | 96 |
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) throws TransformerConfigurationException, XPathExpressionException{ |
95 | 97 |
transformer = TransformerFactory.newInstance().newTransformer(); |
96 |
transformer.setOutputProperty(OutputKeys.INDENT,"yes"); |
|
97 |
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount","3"); |
|
98 |
xpath = XPathFactory.newInstance().newXPath(); |
|
98 |
transformer.setOutputProperty(OutputKeys.INDENT,"yes");
|
|
99 |
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount","3");
|
|
100 |
xpath = XPathFactory.newInstance().newXPath();
|
|
99 | 101 |
xprResultTotalPath = xpath.compile(resultTotalXpath); |
100 |
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath); |
|
101 |
xprEntity = xpath.compile(entityXpath); |
|
102 |
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
|
103 |
xprEntity = xpath.compile(entityXpath);
|
|
102 | 104 |
} |
103 | 105 |
|
104 | 106 |
private void initQueue() { |
... | ... | |
149 | 151 |
String nextQuery = ""; |
150 | 152 |
try { |
151 | 153 |
resultStream = new URL(query).openStream(); |
152 |
if("json".equals(resultFormatValue)){ |
|
154 |
if("json".equals(resultFormatValue.toLowerCase())){
|
|
153 | 155 |
resultJson = IOUtils.toString(resultStream,"UTF-8"); |
154 | 156 |
|
155 | 157 |
//TODO move regex definitions as constant fields |
156 | 158 |
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml) |
157 |
while(resultJson.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")){ |
|
158 |
resultJson = resultJson.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); |
|
159 |
} |
|
159 |
resultJson = syntaxConvertJsonKeyNamens(resultJson); |
|
160 |
// while(resultJson.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")){ |
|
161 |
// resultJson = resultJson.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); |
|
162 |
// } |
|
160 | 163 |
org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson); |
161 | 164 |
resultXml = org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element |
162 | 165 |
// log.info(resultXml); |
... | ... | |
173 | 176 |
} |
174 | 177 |
|
175 | 178 |
resumptionInt += resultSizeValue; |
176 |
if("scan".equals(resumptionType)) { resumptionStr = xprResumptionPath.evaluate(resultNode);} |
|
177 |
if("count".equals(resumptionType)){ resumptionStr = Integer.toString(resumptionInt); } |
|
178 |
|
|
179 |
|
|
180 |
switch(resumptionType.toLowerCase()) { |
|
181 |
case "scan": |
|
182 |
resumptionStr = xprResumptionPath.evaluate(resultNode); |
|
183 |
break; |
|
184 |
case "count": |
|
185 |
resumptionStr = Integer.toString(resumptionInt); |
|
186 |
break; |
|
187 |
case "discover": |
|
188 |
String emptyXml = "<"+wrapName+"></"+wrapName+">"; |
|
189 |
if( (emptyXml.toLowerCase()).equals(resultXml.toLowerCase()) ) { |
|
190 |
resumptionStr = ""; |
|
191 |
resultTotal = resumptionInt; |
|
192 |
} else { |
|
193 |
resumptionStr = Integer.toString(resumptionInt); |
|
194 |
resultTotal = resumptionInt+1; |
|
195 |
} |
|
196 |
break; |
|
197 |
default: |
|
198 |
} |
|
199 |
/* if("scan".equals(resumptionType.toLowerCase())) { resumptionStr = xprResumptionPath.evaluate(resultNode);} |
|
200 |
if("count".equals(resumptionType.toLowerCase())){ resumptionStr = Integer.toString(resumptionInt); } |
|
201 |
*/ |
|
179 | 202 |
if (resultTotal == -1) { |
180 | 203 |
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode)); |
181 | 204 |
log.info("resultTotal: " + resultTotal); |
... | ... | |
193 | 216 |
throw new IllegalStateException("collection failed: " + e.getMessage()); |
194 | 217 |
} |
195 | 218 |
} |
219 |
|
|
220 |
/** |
|
221 |
* convert in Json-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to '' |
|
222 |
* |
|
223 |
* @param jsonInput |
|
224 |
* @return |
|
225 |
*/ |
|
226 |
private String syntaxConvertJsonKeyNamens(String jsonInput) { |
|
196 | 227 |
|
228 |
// replace ' 's in JSON Namens with '_' |
|
229 |
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) { |
|
230 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); |
|
231 |
} |
|
232 |
|
|
233 |
// replace forward-slash (sign '/' ) in JSON Names with '_' |
|
234 |
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) { |
|
235 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":"); |
|
236 |
} |
|
237 |
|
|
238 |
// replace '(' in JSON Names with '' |
|
239 |
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) { |
|
240 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":"); |
|
241 |
} |
|
242 |
// replace ')' in JSON Names with '' |
|
243 |
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) { |
|
244 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":"); |
|
245 |
} |
|
246 |
|
|
247 |
return jsonInput; |
|
248 |
} |
|
249 |
|
|
197 | 250 |
} |
modules/dnet-collector-plugins/trunk/src/main/resources/eu/dnetlib/data/collector/plugins/applicationContext-dnet-modular-collector-plugins.xml | ||
---|---|---|
99 | 99 |
<bean class="eu.dnetlib.data.collector.rmi.ProtocolParameter" |
100 | 100 |
p:name="resultSizeParam" /> |
101 | 101 |
<bean class="eu.dnetlib.data.collector.rmi.ProtocolParameter" |
102 |
p:name="resultSizeValue" /> |
|
103 |
<bean class="eu.dnetlib.data.collector.rmi.ProtocolParameter" |
|
102 | 104 |
p:name="queryParams" /> |
103 | 105 |
<bean class="eu.dnetlib.data.collector.rmi.ProtocolParameter" |
104 | 106 |
p:name="entityXpath" /> |
Also available in: Unified diff
Changes in the Rest_Json CollectorPlugin with enhancements for the new OpenDOAR API at JISC under https://v2.sherpa.ac.uk/opendoar/