Project

General

Profile

« Previous | Next » 

Revision 52970

Added by Andreas Czerniak over 5 years ago

Changes in the Rest_Json CollectorPlugin with enhancements for the new OpenDOAR API at JISC under https://v2.sherpa.ac.uk/opendoar/

View differences:

modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/rest/RestCollectorPluginTest.java
15 15
import eu.dnetlib.data.collector.rmi.ProtocolDescriptor;
16 16

  
17 17
/**
18
 * @author js
18
 * @author js, Andreas Czerniak
19 19
 *
20 20
 */
21 21
public class RestCollectorPluginTest {
......
29 29
	private String resultFormatParam = "format";
30 30
	private String resultFormatValue = "json";
31 31
	private String resultSizeParam = "size";
32
        private String resultSizeValue = "10";
32 33
	// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
33 34
	private String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
34 35
    // private String query = "=(sources:engrXiv AND type:preprint)";
......
47 48
		params.put("resultFormatParam", resultFormatParam);
48 49
		params.put("resultFormatValue", resultFormatValue);
49 50
		params.put("resultSizeParam", resultSizeParam);
51
                params.put("resultSizeValue", resultSizeValue);
50 52
		params.put("queryParams", query);
51 53
		params.put("entityXpath", entityXpath);
52 54
		
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/rest/RestIteratorTest.java
7 7
import org.junit.Test;
8 8

  
9 9
/**
10
 * @author js
11
 *
10
 * 
11
 * @author js, Andreas Czerniak
12
 * @date 2018-08-06
12 13
 */
13 14
public class RestIteratorTest {
14 15

  
......
19 20
	private String resultTotalXpath = "//hits/total";
20 21
	private String entityXpath = "//hits/hits";
21 22
	private String resultFormatParam = "format";
22
	private String resultFormatValue = "json";
23
	private String resultFormatValue = "Json";              //  Change from lowerCase to one UpperCase
23 24
	private String resultSizeParam = "size";
25
        private String resultSizeValue = "10";                  //  add  new
24 26
	private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
25 27

  
26 28

  
27 29
	@Ignore
28 30
	@Test
29 31
	public void test(){
30
		final RestIterator iterator = new RestIterator(baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, query, entityXpath);
32
		final RestIterator iterator = new RestIterator(baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue, query, entityXpath);
31 33
		int i =20;
32 34
		while (iterator.hasNext() && i > 0) {
33 35
			String result = iterator.next();
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/rest/RestCollectorPlugin.java
9 9
import org.apache.commons.lang3.StringUtils;
10 10

  
11 11
/**
12
 * @author js
12
 * @author js, Andreas Czerniak
13 13
 *
14 14
 */
15 15
public class RestCollectorPlugin extends AbstractCollectorPlugin {
......
25 25
		final String resultFormatParam = ifDescriptor.getParams().get("resultFormatParam");
26 26
		final String resultFormatValue = ifDescriptor.getParams().get("resultFormatValue");
27 27
		final String resultSizeParam = ifDescriptor.getParams().get("resultSizeParam");
28
		final String queryParams = ifDescriptor.getParams().get("queryParams");
28
		final String resultSizeValue = (StringUtils.isBlank(ifDescriptor.getParams().get("resultSizeValue"))) ? "100" : ifDescriptor.getParams().get("resultSizeValue");
29
                final String queryParams = ifDescriptor.getParams().get("queryParams");
29 30
		final String entityXpath = ifDescriptor.getParams().get("entityXpath");
30 31
		
31 32
		if (StringUtils.isBlank(baseUrl)) {throw new CollectorServiceException("Param 'baseUrl' is null or empty");}
......
33 34
		if (StringUtils.isBlank(resumptionParam)) {throw new CollectorServiceException("Param 'resumptionParam' is null or empty");}
34 35
		// if (StringUtils.isBlank(resumptionXpath)) {throw new CollectorServiceException("Param 'resumptionXpath' is null or empty");}
35 36
		// if (StringUtils.isBlank(resultTotalXpath)) {throw new CollectorServiceException("Param 'resultTotalXpath' is null or empty");}
36
		if (StringUtils.isBlank(resultFormatParam)) {throw new CollectorServiceException("Param 'resultFormatParam' is null or empty");}
37
		// resultFormatParam can be emtpy because some Rest-APIs doesn't like this argument in the query
38
                //if (StringUtils.isBlank(resultFormatParam)) {throw new CollectorServiceException("Param 'resultFormatParam' is null, empty or whitespace");}
37 39
		if (StringUtils.isBlank(resultFormatValue)) {throw new CollectorServiceException("Param 'resultFormatValue' is null or empty");}
38 40
		if (StringUtils.isBlank(resultSizeParam)) {throw new CollectorServiceException("Param 'resultSizeParam' is null or empty");}
39
		if (StringUtils.isBlank(queryParams)) {throw new CollectorServiceException("Param 'queryParams' is null or empty");}
41
                if (StringUtils.isBlank(queryParams)) {throw new CollectorServiceException("Param 'queryParams' is null or empty");}
40 42
		if (StringUtils.isBlank(entityXpath)) {throw new CollectorServiceException("Param 'entityXpath' is null or empty");}
41 43
		
42 44
		return () -> new RestIterator(
......
48 50
				resultFormatParam,
49 51
				resultFormatValue,
50 52
				resultSizeParam,
53
                                resultSizeValue,
51 54
				queryParams,
52 55
				entityXpath);
53 56
	}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/rest/RestIterator.java
34 34
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
35 35

  
36 36
/**
37
 * @author Jochen Schirrwagen, Aenne Loehden
37
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
38
 * @date 2018-08-06
38 39
 *
39 40
 */
40 41
public class RestIterator implements Iterator<String> {
......
47 48
	private String resumptionParam;
48 49
	private String resultFormatValue;
49 50
	private String queryParams;
50
	private int resultSizeValue = 100;
51
	private int resultSizeValue;
51 52
	private int resumptionInt = 0;			// integer resumption token (first record to harvest)
52 53
	private int resultTotal = -1;
53 54
	private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
......
71 72
			final String resultFormatParam,
72 73
			final String resultFormatValue,
73 74
			final String resultSizeParam,
75
                        final String resultSizeValue,
74 76
			final String queryParams,
75 77
			final String entityXpath
76 78
			) {
......
93 95
	
94 96
	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) throws TransformerConfigurationException, XPathExpressionException{
95 97
		transformer = TransformerFactory.newInstance().newTransformer();
96
        transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 
97
        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount","3");
98
		xpath = XPathFactory.newInstance().newXPath();
98
                transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 
99
                transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount","3");
100
		xpath              = XPathFactory.newInstance().newXPath();
99 101
		xprResultTotalPath = xpath.compile(resultTotalXpath);
100
		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
101
		xprEntity = xpath.compile(entityXpath);
102
		xprResumptionPath  = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
103
		xprEntity          = xpath.compile(entityXpath);
102 104
	}
103 105
	
104 106
	private void initQueue() {
......
149 151
		String nextQuery = "";
150 152
		try {
151 153
            resultStream = new URL(query).openStream();
152
			if("json".equals(resultFormatValue)){				
154
			if("json".equals(resultFormatValue.toLowerCase())){				
153 155
				resultJson = IOUtils.toString(resultStream,"UTF-8");
154 156

  
155 157
				//TODO move regex definitions as constant fields
156 158
				// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
157
				while(resultJson.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")){
158
					resultJson = resultJson.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
159
				}
159
				resultJson = syntaxConvertJsonKeyNamens(resultJson);
160
//                                while(resultJson.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")){
161
//					resultJson = resultJson.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
162
//				}
160 163
				org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
161 164
				resultXml = org.json.XML.toString(jsonObject,wrapName); // wrap xml in single root element
162 165
//				log.info(resultXml);
......
173 176
			}
174 177
				
175 178
			resumptionInt += resultSizeValue;
176
			if("scan".equals(resumptionType)) { resumptionStr = xprResumptionPath.evaluate(resultNode);}
177
			if("count".equals(resumptionType)){ resumptionStr = Integer.toString(resumptionInt); }
178

  
179
                        
180
                        switch(resumptionType.toLowerCase()) {
181
                            case "scan":
182
                                            resumptionStr = xprResumptionPath.evaluate(resultNode);
183
                                            break;
184
                            case "count":
185
                                            resumptionStr = Integer.toString(resumptionInt);
186
                                            break;
187
                            case "discover":
188
                                            String emptyXml = "<"+wrapName+"></"+wrapName+">";
189
                                            if( (emptyXml.toLowerCase()).equals(resultXml.toLowerCase()) ) {
190
                                                resumptionStr = "";
191
                                                resultTotal   = resumptionInt;
192
                                            } else {
193
                                                resumptionStr = Integer.toString(resumptionInt);
194
                                                resultTotal   = resumptionInt+1;
195
                                            }
196
                                            break;
197
                            default:
198
                        }
199
/*			if("scan".equals(resumptionType.toLowerCase())) { resumptionStr = xprResumptionPath.evaluate(resultNode);}
200
			if("count".equals(resumptionType.toLowerCase())){ resumptionStr = Integer.toString(resumptionInt); }
201
*/
179 202
			if (resultTotal == -1) {
180 203
				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
181 204
				log.info("resultTotal: " + resultTotal);
......
193 216
			throw new IllegalStateException("collection failed: " + e.getMessage());
194 217
		}
195 218
	}
219
        
220
        /**
221
         * convert in Json-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
222
         * 
223
         * @param jsonInput
224
         * @return 
225
         */
226
        private String syntaxConvertJsonKeyNamens(String jsonInput) {
196 227

  
228
            // replace ' 's in JSON Namens with '_'
229
            while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
230
                jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
231
            }
232

  
233
            // replace forward-slash (sign '/' ) in JSON Names with '_'
234
            while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
235
                jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
236
            }
237

  
238
            // replace '(' in JSON Names with ''
239
            while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
240
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
241
            }
242
            // replace ')' in JSON Names with ''
243
            while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
244
                jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
245
            }
246

  
247
            return jsonInput;
248
        }
249

  
197 250
}
modules/dnet-collector-plugins/trunk/src/main/resources/eu/dnetlib/data/collector/plugins/applicationContext-dnet-modular-collector-plugins.xml
99 99
						<bean class="eu.dnetlib.data.collector.rmi.ProtocolParameter"
100 100
							p:name="resultSizeParam" />					
101 101
						<bean class="eu.dnetlib.data.collector.rmi.ProtocolParameter"
102
							p:name="resultSizeValue" />
103
                                                <bean class="eu.dnetlib.data.collector.rmi.ProtocolParameter"
102 104
							p:name="queryParams" />					
103 105
						<bean class="eu.dnetlib.data.collector.rmi.ProtocolParameter"
104 106
							p:name="entityXpath" />

Also available in: Unified diff