Project

General

Profile

1
package eu.dnetlib.validator.service.impls.providers;
2

    
3
import java.io.IOException;
4
import java.io.StringReader;
5
import java.net.MalformedURLException;
6
import java.net.URL;
7
import java.net.URLEncoder;
8
import java.util.ArrayList;
9
import java.util.List;
10

    
11
import javax.xml.parsers.DocumentBuilder;
12
import javax.xml.parsers.DocumentBuilderFactory;
13
import javax.xml.parsers.ParserConfigurationException;
14
import javax.xml.xpath.XPath;
15
import javax.xml.xpath.XPathConstants;
16
import javax.xml.xpath.XPathExpression;
17
import javax.xml.xpath.XPathExpressionException;
18
import javax.xml.xpath.XPathFactory;
19

    
20
import org.springframework.beans.factory.annotation.Value;
21
import org.w3c.dom.Document;
22
import org.w3c.dom.Element;
23
import org.w3c.dom.Node;
24
import org.w3c.dom.NodeList;
25
import org.w3c.dom.ls.DOMImplementationLS;
26
import org.w3c.dom.ls.LSSerializer;
27
import org.xml.sax.InputSource;
28
import org.xml.sax.SAXException;
29

    
30
import se.kb.oai.pmh.OaiPmhServer;
31
import se.kb.oai.pmh.ResumptionToken;
32
import se.kb.oai.pmh.Set;
33
import se.kb.oai.pmh.SetsList;
34
import eu.dnetlib.validator.engine.data.DataException;
35
import eu.dnetlib.validator.engine.data.Provider;
36
import eu.dnetlib.validator.engine.data.ResultSet;
37
import eu.dnetlib.validator.engine.execution.ValidationObject;
38
import eu.dnetlib.validator.service.impls.valobjs.XMLTextValidationObject;
39

    
40
/**
41
 * A provider that retrieves records from an OAI-PMH repository. Resumption
42
 * tokens are handled transparently.
43
 * 
44
 * @author Manos Karvounis
45
 * 
46
 */
47

    
48

    
49
public class OAIPMHRecordProviderNew extends Provider {
50
	
51
	private static final long serialVersionUID = 3386029339653670731L;
52

    
53
	@Value("${services.validator.provider.timeout}")
54
	private int timeout;
55
	private int delay;
56
	private int retryDelay;
57
	private int retryEfforts;
58
	
59
	public static final String BASEURL = "BASEURL";
60
	public static final String METADATA_PREFIX = "metadataPrefix";
61
	/**
62
	 * optional
63
	 */
64
	public static final String FROM = "from";
65
	/**
66
	 * optional
67
	 */
68
	public static final String UNTIL = "until";
69
	/**
70
	 * optional
71
	 */
72
	public static final String SET = "set";
73

    
74
	/**
75
	 * The maximum time to wait for a response from the repository (in millis)
76
	 */
77
//	public static final String TIMEOUT = "TIMEOUT";
78
	/**
79
	 * How much time to wait between consecutive HTTP requests to the repository
80
	 * (in millis).
81
	 */
82
//	public static final String DELAY = "DELAY";
83
	/**
84
	 * How much to wait if an HTTP request fails before trying again by
85
	 * resending the request.
86
	 */
87
//	public static final String RETRY_DELAY = "RETRY_DELAY";
88
	/**
89
	 * If an HTTP requests fails, how many times to try to resend the request.
90
	 */
91
//	public static final String RETRY_EFFORTS = "RETRY_EFFORTS";
92
	/**
93
	 * How many records to test.
94
	 */
95
	public static final String RECORDS = "records";
96
	
97
	public OAIPMHRecordProviderNew() {
98
		super(1);
99
	}
100

    
101
	@Override
102
	public ResultSet<ValidationObject> getValidationObjects() throws ProviderException {
103
		return new OAIPMHRecordResultSet(this);
104
	}
105

    
106
	@Override
107
	public ResultSet<String> getValidationObjectIds() throws ProviderException {
108
		return new OAIPMHRecordIdentifierResultSet(this);
109
	}
110

    
111
	@Override
112
	public ValidationObject getValidationObject(String valObjId) throws ProviderException {
113
		OAIPMHResultSet oai = new OAIPMHResultSet(this);
114
		try {
115
			return new XMLTextValidationObject(oai.getRecord(valObjId));
116
		} catch (DataException e) {
117
			log.error("", e);
118
			throw new ProviderException();
119
		}
120
	}
121

    
122
	private class OAIPMHResultSet {
123
		protected DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
124
		protected DocumentBuilder builder;
125
		protected XPathFactory xfactory = XPathFactory.newInstance();
126
		protected NodeList recordIds = null;
127
		protected NodeList records = null;
128
		protected int index = -1;
129
		protected String resumptionToken = null;
130
		protected String error = null;
131
		private URLStreamer streamer = new URLStreamer();
132
		
133

    
134
		public OAIPMHResultSet(OAIPMHRecordProviderNew prv) {
135
			super();
136
			try {
137
				builder = factory.newDocumentBuilder();
138
			} catch (ParserConfigurationException e) {
139
				log.error("", e);
140
			}
141
		}
142

    
143
		protected NodeList getIds() throws DataException {
144
			NodeList recordIds = null;
145
			String surl = pros.getProperty(BASEURL) + "?verb=ListIdentifiers";
146
			if (resumptionToken == null || resumptionToken.trim().length() == 0) {
147
				surl += "&metadataPrefix=" + pros.getProperty(METADATA_PREFIX);
148
				if (pros.getProperty(FROM) != null)
149
					surl += "&from=" + pros.getProperty(FROM);
150
				if (pros.getProperty(UNTIL) != null)
151
					surl += "&until=" + pros.getProperty(UNTIL);
152
				if (!pros.getProperty(SET).equals("none"))
153
					surl += "&set=" + pros.getProperty(SET);
154
			} else {
155
				surl += "&resumptionToken=" + resumptionToken;
156
			}
157
			
158
			log.debug("Issuing request "+surl);
159
			
160
			String response = null;
161
			try {
162
				response = streamer.getResponse(new URL(surl), timeout, delay, retryDelay, retryEfforts);
163
			} catch (NumberFormatException e) {
164
				log.error("", e);
165
				throw new DataException();
166
			} catch (MalformedURLException e) {
167
				log.error("", e);
168
				throw new DataException();
169
			} catch (IOException e) {
170
				log.error("", e);
171
				throw new DataException();
172
			}
173
			try {
174
				InputSource is = new InputSource(new StringReader(response));
175
				Document doc = builder.parse(is);
176
				XPath xpath = xfactory.newXPath();
177
				XPathExpression expr = xpath.compile("OAI-PMH/ListIdentifiers/header/identifier/text()");
178
				recordIds = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
179
				XPath xpath2 = xfactory.newXPath();
180
				XPathExpression expr2 = xpath2.compile("OAI-PMH/ListIdentifiers/resumptionToken/text()");
181
				NodeList rtl = (NodeList) expr2.evaluate(doc, XPathConstants.NODESET);
182
				if (rtl == null || rtl.getLength() == 0) {
183
					log.debug("There seems to be no resumption token present");
184
					resumptionToken = null;
185
				}
186
				else {
187
					resumptionToken = rtl.item(0).getNodeValue();
188
					resumptionToken = URLEncoder.encode(resumptionToken, "UTF-8");
189
					log.debug("Found resumption token: "+resumptionToken);
190
				}
191
			} catch (Exception e) {
192
				log.error("", e);
193
				throw new DataException();
194
			}
195
			return recordIds;
196
		}
197

    
198
		protected NodeList getRecords() throws DataException {
199
			NodeList records = null;
200
			String surl = pros.getProperty(BASEURL) + "?verb=ListRecords";
201
			if (resumptionToken == null || resumptionToken.trim().length() == 0) {
202
				surl += "&metadataPrefix=" + pros.getProperty(METADATA_PREFIX);
203
				if (pros.getProperty(FROM) != null)
204
					surl += "&from=" + pros.getProperty(FROM);
205
				if (pros.getProperty(UNTIL) != null)
206
					surl += "&until=" + pros.getProperty(UNTIL);
207
				if (!pros.getProperty(SET).equals("none"))
208
					surl += "&set=" + pros.getProperty(SET);
209
			} else {
210
				surl += "&resumptionToken=" + resumptionToken;
211
			}
212
			String setError=null;
213
			if (!pros.getProperty(SET).equals("none")) {
214
				OaiPmhServer harvester = new OaiPmhServer(pros.getProperty(BASEURL));
215
				try {
216
					SetsList setList = harvester.listSets();
217
					ResumptionToken token = setList.getResumptionToken();
218
					List<Set> sets = new ArrayList<Set>();
219
					sets.addAll(setList.asList());
220
					while (token != null) {
221
						setList = harvester.listSets(token);
222
						token = setList.getResumptionToken();
223
						sets.addAll(setList.asList());
224
					}
225
					List<String> ret = new ArrayList<String>();
226
					for (Set set : sets) {
227
						ret.add(set.getSpec().trim());
228
					}
229
					if (!ret.contains(pros.getProperty(SET))){
230
						error =  "Set: <b>'" + pros.getProperty(SET) + "'</b> is not exposed by the repository. \n Please make sure that 'ListSets' verb is configured correctly on your server as well as that the exposed sets list includes this set.";
231
						setError = error;
232
					}
233
				} catch (Exception e) {
234
					log.error("error getting sets from url: " + pros.getProperty(BASEURL), e);
235
				}
236
			}
237
			log.debug("Issuing request "+surl);
238
			
239
			String response = null;
240
			try {
241
				response = streamer.getResponse(new URL(surl), timeout, delay, retryDelay, retryEfforts);
242
			} catch (NumberFormatException e) {
243
				log.error("", e);
244
				throw new DataException();
245
			} catch (MalformedURLException e) {
246
				log.error("", e);
247
				throw new DataException();
248
			} catch (IOException e) {
249
				log.error("", e);
250
				throw new DataException();
251
			}
252
			try {
253
				InputSource is = new InputSource(new StringReader(response));
254
				Document doc = builder.parse(is);
255
				XPath xpath = xfactory.newXPath();
256
//				xpath.setNamespaceContext(new NamespaceResolver(doc));
257
				XPathExpression expr = xpath.compile("OAI-PMH/ListRecords/record");
258
				records = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
259
				XPath xpath2 = xfactory.newXPath();
260
				XPathExpression expr2 = xpath2.compile("OAI-PMH/ListRecords/resumptionToken/text()");
261
				NodeList rtl = (NodeList) expr2.evaluate(doc, XPathConstants.NODESET);
262
				log.debug("Check number of records: "+ records.getLength());
263
				if (records.getLength() == 0) {
264
					log.debug("There are no records: "+ records.getLength());
265
					XPath xpath3 = xfactory.newXPath();
266
					XPathExpression expr3 = xpath3.compile("OAI-PMH/error/text()");
267
					error = "The response on request: <a href=\""+ surl + "\">"+surl+"</a> was the following: ";
268
					error += "<br><b>" + (String) expr3.evaluate(doc, XPathConstants.STRING) + "</b>";
269
					if (setError != null) {
270
						error += "<br>" + setError;
271
					}
272
					log.debug("Error: "+ error);
273
				}
274
				
275
				if (rtl == null || rtl.getLength() == 0) {
276
					log.debug("There seems to be no resumption token present");
277
					resumptionToken = null;
278
				}
279
				else {
280
					resumptionToken = rtl.item(0).getNodeValue();
281
					resumptionToken = URLEncoder.encode(resumptionToken, "UTF-8");
282
					log.debug("Found resumption token: "+resumptionToken);
283
				}
284
			} catch (Exception e) {
285
				log.error("", e);
286
				throw new DataException();
287
			}
288
			return records;
289
		}
290
		
291
		protected Document getRecord(String id) throws DataException {
292
			String surl = pros.getProperty(BASEURL) + "?verb=GetRecord";
293
			surl += "&metadataPrefix=" + pros.getProperty(METADATA_PREFIX);
294
			surl += "&identifier=" + id;
295
			
296
			log.debug("Issuing request: "+surl);
297
			
298
			String response = null;
299
			try {
300
				response = streamer.getResponse(new URL(surl), timeout, delay, retryDelay, retryEfforts);
301
			} catch (NumberFormatException e) {
302
				log.error("", e);
303
				throw new DataException();
304
			} catch (MalformedURLException e) {
305
				log.error("", e);
306
				throw new DataException();
307
			} catch (IOException e) {
308
				log.error("", e);
309
				throw new DataException();
310
			}
311

    
312
			InputSource is = new InputSource(new StringReader(response));
313
			Document doc;
314
			try {
315
				doc = builder.parse(is);
316
			} catch (SAXException e) {
317
				log.error("", e);
318
				throw new DataException();
319
			} catch (IOException e) {
320
				log.error("", e);
321
				throw new DataException();
322
			}
323

    
324
			return doc;
325
		}
326
	}
327

    
328
	private class OAIPMHRecordResultSet extends OAIPMHResultSet implements ResultSet<ValidationObject> {
329

    
330
		public OAIPMHRecordResultSet(OAIPMHRecordProviderNew prv) {
331
			super(prv);
332
			// TODO Auto-generated constructor stub
333
		}
334

    
335
		@Override
336
		public String getError() {
337
			if (error != null)
338
				log.debug("An error occured "+ this.error);
339
			else
340
				log.debug("No errors on request");
341
			return this.error;
342
		}
343
		
344
		@Override
345
		public boolean next() throws DataException {
346
			index++;
347

    
348
			log.debug("Moving cursor to result "+index);
349
			if (records == null || index >= records.getLength()) {
350
				// if we have previously received some results and there no more to take
351
				if (records != null && (resumptionToken == null || resumptionToken.trim().length() == 0))
352
					return false;
353
				index = -1;
354
				records = getRecords();
355
				return next();
356
			}
357
			return true;
358
		}
359

    
360
		@Override
361
		public ValidationObject get() throws DataException {
362
			XMLTextValidationObject ret = null;
363
			
364
			Document newXmlDocument;
365
			try {
366
				newXmlDocument = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
367

    
368
	        Element root = newXmlDocument.createElement("root");
369
	        newXmlDocument.appendChild(root);
370
	            Node node = records.item(index);
371
	            Node copyNode = newXmlDocument.importNode(node, true);
372
	            root.appendChild(copyNode);
373
//	            printXmlDocument(newXmlDocument);
374
	            ret = new XMLTextValidationObject(newXmlDocument);
375
	            XPathFactory factory = XPathFactory.newInstance();
376
	            XPath xPath = factory.newXPath();
377
	            ret.setId(xPath.evaluate("header/identifier", records.item(index)));
378
	            ret.setStatus(xPath.evaluate("header/@status", records.item(index)));
379
	           
380
			} catch (ParserConfigurationException e) {
381
				log.error("error getting object"+ e);
382
			} catch (XPathExpressionException e) {
383
				log.error("error getting object"+ e);
384
			}			
385
			return ret;
386
		}
387

    
388
	}
389
	
390
	public static void printXmlDocument(Document document) {
391
	    DOMImplementationLS domImplementationLS = 
392
	        (DOMImplementationLS) document.getImplementation();
393
	    LSSerializer lsSerializer = 
394
	        domImplementationLS.createLSSerializer();
395
	    String string = lsSerializer.writeToString(document);
396
	    System.out.println(string);
397
	}
398

    
399
	private class OAIPMHRecordIdentifierResultSet extends OAIPMHResultSet implements ResultSet<String> {
400

    
401
		public OAIPMHRecordIdentifierResultSet(OAIPMHRecordProviderNew prv) {
402
			super(prv);
403
			// TODO Auto-generated constructor stub
404
		}
405

    
406
		@Override
407
		public boolean next() throws DataException {
408
			index++;
409
			log.debug("Moving cursor to result "+index);
410
			if (recordIds == null || index >= recordIds.getLength()) {
411
				// if we have previously received some results and there no more to take
412
				if (recordIds != null && (resumptionToken == null || resumptionToken.trim().length() == 0))
413
					return false;
414
				index = -1;
415
				recordIds = getIds();
416
				return next();
417
			}
418
			return true;
419
		}
420

    
421
		@Override
422
		public String get() throws DataException {
423
			String id = recordIds.item(index).getNodeValue();
424

    
425
			log.debug("Returing object with id "+id);
426
			
427
			return id;
428
		}
429

    
430
		@Override
431
		public String getError() {
432
			// TODO Auto-generated method stub
433
			return null;
434
		}
435

    
436
	}
437

    
438
	@Override
439
	public ResultSet<ValidationObject> getValidationObjects(String entity)
440
			throws ProviderException {
441
		// TODO Auto-generated method stub
442
		return null;
443
	}
444

    
445
	public Integer getTimeout() {
446
		return timeout;
447
	}
448

    
449
	public void setTimeout(Integer timeout) {
450
		this.timeout = timeout;
451
	}
452

    
453
	public Integer getDelay() {
454
		return delay;
455
	}
456

    
457
	public void setDelay(Integer delay) {
458
		this.delay = delay;
459
	}
460

    
461
	public Integer getRetryDelay() {
462
		return retryDelay;
463
	}
464

    
465
	public void setRetryDelay(Integer retryDelay) {
466
		this.retryDelay = retryDelay;
467
	}
468

    
469
	public Integer getRetryEfforts() {
470
		return retryEfforts;
471
	}
472

    
473
	public void setRetryEfforts(Integer retryEfforts) {
474
		this.retryEfforts = retryEfforts;
475
	}
476

    
477
	/*
478
	class NamespaceResolver implements NamespaceContext {
479
		
480
		private static final String OAI_NS = "http://www.openarchives.org/OAI/2.0/";
481
		private static final String OAI_DC_NS = "http://www.openarchives.org/OAI/2.0/oai_dc/";
482
		private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
483
		
484
		private final Document document;
485
		
486
		public NamespaceResolver(Document document) {
487
			this.document = document;
488
		}
489
		
490
		public String getNamespaceURI(String prefix) {
491
			log.debug("prefix: " + prefix);
492
			if ("".equals(prefix) || "oai".equals(prefix)){
493
				return OAI_NS;                     
494
			}else if ("oai_dc".equals(prefix)){
495
				return OAI_DC_NS;
496
			}else if ("dc".equals(prefix)){
497
				return DC_NS;
498
			}
499
			return "";
500
		}
501
		
502
		public String getPrefix(String namespaceURI) {
503
			log.debug("prefix: " + namespaceURI);
504
			return "";
505
		}
506
		
507
		@SuppressWarnings("rawtypes")
508
		public Iterator getPrefixes(String namespaceURI) {
509
			// not implemented
510
			return null;
511
		}
512
		
513
	}	*/
514
	
515
}
(7-7/11)