Project

General

Profile

1
package eu.dnetlib.validator.service.impls.providers;
2

    
3
import java.io.IOException;
4
import java.io.StringReader;
5
import java.net.MalformedURLException;
6
import java.net.URL;
7
import java.net.URLEncoder;
8
import java.util.ArrayList;
9
import java.util.List;
10

    
11
import javax.xml.parsers.DocumentBuilder;
12
import javax.xml.parsers.DocumentBuilderFactory;
13
import javax.xml.parsers.ParserConfigurationException;
14
import javax.xml.xpath.XPath;
15
import javax.xml.xpath.XPathConstants;
16
import javax.xml.xpath.XPathExpression;
17
import javax.xml.xpath.XPathExpressionException;
18
import javax.xml.xpath.XPathFactory;
19

    
20
import org.w3c.dom.Document;
21
import org.w3c.dom.Element;
22
import org.w3c.dom.Node;
23
import org.w3c.dom.NodeList;
24
import org.w3c.dom.ls.DOMImplementationLS;
25
import org.w3c.dom.ls.LSSerializer;
26
import org.xml.sax.InputSource;
27
import org.xml.sax.SAXException;
28

    
29
import se.kb.oai.pmh.OaiPmhServer;
30
import se.kb.oai.pmh.ResumptionToken;
31
import se.kb.oai.pmh.Set;
32
import se.kb.oai.pmh.SetsList;
33
import eu.dnetlib.validator.engine.data.DataException;
34
import eu.dnetlib.validator.engine.data.Provider;
35
import eu.dnetlib.validator.engine.data.ResultSet;
36
import eu.dnetlib.validator.engine.execution.ValidationObject;
37
import eu.dnetlib.validator.service.impls.valobjs.XMLTextValidationObject;
38

    
39
/**
40
 * A provider that retrieves records from an OAI-PMH repository. Resumption
41
 * tokens are handled transparently.
42
 * 
43
 * @author Manos Karvounis
44
 * 
45
 */
46
public class OAIPMHRecordProvider extends Provider {
47
	
48
	private static final long serialVersionUID = 3386029339653670731L;
49

    
50
	private int timeout;
51
	private int delay;
52
	private int retryDelay;
53
	private int retryEfforts;
54
	
55
	public static final String BASEURL = "BASEURL";
56
	public static final String METADATA_PREFIX = "metadataPrefix";
57
	/**
58
	 * optional
59
	 */
60
	public static final String FROM = "from";
61
	/**
62
	 * optional
63
	 */
64
	public static final String UNTIL = "until";
65
	/**
66
	 * optional
67
	 */
68
	public static final String SET = "set";
69

    
70
	/**
71
	 * The maximum time to wait for a response from the repository (in millis)
72
	 */
73
//	public static final String TIMEOUT = "TIMEOUT";
74
	/**
75
	 * How much time to wait between consecutive HTTP requests to the repository
76
	 * (in millis).
77
	 */
78
//	public static final String DELAY = "DELAY";
79
	/**
80
	 * How much to wait if an HTTP request fails before trying again by
81
	 * resending the request.
82
	 */
83
//	public static final String RETRY_DELAY = "RETRY_DELAY";
84
	/**
85
	 * If an HTTP requests fails, how many times to try to resend the request.
86
	 */
87
//	public static final String RETRY_EFFORTS = "RETRY_EFFORTS";
88
	/**
89
	 * How many records to test.
90
	 */
91
	public static final String RECORDS = "records";
92
	
93
	public OAIPMHRecordProvider() {
94
		super(1);
95
	}
96

    
97
	@Override
98
	public ResultSet<ValidationObject> getValidationObjects() throws ProviderException {
99
		return new OAIPMHRecordResultSet(this);
100
	}
101

    
102
	@Override
103
	public ResultSet<String> getValidationObjectIds() throws ProviderException {
104
		return new OAIPMHRecordIdentifierResultSet(this);
105
	}
106

    
107
	@Override
108
	public ValidationObject getValidationObject(String valObjId) throws ProviderException {
109
		OAIPMHResultSet oai = new OAIPMHResultSet(this);
110
		try {
111
			return new XMLTextValidationObject(oai.getRecord(valObjId));
112
		} catch (DataException e) {
113
			log.error("", e);
114
			throw new ProviderException();
115
		}
116
	}
117

    
118
	private class OAIPMHResultSet {
119
		protected DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
120
		protected DocumentBuilder builder;
121
		protected XPathFactory xfactory = XPathFactory.newInstance();
122
		protected NodeList recordIds = null;
123
		protected NodeList records = null;
124
		protected int index = -1;
125
		protected String resumptionToken = null;
126
		protected String error = null;
127
		private URLStreamer streamer = new URLStreamer();
128
		
129

    
130
		public OAIPMHResultSet(OAIPMHRecordProvider prv) {
131
			super();
132
			try {
133
				builder = factory.newDocumentBuilder();
134
			} catch (ParserConfigurationException e) {
135
				log.error("", e);
136
			}
137
		}
138

    
139
		protected NodeList getIds() throws DataException {
140
			NodeList recordIds = null;
141
			String surl = pros.getProperty(BASEURL) + "?verb=ListIdentifiers";
142
			if (resumptionToken == null || resumptionToken.trim().length() == 0) {
143
				surl += "&metadataPrefix=" + pros.getProperty(METADATA_PREFIX);
144
				if (pros.getProperty(FROM) != null)
145
					surl += "&from=" + pros.getProperty(FROM);
146
				if (pros.getProperty(UNTIL) != null)
147
					surl += "&until=" + pros.getProperty(UNTIL);
148
				if (!pros.getProperty(SET).equals("none"))
149
					surl += "&set=" + pros.getProperty(SET);
150
			} else {
151
				surl += "&resumptionToken=" + resumptionToken;
152
			}
153
			
154
			log.debug("Issuing request "+surl);
155
			
156
			String response = null;
157
			try {
158
				response = streamer.getResponse(new URL(surl), timeout, delay, retryDelay, retryEfforts);
159
			} catch (NumberFormatException e) {
160
				log.error("", e);
161
				throw new DataException();
162
			} catch (MalformedURLException e) {
163
				log.error("", e);
164
				throw new DataException();
165
			} catch (IOException e) {
166
				log.error("", e);
167
				throw new DataException();
168
			}
169
			try {
170
				InputSource is = new InputSource(new StringReader(response));
171
				Document doc = builder.parse(is);
172
				XPath xpath = xfactory.newXPath();
173
				XPathExpression expr = xpath.compile("OAI-PMH/ListIdentifiers/header/identifier/text()");
174
				recordIds = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
175
				XPath xpath2 = xfactory.newXPath();
176
				XPathExpression expr2 = xpath2.compile("OAI-PMH/ListIdentifiers/resumptionToken/text()");
177
				NodeList rtl = (NodeList) expr2.evaluate(doc, XPathConstants.NODESET);
178
				if (rtl == null || rtl.getLength() == 0) {
179
					log.debug("There seems to be no resumption token present");
180
					resumptionToken = null;
181
				}
182
				else {
183
					resumptionToken = rtl.item(0).getNodeValue();
184
					resumptionToken = URLEncoder.encode(resumptionToken, "UTF-8");
185
					log.debug("Found resumption token: "+resumptionToken);
186
				}
187
			} catch (Exception e) {
188
				log.error("", e);
189
				throw new DataException();
190
			}
191
			return recordIds;
192
		}
193

    
194
		protected NodeList getRecords() throws DataException {
195
			NodeList records = null;
196
			String surl = pros.getProperty(BASEURL) + "?verb=ListRecords";
197
			if (resumptionToken == null || resumptionToken.trim().length() == 0) {
198
				surl += "&metadataPrefix=" + pros.getProperty(METADATA_PREFIX);
199
				if (pros.getProperty(FROM) != null)
200
					surl += "&from=" + pros.getProperty(FROM);
201
				if (pros.getProperty(UNTIL) != null)
202
					surl += "&until=" + pros.getProperty(UNTIL);
203
				if (!pros.getProperty(SET).equals("none"))
204
					surl += "&set=" + pros.getProperty(SET);
205
			} else {
206
				surl += "&resumptionToken=" + resumptionToken;
207
			}
208
			String setError=null;
209
			if (!pros.getProperty(SET).equals("none")) {
210
				OaiPmhServer harvester = new OaiPmhServer(pros.getProperty(BASEURL));
211
				try {
212
					SetsList setList = harvester.listSets();
213
					ResumptionToken token = setList.getResumptionToken();
214
					List<Set> sets = new ArrayList<Set>();
215
					sets.addAll(setList.asList());
216
					while (token != null) {
217
						setList = harvester.listSets(token);
218
						token = setList.getResumptionToken();
219
						sets.addAll(setList.asList());
220
					}
221
					List<String> ret = new ArrayList<String>();
222
					for (Set set : sets) {
223
						ret.add(set.getSpec().trim());
224
					}
225
					if (!ret.contains(pros.getProperty(SET))){
226
						error =  "Set: <b>'" + pros.getProperty(SET) + "'</b> is not exposed by the repository. \n Please make sure that 'ListSets' verb is configured correctly on your server as well as that the exposed sets list includes this set.";
227
						setError = error;
228
					}
229
				} catch (Exception e) {
230
					log.error("error getting sets from url: " + pros.getProperty(BASEURL), e);
231
				}
232
			}
233
			log.debug("Issuing request "+surl);
234
			
235
			String response = null;
236
			try {
237
				response = streamer.getResponse(new URL(surl), timeout, delay, retryDelay, retryEfforts);
238
			} catch (NumberFormatException e) {
239
				log.error("", e);
240
				throw new DataException();
241
			} catch (MalformedURLException e) {
242
				log.error("", e);
243
				throw new DataException();
244
			} catch (IOException e) {
245
				log.error("", e);
246
				throw new DataException();
247
			}
248
			try {
249
				InputSource is = new InputSource(new StringReader(response));
250
				Document doc = builder.parse(is);
251
				XPath xpath = xfactory.newXPath();
252
//				xpath.setNamespaceContext(new NamespaceResolver(doc));
253
				XPathExpression expr = xpath.compile("OAI-PMH/ListRecords/record");
254
				records = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
255
				XPath xpath2 = xfactory.newXPath();
256
				XPathExpression expr2 = xpath2.compile("OAI-PMH/ListRecords/resumptionToken/text()");
257
				NodeList rtl = (NodeList) expr2.evaluate(doc, XPathConstants.NODESET);
258
				log.debug("Check number of records: "+ records.getLength());
259
				if (records.getLength() == 0) {
260
					log.debug("There are no records: "+ records.getLength());
261
					XPath xpath3 = xfactory.newXPath();
262
					XPathExpression expr3 = xpath3.compile("OAI-PMH/error/text()");
263
					error = "The response on request: <a href=\""+ surl + "\">"+surl+"</a> was the following: ";
264
					error += "<br><b>" + (String) expr3.evaluate(doc, XPathConstants.STRING) + "</b>";
265
					if (setError != null) {
266
						error += "<br>" + setError;
267
					}
268
					log.debug("Error: "+ error);
269
				}
270
				
271
				if (rtl == null || rtl.getLength() == 0) {
272
					log.debug("There seems to be no resumption token present");
273
					resumptionToken = null;
274
				}
275
				else {
276
					resumptionToken = rtl.item(0).getNodeValue();
277
					resumptionToken = URLEncoder.encode(resumptionToken, "UTF-8");
278
					log.debug("Found resumption token: "+resumptionToken);
279
				}
280
			} catch (Exception e) {
281
				log.error("", e);
282
				throw new DataException();
283
			}
284
			return records;
285
		}
286
		
287
		protected Document getRecord(String id) throws DataException {
288
			String surl = pros.getProperty(BASEURL) + "?verb=GetRecord";
289
			surl += "&metadataPrefix=" + pros.getProperty(METADATA_PREFIX);
290
			surl += "&identifier=" + id;
291
			
292
			log.debug("Issuing request: "+surl);
293
			
294
			String response = null;
295
			try {
296
				response = streamer.getResponse(new URL(surl), timeout, delay, retryDelay, retryEfforts);
297
			} catch (NumberFormatException e) {
298
				log.error("", e);
299
				throw new DataException();
300
			} catch (MalformedURLException e) {
301
				log.error("", e);
302
				throw new DataException();
303
			} catch (IOException e) {
304
				log.error("", e);
305
				throw new DataException();
306
			}
307

    
308
			InputSource is = new InputSource(new StringReader(response));
309
			Document doc;
310
			try {
311
				doc = builder.parse(is);
312
			} catch (SAXException e) {
313
				log.error("", e);
314
				throw new DataException();
315
			} catch (IOException e) {
316
				log.error("", e);
317
				throw new DataException();
318
			}
319

    
320
			return doc;
321
		}
322
	}
323

    
324
	private class OAIPMHRecordResultSet extends OAIPMHResultSet implements ResultSet<ValidationObject> {
325

    
326
		public OAIPMHRecordResultSet(OAIPMHRecordProvider prv) {
327
			super(prv);
328
			// TODO Auto-generated constructor stub
329
		}
330

    
331
		@Override
332
		public String getError() {
333
			if (error != null)
334
				log.debug("An error occured "+ this.error);
335
			else
336
				log.debug("No errors on request");
337
			return this.error;
338
		}
339
		
340
		@Override
341
		public boolean next() throws DataException {
342
			index++;
343

    
344
			log.debug("Moving cursor to result "+index);
345
			if (records == null || index >= records.getLength()) {
346
				// if we have previously received some results and there no more to take
347
				if (records != null && (resumptionToken == null || resumptionToken.trim().length() == 0))
348
					return false;
349
				index = -1;
350
				records = getRecords();
351
				return next();
352
			}
353
			return true;
354
		}
355

    
356
		@Override
357
		public ValidationObject get() throws DataException {
358
			XMLTextValidationObject ret = null;
359
			
360
			Document newXmlDocument;
361
			try {
362
				newXmlDocument = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
363

    
364
	        Element root = newXmlDocument.createElement("root");
365
	        newXmlDocument.appendChild(root);
366
	            Node node = records.item(index);
367
	            Node copyNode = newXmlDocument.importNode(node, true);
368
	            root.appendChild(copyNode);
369
//	            printXmlDocument(newXmlDocument);
370
	            ret = new XMLTextValidationObject(newXmlDocument);
371
	            XPathFactory factory = XPathFactory.newInstance();
372
	            XPath xPath = factory.newXPath();
373
	            ret.setId(xPath.evaluate("header/identifier", records.item(index)));
374
	            ret.setStatus(xPath.evaluate("header/@status", records.item(index)));
375
	           
376
			} catch (ParserConfigurationException e) {
377
				log.error("error getting object"+ e);
378
			} catch (XPathExpressionException e) {
379
				log.error("error getting object"+ e);
380
			}			
381
			return ret;
382
		}
383

    
384
	}
385
	
386
	public static void printXmlDocument(Document document) {
387
	    DOMImplementationLS domImplementationLS = 
388
	        (DOMImplementationLS) document.getImplementation();
389
	    LSSerializer lsSerializer = 
390
	        domImplementationLS.createLSSerializer();
391
	    String string = lsSerializer.writeToString(document);
392
	    System.out.println(string);
393
	}
394

    
395
	private class OAIPMHRecordIdentifierResultSet extends OAIPMHResultSet implements ResultSet<String> {
396

    
397
		public OAIPMHRecordIdentifierResultSet(OAIPMHRecordProvider prv) {
398
			super(prv);
399
			// TODO Auto-generated constructor stub
400
		}
401

    
402
		@Override
403
		public boolean next() throws DataException {
404
			index++;
405
			log.debug("Moving cursor to result "+index);
406
			if (recordIds == null || index >= recordIds.getLength()) {
407
				// if we have previously received some results and there no more to take
408
				if (recordIds != null && (resumptionToken == null || resumptionToken.trim().length() == 0))
409
					return false;
410
				index = -1;
411
				recordIds = getIds();
412
				return next();
413
			}
414
			return true;
415
		}
416

    
417
		@Override
418
		public String get() throws DataException {
419
			String id = recordIds.item(index).getNodeValue();
420

    
421
			log.debug("Returing object with id "+id);
422
			
423
			return id;
424
		}
425

    
426
		@Override
427
		public String getError() {
428
			// TODO Auto-generated method stub
429
			return null;
430
		}
431

    
432
	}
433

    
434
	@Override
435
	public ResultSet<ValidationObject> getValidationObjects(String entity)
436
			throws ProviderException {
437
		// TODO Auto-generated method stub
438
		return null;
439
	}
440

    
441
	public Integer getTimeout() {
442
		return timeout;
443
	}
444

    
445
	public void setTimeout(Integer timeout) {
446
		this.timeout = timeout;
447
	}
448

    
449
	public Integer getDelay() {
450
		return delay;
451
	}
452

    
453
	public void setDelay(Integer delay) {
454
		this.delay = delay;
455
	}
456

    
457
	public Integer getRetryDelay() {
458
		return retryDelay;
459
	}
460

    
461
	public void setRetryDelay(Integer retryDelay) {
462
		this.retryDelay = retryDelay;
463
	}
464

    
465
	public Integer getRetryEfforts() {
466
		return retryEfforts;
467
	}
468

    
469
	public void setRetryEfforts(Integer retryEfforts) {
470
		this.retryEfforts = retryEfforts;
471
	}
472

    
473
	/*
474
	class NamespaceResolver implements NamespaceContext {
475
		
476
		private static final String OAI_NS = "http://www.openarchives.org/OAI/2.0/";
477
		private static final String OAI_DC_NS = "http://www.openarchives.org/OAI/2.0/oai_dc/";
478
		private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
479
		
480
		private final Document document;
481
		
482
		public NamespaceResolver(Document document) {
483
			this.document = document;
484
		}
485
		
486
		public String getNamespaceURI(String prefix) {
487
			log.debug("prefix: " + prefix);
488
			if ("".equals(prefix) || "oai".equals(prefix)){
489
				return OAI_NS;                     
490
			}else if ("oai_dc".equals(prefix)){
491
				return OAI_DC_NS;
492
			}else if ("dc".equals(prefix)){
493
				return DC_NS;
494
			}
495
			return "";
496
		}
497
		
498
		public String getPrefix(String namespaceURI) {
499
			log.debug("prefix: " + namespaceURI);
500
			return "";
501
		}
502
		
503
		@SuppressWarnings("rawtypes")
504
		public Iterator getPrefixes(String namespaceURI) {
505
			// not implemented
506
			return null;
507
		}
508
		
509
	}	*/
510
	
511
}
(4-4/8)