Project

General

Profile

1
package eu.dnetlib.validator.service.impls.providers;
2

    
3
import java.io.IOException;
4
import java.io.StringReader;
5
import java.net.URL;
6
import java.net.URLEncoder;
7
import java.util.ArrayList;
8
import java.util.HashMap;
9
import java.util.List;
10
import java.util.Map;
11
import java.util.Map.Entry;
12
import java.util.Set;
13

    
14
import javax.xml.parsers.DocumentBuilder;
15
import javax.xml.parsers.DocumentBuilderFactory;
16
import javax.xml.parsers.ParserConfigurationException;
17
import javax.xml.xpath.XPath;
18
import javax.xml.xpath.XPathConstants;
19
import javax.xml.xpath.XPathExpression;
20
import javax.xml.xpath.XPathExpressionException;
21
import javax.xml.xpath.XPathFactory;
22

    
23
import net.sf.ehcache.Cache;
24

    
25
import org.apache.log4j.Logger;
26
import org.w3c.dom.Document;
27
import org.w3c.dom.Element;
28
import org.w3c.dom.Node;
29
import org.w3c.dom.NodeList;
30
import org.w3c.dom.ls.DOMImplementationLS;
31
import org.w3c.dom.ls.LSSerializer;
32
import org.xml.sax.InputSource;
33
import org.xml.sax.SAXException;
34

    
35
import se.kb.oai.pmh.OaiPmhServer;
36
import se.kb.oai.pmh.ResumptionToken;
37
import se.kb.oai.pmh.SetsList;
38
import eu.dnetlib.validator.engine.data.DataException;
39
import eu.dnetlib.validator.engine.data.Provider;
40
import eu.dnetlib.validator.engine.data.ResultSet;
41
import eu.dnetlib.validator.engine.execution.ValidationObject;
42
import eu.dnetlib.validator.service.impls.valobjs.XMLTextValidationObject;
43

    
44
/**
45
 * A provider that retrieves records from an OAI-PMH repository. Resumption
46
 * tokens are handled transparently.
47
 * 
48
 * @author Nikon Gasparis
49
 * 
50
 */
51
public class CrisProvider extends Provider {
52
	
53
	/**
54
	 * 
55
	 */
56
	private static final long serialVersionUID = -8496954108693674745L;
57

    
58
	private String baseUrl;
59
	private String metadataPrefix;
60
	private String set;
61
	private String records;
62
	private String from;
63
	private String until;
64
	private int timeout;
65
	private int delay;
66
	private int retryDelay;
67
	private int retryEfforts;
68
	
69
	private Cache cache = null;
70
	private Set<String> entities;
71
	private Map<String, OAIPMHResultSet> entityResultSetMap = new HashMap<String, OAIPMHResultSet>();
72
	
73
	public CrisProvider() {
74
		super(4);
75
	}
76

    
77
	@SuppressWarnings("unchecked")
78
	@Override
79
	public synchronized ResultSet<ValidationObject> getValidationObjects(String entity) throws ProviderException {
80
		if (!entityResultSetMap.containsKey(entity))
81
		{	
82
			OAIPMHRecordResultSet resultSet = new OAIPMHRecordResultSet();
83
			resultSet.setProvider(this);
84
			resultSet.setValidationSet(entity);
85
			this.entityResultSetMap.put(entity, resultSet);
86
		}
87
		return (ResultSet<ValidationObject>) entityResultSetMap.get(entity);
88
	}
89
	
90
	
91
	public void restartResultSets() {
92
		Map<String, OAIPMHResultSet> newEntityResultSetMap = new HashMap<String, CrisProvider.OAIPMHResultSet>();
93
		for (Entry<String, OAIPMHResultSet> ent : entityResultSetMap.entrySet()) {
94
			OAIPMHReferentialRecordResultSet resultSet = new OAIPMHReferentialRecordResultSet();
95
			resultSet.setProvider(this);
96
			resultSet.setValidationSet(ent.getKey());
97
			newEntityResultSetMap.put(ent.getKey(), resultSet);
98
		}
99
		 entityResultSetMap = null;
100
		 entityResultSetMap = newEntityResultSetMap;
101
	}
102

    
103
	@Override
104
	public ResultSet<String> getValidationObjectIds() throws ProviderException {
105
		return new OAIPMHRecordIdentifierResultSet();
106
	}
107

    
108
	@Override
109
	public synchronized ValidationObject getValidationObject(String valObjId) throws ProviderException {
110
		if (cache != null) {
111
			net.sf.ehcache.Element element = cache.get(valObjId);
112
			if (element != null) {
113
				log.debug("fetching from cache..");
114
				return (ValidationObject) element.getObjectValue();
115
			} else {
116
				log.debug("fetching from server..");
117
				ValidationObject ret = this.fetchValidationObject(valObjId);
118
//				net.sf.ehcache.Element ele = new net.sf.ehcache.Element(valObjId, ret);
119
//				cache.put(ele);
120
				return ret;
121
			}
122
		} else
123
			return this.fetchValidationObject(valObjId);
124
	}
125
	
126
	@Override
127
	public ResultSet<ValidationObject> getValidationObjects()
128
			throws ProviderException {
129
		// TODO Auto-generated method stub
130
		return null;
131
	}
132
	
133
	public synchronized ValidationObject fetchValidationObject(String valObjId) throws ProviderException {
134
		log.debug("fetching object with id: " + valObjId);
135
		try {
136
			OAIPMHResultSet oai = new OAIPMHResultSet();
137
			return new XMLTextValidationObject(oai.getRecord(valObjId));
138
		} catch (DataException e) {
139
			log.error("error fetching object with id: " + valObjId, e);
140
			throw new ProviderException();
141
		}
142
	}
143
	
144
	private class OAIPMHResultSet {
145
		Logger log = Logger.getLogger(OAIPMHResultSet.class);
146
		protected DocumentBuilderFactory factory = DocumentBuilderFactory
147
				.newInstance();
148
		protected DocumentBuilder builder;
149
		protected XPathFactory xfactory = XPathFactory.newInstance();
150
		protected NodeList recordIds = null;
151
		protected NodeList records = null;
152
		protected String validationSet;
153
		protected int index = -1;
154
		protected String resumptionToken = null;
155
		protected String error = null;
156
		private URLStreamer streamer = new URLStreamer();
157
		private CrisProvider provider;
158

    
159
		public void setValidationSet(String validationSet) {
160
			this.validationSet = validationSet;
161
		}
162

    
163
		public void setProvider(CrisProvider provider) {
164
			this.provider = provider;
165
		}
166

    
167
		public OAIPMHResultSet() {
168
			try {
169
				builder = factory.newDocumentBuilder();
170
			} catch (ParserConfigurationException e) {
171
				log.error("", e);
172
			}
173
		}
174

    
175
		protected NodeList getIds() throws DataException {
176
			NodeList recordIds = null;
177
			String surl = provider.getBaseUrl() + "?verb=ListIdentifiers";
178
			if (resumptionToken == null || resumptionToken.trim().length() == 0) {
179
				surl += "&metadataPrefix=" + provider.getMetadataPrefix();
180
				if (provider.getFrom() != null)
181
					surl += "&from=" + provider.getFrom();
182
				if (provider.getUntil() != null)
183
					surl += "&until=" + provider.getUntil();
184
				if (!validationSet.equals("none"))
185
					surl += "&set=" + validationSet;
186
			} else {
187
				surl += "&resumptionToken=" + resumptionToken;
188
			}
189

    
190
			log.debug("Issuing request " + surl);
191

    
192
			String response = null;
193
			try {
194
				response = streamer.getResponse(new URL(surl),
195
						provider.getTimeout(),
196
						provider.getDelay(),
197
						provider.getRetryDelay(),
198
						provider.getRetryEfforts());
199
			} catch (Exception e) {
200
				log.error("", e);
201
				throw new DataException();
202
			}
203
			try {
204
				InputSource is = new InputSource(new StringReader(response));
205
				Document doc = builder.parse(is);
206
				XPath xpath = xfactory.newXPath();
207
				XPathExpression expr = xpath
208
						.compile("OAI-PMH/ListIdentifiers/header/identifier/text()");
209
				recordIds = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
210
				XPath xpath2 = xfactory.newXPath();
211
				XPathExpression expr2 = xpath2
212
						.compile("OAI-PMH/ListIdentifiers/resumptionToken/text()");
213
				NodeList rtl = (NodeList) expr2.evaluate(doc,
214
						XPathConstants.NODESET);
215
				if (rtl == null || rtl.getLength() == 0) {
216
					log.debug("There seems to be no resumption token present");
217
					resumptionToken = null;
218
				} else {
219
					resumptionToken = rtl.item(0).getNodeValue();
220
					resumptionToken = URLEncoder.encode(resumptionToken, "UTF-8");
221
					log.debug("Found resumption token: " + resumptionToken);
222
				}
223
			} catch (Exception e) {
224
				log.error("", e);
225
				throw new DataException();
226
			}
227
			return recordIds;
228
		}
229

    
230
		protected NodeList getRecords() throws DataException {
231
			NodeList records = null;
232
			String surl = provider.getBaseUrl() + "?verb=ListRecords";
233
			if (resumptionToken == null || resumptionToken.trim().length() == 0) {
234
				surl += "&metadataPrefix=" + provider.getMetadataPrefix();
235
				if (provider.getFrom() != null)
236
					surl += "&from=" + provider.getFrom();
237
				if (provider.getUntil() != null)
238
					surl += "&until=" + provider.getUntil();
239
				if (!validationSet.equals("none"))
240
					surl += "&set=" + validationSet;
241
			} else {
242
				surl += "&resumptionToken=" + resumptionToken;
243
			}
244

    
245

    
246
			String setError = null;
247
			if (!validationSet.equals("none")) {
248
				OaiPmhServer harvester = new OaiPmhServer(provider.getBaseUrl());
249
				try {
250
					SetsList setList = harvester.listSets();
251
					ResumptionToken token = setList.getResumptionToken();
252
					List<se.kb.oai.pmh.Set> sets = new ArrayList<se.kb.oai.pmh.Set>();
253
					sets.addAll(setList.asList());
254
					while (token != null) {
255
						setList = harvester.listSets(token);
256
						token = setList.getResumptionToken();
257
						sets.addAll(setList.asList());
258
					}
259
					List<String> ret = new ArrayList<String>();
260
					for (se.kb.oai.pmh.Set set : sets) {
261
						ret.add(set.getSpec().trim());
262
					}
263
					if (!ret.contains(validationSet)) {
264
						error = "Set: <b>'"
265
								+ validationSet
266
								+ "'</b> is not exposed by the repository. \n Please make sure that 'ListSets' verb is configured correctly on your server as well as that the exposed sets list includes this set.";
267
						setError = error;
268
					}
269
				} catch (Exception e) {
270
					log.error(
271
							"error getting sets from url: "
272
									+ provider.getBaseUrl(), e);
273
				}
274
			}
275
			log.debug("Issuing request " + surl);
276

    
277
			String response = null;
278
			try {
279
				response = streamer.getResponse(new URL(surl),
280
						provider.getTimeout(),
281
						provider.getDelay(),
282
						provider.getRetryDelay(),
283
						provider.getRetryEfforts());
284
			} catch (Exception e) {
285
				log.error("", e);
286
				throw new DataException();
287
			}
288
			
289
			try {
290
				InputSource is = new InputSource(new StringReader(response));
291
				Document doc = builder.parse(is);
292
				XPath xpath = xfactory.newXPath();
293
				// xpath.setNamespaceContext(new NamespaceResolver(doc));
294
				XPathExpression expr = xpath.compile("OAI-PMH/ListRecords/record");
295
				records = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
296
				XPath xpath2 = xfactory.newXPath();
297
				XPathExpression expr2 = xpath2
298
						.compile("OAI-PMH/ListRecords/resumptionToken/text()");
299
				NodeList rtl = (NodeList) expr2.evaluate(doc,
300
						XPathConstants.NODESET);
301
				log.debug("Check number of records: " + records.getLength());
302
				if (records.getLength() == 0) {
303
					log.debug("There are no records: " + records.getLength());
304
					XPath xpath3 = xfactory.newXPath();
305
					XPathExpression expr3 = xpath3.compile("OAI-PMH/error/text()");
306
					error = "The response on request: <a href=\"" + surl + "\">"
307
							+ surl + "</a> was the following: ";
308
					error += "<br><b>"
309
							+ (String) expr3.evaluate(doc, XPathConstants.STRING)
310
							+ "</b>";
311
					if (setError != null) {
312
						error += "<br>" + setError;
313
					}
314
					log.debug("Error: " + error);
315
				}
316

    
317
				if (rtl == null || rtl.getLength() == 0) {
318
					log.debug("There seems to be no resumption token present");
319
					resumptionToken = null;
320
				} else {
321
					resumptionToken = rtl.item(0).getNodeValue();
322
					resumptionToken = URLEncoder.encode(resumptionToken, "UTF-8");
323
					log.debug("Found resumption token: " + resumptionToken);
324
				}
325
			} catch (Exception e) {
326
				log.error("", e);
327
				throw new DataException();
328
			}
329
			return records;
330
		}
331

    
332
		protected Document getRecord(String id) throws DataException {
333
			String surl = provider.getBaseUrl() + "?verb=GetRecord";
334
			surl += "&metadataPrefix=" + provider.getMetadataPrefix();
335
			surl += "&identifier=" + id;
336

    
337
			log.debug("Issuing request: " + surl);
338

    
339
			String response = null;
340
			try {
341
				response = streamer.getResponse(new URL(surl),
342
						provider.getTimeout(),
343
						provider.getDelay(),
344
						provider.getRetryDelay(),
345
						provider.getRetryEfforts());
346
			} catch (Exception e) {
347
				log.error("", e);
348
				throw new DataException();
349
			}
350

    
351
			InputSource is = new InputSource(new StringReader(response));
352
			Document doc;
353
			try {
354
				doc = builder.parse(is);
355
			} catch (SAXException e) {
356
				log.error("", e);
357
				throw new DataException();
358
			} catch (IOException e) {
359
				log.error("", e);
360
				throw new DataException();
361
			}
362

    
363
			return doc;
364
		}
365
	}
366

    
367
	private class OAIPMHRecordResultSet extends OAIPMHResultSet implements ResultSet<ValidationObject> {
368

    
369
		@Override
370
		public String getError() {
371
			if (error != null)
372
				log.debug("An error occured "+ this.error);
373
			else
374
				log.debug("No errors on request");
375
			return this.error;
376
		}
377
		
378
		@Override
379
		public boolean next() throws DataException {
380
			index++;
381

    
382
			log.debug("Moving cursor to result "+index);
383
			if (records == null || index >= records.getLength()) {
384
				// if we have previously received some results and there no more to take
385
				if (records != null && (resumptionToken == null || resumptionToken.trim().length() == 0))
386
					return false;
387
				index = -1;
388
				records = getRecords();
389
				return next();
390
			}
391
			return true;
392
		}
393

    
394
		@Override
395
		public ValidationObject get() throws DataException {
396
			XMLTextValidationObject ret = null;
397
			
398
			Document newXmlDocument;
399
			try {
400
				newXmlDocument = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
401

    
402
	        Element root = newXmlDocument.createElement("root");
403
	        newXmlDocument.appendChild(root);
404
	            Node node = records.item(index);
405
	            Node copyNode = newXmlDocument.importNode(node, true);
406
	            root.appendChild(copyNode);
407
//	            printXmlDocument(newXmlDocument);
408
	            ret = new XMLTextValidationObject(newXmlDocument);
409
	            XPathFactory factory = XPathFactory.newInstance();
410
	            XPath xPath = factory.newXPath();
411
	            ret.setId(xPath.evaluate("header/identifier", records.item(index)));
412
	            ret.setStatus(xPath.evaluate("header/@status", records.item(index)));
413
	           
414
			} catch (ParserConfigurationException e) {
415
				log.error("error getting object"+ e);
416
			} catch (XPathExpressionException e) {
417
				log.error("error getting object"+ e);
418
			}			
419
			if (cache != null) {
420
				log.debug("adding to cache..");
421
				net.sf.ehcache.Element ele = new net.sf.ehcache.Element(ret.getId(), ret);
422
				cache.put(ele);
423
			}
424
			return ret;
425
		}
426

    
427
	}
428
	
429
	public static void printXmlDocument(Document document) {
430
	    DOMImplementationLS domImplementationLS = 
431
	        (DOMImplementationLS) document.getImplementation();
432
	    LSSerializer lsSerializer = 
433
	        domImplementationLS.createLSSerializer();
434
	    String string = lsSerializer.writeToString(document);
435
	    System.out.println(string);
436
	}
437

    
438
	private class OAIPMHRecordIdentifierResultSet extends OAIPMHResultSet implements ResultSet<String> {
439

    
440
		public OAIPMHRecordIdentifierResultSet() {
441
		}
442

    
443
		@Override
444
		public boolean next() throws DataException {
445
			index++;
446
			log.debug("Moving cursor to result "+index);
447
			if (recordIds == null || index >= recordIds.getLength()) {
448
				// if we have previously received some results and there no more to take
449
				if (recordIds != null && (resumptionToken == null || resumptionToken.trim().length() == 0))
450
					return false;
451
				index = -1;
452
				recordIds = getIds();
453
				return next();
454
			}
455
			return true;
456
		}
457

    
458
		@Override
459
		public String get() throws DataException {
460
			String id = recordIds.item(index).getNodeValue();
461

    
462
			log.debug("Returing object with id "+id);
463
			
464
			return id;
465
		}
466

    
467
		@Override
468
		public String getError() {
469
			// TODO Auto-generated method stub
470
			return null;
471
		}
472

    
473
	}
474
	
475
	private class OAIPMHReferentialRecordResultSet extends OAIPMHResultSet implements ResultSet<ValidationObject> {
476

    
477
		public OAIPMHReferentialRecordResultSet() {
478
		}
479

    
480
		@Override
481
		public boolean next() throws DataException {
482
			index++;
483
			log.debug("Moving cursor to result "+index);
484
			if (recordIds == null || index >= recordIds.getLength()) {
485
				// if we have previously received some results and there no more to take
486
				if (recordIds != null && (resumptionToken == null || resumptionToken.trim().length() == 0))
487
					return false;
488
				index = -1;
489
				recordIds = getIds();
490
				return next();
491
			}
492
			return true;
493
		}
494

    
495
		@Override
496
		public ValidationObject get() throws DataException {
497
			String id = recordIds.item(index).getNodeValue();
498
			log.debug("Returing object with id "+id);
499
			net.sf.ehcache.Element element = cache.get(id);
500
			if (element != null) {
501
				return (ValidationObject) element.getObjectValue();
502
			} else {
503
				return new XMLTextValidationObject(this.getRecord(id));
504
			}
505
//			return (ValidationObject) selfPopulatingCache.get(id).getObjectValue();
506
		}
507

    
508
		@Override
509
		public String getError() {
510
			// TODO Auto-generated method stub
511
			return null;
512
		}
513

    
514
	}
515

    
516
	public Set<String> getEntities() {
517
		return entities;
518
	}
519

    
520
	public Cache getCache() {
521
		return cache;
522
	}
523

    
524
	public void setCache(Cache cache) {
525
		this.cache = cache;
526
	}
527

    
528
	public void setEntities(Set<String> entities) {
529
		this.entities = entities;
530
	}
531
	public String getBaseUrl() {
532
		return baseUrl;
533
	}
534

    
535
	public void setBaseUrl(String baseUrl) {
536
		this.baseUrl = baseUrl;
537
	}
538

    
539
	public String getMetadataPrefix() {
540
		return metadataPrefix;
541
	}
542

    
543
	public void setMetadataPrefix(String metadataPrefix) {
544
		this.metadataPrefix = metadataPrefix;
545
	}
546

    
547

    
548
	public String getRecords() {
549
		return records;
550
	}
551

    
552
	public void setRecords(String records) {
553
		this.records = records;
554
	}
555

    
556
	public String getFrom() {
557
		return from;
558
	}
559

    
560
	public void setFrom(String from) {
561
		this.from = from;
562
	}
563

    
564
	public String getUntil() {
565
		return until;
566
	}
567

    
568
	public void setUntil(String until) {
569
		this.until = until;
570
	}
571

    
572
	public String getSet() {
573
		return set;
574
	}
575

    
576
	public void setSet(String set) {
577
		this.set = set;
578
	}
579

    
580
	public Integer getTimeout() {
581
		return timeout;
582
	}
583

    
584
	public void setTimeout(Integer timeout) {
585
		this.timeout = timeout;
586
	}
587

    
588
	public Integer getDelay() {
589
		return delay;
590
	}
591

    
592
	public void setDelay(Integer delay) {
593
		this.delay = delay;
594
	}
595

    
596
	public Integer getRetryDelay() {
597
		return retryDelay;
598
	}
599

    
600
	public void setRetryDelay(Integer retryDelay) {
601
		this.retryDelay = retryDelay;
602
	}
603

    
604
	public Integer getRetryEfforts() {
605
		return retryEfforts;
606
	}
607

    
608
	public void setRetryEfforts(Integer retryEfforts) {
609
		this.retryEfforts = retryEfforts;
610
	}
611

    
612

    
613
}
(1-1/11)