Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.collector.plugins.oai;
2
3 28959 michele.ar
import java.io.StringReader;
4 26600 sandro.lab
import java.util.Iterator;
5
import java.util.Queue;
6
import java.util.concurrent.PriorityBlockingQueue;
7
8
import org.apache.commons.logging.Log;
9
import org.apache.commons.logging.LogFactory;
10
import org.dom4j.Document;
11 28959 michele.ar
import org.dom4j.DocumentException;
12 26600 sandro.lab
import org.dom4j.Node;
13
import org.dom4j.io.SAXReader;
14
15 28959 michele.ar
import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector;
16
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
17 29018 michele.ar
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
18 26600 sandro.lab
19
public class OaiIterator implements Iterator<String> {
20
21
	private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
22
23
	private Queue<String> queue = new PriorityBlockingQueue<String>();
24
	private SAXReader reader = new SAXReader();
25 28959 michele.ar
26 26600 sandro.lab
	private String baseUrl;
27 27189 michele.ar
	private String set;
28
	private String mdFormat;
29 27303 michele.ar
	private String fromDate;
30
	private String untilDate;
31 26600 sandro.lab
	private String token;
32 27189 michele.ar
	private boolean started;
33 28959 michele.ar
	private HttpConnector httpConnector;
34 26600 sandro.lab
35 28959 michele.ar
	public OaiIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate, final HttpConnector httpConnector) {
36 26600 sandro.lab
		this.baseUrl = baseUrl;
37 27189 michele.ar
		this.mdFormat = mdFormat;
38
		this.set = set;
39 27303 michele.ar
		this.fromDate = fromDate;
40
		this.untilDate = untilDate;
41 27189 michele.ar
		this.started = false;
42 28959 michele.ar
		this.httpConnector = httpConnector;
43 27189 michele.ar
	}
44
45
	private void verifyStarted() {
46
		if (!this.started) {
47
			this.started = true;
48 28959 michele.ar
			try {
49
				this.token = firstPage();
50 29018 michele.ar
			} catch (CollectorServiceException e) {
51 28959 michele.ar
				throw new RuntimeException(e);
52
			}
53 26600 sandro.lab
		}
54
	}
55
56
	@Override
57
	public boolean hasNext() {
58
		synchronized (queue) {
59 27189 michele.ar
			verifyStarted();
60 26600 sandro.lab
			return !queue.isEmpty();
61
		}
62
	}
63
64
	@Override
65
	public String next() {
66
		synchronized (queue) {
67 27189 michele.ar
			verifyStarted();
68 26600 sandro.lab
			final String res = queue.poll();
69
			while (queue.isEmpty() && (token != null) && !token.isEmpty()) {
70 28959 michele.ar
				try {
71
					token = otherPages(token);
72 29018 michele.ar
				} catch (CollectorServiceException e) {
73 28959 michele.ar
					throw new RuntimeException(e);
74
				}
75 26600 sandro.lab
			}
76
			return res;
77
		}
78
	}
79
80
	@Override
81
	public void remove() {}
82
83 29018 michele.ar
	private String firstPage() throws CollectorServiceException {
84 27189 michele.ar
85 26600 sandro.lab
		String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + mdFormat;
86
		if ((set != null) && !set.isEmpty()) {
87
			url += "&set=" + set;
88
		}
89 27189 michele.ar
90 27303 michele.ar
		if ((fromDate != null) && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
91
			url += "&from=" + fromDate;
92
		}
93
94
		if ((untilDate != null) && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
95
			url += "&until=" + untilDate;
96
		}
97
98 28959 michele.ar
		log.info("Start harvesting using url: " + url);
99 27189 michele.ar
100 26600 sandro.lab
		return downloadPage(url);
101
	}
102
103 29018 michele.ar
	private String otherPages(final String resumptionToken) throws CollectorServiceException {
104 26600 sandro.lab
		return downloadPage(baseUrl + "?verb=ListRecords&resumptionToken=" + resumptionToken);
105
	}
106
107 29018 michele.ar
	private String downloadPage(final String url) throws CollectorServiceException {
108 28959 michele.ar
109
		final String xml = httpConnector.getInputSource(url);
110 27189 michele.ar
111 28959 michele.ar
		Document doc;
112 26600 sandro.lab
		try {
113 28959 michele.ar
			doc = reader.read(new StringReader(xml));
114
		} catch (DocumentException e) {
115
			log.warn("Error parsing xml, I try to clean it: " + xml, e);
116
			final String cleaned = XmlCleaner.cleanAllEntities(xml);
117
			try {
118
				doc = reader.read(new StringReader(cleaned));
119
			} catch (DocumentException e1) {
120 29018 michele.ar
				throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1);
121 26600 sandro.lab
			}
122
		}
123 28959 michele.ar
124
		for (Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
125
			queue.add(((Node) o).asXML());
126
		}
127
128
		return doc.valueOf("//*[local-name()='resumptionToken']");
129
130 26600 sandro.lab
	}
131
132
}