Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.oai;
2

    
3
import java.io.StringReader;
4
import java.util.Iterator;
5
import java.util.Queue;
6
import java.util.concurrent.PriorityBlockingQueue;
7

    
8
import org.apache.commons.logging.Log;
9
import org.apache.commons.logging.LogFactory;
10
import org.dom4j.Document;
11
import org.dom4j.DocumentException;
12
import org.dom4j.Node;
13
import org.dom4j.io.SAXReader;
14

    
15
import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector;
16
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
17
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
18

    
19
public class OaiIterator implements Iterator<String> {
20

    
21
	private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
22

    
23
	private Queue<String> queue = new PriorityBlockingQueue<String>();
24
	private SAXReader reader = new SAXReader();
25

    
26
	private String baseUrl;
27
	private String set;
28
	private String mdFormat;
29
	private String fromDate;
30
	private String untilDate;
31
	private String token;
32
	private boolean started;
33
	private HttpConnector httpConnector;
34

    
35
	public OaiIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate, final HttpConnector httpConnector) {
36
		this.baseUrl = baseUrl;
37
		this.mdFormat = mdFormat;
38
		this.set = set;
39
		this.fromDate = fromDate;
40
		this.untilDate = untilDate;
41
		this.started = false;
42
		this.httpConnector = httpConnector;
43
	}
44
	
45
	private void verifyStarted() {
46
		if (!this.started) {
47
			this.started = true;
48
			try {
49
				this.token = firstPage();
50
			} catch (CollectorServiceException e) {
51
				throw new RuntimeException(e);
52
			}
53
		}
54
	}
55

    
56
	@Override
57
	public boolean hasNext() {
58
		synchronized (queue) {
59
			verifyStarted();
60
			return !queue.isEmpty();
61
		}
62
	}
63

    
64
	@Override
65
	public String next() {
66
		synchronized (queue) {
67
			verifyStarted();
68
			final String res = queue.poll();
69
			while (queue.isEmpty() && (token != null) && !token.isEmpty()) {
70
				try {
71
					token = otherPages(token);
72
				} catch (CollectorServiceException e) {
73
					throw new RuntimeException(e);
74
				}
75
			}
76
			return res;
77
		}
78
	}
79

    
80
	@Override
81
	public void remove() {}
82

    
83
	private String firstPage() throws CollectorServiceException {
84
		
85
		String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + mdFormat;
86
		if ((set != null) && !set.isEmpty()) {
87
			url += "&set=" + set;
88
		}
89
		
90
		if ((fromDate != null) && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
91
			url += "&from=" + fromDate;
92
		}
93
		
94
		if ((untilDate != null) && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
95
			url += "&until=" + untilDate;
96
		}
97
		
98
		log.info("Start harvesting using url: " + url);
99

    
100
		return downloadPage(url);
101
	}
102

    
103
	private String otherPages(final String resumptionToken) throws CollectorServiceException {
104
		return downloadPage(baseUrl + "?verb=ListRecords&resumptionToken=" + resumptionToken);
105
	}
106

    
107
	private String downloadPage(final String url) throws CollectorServiceException {
108
	
109
		final String xml = httpConnector.getInputSource(url);
110
		
111
		Document doc;
112
		try {
113
			doc = reader.read(new StringReader(xml));
114
		} catch (DocumentException e) {
115
			log.warn("Error parsing xml, I try to clean it: " + xml, e);
116
			final String cleaned = XmlCleaner.cleanAllEntities(xml);
117
			try {
118
				doc = reader.read(new StringReader(cleaned));
119
			} catch (DocumentException e1) {
120
				throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1);
121
			}
122
		}
123
		
124
		for (Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
125
			queue.add(((Node) o).asXML());
126
		}
127
	
128
		return doc.valueOf("//*[local-name()='resumptionToken']");
129
		
130
	}
131

    
132
}
(2-2/3)