Revision 54119
Added by Sandro La Bruzzo over 5 years ago
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/oai/OaiIterator.java | ||
---|---|---|
7 | 7 |
import java.util.Queue; |
8 | 8 |
import java.util.concurrent.PriorityBlockingQueue; |
9 | 9 |
|
10 |
import org.apache.commons.lang3.StringUtils; |
|
10 | 11 |
import org.apache.commons.logging.Log; |
11 | 12 |
import org.apache.commons.logging.LogFactory; |
12 | 13 |
import org.dom4j.Document; |
... | ... | |
103 | 104 |
} |
104 | 105 |
} |
105 | 106 |
|
107 |
private String extractResumptionToken(final String xml) { |
|
108 |
|
|
109 |
final String s = StringUtils.substringAfter(xml, "<resumptionToken"); |
|
110 |
if (s == null){ |
|
111 |
return null; |
|
112 |
} |
|
113 |
|
|
114 |
final String result = StringUtils.substringBetween(s, ">", "</"); |
|
115 |
if (result == null) |
|
116 |
return null; |
|
117 |
return result.trim(); |
|
118 |
|
|
119 |
|
|
120 |
} |
|
121 |
|
|
106 | 122 |
private String otherPages(final String resumptionToken) throws CollectorServiceException { |
107 | 123 |
try { |
108 | 124 |
return downloadPage(baseUrl + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken,"UTF-8")); |
... | ... | |
111 | 127 |
} |
112 | 128 |
} |
113 | 129 |
|
114 |
private String downloadPage(final String url) throws CollectorServiceException { |
|
115 |
|
|
116 |
final String xml = httpConnector.getInputSource(url); |
|
117 |
|
|
118 |
Document doc; |
|
119 |
try { |
|
120 |
doc = reader.read(new StringReader(xml)); |
|
121 |
} catch (DocumentException e) { |
|
122 |
log.debug("Error parsing xml, I try to clean it: " + xml, e); |
|
123 |
final String cleaned = XmlCleaner.cleanAllEntities(xml); |
|
124 |
try { |
|
125 |
doc = reader.read(new StringReader(cleaned)); |
|
126 |
} catch (DocumentException e1) { |
|
127 |
throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1); |
|
128 |
} |
|
129 |
} |
|
130 |
|
|
131 |
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']"); |
|
132 |
if (errorNode != null) { |
|
133 |
final String code = errorNode.valueOf("@code"); |
|
134 |
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) { |
|
135 |
log.warn("noRecordsMatch for oai call: " + url); |
|
136 |
return null; |
|
137 |
} else { |
|
138 |
throw new CollectorServiceException(code + " - " + errorNode.getText()); |
|
139 |
} |
|
140 |
} |
|
141 |
|
|
142 |
for (Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) { |
|
143 |
queue.add(((Node) o).asXML()); |
|
144 |
} |
|
145 |
|
|
146 |
return doc.valueOf("//*[local-name()='resumptionToken']"); |
|
147 |
|
|
148 |
} |
|
130 |
private String downloadPage(final String url) throws CollectorServiceException { |
|
149 | 131 |
|
132 |
final String xml = httpConnector.getInputSource(url); |
|
133 |
Document doc; |
|
134 |
try { |
|
135 |
doc = reader.read(new StringReader(xml)); |
|
136 |
} catch (DocumentException e) { |
|
137 |
log.warn("Error parsing xml, I try to clean it: " + xml, e); |
|
138 |
final String cleaned = XmlCleaner.cleanAllEntities(xml); |
|
139 |
try { |
|
140 |
doc = reader.read(new StringReader(cleaned)); |
|
141 |
} catch (DocumentException e1) { |
|
142 |
final String resumptionToken = extractResumptionToken(xml); |
|
143 |
if (resumptionToken == null) |
|
144 |
throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1); |
|
145 |
return resumptionToken; |
|
146 |
} |
|
147 |
} |
|
148 |
|
|
149 |
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']"); |
|
150 |
if (errorNode != null) { |
|
151 |
final String code = errorNode.valueOf("@code"); |
|
152 |
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) { |
|
153 |
log.warn("noRecordsMatch for oai call: " + url); |
|
154 |
return null; |
|
155 |
} else { |
|
156 |
throw new CollectorServiceException(code + " - " + errorNode.getText()); |
|
157 |
} |
|
158 |
} |
|
159 |
|
|
160 |
for (Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) { |
|
161 |
queue.add(((Node) o).asXML()); |
|
162 |
} |
|
163 |
|
|
164 |
return doc.valueOf("//*[local-name()='resumptionToken']"); |
|
165 |
|
|
166 |
} |
|
167 |
|
|
150 | 168 |
} |
Also available in: Unified diff
Changed OAI Iterator to allow to find the next resumption token even if the page is not xml well formed