Project

General

Profile

« Previous | Next » 

Revision 54119

Changed OAI Iterator to allow to find the next resumption token even if the page is not xml well formed

View differences:

modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/oai/OaiIterator.java
7 7
import java.util.Queue;
8 8
import java.util.concurrent.PriorityBlockingQueue;
9 9

  
10
import org.apache.commons.lang3.StringUtils;
10 11
import org.apache.commons.logging.Log;
11 12
import org.apache.commons.logging.LogFactory;
12 13
import org.dom4j.Document;
......
103 104
		}
104 105
	}
105 106

  
107
	private String extractResumptionToken(final String xml) {
108

  
109
		final String s = StringUtils.substringAfter(xml, "<resumptionToken");
110
		if (s == null){
111
			return null;
112
		}
113

  
114
		final String result = StringUtils.substringBetween(s, ">", "</");
115
		if (result == null)
116
			return null;
117
		return  result.trim();
118

  
119

  
120
	}
121

  
106 122
	private String otherPages(final String resumptionToken) throws CollectorServiceException {
107 123
		try {
108 124
			return downloadPage(baseUrl + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken,"UTF-8"));
......
111 127
		}
112 128
	}
113 129

  
114
	private String downloadPage(final String url) throws CollectorServiceException {
115
	
116
		final String xml = httpConnector.getInputSource(url);
117
		
118
		Document doc;
119
		try {
120
			doc = reader.read(new StringReader(xml));
121
		} catch (DocumentException e) {
122
			log.debug("Error parsing xml, I try to clean it: " + xml, e);
123
			final String cleaned = XmlCleaner.cleanAllEntities(xml);
124
			try {
125
				doc = reader.read(new StringReader(cleaned));
126
			} catch (DocumentException e1) {
127
				throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1);
128
			}
129
		}
130
		
131
		final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
132
		if (errorNode != null) {
133
			final String code = errorNode.valueOf("@code");
134
			if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
135
				log.warn("noRecordsMatch for oai call: " + url);
136
				return null;
137
			} else {
138
				throw new CollectorServiceException(code + " - " + errorNode.getText());
139
			}
140
		}
141
		
142
		for (Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
143
			queue.add(((Node) o).asXML());
144
		}
145
	
146
		return doc.valueOf("//*[local-name()='resumptionToken']");
147
		
148
	}
130
    private String downloadPage(final String url) throws CollectorServiceException {
149 131

  
132
        final String xml = httpConnector.getInputSource(url);
133
        Document doc;
134
        try {
135
            doc = reader.read(new StringReader(xml));
136
        } catch (DocumentException e) {
137
            log.warn("Error parsing xml, I try to clean it: " + xml, e);
138
            final String cleaned = XmlCleaner.cleanAllEntities(xml);
139
            try {
140
                doc = reader.read(new StringReader(cleaned));
141
            } catch (DocumentException e1) {
142
                final String resumptionToken = extractResumptionToken(xml);
143
                if (resumptionToken == null)
144
                    throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1);
145
                return resumptionToken;
146
            }
147
        }
148

  
149
        final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
150
        if (errorNode != null) {
151
            final String code = errorNode.valueOf("@code");
152
            if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
153
                log.warn("noRecordsMatch for oai call: " + url);
154
                return null;
155
            } else {
156
                throw new CollectorServiceException(code + " - " + errorNode.getText());
157
            }
158
        }
159

  
160
        for (Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
161
            queue.add(((Node) o).asXML());
162
        }
163

  
164
        return doc.valueOf("//*[local-name()='resumptionToken']");
165

  
166
    }
167

  
150 168
}

Also available in: Unified diff