1
|
package eu.dnetlib.validator.service.impls.providers;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.io.StringReader;
|
5
|
import java.net.MalformedURLException;
|
6
|
import java.net.URL;
|
7
|
import java.net.URLEncoder;
|
8
|
import java.util.ArrayList;
|
9
|
import java.util.List;
|
10
|
|
11
|
import javax.xml.parsers.DocumentBuilder;
|
12
|
import javax.xml.parsers.DocumentBuilderFactory;
|
13
|
import javax.xml.parsers.ParserConfigurationException;
|
14
|
import javax.xml.xpath.XPath;
|
15
|
import javax.xml.xpath.XPathConstants;
|
16
|
import javax.xml.xpath.XPathExpression;
|
17
|
import javax.xml.xpath.XPathExpressionException;
|
18
|
import javax.xml.xpath.XPathFactory;
|
19
|
|
20
|
import org.springframework.beans.factory.annotation.Value;
|
21
|
import org.w3c.dom.Document;
|
22
|
import org.w3c.dom.Element;
|
23
|
import org.w3c.dom.Node;
|
24
|
import org.w3c.dom.NodeList;
|
25
|
import org.w3c.dom.ls.DOMImplementationLS;
|
26
|
import org.w3c.dom.ls.LSSerializer;
|
27
|
import org.xml.sax.InputSource;
|
28
|
import org.xml.sax.SAXException;
|
29
|
|
30
|
import se.kb.oai.pmh.OaiPmhServer;
|
31
|
import se.kb.oai.pmh.ResumptionToken;
|
32
|
import se.kb.oai.pmh.Set;
|
33
|
import se.kb.oai.pmh.SetsList;
|
34
|
import eu.dnetlib.validator.engine.data.DataException;
|
35
|
import eu.dnetlib.validator.engine.data.Provider;
|
36
|
import eu.dnetlib.validator.engine.data.ResultSet;
|
37
|
import eu.dnetlib.validator.engine.execution.ValidationObject;
|
38
|
import eu.dnetlib.validator.service.impls.valobjs.XMLTextValidationObject;
|
39
|
|
40
|
/**
|
41
|
* A provider that retrieves records from an OAI-PMH repository. Resumption
|
42
|
* tokens are handled transparently.
|
43
|
*
|
44
|
* @author Manos Karvounis
|
45
|
*
|
46
|
*/
|
47
|
|
48
|
|
49
|
public class OAIPMHRecordProvider extends Provider {
|
50
|
|
51
|
private static final long serialVersionUID = 3386029339653670731L;
|
52
|
|
53
|
@Value("${services.validator.provider.timeout}")
|
54
|
private int timeout;
|
55
|
private int delay;
|
56
|
private int retryDelay;
|
57
|
private int retryEfforts;
|
58
|
|
59
|
public static final String BASEURL = "BASEURL";
|
60
|
public static final String METADATA_PREFIX = "metadataPrefix";
|
61
|
/**
|
62
|
* optional
|
63
|
*/
|
64
|
public static final String FROM = "from";
|
65
|
/**
|
66
|
* optional
|
67
|
*/
|
68
|
public static final String UNTIL = "until";
|
69
|
/**
|
70
|
* optional
|
71
|
*/
|
72
|
public static final String SET = "set";
|
73
|
|
74
|
/**
|
75
|
* The maximum time to wait for a response from the repository (in millis)
|
76
|
*/
|
77
|
// public static final String TIMEOUT = "TIMEOUT";
|
78
|
/**
|
79
|
* How much time to wait between consecutive HTTP requests to the repository
|
80
|
* (in millis).
|
81
|
*/
|
82
|
// public static final String DELAY = "DELAY";
|
83
|
/**
|
84
|
* How much to wait if an HTTP request fails before trying again by
|
85
|
* resending the request.
|
86
|
*/
|
87
|
// public static final String RETRY_DELAY = "RETRY_DELAY";
|
88
|
/**
|
89
|
* If an HTTP requests fails, how many times to try to resend the request.
|
90
|
*/
|
91
|
// public static final String RETRY_EFFORTS = "RETRY_EFFORTS";
|
92
|
/**
|
93
|
* How many records to test.
|
94
|
*/
|
95
|
public static final String RECORDS = "records";
|
96
|
|
97
|
public OAIPMHRecordProvider() {
|
98
|
super(1);
|
99
|
}
|
100
|
|
101
|
@Override
|
102
|
public ResultSet<ValidationObject> getValidationObjects() throws ProviderException {
|
103
|
return new OAIPMHRecordResultSet(this);
|
104
|
}
|
105
|
|
106
|
@Override
|
107
|
public ResultSet<String> getValidationObjectIds() throws ProviderException {
|
108
|
return new OAIPMHRecordIdentifierResultSet(this);
|
109
|
}
|
110
|
|
111
|
@Override
|
112
|
public ValidationObject getValidationObject(String valObjId) throws ProviderException {
|
113
|
OAIPMHResultSet oai = new OAIPMHResultSet(this);
|
114
|
try {
|
115
|
return new XMLTextValidationObject(oai.getRecord(valObjId));
|
116
|
} catch (DataException e) {
|
117
|
log.error("", e);
|
118
|
throw new ProviderException();
|
119
|
}
|
120
|
}
|
121
|
|
122
|
private class OAIPMHResultSet {
|
123
|
protected DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
124
|
protected DocumentBuilder builder;
|
125
|
protected XPathFactory xfactory = XPathFactory.newInstance();
|
126
|
protected NodeList recordIds = null;
|
127
|
protected NodeList records = null;
|
128
|
protected int index = -1;
|
129
|
protected String resumptionToken = null;
|
130
|
protected String error = null;
|
131
|
private URLStreamer streamer = new URLStreamer();
|
132
|
|
133
|
|
134
|
public OAIPMHResultSet(OAIPMHRecordProvider prv) {
|
135
|
super();
|
136
|
try {
|
137
|
builder = factory.newDocumentBuilder();
|
138
|
} catch (ParserConfigurationException e) {
|
139
|
log.error("", e);
|
140
|
}
|
141
|
}
|
142
|
|
143
|
protected NodeList getIds() throws DataException {
|
144
|
NodeList recordIds = null;
|
145
|
String surl = pros.getProperty(BASEURL) + "?verb=ListIdentifiers";
|
146
|
if (resumptionToken == null || resumptionToken.trim().length() == 0) {
|
147
|
surl += "&metadataPrefix=" + pros.getProperty(METADATA_PREFIX);
|
148
|
if (pros.getProperty(FROM) != null)
|
149
|
surl += "&from=" + pros.getProperty(FROM);
|
150
|
if (pros.getProperty(UNTIL) != null)
|
151
|
surl += "&until=" + pros.getProperty(UNTIL);
|
152
|
if (!pros.getProperty(SET).equals("none"))
|
153
|
surl += "&set=" + pros.getProperty(SET);
|
154
|
} else {
|
155
|
surl += "&resumptionToken=" + resumptionToken;
|
156
|
}
|
157
|
|
158
|
log.debug("Issuing request "+surl);
|
159
|
|
160
|
String response = null;
|
161
|
try {
|
162
|
response = streamer.getResponse(new URL(surl), timeout, delay, retryDelay, retryEfforts);
|
163
|
} catch (NumberFormatException e) {
|
164
|
log.error("", e);
|
165
|
throw new DataException();
|
166
|
} catch (MalformedURLException e) {
|
167
|
log.error("", e);
|
168
|
throw new DataException();
|
169
|
} catch (IOException e) {
|
170
|
log.error("", e);
|
171
|
throw new DataException();
|
172
|
}
|
173
|
try {
|
174
|
InputSource is = new InputSource(new StringReader(response));
|
175
|
Document doc = builder.parse(is);
|
176
|
XPath xpath = xfactory.newXPath();
|
177
|
XPathExpression expr = xpath.compile("OAI-PMH/ListIdentifiers/header/identifier/text()");
|
178
|
recordIds = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
|
179
|
XPath xpath2 = xfactory.newXPath();
|
180
|
XPathExpression expr2 = xpath2.compile("OAI-PMH/ListIdentifiers/resumptionToken/text()");
|
181
|
NodeList rtl = (NodeList) expr2.evaluate(doc, XPathConstants.NODESET);
|
182
|
if (rtl == null || rtl.getLength() == 0) {
|
183
|
log.debug("There seems to be no resumption token present");
|
184
|
resumptionToken = null;
|
185
|
}
|
186
|
else {
|
187
|
resumptionToken = rtl.item(0).getNodeValue();
|
188
|
resumptionToken = URLEncoder.encode(resumptionToken, "UTF-8");
|
189
|
log.debug("Found resumption token: "+resumptionToken);
|
190
|
}
|
191
|
} catch (Exception e) {
|
192
|
log.error("", e);
|
193
|
throw new DataException();
|
194
|
}
|
195
|
return recordIds;
|
196
|
}
|
197
|
|
198
|
protected NodeList getRecords() throws DataException {
|
199
|
NodeList records = null;
|
200
|
String surl = pros.getProperty(BASEURL) + "?verb=ListRecords";
|
201
|
if (resumptionToken == null || resumptionToken.trim().length() == 0) {
|
202
|
surl += "&metadataPrefix=" + pros.getProperty(METADATA_PREFIX);
|
203
|
if (pros.getProperty(FROM) != null)
|
204
|
surl += "&from=" + pros.getProperty(FROM);
|
205
|
if (pros.getProperty(UNTIL) != null)
|
206
|
surl += "&until=" + pros.getProperty(UNTIL);
|
207
|
if (!pros.getProperty(SET).equals("none"))
|
208
|
surl += "&set=" + pros.getProperty(SET);
|
209
|
} else {
|
210
|
surl += "&resumptionToken=" + resumptionToken;
|
211
|
}
|
212
|
String setError=null;
|
213
|
if (!pros.getProperty(SET).equals("none")) {
|
214
|
OaiPmhServer harvester = new OaiPmhServer(pros.getProperty(BASEURL));
|
215
|
try {
|
216
|
SetsList setList = harvester.listSets();
|
217
|
ResumptionToken token = setList.getResumptionToken();
|
218
|
List<Set> sets = new ArrayList<Set>();
|
219
|
sets.addAll(setList.asList());
|
220
|
while (token != null) {
|
221
|
setList = harvester.listSets(token);
|
222
|
token = setList.getResumptionToken();
|
223
|
sets.addAll(setList.asList());
|
224
|
}
|
225
|
List<String> ret = new ArrayList<String>();
|
226
|
for (Set set : sets) {
|
227
|
ret.add(set.getSpec().trim());
|
228
|
}
|
229
|
if (!ret.contains(pros.getProperty(SET))){
|
230
|
error = "Set: <b>'" + pros.getProperty(SET) + "'</b> is not exposed by the repository. \n Please make sure that 'ListSets' verb is configured correctly on your server as well as that the exposed sets list includes this set.";
|
231
|
setError = error;
|
232
|
}
|
233
|
} catch (Exception e) {
|
234
|
log.error("error getting sets from url: " + pros.getProperty(BASEURL), e);
|
235
|
}
|
236
|
}
|
237
|
log.debug("Issuing request "+surl);
|
238
|
|
239
|
String response = null;
|
240
|
try {
|
241
|
response = streamer.getResponse(new URL(surl), timeout, delay, retryDelay, retryEfforts);
|
242
|
} catch (NumberFormatException e) {
|
243
|
log.error("", e);
|
244
|
throw new DataException();
|
245
|
} catch (MalformedURLException e) {
|
246
|
log.error("", e);
|
247
|
throw new DataException();
|
248
|
} catch (IOException e) {
|
249
|
log.error("", e);
|
250
|
throw new DataException();
|
251
|
}
|
252
|
try {
|
253
|
InputSource is = new InputSource(new StringReader(response));
|
254
|
Document doc = builder.parse(is);
|
255
|
XPath xpath = xfactory.newXPath();
|
256
|
// xpath.setNamespaceContext(new NamespaceResolver(doc));
|
257
|
XPathExpression expr = xpath.compile("OAI-PMH/ListRecords/record");
|
258
|
records = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
|
259
|
XPath xpath2 = xfactory.newXPath();
|
260
|
XPathExpression expr2 = xpath2.compile("OAI-PMH/ListRecords/resumptionToken/text()");
|
261
|
NodeList rtl = (NodeList) expr2.evaluate(doc, XPathConstants.NODESET);
|
262
|
log.debug("Check number of records: "+ records.getLength());
|
263
|
if (records.getLength() == 0) {
|
264
|
log.debug("There are no records: "+ records.getLength());
|
265
|
XPath xpath3 = xfactory.newXPath();
|
266
|
XPathExpression expr3 = xpath3.compile("OAI-PMH/error/text()");
|
267
|
error = "The response on request: <a href=\""+ surl + "\">"+surl+"</a> was the following: ";
|
268
|
error += "<br><b>" + (String) expr3.evaluate(doc, XPathConstants.STRING) + "</b>";
|
269
|
if (setError != null) {
|
270
|
error += "<br>" + setError;
|
271
|
}
|
272
|
log.debug("Error: "+ error);
|
273
|
}
|
274
|
|
275
|
if (rtl == null || rtl.getLength() == 0) {
|
276
|
log.debug("There seems to be no resumption token present");
|
277
|
resumptionToken = null;
|
278
|
}
|
279
|
else {
|
280
|
resumptionToken = rtl.item(0).getNodeValue();
|
281
|
resumptionToken = URLEncoder.encode(resumptionToken, "UTF-8");
|
282
|
log.debug("Found resumption token: "+resumptionToken);
|
283
|
}
|
284
|
} catch (Exception e) {
|
285
|
log.error("", e);
|
286
|
throw new DataException();
|
287
|
}
|
288
|
return records;
|
289
|
}
|
290
|
|
291
|
protected Document getRecord(String id) throws DataException {
|
292
|
String surl = pros.getProperty(BASEURL) + "?verb=GetRecord";
|
293
|
surl += "&metadataPrefix=" + pros.getProperty(METADATA_PREFIX);
|
294
|
surl += "&identifier=" + id;
|
295
|
|
296
|
log.debug("Issuing request: "+surl);
|
297
|
|
298
|
String response = null;
|
299
|
try {
|
300
|
response = streamer.getResponse(new URL(surl), timeout, delay, retryDelay, retryEfforts);
|
301
|
} catch (NumberFormatException e) {
|
302
|
log.error("", e);
|
303
|
throw new DataException();
|
304
|
} catch (MalformedURLException e) {
|
305
|
log.error("", e);
|
306
|
throw new DataException();
|
307
|
} catch (IOException e) {
|
308
|
log.error("", e);
|
309
|
throw new DataException();
|
310
|
}
|
311
|
|
312
|
InputSource is = new InputSource(new StringReader(response));
|
313
|
Document doc;
|
314
|
try {
|
315
|
doc = builder.parse(is);
|
316
|
} catch (SAXException e) {
|
317
|
log.error("", e);
|
318
|
throw new DataException();
|
319
|
} catch (IOException e) {
|
320
|
log.error("", e);
|
321
|
throw new DataException();
|
322
|
}
|
323
|
|
324
|
return doc;
|
325
|
}
|
326
|
}
|
327
|
|
328
|
private class OAIPMHRecordResultSet extends OAIPMHResultSet implements ResultSet<ValidationObject> {
|
329
|
|
330
|
public OAIPMHRecordResultSet(OAIPMHRecordProvider prv) {
|
331
|
super(prv);
|
332
|
// TODO Auto-generated constructor stub
|
333
|
}
|
334
|
|
335
|
@Override
|
336
|
public String getError() {
|
337
|
if (error != null)
|
338
|
log.debug("An error occured "+ this.error);
|
339
|
else
|
340
|
log.debug("No errors on request");
|
341
|
return this.error;
|
342
|
}
|
343
|
|
344
|
@Override
|
345
|
public boolean next() throws DataException {
|
346
|
index++;
|
347
|
|
348
|
log.debug("Moving cursor to result "+index);
|
349
|
if (records == null || index >= records.getLength()) {
|
350
|
// if we have previously received some results and there no more to take
|
351
|
if (records != null && (resumptionToken == null || resumptionToken.trim().length() == 0))
|
352
|
return false;
|
353
|
index = -1;
|
354
|
records = getRecords();
|
355
|
return next();
|
356
|
}
|
357
|
return true;
|
358
|
}
|
359
|
|
360
|
@Override
|
361
|
public ValidationObject get() throws DataException {
|
362
|
XMLTextValidationObject ret = null;
|
363
|
|
364
|
Document newXmlDocument;
|
365
|
try {
|
366
|
newXmlDocument = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
|
367
|
|
368
|
Element root = newXmlDocument.createElement("root");
|
369
|
newXmlDocument.appendChild(root);
|
370
|
Node node = records.item(index);
|
371
|
Node copyNode = newXmlDocument.importNode(node, true);
|
372
|
root.appendChild(copyNode);
|
373
|
// printXmlDocument(newXmlDocument);
|
374
|
ret = new XMLTextValidationObject(newXmlDocument);
|
375
|
XPathFactory factory = XPathFactory.newInstance();
|
376
|
XPath xPath = factory.newXPath();
|
377
|
ret.setId(xPath.evaluate("header/identifier", records.item(index)));
|
378
|
ret.setStatus(xPath.evaluate("header/@status", records.item(index)));
|
379
|
|
380
|
} catch (ParserConfigurationException e) {
|
381
|
log.error("error getting object"+ e);
|
382
|
} catch (XPathExpressionException e) {
|
383
|
log.error("error getting object"+ e);
|
384
|
}
|
385
|
return ret;
|
386
|
}
|
387
|
|
388
|
}
|
389
|
|
390
|
public static void printXmlDocument(Document document) {
|
391
|
DOMImplementationLS domImplementationLS =
|
392
|
(DOMImplementationLS) document.getImplementation();
|
393
|
LSSerializer lsSerializer =
|
394
|
domImplementationLS.createLSSerializer();
|
395
|
String string = lsSerializer.writeToString(document);
|
396
|
System.out.println(string);
|
397
|
}
|
398
|
|
399
|
private class OAIPMHRecordIdentifierResultSet extends OAIPMHResultSet implements ResultSet<String> {
|
400
|
|
401
|
public OAIPMHRecordIdentifierResultSet(OAIPMHRecordProvider prv) {
|
402
|
super(prv);
|
403
|
// TODO Auto-generated constructor stub
|
404
|
}
|
405
|
|
406
|
@Override
|
407
|
public boolean next() throws DataException {
|
408
|
index++;
|
409
|
log.debug("Moving cursor to result "+index);
|
410
|
if (recordIds == null || index >= recordIds.getLength()) {
|
411
|
// if we have previously received some results and there no more to take
|
412
|
if (recordIds != null && (resumptionToken == null || resumptionToken.trim().length() == 0))
|
413
|
return false;
|
414
|
index = -1;
|
415
|
recordIds = getIds();
|
416
|
return next();
|
417
|
}
|
418
|
return true;
|
419
|
}
|
420
|
|
421
|
@Override
|
422
|
public String get() throws DataException {
|
423
|
String id = recordIds.item(index).getNodeValue();
|
424
|
|
425
|
log.debug("Returing object with id "+id);
|
426
|
|
427
|
return id;
|
428
|
}
|
429
|
|
430
|
@Override
|
431
|
public String getError() {
|
432
|
// TODO Auto-generated method stub
|
433
|
return null;
|
434
|
}
|
435
|
|
436
|
}
|
437
|
|
438
|
@Override
|
439
|
public ResultSet<ValidationObject> getValidationObjects(String entity)
|
440
|
throws ProviderException {
|
441
|
// TODO Auto-generated method stub
|
442
|
return null;
|
443
|
}
|
444
|
|
445
|
public Integer getTimeout() {
|
446
|
return timeout;
|
447
|
}
|
448
|
|
449
|
public void setTimeout(Integer timeout) {
|
450
|
this.timeout = timeout;
|
451
|
}
|
452
|
|
453
|
public Integer getDelay() {
|
454
|
return delay;
|
455
|
}
|
456
|
|
457
|
public void setDelay(Integer delay) {
|
458
|
this.delay = delay;
|
459
|
}
|
460
|
|
461
|
public Integer getRetryDelay() {
|
462
|
return retryDelay;
|
463
|
}
|
464
|
|
465
|
public void setRetryDelay(Integer retryDelay) {
|
466
|
this.retryDelay = retryDelay;
|
467
|
}
|
468
|
|
469
|
public Integer getRetryEfforts() {
|
470
|
return retryEfforts;
|
471
|
}
|
472
|
|
473
|
public void setRetryEfforts(Integer retryEfforts) {
|
474
|
this.retryEfforts = retryEfforts;
|
475
|
}
|
476
|
|
477
|
/*
|
478
|
class NamespaceResolver implements NamespaceContext {
|
479
|
|
480
|
private static final String OAI_NS = "http://www.openarchives.org/OAI/2.0/";
|
481
|
private static final String OAI_DC_NS = "http://www.openarchives.org/OAI/2.0/oai_dc/";
|
482
|
private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
|
483
|
|
484
|
private final Document document;
|
485
|
|
486
|
public NamespaceResolver(Document document) {
|
487
|
this.document = document;
|
488
|
}
|
489
|
|
490
|
public String getNamespaceURI(String prefix) {
|
491
|
log.debug("prefix: " + prefix);
|
492
|
if ("".equals(prefix) || "oai".equals(prefix)){
|
493
|
return OAI_NS;
|
494
|
}else if ("oai_dc".equals(prefix)){
|
495
|
return OAI_DC_NS;
|
496
|
}else if ("dc".equals(prefix)){
|
497
|
return DC_NS;
|
498
|
}
|
499
|
return "";
|
500
|
}
|
501
|
|
502
|
public String getPrefix(String namespaceURI) {
|
503
|
log.debug("prefix: " + namespaceURI);
|
504
|
return "";
|
505
|
}
|
506
|
|
507
|
@SuppressWarnings("rawtypes")
|
508
|
public Iterator getPrefixes(String namespaceURI) {
|
509
|
// not implemented
|
510
|
return null;
|
511
|
}
|
512
|
|
513
|
} */
|
514
|
|
515
|
}
|