Revision 27189
Added by Michele Artini over 10 years ago
modules/dnet-modular-collector-service/trunk/src/test/java/eu/dnetlib/data/collector/plugins/oai/OaiIteratorTest.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.collector.plugins.oai; |
2 | 2 |
|
3 |
import org.junit.Before; |
|
4 |
import org.junit.Ignore; |
|
3 | 5 |
import org.junit.Test; |
4 | 6 |
|
5 | 7 |
public class OaiIteratorTest { |
6 |
|
|
7 |
// TODO implement a local test (not depending on external services) |
|
8 |
|
|
9 |
private static final String BASE_URL = "http://oai.d.efg.research-infrastructures.eu/oai.do"; |
|
10 |
private static final String FORMAT = "oai_dc"; |
|
11 |
private static final String SET = "d937bab1-d44c-44aa-bf7d-df5312a3b623"; |
|
12 |
|
|
13 |
private OaiIterator oai; |
|
14 |
|
|
15 |
@Before |
|
16 |
public void setUp() { |
|
17 |
oai = new OaiIterator(BASE_URL, FORMAT, SET); |
|
18 |
} |
|
19 |
|
|
8 | 20 |
@Test |
21 |
@Ignore |
|
9 | 22 |
public void test() { |
10 |
|
|
11 |
// final Iterator<String> iter = new OaiIterator("http://zenodo.org/oai2d", "oai_dc", null); |
|
12 |
// |
|
13 |
// int count = 0; |
|
14 |
// System.out.println("START: " + count); |
|
15 |
// while (iter.hasNext()) { |
|
16 |
// iter.next(); |
|
17 |
// count++; |
|
18 |
// if ((count % 100) == 0) { |
|
19 |
// System.out.println(" - " + count); |
|
20 |
// } |
|
21 |
// } |
|
22 |
// System.out.println("TOTAL: " + count); |
|
23 |
int count = 0; |
|
24 |
while (oai.hasNext()) { |
|
25 |
oai.next(); |
|
26 |
count++; |
|
27 |
} |
|
28 |
System.out.println("TOTAL: " + count); |
|
23 | 29 |
} |
24 | 30 |
} |
modules/dnet-modular-collector-service/trunk/src/test/java/eu/dnetlib/data/collector/plugins/oai/OaiCollectorPluginRealTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai; |
|
2 |
|
|
3 |
import java.util.HashMap; |
|
4 |
|
|
5 |
import org.junit.Before; |
|
6 |
import org.junit.Ignore; |
|
7 |
import org.junit.Test; |
|
8 |
|
|
9 |
import com.google.common.collect.Iterables; |
|
10 |
|
|
11 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
12 |
|
|
13 |
public class OaiCollectorPluginRealTest { |
|
14 |
|
|
15 |
private OaiCollectorPlugin oai; |
|
16 |
|
|
17 |
private static final String BASE_URL = "http://oai.d.efg.research-infrastructures.eu/oai.do"; |
|
18 |
private static final String FORMAT = "oai_dc"; |
|
19 |
private static final String SETS = "d937bab1-d44c-44aa-bf7d-df5312a3b623, e5b14959-1e87-4c07-9f85-942c9cdd9136, 13302eb6-764a-4ed2-8d08-2a1c9526f442, 31701e97-096f-4266-81b5-30b9bc3a06b0"; |
|
20 |
|
|
21 |
@Before |
|
22 |
public void setUp() { |
|
23 |
oai = new OaiCollectorPlugin(); |
|
24 |
oai.setOaiIteratorFactory(new OaiIteratorFactory()); |
|
25 |
} |
|
26 |
|
|
27 |
@Test |
|
28 |
@Ignore |
|
29 |
public void testCollect() throws Exception { |
|
30 |
final InterfaceDescriptor iface = new InterfaceDescriptor(); |
|
31 |
iface.setId("123"); |
|
32 |
iface.setProtocol("OAI"); |
|
33 |
iface.setBaseUrl(BASE_URL); |
|
34 |
iface.setParams(new HashMap<String, String>()); |
|
35 |
iface.getParams().put("format", FORMAT); |
|
36 |
iface.getParams().put("set", SETS); |
|
37 |
|
|
38 |
int count = 0; |
|
39 |
for(String s : oai.collect(iface)) { |
|
40 |
count++; |
|
41 |
} |
|
42 |
System.out.println("TOTAL: " + count); |
|
43 |
} |
|
44 |
|
|
45 |
} |
modules/dnet-modular-collector-service/trunk/src/test/java/eu/dnetlib/data/collector/plugins/oai/OaiCollectorPluginTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertEquals; |
|
4 |
import static org.junit.Assert.assertNotNull; |
|
5 |
import static org.mockito.Mockito.verify; |
|
6 |
import static org.mockito.Mockito.when; |
|
7 |
|
|
8 |
import java.util.HashMap; |
|
9 |
import java.util.Iterator; |
|
10 |
import java.util.List; |
|
11 |
|
|
12 |
import org.junit.Before; |
|
13 |
import org.junit.Test; |
|
14 |
import org.junit.runner.RunWith; |
|
15 |
import org.mockito.Mock; |
|
16 |
import org.mockito.internal.verification.Times; |
|
17 |
import org.mockito.runners.MockitoJUnit44Runner; |
|
18 |
|
|
19 |
import com.google.common.base.Joiner; |
|
20 |
import com.google.common.collect.Lists; |
|
21 |
|
|
22 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
23 |
|
|
24 |
@RunWith(MockitoJUnit44Runner.class) |
|
25 |
public class OaiCollectorPluginTest { |
|
26 |
|
|
27 |
private OaiCollectorPlugin oai; |
|
28 |
|
|
29 |
@Mock |
|
30 |
private OaiIteratorFactory oaiIteratorFactory; |
|
31 |
|
|
32 |
private List<String> elements = Lists.newArrayList("0", "1", "2", "3", "4", "5", "6", "7", "8" , "9", "10", "11", "12"); |
|
33 |
|
|
34 |
private Iterator<String> oaiIterator1 = elements.subList(0, 3).iterator(); |
|
35 |
private Iterator<String> oaiIterator2 = elements.subList(3, 7).iterator(); |
|
36 |
private Iterator<String> oaiIterator3 = elements.subList(7, elements.size()).iterator(); |
|
37 |
|
|
38 |
private static final String BASE_URL = "http://oai.test.it/oai"; |
|
39 |
private static final String FORMAT = "oai_dc"; |
|
40 |
private static final String PROTOCOL = "OAI"; |
|
41 |
private static final String SET_1 = "set01"; |
|
42 |
private static final String SET_2 = "set02"; |
|
43 |
private static final String SET_3 = "set03"; |
|
44 |
|
|
45 |
@Before |
|
46 |
public void setUp() { |
|
47 |
oai = new OaiCollectorPlugin(); |
|
48 |
oai.setOaiIteratorFactory(oaiIteratorFactory); |
|
49 |
when(oaiIteratorFactory.newIterator(BASE_URL, FORMAT, SET_1)).thenReturn(oaiIterator1); |
|
50 |
when(oaiIteratorFactory.newIterator(BASE_URL, FORMAT, SET_2)).thenReturn(oaiIterator2); |
|
51 |
when(oaiIteratorFactory.newIterator(BASE_URL, FORMAT, SET_3)).thenReturn(oaiIterator3); |
|
52 |
} |
|
53 |
|
|
54 |
public void test() { |
|
55 |
oai = new OaiCollectorPlugin(); |
|
56 |
} |
|
57 |
|
|
58 |
@Test |
|
59 |
public void testGetProtocol() { |
|
60 |
assertEquals(PROTOCOL, oai.getProtocol()); |
|
61 |
} |
|
62 |
|
|
63 |
@Test |
|
64 |
public void testCollect() throws Exception { |
|
65 |
final InterfaceDescriptor iface = new InterfaceDescriptor(); |
|
66 |
iface.setId("123"); |
|
67 |
iface.setProtocol(PROTOCOL); |
|
68 |
iface.setBaseUrl(BASE_URL); |
|
69 |
iface.setParams(new HashMap<String, String>()); |
|
70 |
iface.getParams().put("format", FORMAT); |
|
71 |
iface.getParams().put("set", Joiner.on(", ").join(SET_1, SET_2, SET_3)); |
|
72 |
|
|
73 |
final Iterable<String> records = oai.collect(iface); |
|
74 |
|
|
75 |
assertNotNull(records); |
|
76 |
verify(oaiIteratorFactory, new Times(0)).newIterator(BASE_URL, FORMAT, SET_1); |
|
77 |
verify(oaiIteratorFactory, new Times(0)).newIterator(BASE_URL, FORMAT, SET_2); |
|
78 |
verify(oaiIteratorFactory, new Times(0)).newIterator(BASE_URL, FORMAT, SET_3); |
|
79 |
|
|
80 |
int count = 0; |
|
81 |
for (String s : records) { |
|
82 |
System.out.println("RECORD: " + s); |
|
83 |
assertEquals("" + count, s); |
|
84 |
count++; |
|
85 |
} |
|
86 |
assertEquals(elements.size(), count); |
|
87 |
verify(oaiIteratorFactory).newIterator(BASE_URL, FORMAT, SET_1); |
|
88 |
verify(oaiIteratorFactory).newIterator(BASE_URL, FORMAT, SET_2); |
|
89 |
verify(oaiIteratorFactory).newIterator(BASE_URL, FORMAT, SET_3); |
|
90 |
} |
|
91 |
} |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/oai/OaiIterator.java | ||
---|---|---|
24 | 24 |
private SAXReader reader = new SAXReader(); |
25 | 25 |
private HttpClient client = new HttpClient(); |
26 | 26 |
private String baseUrl; |
27 |
private String set; |
|
28 |
private String mdFormat; |
|
27 | 29 |
private String token; |
30 |
private boolean started; |
|
28 | 31 |
|
29 | 32 |
public OaiIterator(final String baseUrl, final String mdFormat, final String set) { |
30 | 33 |
this.baseUrl = baseUrl; |
31 |
if ((set != null) && !set.isEmpty()) { |
|
32 |
token = firstPage(mdFormat, set); |
|
33 |
} else { |
|
34 |
token = firstPage(mdFormat, null); |
|
34 |
this.mdFormat = mdFormat; |
|
35 |
this.set = set; |
|
36 |
this.started = false; |
|
37 |
} |
|
38 |
|
|
39 |
private void verifyStarted() { |
|
40 |
if (!this.started) { |
|
41 |
this.token = firstPage(); |
|
42 |
this.started = true; |
|
35 | 43 |
} |
36 | 44 |
} |
37 | 45 |
|
38 | 46 |
@Override |
39 | 47 |
public boolean hasNext() { |
40 | 48 |
synchronized (queue) { |
49 |
verifyStarted(); |
|
41 | 50 |
return !queue.isEmpty(); |
42 | 51 |
} |
43 | 52 |
} |
... | ... | |
45 | 54 |
@Override |
46 | 55 |
public String next() { |
47 | 56 |
synchronized (queue) { |
57 |
verifyStarted(); |
|
48 | 58 |
final String res = queue.poll(); |
49 | 59 |
while (queue.isEmpty() && (token != null) && !token.isEmpty()) { |
50 | 60 |
token = otherPages(token); |
... | ... | |
56 | 66 |
@Override |
57 | 67 |
public void remove() {} |
58 | 68 |
|
59 |
private String firstPage(final String mdFormat, final String set) { |
|
69 |
private String firstPage() { |
|
70 |
|
|
60 | 71 |
String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + mdFormat; |
61 | 72 |
if ((set != null) && !set.isEmpty()) { |
62 | 73 |
url += "&set=" + set; |
63 | 74 |
} |
75 |
|
|
76 |
log.info("Downloading first page using url: " + url); |
|
77 |
|
|
64 | 78 |
return downloadPage(url); |
65 | 79 |
} |
66 | 80 |
|
... | ... | |
69 | 83 |
} |
70 | 84 |
|
71 | 85 |
private String downloadPage(final String url) { |
86 |
|
|
87 |
System.out.println("URL: " + url); |
|
72 | 88 |
try { |
73 | 89 |
log.info("HTTP GET: " + url); |
74 | 90 |
final HttpMethod method = new GetMethod(url); |
... | ... | |
80 | 96 |
for (Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) { |
81 | 97 |
queue.add(((Node) o).asXML()); |
82 | 98 |
} |
83 |
|
|
99 |
|
|
100 |
System.out.println("Done"); |
|
84 | 101 |
return doc.valueOf("//*[local-name()='resumptionToken']"); |
85 | 102 |
} catch (Exception e) { |
86 | 103 |
throw new RuntimeException("Error obtaining records from: " + url, e); |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/oai/OaiIteratorFactory.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
|
|
5 |
public class OaiIteratorFactory { |
|
6 |
public Iterator<String> newIterator(final String baseUrl, final String mdFormat, final String set) { |
|
7 |
return new OaiIterator(baseUrl, mdFormat, set); |
|
8 |
} |
|
9 |
} |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/oai/OaiCollectorPlugin.java | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.util.Iterator; |
4 | 4 |
|
5 |
import org.springframework.beans.factory.annotation.Required; |
|
6 |
|
|
7 |
import com.google.common.base.Function; |
|
8 |
import com.google.common.base.Splitter; |
|
9 |
import com.google.common.collect.Iterables; |
|
10 |
import com.google.common.collect.Iterators; |
|
11 |
|
|
5 | 12 |
import eu.dnetlib.data.collector.plugin.CollectorPlugin; |
6 | 13 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
7 | 14 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
... | ... | |
10 | 17 |
|
11 | 18 |
private static final String FORMAT_PARAM = "format"; |
12 | 19 |
private static final String OAI_SET_PARAM = "set"; |
20 |
|
|
21 |
private OaiIteratorFactory oaiIteratorFactory; |
|
13 | 22 |
|
14 | 23 |
@Override |
15 | 24 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor) throws CollectorServiceException { |
16 | 25 |
final String baseUrl = interfaceDescriptor.getBaseUrl(); |
17 | 26 |
final String mdFormat = interfaceDescriptor.getParams().get(FORMAT_PARAM); |
18 |
final String set = interfaceDescriptor.getParams().get(OAI_SET_PARAM); |
|
27 |
final String setParam = interfaceDescriptor.getParams().get(OAI_SET_PARAM);
|
|
19 | 28 |
|
29 |
final Iterable<String> sets = Splitter.on(",").omitEmptyStrings().trimResults().split(setParam); |
|
30 |
|
|
20 | 31 |
if ((baseUrl == null) || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); } |
21 | 32 |
|
22 | 33 |
if ((mdFormat == null) || mdFormat.isEmpty()) { throw new CollectorServiceException("Param 'mdFormat' is null or empty"); } |
23 | 34 |
|
24 | 35 |
return new Iterable<String>() { |
25 |
|
|
36 |
@SuppressWarnings("unchecked") |
|
26 | 37 |
@Override |
27 | 38 |
public Iterator<String> iterator() { |
28 |
return new OaiIterator(baseUrl, mdFormat, set); |
|
39 |
final Iterable<Iterator<String>> iter = Iterables.transform(sets, new Function<String, Iterator<String>>() { |
|
40 |
@Override |
|
41 |
public Iterator<String> apply(String set) { |
|
42 |
return oaiIteratorFactory.newIterator(baseUrl, mdFormat, set); |
|
43 |
} |
|
44 |
}); |
|
45 |
return Iterators.concat(Iterables.toArray(iter, Iterator.class)); |
|
29 | 46 |
} |
30 | 47 |
}; |
31 | 48 |
} |
... | ... | |
35 | 52 |
return "OAI"; |
36 | 53 |
} |
37 | 54 |
|
55 |
public OaiIteratorFactory getOaiIteratorFactory() { |
|
56 |
return oaiIteratorFactory; |
|
57 |
} |
|
58 |
|
|
59 |
@Required |
|
60 |
public void setOaiIteratorFactory(OaiIteratorFactory oaiIteratorFactory) { |
|
61 |
this.oaiIteratorFactory = oaiIteratorFactory; |
|
62 |
} |
|
63 |
|
|
38 | 64 |
} |
modules/dnet-modular-collector-service/trunk/src/main/resources/eu/dnetlib/data/collector/plugins/applicationContext-dnet-modular-collector-plugins.xml | ||
---|---|---|
13 | 13 |
http://www.springframework.org/schema/util http://www.springframework.org/schema/util/spring-util-2.0.xsd |
14 | 14 |
http://dnetlib.eu/springbeans/template http://dnetlib.eu/springbeans/template.xsd"> |
15 | 15 |
|
16 |
<bean id="oaiCollectorPlugin" class="eu.dnetlib.data.collector.plugins.oai.OaiCollectorPlugin"> |
|
17 |
<property name="oaiIteratorFactory"> |
|
18 |
<bean class="eu.dnetlib.data.collector.plugins.oai.OaiIteratorFactory" /> |
|
19 |
</property> |
|
20 |
</bean> |
|
21 |
|
|
16 | 22 |
<bean id="httpCollectorPlugin" class="eu.dnetlib.data.collector.plugins.HttpCollectorPlugin" /> |
17 | 23 |
<bean id="fileCollectorPlugin" class="eu.dnetlib.data.collector.plugins.FileCollectorPlugin" /> |
18 | 24 |
<bean id="classpathCollectorPlugin" class="eu.dnetlib.data.collector.plugins.ClasspathCollectorPlugin" /> |
19 |
<bean id="oaiCollectorPlugin" class="eu.dnetlib.data.collector.plugins.oai.OaiCollectorPlugin" /> |
|
20 | 25 |
<bean id="csvFileCollectorPlugin" class="eu.dnetlib.data.collector.plugins.FileCSVCollectorPlugin" /> |
21 | 26 |
<bean id="ftpCollectorPlugin" class="eu.dnetlib.data.collector.plugins.ftp.FtpCollectorPlugin" /> |
22 | 27 |
<bean id="filesystemCollectorPlugin" class="eu.dnetlib.data.collector.plugins.filesystem.FilesystemCollectorPlugin" /> |
modules/dnet-modular-collector-service/trunk/pom.xml | ||
---|---|---|
65 | 65 |
<!-- <artifactId>je</artifactId> --> |
66 | 66 |
<!-- <version>5.0.73</version> --> |
67 | 67 |
<!-- </dependency> --> |
68 |
<dependency> |
|
69 |
<groupId>org.mockito</groupId> |
|
70 |
<artifactId>mockito-core</artifactId> |
|
71 |
<version>1.6</version> |
|
72 |
<scope>test</scope> |
|
73 |
</dependency> |
|
68 | 74 |
|
69 |
|
|
70 | 75 |
<dependency> |
71 | 76 |
<groupId>commons-httpclient</groupId> |
72 | 77 |
<artifactId>commons-httpclient</artifactId> |
Also available in: Unified diff
multiple sets management