Revision 31237
Added by Sandro La Bruzzo about 10 years ago
modules/dnet-modular-collector-service/trunk/src/test/java/eu/dnetlib/data/collector/filesystem/csv/HTTPCSVCollectorPluginTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.filesystem.csv; |
|
2 |
|
|
3 |
import java.net.URISyntaxException; |
|
4 |
import java.util.HashMap; |
|
5 |
|
|
6 |
import org.junit.Assert; |
|
7 |
import org.junit.Test; |
|
8 |
|
|
9 |
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin; |
|
10 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
11 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
12 |
|
|
13 |
public class HTTPCSVCollectorPluginTest { |
|
14 |
|
|
15 |
@Test |
|
16 |
public void testCSVHeader() throws URISyntaxException, CollectorServiceException { |
|
17 |
|
|
18 |
InterfaceDescriptor descr = new InterfaceDescriptor(); |
|
19 |
HashMap<String, String> params = new HashMap<String, String>(); |
|
20 |
|
|
21 |
params.put("separator", ","); |
|
22 |
params.put("quote", "\""); |
|
23 |
params.put("identifier", "ISSN"); |
|
24 |
descr.setBaseUrl("http://doaj.org/csv"); |
|
25 |
descr.setParams(params); |
|
26 |
HttpCSVCollectorPlugin plugin = new HttpCSVCollectorPlugin(); |
|
27 |
int i = 0; |
|
28 |
for (String s : plugin.collect(descr, null, null)) { |
|
29 |
Assert.assertTrue(s.length() > 0); |
|
30 |
i++; |
|
31 |
|
|
32 |
} |
|
33 |
Assert.assertTrue(i > 0); |
|
34 |
|
|
35 |
} |
|
36 |
} |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpCSVCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins; |
|
2 |
|
|
3 |
import java.io.InputStreamReader; |
|
4 |
import java.io.Reader; |
|
5 |
import java.net.URL; |
|
6 |
import java.util.Iterator; |
|
7 |
import java.util.List; |
|
8 |
import java.util.Set; |
|
9 |
|
|
10 |
import org.apache.commons.csv.CSVFormat; |
|
11 |
import org.apache.commons.csv.CSVParser; |
|
12 |
import org.apache.commons.csv.CSVRecord; |
|
13 |
import org.dom4j.Document; |
|
14 |
import org.dom4j.DocumentHelper; |
|
15 |
import org.dom4j.Element; |
|
16 |
|
|
17 |
import com.google.common.base.Function; |
|
18 |
import com.google.common.collect.Iterators; |
|
19 |
import com.google.common.collect.Lists; |
|
20 |
|
|
21 |
import eu.dnetlib.data.collector.plugin.CollectorPlugin; |
|
22 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
23 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
24 |
|
|
25 |
/** |
|
26 |
* The Class HttpCSVCollectorPlugin. |
|
27 |
*/ |
|
28 |
public class HttpCSVCollectorPlugin implements CollectorPlugin { |
|
29 |
|
|
30 |
/** |
|
31 |
* The Class HTTPCSVIterator. |
|
32 |
*/ |
|
33 |
class HTTPCSVIterator implements Iterable<String> { |
|
34 |
|
|
35 |
/** The descriptor. */ |
|
36 |
private InterfaceDescriptor descriptor; |
|
37 |
|
|
38 |
/** |
|
39 |
* Instantiates a new HTTPCSV iterator. |
|
40 |
* |
|
41 |
* @param descriptor |
|
42 |
* the descriptor |
|
43 |
*/ |
|
44 |
public HTTPCSVIterator(final InterfaceDescriptor descriptor) { |
|
45 |
this.descriptor = descriptor; |
|
46 |
} |
|
47 |
|
|
48 |
/** |
|
49 |
* Iterator. |
|
50 |
* |
|
51 |
* @return the iterator |
|
52 |
*/ |
|
53 |
@SuppressWarnings("resource") |
|
54 |
@Override |
|
55 |
public Iterator<String> iterator() { |
|
56 |
CSVParser parser; |
|
57 |
Reader reader; |
|
58 |
try { |
|
59 |
String separator = descriptor.getParams().get("separator"); |
|
60 |
final String identifier = descriptor.getParams().get("identifier"); |
|
61 |
String quote = descriptor.getParams().get("quote"); |
|
62 |
URL url = new URL(descriptor.getBaseUrl()); |
|
63 |
url.openConnection(); |
|
64 |
|
|
65 |
reader = new InputStreamReader(url.openStream()); |
|
66 |
|
|
67 |
CSVFormat format = CSVFormat.EXCEL.withHeader().withDelimiter(separator.charAt(0)).withQuote(quote.charAt(0)); |
|
68 |
|
|
69 |
parser = new CSVParser(reader, format); |
|
70 |
final Set<String> headers = parser.getHeaderMap().keySet(); |
|
71 |
Function<CSVRecord, String> function = new Function<CSVRecord, String>() { |
|
72 |
|
|
73 |
@Override |
|
74 |
public String apply(final CSVRecord input) { |
|
75 |
Document document = DocumentHelper.createDocument(); |
|
76 |
Element root = document.addElement("csvRecord"); |
|
77 |
for (String key : headers) { |
|
78 |
Element row = root.addElement("column"); |
|
79 |
row.addAttribute("name", key).addText(input.get(key)); |
|
80 |
if (key.equals(identifier)) { |
|
81 |
row.addAttribute("isID", "true"); |
|
82 |
} |
|
83 |
|
|
84 |
} |
|
85 |
return document.asXML(); |
|
86 |
} |
|
87 |
}; |
|
88 |
Iterator<String> result = Iterators.transform(parser.iterator(), function); |
|
89 |
return result; |
|
90 |
|
|
91 |
} catch (Exception e) { |
|
92 |
return null; |
|
93 |
} |
|
94 |
} |
|
95 |
} |
|
96 |
|
|
97 |
/* |
|
98 |
* (non-Javadoc) |
|
99 |
* |
|
100 |
* @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String, |
|
101 |
* java.lang.String) |
|
102 |
*/ |
|
103 |
@Override |
|
104 |
public Iterable<String> collect(final InterfaceDescriptor descriptor, final String fromDate, final String untilDate) throws CollectorServiceException { |
|
105 |
|
|
106 |
return new HTTPCSVIterator(descriptor); |
|
107 |
} |
|
108 |
|
|
109 |
/* |
|
110 |
* (non-Javadoc) |
|
111 |
* |
|
112 |
* @see eu.dnetlib.data.collector.plugin.CollectorPlugin#getProtocol() |
|
113 |
*/ |
|
114 |
@Override |
|
115 |
public String getProtocol() { |
|
116 |
|
|
117 |
return "HttpCSVCollectorPlugin"; |
|
118 |
} |
|
119 |
|
|
120 |
/* |
|
121 |
* (non-Javadoc) |
|
122 |
* |
|
123 |
* @see eu.dnetlib.data.collector.plugin.CollectorPlugin#listNameParameters() |
|
124 |
*/ |
|
125 |
@Override |
|
126 |
public List<String> listNameParameters() { |
|
127 |
return Lists.newArrayList("separator", "identifier", "quote"); |
|
128 |
} |
|
129 |
|
|
130 |
} |
modules/dnet-modular-collector-service/trunk/pom.xml | ||
---|---|---|
1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|
3 |
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
3 | 4 |
<parent> |
4 | 5 |
<groupId>eu.dnetlib</groupId> |
5 | 6 |
<artifactId>dnet-parent</artifactId> |
... | ... | |
12 | 13 |
<packaging>jar</packaging> |
13 | 14 |
<version>2.0.0-SNAPSHOT</version> |
14 | 15 |
<scm> |
15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-modular-collector-service/trunk</developerConnection>
|
|
16 |
</scm>
|
|
16 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-modular-collector-service/trunk</developerConnection> |
|
17 |
</scm> |
|
17 | 18 |
<dependencies> |
18 | 19 |
<dependency> |
19 | 20 |
<groupId>eu.dnetlib</groupId> |
... | ... | |
67 | 68 |
<artifactId>commons-httpclient</artifactId> |
68 | 69 |
<version>3.1</version> |
69 | 70 |
</dependency> |
71 |
<dependency> |
|
72 |
<groupId>org.apache.commons</groupId> |
|
73 |
<artifactId>commons-csv</artifactId> |
|
74 |
<version>1.0</version> |
|
75 |
</dependency> |
|
70 | 76 |
</dependencies> |
71 | 77 |
</project> |
Also available in: Unified diff
Added new colletor plugin for CSV retreived from HTTP URL