Project

General

Profile

« Previous | Next » 

Revision 31237

Added new colletor plugin for CSV retreived from HTTP URL

View differences:

modules/dnet-modular-collector-service/trunk/src/test/java/eu/dnetlib/data/collector/filesystem/csv/HTTPCSVCollectorPluginTest.java
1
package eu.dnetlib.data.collector.filesystem.csv;
2

  
3
import java.net.URISyntaxException;
4
import java.util.HashMap;
5

  
6
import org.junit.Assert;
7
import org.junit.Test;
8

  
9
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
10
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
11
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
12

  
13
public class HTTPCSVCollectorPluginTest {
14

  
15
	@Test
16
	public void testCSVHeader() throws URISyntaxException, CollectorServiceException {
17

  
18
		InterfaceDescriptor descr = new InterfaceDescriptor();
19
		HashMap<String, String> params = new HashMap<String, String>();
20

  
21
		params.put("separator", ",");
22
		params.put("quote", "\"");
23
		params.put("identifier", "ISSN");
24
		descr.setBaseUrl("http://doaj.org/csv");
25
		descr.setParams(params);
26
		HttpCSVCollectorPlugin plugin = new HttpCSVCollectorPlugin();
27
		int i = 0;
28
		for (String s : plugin.collect(descr, null, null)) {
29
			Assert.assertTrue(s.length() > 0);
30
			i++;
31

  
32
		}
33
		Assert.assertTrue(i > 0);
34

  
35
	}
36
}
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpCSVCollectorPlugin.java
1
package eu.dnetlib.data.collector.plugins;
2

  
3
import java.io.InputStreamReader;
4
import java.io.Reader;
5
import java.net.URL;
6
import java.util.Iterator;
7
import java.util.List;
8
import java.util.Set;
9

  
10
import org.apache.commons.csv.CSVFormat;
11
import org.apache.commons.csv.CSVParser;
12
import org.apache.commons.csv.CSVRecord;
13
import org.dom4j.Document;
14
import org.dom4j.DocumentHelper;
15
import org.dom4j.Element;
16

  
17
import com.google.common.base.Function;
18
import com.google.common.collect.Iterators;
19
import com.google.common.collect.Lists;
20

  
21
import eu.dnetlib.data.collector.plugin.CollectorPlugin;
22
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
23
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
24

  
25
/**
26
 * The Class HttpCSVCollectorPlugin.
27
 */
28
public class HttpCSVCollectorPlugin implements CollectorPlugin {
29

  
30
	/**
31
	 * The Class HTTPCSVIterator.
32
	 */
33
	class HTTPCSVIterator implements Iterable<String> {
34

  
35
		/** The descriptor. */
36
		private InterfaceDescriptor descriptor;
37

  
38
		/**
39
		 * Instantiates a new HTTPCSV iterator.
40
		 *
41
		 * @param descriptor
42
		 *            the descriptor
43
		 */
44
		public HTTPCSVIterator(final InterfaceDescriptor descriptor) {
45
			this.descriptor = descriptor;
46
		}
47

  
48
		/**
49
		 * Iterator.
50
		 *
51
		 * @return the iterator
52
		 */
53
		@SuppressWarnings("resource")
54
		@Override
55
		public Iterator<String> iterator() {
56
			CSVParser parser;
57
			Reader reader;
58
			try {
59
				String separator = descriptor.getParams().get("separator");
60
				final String identifier = descriptor.getParams().get("identifier");
61
				String quote = descriptor.getParams().get("quote");
62
				URL url = new URL(descriptor.getBaseUrl());
63
				url.openConnection();
64

  
65
				reader = new InputStreamReader(url.openStream());
66

  
67
				CSVFormat format = CSVFormat.EXCEL.withHeader().withDelimiter(separator.charAt(0)).withQuote(quote.charAt(0));
68

  
69
				parser = new CSVParser(reader, format);
70
				final Set<String> headers = parser.getHeaderMap().keySet();
71
				Function<CSVRecord, String> function = new Function<CSVRecord, String>() {
72

  
73
					@Override
74
					public String apply(final CSVRecord input) {
75
						Document document = DocumentHelper.createDocument();
76
						Element root = document.addElement("csvRecord");
77
						for (String key : headers) {
78
							Element row = root.addElement("column");
79
							row.addAttribute("name", key).addText(input.get(key));
80
							if (key.equals(identifier)) {
81
								row.addAttribute("isID", "true");
82
							}
83

  
84
						}
85
						return document.asXML();
86
					}
87
				};
88
				Iterator<String> result = Iterators.transform(parser.iterator(), function);
89
				return result;
90

  
91
			} catch (Exception e) {
92
				return null;
93
			}
94
		}
95
	}
96

  
97
	/*
98
	 * (non-Javadoc)
99
	 *
100
	 * @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String,
101
	 * java.lang.String)
102
	 */
103
	@Override
104
	public Iterable<String> collect(final InterfaceDescriptor descriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
105

  
106
		return new HTTPCSVIterator(descriptor);
107
	}
108

  
109
	/*
110
	 * (non-Javadoc)
111
	 * 
112
	 * @see eu.dnetlib.data.collector.plugin.CollectorPlugin#getProtocol()
113
	 */
114
	@Override
115
	public String getProtocol() {
116

  
117
		return "HttpCSVCollectorPlugin";
118
	}
119

  
120
	/*
121
	 * (non-Javadoc)
122
	 * 
123
	 * @see eu.dnetlib.data.collector.plugin.CollectorPlugin#listNameParameters()
124
	 */
125
	@Override
126
	public List<String> listNameParameters() {
127
		return Lists.newArrayList("separator", "identifier", "quote");
128
	}
129

  
130
}
modules/dnet-modular-collector-service/trunk/pom.xml
1 1
<?xml version="1.0" encoding="UTF-8"?>
2
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
2
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3 4
	<parent>
4 5
		<groupId>eu.dnetlib</groupId>
5 6
		<artifactId>dnet-parent</artifactId>
......
12 13
	<packaging>jar</packaging>
13 14
	<version>2.0.0-SNAPSHOT</version>
14 15
	<scm>
15
 		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-modular-collector-service/trunk</developerConnection>
16
  	</scm>
16
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-modular-collector-service/trunk</developerConnection>
17
	</scm>
17 18
	<dependencies>
18 19
		<dependency>
19 20
			<groupId>eu.dnetlib</groupId>
......
67 68
			<artifactId>commons-httpclient</artifactId>
68 69
			<version>3.1</version>
69 70
		</dependency>
71
		<dependency>
72
			<groupId>org.apache.commons</groupId>
73
			<artifactId>commons-csv</artifactId>
74
			<version>1.0</version>
75
		</dependency>
70 76
	</dependencies>
71 77
</project>

Also available in: Unified diff