Project

General

Profile

« Previous | Next » 

Revision 48023

integrated latest changes from dnet40

View differences:

modules/dnet-modular-collector-service/trunk/src/test/java/eu/dnetlib/data/collector/plugins/csv/HTTPCSVCollectorPluginTest.java
3 3
import java.net.URISyntaxException;
4 4
import java.util.HashMap;
5 5

  
6
import org.junit.Assert;
7
import org.junit.Test;
8

  
9 6
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
10 7
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
11 8
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
9
import org.junit.Test;
12 10

  
11
import static org.junit.Assert.assertFalse;
12
import static org.junit.Assert.assertTrue;
13

  
13 14
public class HTTPCSVCollectorPluginTest {
14 15

  
15 16
	private String FILE_URL = HTTPCSVCollectorPluginTest.class.getResource("testCSVwithBOM.csv").toString();
17
	final HttpCSVCollectorPlugin plugin = new HttpCSVCollectorPlugin();
16 18

  
17
	// private String FILE_URL = "file:///Users/michele/Downloads/P3_GrantExport.csv";
18

  
19 19
	@Test
20 20
	public void testCSVHeader() throws URISyntaxException, CollectorServiceException {
21 21

  
......
27 27
		params.put("identifier", "ID");
28 28
		descr.setBaseUrl(FILE_URL);
29 29
		descr.setParams(params);
30
		final HttpCSVCollectorPlugin plugin = new HttpCSVCollectorPlugin();
30

  
31 31
		int i = 0;
32 32
		for (final String s : plugin.collect(descr, null, null)) {
33
			Assert.assertTrue(s.length() > 0);
33
			assertTrue(s.length() > 0);
34 34
			System.out.println(s);
35

  
36 35
			i++;
37 36
		}
38
		Assert.assertTrue(i > 0);
37
		System.out.println(i);
38
		assertTrue(i > 0);
39 39
	}
40

  
41
	@Test
42
	public void testVerifyQuotesOk(){
43
		String correct = "\"5\",\"Il Padrino\",\"EEEEEEEE \"\"ZZZZZ\"\" EEEEEEEEEE\",1970";
44
		assertTrue(plugin.verifyQuotes(correct, ','));
45
	}
46

  
47
	@Test
48
	public void testVerifyQuotesWRONG(){
49
		String correct = "5\",\"Il Padrino\",\"EEEEEEEE \"ZZZZZ\" EEEEEEEEEE\",1970";
50
		assertFalse(plugin.verifyQuotes(correct, ','));
51
	}
52

  
53
	@Test
54
	public void testSNSF(){
55
		String s = "\"8773\";\"3101-008773\";\"EMBO workshop on structure, function and regulation of membrane transport proteins\";\"\";\"Rossier Bernard C.\";\"Scientific Conferences\";\"Science communication\";\"Département de Pharmacologie & Toxicologie Faculté de Biologie et de Médecine Université de Lausanne\";\"Université de Lausanne - LA\";\"30103\";\"Cellular Biology, Cytology\";\"Biology and Medicine;Basic Biological Research\";\"01.04.1987\";\"30.09.1987\";\"10000.00\";\"\";\"30103\"" ;
56
		assertTrue(plugin.verifyQuotes(s, ';'));
57
	}
58

  
59
	@Test
60
	public void testSNSF2(){
61
		String s = "\"11000\";\"4021-011000\";\"Literarische und nationale Erziehung : Schweizerisches Selbstverständnis in der Literatur für Kinder und Jugend- liche\";\"\";\"Tschirky Rosmarie\";\"NRP 21 Cultural Diversity and National Identity\";\"Programmes;National Research Programmes (NRPs)\";\"Schweiz. Inst. für Kinder- und Jugendmedien\";\"Universität Zürich - ZH\";\"10501\";\"German and English languages and literature\";\"Human and Social Sciences;Linguistics and literature, philosophy\";\"10501\";\"01.10.1986\";\"31.03.1990\";\"308807.00\";\"\"";
62
		assertTrue(plugin.verifyQuotes(s, ';'));
63
	}
64

  
65
	@Test
66
	public void testSNSFInvalid(){
67
		String s = "\"35918\";\"1113-035918\";\"Entwicklung eines dreisprachigen Thesaurus des schweizerischen Rechts zur Unterstützung der Suche in Volltextdatenbanken.\";\"\";\"Verein \"Schweizerische Juristische Datenbank\"\";\"Project funding (Div. I-III)\";\"Project funding\";\"Verein \"\"Schweizerische Juristische Datenbank\"\"\";\"NPO (Biblioth., Museen, Verwalt.) - NPO\";\"10205\";\"Legal sciences\";\"Human and Social Sciences;Economics, law\";\"10205\";\"01.12.1992\";\"31.03.1995\";\"500366.00\";\"\"";
68
		assertFalse(plugin.verifyQuotes(s, ';'));
69
	}
70

  
40 71
}
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/filesystem/FilesystemIterable.java
103 103
				if (setObjIdentifierFromFileName) {
104 104
					return addObjIdentifier(cleanedXML, FilenameUtils.getBaseName(inputFileName));
105 105
				} else return cleanedXML;
106
			}catch (VTDException e) {
106
			} catch (VTDException e) {
107 107
				log.error("Cannot process with VTD to set the objIdentifier " + inputFileName);
108 108
				return "";
109
			}
110
			catch (Exception e) {
109
			} catch (Exception e) {
111 110
				log.error("Unable to read " + inputFileName);
112 111
				return "";
113 112
			} finally {
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpCSVCollectorPlugin.java
1 1
package eu.dnetlib.data.collector.plugins;
2 2

  
3
import java.io.BufferedReader;
4
import java.io.File;
5
import java.io.FileReader;
6
import java.io.FileWriter;
7
import java.io.InputStream;
8
import java.io.InputStreamReader;
3
import java.io.*;
9 4
import java.net.URL;
10 5
import java.util.Iterator;
11 6
import java.util.Set;
12 7

  
8
import com.google.common.collect.Iterators;
9
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
10
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
11
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
13 12
import org.apache.commons.csv.CSVFormat;
14 13
import org.apache.commons.csv.CSVParser;
15
import org.apache.commons.csv.CSVRecord;
16 14
import org.apache.commons.io.input.BOMInputStream;
17 15
import org.apache.commons.lang.StringUtils;
18 16
import org.apache.commons.logging.Log;
......
21 19
import org.dom4j.DocumentHelper;
22 20
import org.dom4j.Element;
23 21

  
24
import com.google.common.base.Function;
25
import com.google.common.collect.Iterators;
26

  
27
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
28
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
29
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
30

  
31 22
/**
32 23
 * The Class HttpCSVCollectorPlugin.
33 24
 */
......
81 72

  
82 73
					String line;
83 74
					while ((line = reader.readLine()) != null) {
84
						if (StringUtils.isBlank(quote) || (quote.charAt(0) != '\"') || verifyQuotes(line)) {
75
						if (StringUtils.isBlank(quote) || (quote.charAt(0) != '"') || verifyQuotes(line, separator.charAt(0))) {
85 76
							fw.write(line);
86 77
							fw.write("\n");
87 78
							nLines++;
......
101 92

  
102 93
				final long nRecords = nLines - 1;
103 94

  
104
				return Iterators.transform(parser.iterator(), new Function<CSVRecord, String>() {
105

  
106
					@Override
107
					public String apply(final CSVRecord input) {
108
						try {
109
							final Document document = DocumentHelper.createDocument();
110
							final Element root = document.addElement("csvRecord");
111
							for (final String key : headers) {
112
								final Element row = root.addElement("column");
113
								row.addAttribute("name", key).addText(input.get(key));
114
								if (key.equals(identifier)) {
115
									row.addAttribute("isID", "true");
116
								}
95
				return Iterators.transform(parser.iterator(), input -> {
96
					try {
97
						final Document document = DocumentHelper.createDocument();
98
						final Element root = document.addElement("csvRecord");
99
						for (final String key : headers) {
100
							final Element row = root.addElement("column");
101
							row.addAttribute("name", key).addText(input.get(key));
102
							if (key.equals(identifier)) {
103
								row.addAttribute("isID", "true");
117 104
							}
105
						}
118 106

  
119
							return document.asXML();
120
						} finally {
121
							System.out.println(tempFile.getAbsolutePath());
122
							if (parser.getRecordNumber() == nRecords) {
123
								System.out.println("DELETING " + tempFile.getAbsolutePath());
124
								tempFile.delete();
125
							}
107
						return document.asXML();
108
					} finally {
109
						System.out.println(tempFile.getAbsolutePath());
110
						if (parser.getRecordNumber() == nRecords) {
111
							System.out.println("DELETING " + tempFile.getAbsolutePath());
112
							tempFile.delete();
126 113
						}
127 114
					}
128 115
				});
......
132 119
			}
133 120
		}
134 121

  
135
		private boolean verifyQuotes(final String line) {
136
			final char[] cs = line.trim().toCharArray();
137
			boolean inField = false;
138
			boolean skipNext = false;
139
			for (int i = 0; i < cs.length; i++) {
140
				if (skipNext) {
141
					skipNext = false;
142
				} else if (inField) {
143
					if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == ','))) {
144
						inField = false;
145
					} else if ((cs[i] == '\"') && (i < (cs.length - 1))) {
146
						if ((cs[i + 1] == '\"')) {
147
							skipNext = true;
148
						} else {
149
							log.warn("Skipped invalid line: " + line);
150
							return false;
151
						}
152
					}
153
				} else {
154
					if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == ','))) {
155
						inField = true;
156
					}
157
				}
158
			}
159

  
160
			if (inField) {
161
				log.warn("Skipped invalid line: " + line);
162
				return false;
163
			}
164

  
165
			return true;
166
		}
167 122
	}
168 123

  
169 124
	/*
......
178 133
		return new HTTPCSVIterator(descriptor);
179 134
	}
180 135

  
136
	public boolean verifyQuotes(final String line, final char separator) {
137
		final char[] cs = line.trim().toCharArray();
138
		boolean inField = false;
139
		boolean skipNext = false;
140
		for (int i = 0; i < cs.length; i++) {
141
			if (skipNext) {
142
				skipNext = false;
143
			} else if (inField) {
144
				if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == separator))) {
145
					inField = false;
146
				} else if ((cs[i] == '\"') && (i < (cs.length - 1))) {
147
					if ((cs[i + 1] == '\"')) {
148
						skipNext = true;
149
					} else {
150
						log.warn("Skipped invalid line: " + line);
151
						return false;
152
					}
153
				}
154
			} else {
155
				if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == separator))) {
156
					inField = true;
157
				}
158
			}
159
		}
160

  
161
		if (inField) {
162
			log.warn("Skipped invalid line: " + line);
163
			return false;
164
		}
165

  
166
		return true;
167
	}
168

  
181 169
}
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/FileCSVCollectorPlugin.java
21 21
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
22 22
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
23 23

  
24
/**
25
 * Please use eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin instead
26
 */
27
@Deprecated
24 28
public class FileCSVCollectorPlugin extends AbstractCollectorPlugin {
25 29

  
26 30
	private static final Log log = LogFactory.getLog(FileCSVCollectorPlugin.class);
......
136 140
					}
137 141
				} else headers = tmpHeader;
138 142
			}
139
			return new Iterable<String>() {
140

  
141
				@Override
142
				public Iterator<String> iterator() {
143
					return new FileCSVIterator(br, separator, quote);
144
				}
145
			};
143
			return () -> new FileCSVIterator(br, separator, quote);
146 144
		} catch (final Exception e) {
147 145
			throw new CollectorServiceException(e);
148 146
		}

Also available in: Unified diff