Project

General

Profile

« Previous | Next » 

Revision 48023

integrated latest changes from dnet40

View differences:

HttpCSVCollectorPlugin.java
1 1
package eu.dnetlib.data.collector.plugins;
2 2

  
3
import java.io.BufferedReader;
4
import java.io.File;
5
import java.io.FileReader;
6
import java.io.FileWriter;
7
import java.io.InputStream;
8
import java.io.InputStreamReader;
3
import java.io.*;
9 4
import java.net.URL;
10 5
import java.util.Iterator;
11 6
import java.util.Set;
12 7

  
8
import com.google.common.collect.Iterators;
9
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
10
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
11
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
13 12
import org.apache.commons.csv.CSVFormat;
14 13
import org.apache.commons.csv.CSVParser;
15
import org.apache.commons.csv.CSVRecord;
16 14
import org.apache.commons.io.input.BOMInputStream;
17 15
import org.apache.commons.lang.StringUtils;
18 16
import org.apache.commons.logging.Log;
......
21 19
import org.dom4j.DocumentHelper;
22 20
import org.dom4j.Element;
23 21

  
24
import com.google.common.base.Function;
25
import com.google.common.collect.Iterators;
26

  
27
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
28
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
29
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
30

  
31 22
/**
32 23
 * The Class HttpCSVCollectorPlugin.
33 24
 */
......
81 72

  
82 73
					String line;
83 74
					while ((line = reader.readLine()) != null) {
84
						if (StringUtils.isBlank(quote) || (quote.charAt(0) != '\"') || verifyQuotes(line)) {
75
						if (StringUtils.isBlank(quote) || (quote.charAt(0) != '"') || verifyQuotes(line, separator.charAt(0))) {
85 76
							fw.write(line);
86 77
							fw.write("\n");
87 78
							nLines++;
......
101 92

  
102 93
				final long nRecords = nLines - 1;
103 94

  
104
				return Iterators.transform(parser.iterator(), new Function<CSVRecord, String>() {
105

  
106
					@Override
107
					public String apply(final CSVRecord input) {
108
						try {
109
							final Document document = DocumentHelper.createDocument();
110
							final Element root = document.addElement("csvRecord");
111
							for (final String key : headers) {
112
								final Element row = root.addElement("column");
113
								row.addAttribute("name", key).addText(input.get(key));
114
								if (key.equals(identifier)) {
115
									row.addAttribute("isID", "true");
116
								}
95
				return Iterators.transform(parser.iterator(), input -> {
96
					try {
97
						final Document document = DocumentHelper.createDocument();
98
						final Element root = document.addElement("csvRecord");
99
						for (final String key : headers) {
100
							final Element row = root.addElement("column");
101
							row.addAttribute("name", key).addText(input.get(key));
102
							if (key.equals(identifier)) {
103
								row.addAttribute("isID", "true");
117 104
							}
105
						}
118 106

  
119
							return document.asXML();
120
						} finally {
121
							System.out.println(tempFile.getAbsolutePath());
122
							if (parser.getRecordNumber() == nRecords) {
123
								System.out.println("DELETING " + tempFile.getAbsolutePath());
124
								tempFile.delete();
125
							}
107
						return document.asXML();
108
					} finally {
109
						System.out.println(tempFile.getAbsolutePath());
110
						if (parser.getRecordNumber() == nRecords) {
111
							System.out.println("DELETING " + tempFile.getAbsolutePath());
112
							tempFile.delete();
126 113
						}
127 114
					}
128 115
				});
......
132 119
			}
133 120
		}
134 121

  
135
		private boolean verifyQuotes(final String line) {
136
			final char[] cs = line.trim().toCharArray();
137
			boolean inField = false;
138
			boolean skipNext = false;
139
			for (int i = 0; i < cs.length; i++) {
140
				if (skipNext) {
141
					skipNext = false;
142
				} else if (inField) {
143
					if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == ','))) {
144
						inField = false;
145
					} else if ((cs[i] == '\"') && (i < (cs.length - 1))) {
146
						if ((cs[i + 1] == '\"')) {
147
							skipNext = true;
148
						} else {
149
							log.warn("Skipped invalid line: " + line);
150
							return false;
151
						}
152
					}
153
				} else {
154
					if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == ','))) {
155
						inField = true;
156
					}
157
				}
158
			}
159

  
160
			if (inField) {
161
				log.warn("Skipped invalid line: " + line);
162
				return false;
163
			}
164

  
165
			return true;
166
		}
167 122
	}
168 123

  
169 124
	/*
......
178 133
		return new HTTPCSVIterator(descriptor);
179 134
	}
180 135

  
136
	public boolean verifyQuotes(final String line, final char separator) {
137
		final char[] cs = line.trim().toCharArray();
138
		boolean inField = false;
139
		boolean skipNext = false;
140
		for (int i = 0; i < cs.length; i++) {
141
			if (skipNext) {
142
				skipNext = false;
143
			} else if (inField) {
144
				if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == separator))) {
145
					inField = false;
146
				} else if ((cs[i] == '\"') && (i < (cs.length - 1))) {
147
					if ((cs[i + 1] == '\"')) {
148
						skipNext = true;
149
					} else {
150
						log.warn("Skipped invalid line: " + line);
151
						return false;
152
					}
153
				}
154
			} else {
155
				if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == separator))) {
156
					inField = true;
157
				}
158
			}
159
		}
160

  
161
		if (inField) {
162
			log.warn("Skipped invalid line: " + line);
163
			return false;
164
		}
165

  
166
		return true;
167
	}
168

  
181 169
}

Also available in: Unified diff