Project

General

Profile

1
package eu.dnetlib.data.collector.plugins;
2

    
3
import java.io.*;
4
import java.net.URL;
5
import java.util.Iterator;
6
import java.util.Set;
7

    
8
import com.google.common.collect.Iterators;
9
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
10
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
11
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
12
import org.apache.commons.csv.CSVFormat;
13
import org.apache.commons.csv.CSVParser;
14
import org.apache.commons.io.input.BOMInputStream;
15
import org.apache.commons.lang.StringUtils;
16
import org.apache.commons.logging.Log;
17
import org.apache.commons.logging.LogFactory;
18
import org.dom4j.Document;
19
import org.dom4j.DocumentHelper;
20
import org.dom4j.Element;
21

    
22
/**
23
 * The Class HttpCSVCollectorPlugin.
24
 */
25
public class HttpCSVCollectorPlugin extends AbstractCollectorPlugin {
26

    
27
	private static final Log log = LogFactory.getLog(HttpCSVCollectorPlugin.class);
28

    
29
	public static final String UTF8_BOM = "\uFEFF";
30

    
31
	/**
32
	 * The Class HTTPCSVIterator.
33
	 */
34
	class HTTPCSVIterator implements Iterable<String> {
35

    
36
		/** The descriptor. */
37
		private InterfaceDescriptor descriptor;
38

    
39
		/**
40
		 * Instantiates a new HTTPCSV iterator.
41
		 *
42
		 * @param descriptor
43
		 *            the descriptor
44
		 */
45
		public HTTPCSVIterator(final InterfaceDescriptor descriptor) {
46
			this.descriptor = descriptor;
47
		}
48

    
49
		/**
50
		 * Iterator.
51
		 *
52
		 * @return the iterator
53
		 */
54
		@SuppressWarnings("resource")
55
		@Override
56
		public Iterator<String> iterator() {
57

    
58
			try {
59
				final String separator = descriptor.getParams().get("separator");
60
				final String identifier = descriptor.getParams().get("identifier");
61
				final String quote = descriptor.getParams().get("quote");
62
				final URL url = new URL(descriptor.getBaseUrl());
63
				long nLines = 0;
64

    
65
				// FIX
66
				// This code should skip the lines with invalid quotes
67
				final File tempFile = File.createTempFile("csv-", ".tmp");
68
				try (InputStream is = url.openConnection().getInputStream();
69
						BOMInputStream bomIs = new BOMInputStream(is);
70
						BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs));
71
						FileWriter fw = new FileWriter(tempFile)) {
72

    
73
					String line;
74
					while ((line = reader.readLine()) != null) {
75
						if (StringUtils.isBlank(quote) || (quote.charAt(0) != '"') || verifyQuotes(line, separator.charAt(0))) {
76
							fw.write(line);
77
							fw.write("\n");
78
							nLines++;
79
						}
80
					}
81
				}
82
				// END FIX
83

    
84
				final CSVFormat format = CSVFormat.EXCEL
85
						.withHeader()
86
						.withDelimiter(separator.equals("\\t") || StringUtils.isBlank(separator) ? '\t' : separator.charAt(0))
87
						.withQuote(StringUtils.isBlank(quote) ? null : quote.charAt(0))
88
						.withTrim();
89

    
90
				final CSVParser parser = new CSVParser(new FileReader(tempFile), format);
91
				final Set<String> headers = parser.getHeaderMap().keySet();
92

    
93
				final long nRecords = nLines - 1;
94

    
95
				return Iterators.transform(parser.iterator(), input -> {
96
					try {
97
						final Document document = DocumentHelper.createDocument();
98
						final Element root = document.addElement("csvRecord");
99
						for (final String key : headers) {
100
							final Element row = root.addElement("column");
101
							row.addAttribute("name", key).addText(input.get(key));
102
							if (key.equals(identifier)) {
103
								row.addAttribute("isID", "true");
104
							}
105
						}
106

    
107
						return document.asXML();
108
					} finally {
109
						System.out.println(tempFile.getAbsolutePath());
110
						if (parser.getRecordNumber() == nRecords) {
111
							System.out.println("DELETING " + tempFile.getAbsolutePath());
112
							tempFile.delete();
113
						}
114
					}
115
				});
116
			} catch (final Exception e) {
117
				log.error("Error iterating csv lines", e);
118
				return null;
119
			}
120
		}
121

    
122
	}
123

    
124
	/*
125
	 * (non-Javadoc)
126
	 *
127
	 * @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String,
128
	 * java.lang.String)
129
	 */
130
	@Override
131
	public Iterable<String> collect(final InterfaceDescriptor descriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
132

    
133
		return new HTTPCSVIterator(descriptor);
134
	}
135

    
136
	public boolean verifyQuotes(final String line, final char separator) {
137
		final char[] cs = line.trim().toCharArray();
138
		boolean inField = false;
139
		boolean skipNext = false;
140
		for (int i = 0; i < cs.length; i++) {
141
			if (skipNext) {
142
				skipNext = false;
143
			} else if (inField) {
144
				if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == separator))) {
145
					inField = false;
146
				} else if ((cs[i] == '\"') && (i < (cs.length - 1))) {
147
					if ((cs[i + 1] == '\"')) {
148
						skipNext = true;
149
					} else {
150
						log.warn("Skipped invalid line: " + line);
151
						return false;
152
					}
153
				}
154
			} else {
155
				if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == separator))) {
156
					inField = true;
157
				}
158
			}
159
		}
160

    
161
		if (inField) {
162
			log.warn("Skipped invalid line: " + line);
163
			return false;
164
		}
165

    
166
		return true;
167
	}
168

    
169
}
(6-6/7)