Project

General

Profile

1
package eu.dnetlib.data.collector.plugins;
2

    
3
import java.io.*;
4
import java.net.URL;
5
import java.util.Iterator;
6
import java.util.Set;
7

    
8
import com.google.common.collect.Iterators;
9
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
10
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
11
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
12
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
13
import org.apache.commons.csv.CSVFormat;
14
import org.apache.commons.csv.CSVParser;
15
import org.apache.commons.io.input.BOMInputStream;
16
import org.apache.commons.lang3.StringUtils;
17
import org.apache.commons.logging.Log;
18
import org.apache.commons.logging.LogFactory;
19
import org.dom4j.Document;
20
import org.dom4j.DocumentHelper;
21
import org.dom4j.Element;
22

    
23
/**
24
 * The Class HttpCSVCollectorPlugin.
25
 */
26
public class HttpCSVCollectorPlugin extends AbstractCollectorPlugin {
27

    
28
	private static final Log log = LogFactory.getLog(HttpCSVCollectorPlugin.class);
29

    
30
	public static final String UTF8_BOM = "\uFEFF";
31

    
32
	/**
33
	 * The Class HTTPCSVIterator.
34
	 */
35
	class HTTPCSVIterator implements Iterable<String> {
36

    
37
		/** The descriptor. */
38
		private InterfaceDescriptor descriptor;
39

    
40
		/**
41
		 * Instantiates a new HTTPCSV iterator.
42
		 *
43
		 * @param descriptor
44
		 *            the descriptor
45
		 */
46
		public HTTPCSVIterator(final InterfaceDescriptor descriptor) {
47
			this.descriptor = descriptor;
48
		}
49

    
50
		/**
51
		 * Iterator.
52
		 *
53
		 * @return the iterator
54
		 */
55
		@SuppressWarnings("resource")
56
		@Override
57
		public Iterator<String> iterator() {
58

    
59
			try {
60
				final String separator = descriptor.getParams().get("separator");
61
				final String identifier = descriptor.getParams().get("identifier");
62
				final String quote = descriptor.getParams().get("quote");
63
				final URL url = new URL(descriptor.getBaseUrl());
64
				long nLines = 0;
65

    
66
				// FIX
67
				// This code should skip the lines with invalid quotes
68
				final File tempFile = File.createTempFile("csv-", ".tmp");
69
				try (InputStream is = url.openConnection().getInputStream();
70
						BOMInputStream bomIs = new BOMInputStream(is);
71
						BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs));
72
						FileWriter fw = new FileWriter(tempFile)) {
73

    
74
					String line;
75
					while ((line = reader.readLine()) != null) {
76
						if (StringUtils.isBlank(quote) || (quote.charAt(0) != '"') || verifyQuotes(line, separator.charAt(0))) {
77
							fw.write(line);
78
							fw.write("\n");
79
							nLines++;
80
						}
81
					}
82
				}
83
				// END FIX
84

    
85
				final CSVFormat format = CSVFormat.EXCEL
86
						.withHeader()
87
						.withDelimiter(separator.equals("\\t") || StringUtils.isBlank(separator) ? '\t' : separator.charAt(0))
88
						.withQuote(StringUtils.isBlank(quote) ? null : quote.charAt(0))
89
						.withTrim();
90

    
91
				final CSVParser parser = new CSVParser(new FileReader(tempFile), format);
92
				final Set<String> headers = parser.getHeaderMap().keySet();
93

    
94
				final long nRecords = nLines - 1;
95

    
96
				return Iterators.transform(parser.iterator(), input -> {
97
					try {
98
						final Document document = DocumentHelper.createDocument();
99
						final Element root = document.addElement("csvRecord");
100
						for (final String key : headers) {
101
							final Element row = root.addElement("column");
102
							row.addAttribute("name", key).addText(XmlCleaner.cleanAllEntities(input.get(key)));
103
							if (key.equals(identifier)) {
104
								row.addAttribute("isID", "true");
105
							}
106
						}
107

    
108
						return document.asXML();
109
					} finally {
110
						log.debug(tempFile.getAbsolutePath());
111
						if (parser.getRecordNumber() == nRecords) {
112
							log.debug("DELETING " + tempFile.getAbsolutePath());
113
							tempFile.delete();
114
						}
115
					}
116
				});
117
			} catch (final Exception e) {
118
				log.error("Error iterating csv lines", e);
119
				return null;
120
			}
121
		}
122

    
123
	}
124

    
125
	/*
126
	 * (non-Javadoc)
127
	 *
128
	 * @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String,
129
	 * java.lang.String)
130
	 */
131
	@Override
132
	public Iterable<String> collect(final InterfaceDescriptor descriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
133

    
134
		return new HTTPCSVIterator(descriptor);
135
	}
136

    
137
	public boolean verifyQuotes(final String line, final char separator) {
138
		final char[] cs = line.trim().toCharArray();
139
		boolean inField = false;
140
		boolean skipNext = false;
141
		for (int i = 0; i < cs.length; i++) {
142
			if (skipNext) {
143
				skipNext = false;
144
			} else if (inField) {
145
				if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == separator))) {
146
					inField = false;
147
				} else if ((cs[i] == '\"') && (i < (cs.length - 1))) {
148
					if ((cs[i + 1] == '\"')) {
149
						skipNext = true;
150
					} else {
151
						log.warn("Skipped invalid line: " + line);
152
						return false;
153
					}
154
				}
155
			} else {
156
				if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == separator))) {
157
					inField = true;
158
				}
159
			}
160
		}
161

    
162
		if (inField) {
163
			log.warn("Skipped invalid line: " + line);
164
			return false;
165
		}
166

    
167
		return true;
168
	}
169

    
170
}
(6-6/8)