Project

General

Profile

1
package eu.dnetlib.data.collector.plugins;
2

    
3
import java.io.*;
4
import java.net.URL;
5
import java.util.Iterator;
6
import java.util.Set;
7

    
8
import com.google.common.collect.Iterators;
9
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
10
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
11
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
12
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
13
import org.apache.commons.csv.CSVFormat;
14
import org.apache.commons.csv.CSVParser;
15
import org.apache.commons.io.input.BOMInputStream;
16
import org.apache.commons.lang3.StringUtils;
17
import org.apache.commons.logging.Log;
18
import org.apache.commons.logging.LogFactory;
19
import org.dom4j.Document;
20
import org.dom4j.DocumentHelper;
21
import org.dom4j.Element;
22

    
23
/**
24
 * The Class HttpCSVCollectorPlugin.
25
 */
26
public class HttpCSVCollectorPlugin extends AbstractCollectorPlugin {
27

    
28
	private static final Log log = LogFactory.getLog(HttpCSVCollectorPlugin.class);
29

    
30
	public static final String UTF8_BOM = "\uFEFF";
31

    
32
	/**
33
	 * The Class HTTPCSVIterator.
34
	 */
35
	class HTTPCSVIterator implements Iterable<String> {
36

    
37
		/** The descriptor. */
38
		private InterfaceDescriptor descriptor;
39

    
40
		/**
41
		 * Instantiates a new HTTPCSV iterator.
42
		 *
43
		 * @param descriptor
44
		 *            the descriptor
45
		 */
46
		public HTTPCSVIterator(final InterfaceDescriptor descriptor) {
47
			this.descriptor = descriptor;
48
		}
49

    
50
		/**
51
		 * Iterator.
52
		 *
53
		 * @return the iterator
54
		 */
55
		@SuppressWarnings("resource")
56
		@Override
57
		public Iterator<String> iterator() {
58

    
59
			try {
60
				final String separator = descriptor.getParams().get("separator");
61
				final String identifier = descriptor.getParams().get("identifier");
62
				final String quote = descriptor.getParams().get("quote");
63
				final URL url = new URL(descriptor.getBaseUrl());
64
				long nLines = 0;
65

    
66
				// FIX
67
				// This code should skip the lines with invalid quotes
68
				final File tempFile = File.createTempFile("csv-", ".tmp");
69
				try (InputStream is = url.openConnection().getInputStream();
70
						BOMInputStream bomIs = new BOMInputStream(is);
71
						BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs));
72
						FileWriter fw = new FileWriter(tempFile)) {
73

    
74
					String line;
75
					while ((line = reader.readLine()) != null) {
76
						if (StringUtils.isBlank(quote) || (quote.charAt(0) != '"') || verifyQuotes(line, separator.charAt(0))) {
77
							fw.write(line);
78
							fw.write("\n");
79
							nLines++;
80
						}
81
					}
82
				}
83
				// END FIX
84

    
85
				final CSVFormat format = CSVFormat.EXCEL
86
						.withHeader()
87
						.withDelimiter(separator.equals("\\t") || StringUtils.isBlank(separator) ? '\t' : separator.charAt(0))
88
						.withQuote(StringUtils.isBlank(quote) ? null : quote.charAt(0))
89
						.withTrim();
90

    
91
				final CSVParser parser = new CSVParser(new FileReader(tempFile), format);
92
				final Set<String> headers = parser.getHeaderMap().keySet();
93

    
94
				final long nRecords = nLines - 1;
95

    
96
				return Iterators.transform(parser.iterator(), input -> {
97
					try {
98
						final Document document = DocumentHelper.createDocument();
99
						final Element root = document.addElement("csvRecord");
100
						for (final String key : headers) {
101
							final Element row = root.addElement("column");
102
							final String value = XmlCleaner.cleanAllEntities(input.get(key));
103
							if (value!= null) row.addAttribute("name", key).addText(value);
104
							if (key.equals(identifier)) {
105
								row.addAttribute("isID", "true");
106
							}
107
						}
108

    
109
						return document.asXML();
110
					} finally {
111
						log.debug(tempFile.getAbsolutePath());
112
						if (parser.getRecordNumber() == nRecords) {
113
							log.debug("DELETING " + tempFile.getAbsolutePath());
114
							tempFile.delete();
115
						}
116
					}
117
				});
118
			} catch (final Exception e) {
119
				log.error("Error iterating csv lines", e);
120
				return null;
121
			}
122
		}
123

    
124
	}
125

    
126
	/*
127
	 * (non-Javadoc)
128
	 *
129
	 * @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String,
130
	 * java.lang.String)
131
	 */
132
	@Override
133
	public Iterable<String> collect(final InterfaceDescriptor descriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
134

    
135
		return new HTTPCSVIterator(descriptor);
136
	}
137

    
138
	public boolean verifyQuotes(final String line, final char separator) {
139
		final char[] cs = line.trim().toCharArray();
140
		boolean inField = false;
141
		boolean skipNext = false;
142
		for (int i = 0; i < cs.length; i++) {
143
			if (skipNext) {
144
				skipNext = false;
145
			} else if (inField) {
146
				if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == separator))) {
147
					inField = false;
148
				} else if ((cs[i] == '\"') && (i < (cs.length - 1))) {
149
					if ((cs[i + 1] == '\"')) {
150
						skipNext = true;
151
					} else {
152
						log.warn("Skipped invalid line: " + line);
153
						return false;
154
					}
155
				}
156
			} else {
157
				if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == separator))) {
158
					inField = true;
159
				}
160
			}
161
		}
162

    
163
		if (inField) {
164
			log.warn("Skipped invalid line: " + line);
165
			return false;
166
		}
167

    
168
		return true;
169
	}
170

    
171
}
(6-6/8)