1 |
48028
|
claudio.at
|
package eu.dnetlib.data.collector.plugins.excel;
|
2 |
|
|
|
3 |
|
|
/**
|
4 |
|
|
* Created by miriam on 10/05/2017.
|
5 |
|
|
*/
|
6 |
|
|
import java.io.File;
|
7 |
|
|
import java.io.FileInputStream;
|
8 |
|
|
import java.io.IOException;
|
9 |
|
|
import java.net.URL;
|
10 |
|
|
import java.util.ArrayList;
|
11 |
|
|
import java.util.HashMap;
|
12 |
|
|
import java.util.Iterator;
|
13 |
|
|
|
14 |
|
|
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
|
15 |
|
|
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
16 |
51970
|
miriam.bag
|
import org.apache.commons.lang3.StringUtils;
|
17 |
48028
|
claudio.at
|
import org.apache.commons.logging.Log;
|
18 |
|
|
import org.apache.commons.logging.LogFactory;
|
19 |
|
|
import org.apache.poi.ss.usermodel.Cell;
|
20 |
|
|
import org.apache.poi.ss.usermodel.DataFormatter;
|
21 |
|
|
import org.apache.poi.ss.usermodel.Row;
|
22 |
|
|
import org.apache.poi.ss.usermodel.Sheet;
|
23 |
|
|
import org.apache.poi.ss.usermodel.Workbook;
|
24 |
|
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
25 |
|
|
import org.json.*;
|
26 |
|
|
|
27 |
|
|
import org.apache.commons.io.FileUtils;
|
28 |
|
|
|
29 |
|
|
public class Read {
|
30 |
|
|
|
31 |
|
|
private static final Log log = LogFactory.getLog(Read.class);
|
32 |
|
|
|
33 |
|
|
/** The descriptor. */
|
34 |
|
|
private InterfaceDescriptor descriptor;
|
35 |
|
|
|
36 |
|
|
|
37 |
|
|
/*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10";
|
38 |
|
|
private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv";
|
39 |
|
|
private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," +
|
40 |
|
|
"\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}],"
|
41 |
|
|
+ "\"col_currency\":10}"; */
|
42 |
|
|
private Sheet sheet;
|
43 |
|
|
private CSVFileWriter csv_writer = new CSVFileWriter();
|
44 |
|
|
private HashMap<String,String> map_header = new HashMap<String,String>();
|
45 |
|
|
private HashMap<String,String> map_body = new HashMap<String,String>();
|
46 |
|
|
private int header_row;
|
47 |
|
|
private String file_to_save ;
|
48 |
|
|
private boolean replace_currency = false;
|
49 |
|
|
private String from_currency, to_currency;
|
50 |
|
|
private boolean remove_empty, remove_tmp_file;
|
51 |
|
|
private String remove_id;
|
52 |
|
|
private int column_id;
|
53 |
|
|
private int currency_column;
|
54 |
|
|
private int sheet_number;
|
55 |
|
|
private String tmp_file;
|
56 |
|
|
private String argument;
|
57 |
|
|
private String identifier;
|
58 |
|
|
|
59 |
|
|
private HttpCSVCollectorPlugin collector;
|
60 |
|
|
|
61 |
|
|
public HttpCSVCollectorPlugin getCollector() {
|
62 |
|
|
return collector;
|
63 |
|
|
}
|
64 |
|
|
|
65 |
|
|
public void setCollector(HttpCSVCollectorPlugin collector) {
|
66 |
|
|
this.collector = collector;
|
67 |
|
|
}
|
68 |
|
|
|
69 |
|
|
public Read(InterfaceDescriptor descriptor){
|
70 |
|
|
this.descriptor = descriptor;
|
71 |
|
|
|
72 |
|
|
}
|
73 |
|
|
|
74 |
|
|
private static String getCellValue( Cell cell)
|
75 |
|
|
{
|
76 |
|
|
DataFormatter formatter = new DataFormatter();
|
77 |
|
|
String formattedCellValue = formatter.formatCellValue(cell);
|
78 |
|
|
return formattedCellValue;
|
79 |
|
|
|
80 |
|
|
}
|
81 |
|
|
|
82 |
|
|
private void copyFile() throws IOException{
|
83 |
|
|
FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file));
|
84 |
|
|
|
85 |
|
|
}
|
86 |
|
|
|
87 |
|
|
private void parseDescriptor(){
|
88 |
|
|
HashMap<String, String> params = descriptor.getParams();
|
89 |
|
|
argument = params.get("argument");
|
90 |
|
|
header_row = Integer.parseInt(params.get("header_row"));
|
91 |
|
|
tmp_file = params.get("tmp_file");
|
92 |
|
|
remove_empty = (params.get("remove_empty_lines") == "yes");
|
93 |
|
|
remove_id = params.get("remove_lines_with_id");
|
94 |
|
|
column_id = Integer.parseInt(params.get("col_id"));
|
95 |
|
|
remove_tmp_file = (params.get("remove_tmp_file") == "yes");
|
96 |
|
|
sheet_number = Integer.parseInt(params.get("sheet_number"));
|
97 |
|
|
file_to_save = params.get("file_to_save");
|
98 |
|
|
}
|
99 |
|
|
private void init() throws IOException{
|
100 |
|
|
parseDescriptor();
|
101 |
|
|
log.info("Parsing the arguments");
|
102 |
|
|
parseArguments();
|
103 |
|
|
log.info("Copying the file in temp local file");
|
104 |
|
|
copyFile();
|
105 |
|
|
log.info("Extracting the sheet " + sheet_number);
|
106 |
|
|
FileInputStream fis = new FileInputStream(tmp_file);
|
107 |
|
|
Workbook workbook = new XSSFWorkbook(fis);
|
108 |
|
|
sheet = workbook.getSheetAt(sheet_number);
|
109 |
|
|
fis.close();
|
110 |
|
|
if(remove_tmp_file) {
|
111 |
|
|
File f = new File(tmp_file);
|
112 |
|
|
f.delete();
|
113 |
|
|
}
|
114 |
|
|
|
115 |
|
|
}
|
116 |
|
|
|
117 |
|
|
private void fillMap(JSONObject json, HashMap<String,String> map, String elem){
|
118 |
51970
|
miriam.bag
|
try{
|
119 |
|
|
final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
|
120 |
|
|
for(Object entry: arr)
|
121 |
48028
|
claudio.at
|
map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to"));
|
122 |
51970
|
miriam.bag
|
}catch(Throwable e){
|
123 |
|
|
log.error("Problems filling the map for " + elem);
|
124 |
|
|
throw(e);
|
125 |
48028
|
claudio.at
|
}
|
126 |
|
|
|
127 |
|
|
}
|
128 |
|
|
|
129 |
51970
|
miriam.bag
|
|
130 |
|
|
|
131 |
48028
|
claudio.at
|
private void parseArguments() {
|
132 |
51970
|
miriam.bag
|
if (StringUtils.isNotEmpty(argument)){
|
133 |
|
|
try{
|
134 |
|
|
final JSONObject json = new JSONObject(argument);
|
135 |
|
|
if(json.has("header"))
|
136 |
|
|
fillMap(json, map_header,"header");
|
137 |
|
|
if (json.has("body"))
|
138 |
|
|
fillMap(json,map_body,"body");
|
139 |
48028
|
claudio.at
|
|
140 |
51970
|
miriam.bag
|
if(json.has("replace_currency"))
|
141 |
|
|
{
|
142 |
|
|
replace_currency = true ;
|
143 |
|
|
from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from");
|
144 |
|
|
to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to");
|
145 |
|
|
|
146 |
|
|
}
|
147 |
|
|
|
148 |
|
|
if (json.has("col_currency"))
|
149 |
|
|
currency_column = json.getInt("col_currency");
|
150 |
|
|
}catch(Throwable e){
|
151 |
|
|
log.error("Problems while parsing the argument parameter.");
|
152 |
|
|
throw (e);
|
153 |
|
|
}
|
154 |
48028
|
claudio.at
|
}
|
155 |
|
|
|
156 |
|
|
|
157 |
51970
|
miriam.bag
|
|
158 |
48028
|
claudio.at
|
}
|
159 |
|
|
|
160 |
|
|
private String applyReplace(String row, HashMap<String,String>replace){
|
161 |
|
|
for(String key: replace.keySet()){
|
162 |
|
|
if(row.contains(key))
|
163 |
|
|
row = row.replace(key, replace.get(key));
|
164 |
|
|
}
|
165 |
|
|
return row;
|
166 |
|
|
}
|
167 |
|
|
|
168 |
|
|
private void getHeader(){
|
169 |
|
|
Row row = sheet.getRow(header_row);
|
170 |
|
|
Iterator<Cell> cellIterator = row.cellIterator();
|
171 |
|
|
Cell cell;
|
172 |
|
|
String project = "";
|
173 |
|
|
int count = 0;
|
174 |
|
|
while (cellIterator.hasNext()){
|
175 |
|
|
cell = cellIterator.next();
|
176 |
52643
|
miriam.bag
|
final String stringCellValue = cell.getStringCellValue();
|
177 |
|
|
project += applyReplace(stringCellValue,map_header) + ";";
|
178 |
|
|
if(count++ == column_id) identifier = applyReplace(stringCellValue,map_header);
|
179 |
48028
|
claudio.at
|
}
|
180 |
|
|
project = project.substring(0, project.length() -1 );
|
181 |
|
|
csv_writer.setHeader(project.split(";"));
|
182 |
|
|
|
183 |
|
|
}
|
184 |
|
|
|
185 |
|
|
private void getData(){
|
186 |
|
|
Row row;
|
187 |
|
|
Cell cell;
|
188 |
|
|
String tmp;
|
189 |
|
|
Iterator<Cell>cellIterator;
|
190 |
|
|
for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){
|
191 |
|
|
row = sheet.getRow(row_number);
|
192 |
52643
|
miriam.bag
|
if (row != null) {
|
193 |
|
|
cellIterator = row.cellIterator();
|
194 |
48028
|
claudio.at
|
|
195 |
52643
|
miriam.bag
|
int col_number = 0;
|
196 |
48028
|
claudio.at
|
|
197 |
52643
|
miriam.bag
|
boolean discard_row = false;
|
198 |
|
|
ArrayList<String> al = new ArrayList<String>();
|
199 |
|
|
while (cellIterator.hasNext() && !discard_row) {
|
200 |
|
|
cell = cellIterator.next();
|
201 |
|
|
tmp = getCellValue(cell).trim();
|
202 |
52644
|
miriam.bag
|
tmp = tmp.replace("\n"," ");
|
203 |
52643
|
miriam.bag
|
if (col_number == column_id &&
|
204 |
|
|
((remove_empty && tmp.trim().equals("")) ||
|
205 |
|
|
(!remove_id.equals("") && tmp.equals(remove_id))))
|
206 |
|
|
discard_row = true;
|
207 |
48028
|
claudio.at
|
|
208 |
52643
|
miriam.bag
|
if (replace_currency && col_number == currency_column)
|
209 |
|
|
tmp = tmp.replace(from_currency, to_currency);
|
210 |
48028
|
claudio.at
|
|
211 |
52643
|
miriam.bag
|
al.add(applyReplace(tmp, map_body));
|
212 |
|
|
col_number++;
|
213 |
|
|
}
|
214 |
|
|
if (!discard_row) {
|
215 |
|
|
csv_writer.addProject(al);
|
216 |
|
|
|
217 |
|
|
}
|
218 |
48028
|
claudio.at
|
}
|
219 |
|
|
}
|
220 |
|
|
|
221 |
|
|
}
|
222 |
|
|
|
223 |
|
|
private void writeCSVFile(){
|
224 |
|
|
|
225 |
|
|
csv_writer.writeFile(file_to_save);
|
226 |
|
|
}
|
227 |
|
|
|
228 |
|
|
private InterfaceDescriptor prepareHTTPCSVDescriptor(){
|
229 |
|
|
InterfaceDescriptor dex = new InterfaceDescriptor();
|
230 |
|
|
dex.setBaseUrl("file://"+file_to_save);
|
231 |
|
|
HashMap<String, String> params = new HashMap<String, String>();
|
232 |
|
|
params.put("separator", descriptor.getParams().get("separator"));
|
233 |
|
|
params.put("identifier",identifier);
|
234 |
|
|
params.put("quote",descriptor.getParams().get("quote"));
|
235 |
|
|
dex.setParams(params);
|
236 |
|
|
return dex;
|
237 |
|
|
}
|
238 |
|
|
|
239 |
|
|
public Iterable<String> parseFile() throws Exception{
|
240 |
|
|
|
241 |
|
|
|
242 |
|
|
init();
|
243 |
|
|
log.info("Getting header elements");
|
244 |
|
|
getHeader();
|
245 |
|
|
log.info("Getting sheet data");
|
246 |
|
|
getData();
|
247 |
|
|
log.info("Writing the csv file");
|
248 |
|
|
writeCSVFile();
|
249 |
|
|
log.info("Preparing to parse csv");
|
250 |
|
|
|
251 |
|
|
return collector.collect(prepareHTTPCSVDescriptor(),"","");
|
252 |
|
|
|
253 |
|
|
}
|
254 |
|
|
|
255 |
|
|
|
256 |
|
|
}
|