1
|
package eu.dnetlib.data.collector.plugins.excel;
|
2
|
|
3
|
/**
|
4
|
* Created by miriam on 10/05/2017.
|
5
|
*/
|
6
|
import java.io.File;
|
7
|
import java.io.FileInputStream;
|
8
|
import java.io.IOException;
|
9
|
import java.net.URL;
|
10
|
import java.util.ArrayList;
|
11
|
import java.util.HashMap;
|
12
|
import java.util.Iterator;
|
13
|
|
14
|
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
|
15
|
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
16
|
import org.apache.commons.lang3.StringUtils;
|
17
|
import org.apache.commons.logging.Log;
|
18
|
import org.apache.commons.logging.LogFactory;
|
19
|
import org.apache.poi.ss.usermodel.Cell;
|
20
|
import org.apache.poi.ss.usermodel.DataFormatter;
|
21
|
import org.apache.poi.ss.usermodel.Row;
|
22
|
import org.apache.poi.ss.usermodel.Sheet;
|
23
|
import org.apache.poi.ss.usermodel.Workbook;
|
24
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
25
|
import org.json.*;
|
26
|
|
27
|
import org.apache.commons.io.FileUtils;
|
28
|
|
29
|
public class Read {
|
30
|
|
31
|
private static final Log log = LogFactory.getLog(Read.class);
|
32
|
|
33
|
/** The descriptor. */
|
34
|
private InterfaceDescriptor descriptor;
|
35
|
|
36
|
|
37
|
/*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10";
|
38
|
private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv";
|
39
|
private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," +
|
40
|
"\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}],"
|
41
|
+ "\"col_currency\":10}"; */
|
42
|
private Sheet sheet;
|
43
|
private CSVFileWriter csv_writer = new CSVFileWriter();
|
44
|
private HashMap<String,String> map_header = new HashMap<String,String>();
|
45
|
private HashMap<String,String> map_body = new HashMap<String,String>();
|
46
|
private int header_row;
|
47
|
private String file_to_save ;
|
48
|
private boolean replace_currency = false;
|
49
|
private String from_currency, to_currency;
|
50
|
private boolean remove_empty, remove_tmp_file;
|
51
|
private String remove_id;
|
52
|
private int column_id;
|
53
|
private int currency_column;
|
54
|
private int sheet_number;
|
55
|
private String tmp_file;
|
56
|
private String argument;
|
57
|
private String identifier;
|
58
|
|
59
|
private HttpCSVCollectorPlugin collector;
|
60
|
|
61
|
public HttpCSVCollectorPlugin getCollector() {
|
62
|
return collector;
|
63
|
}
|
64
|
|
65
|
public void setCollector(HttpCSVCollectorPlugin collector) {
|
66
|
this.collector = collector;
|
67
|
}
|
68
|
|
69
|
public Read(InterfaceDescriptor descriptor){
|
70
|
this.descriptor = descriptor;
|
71
|
|
72
|
}
|
73
|
|
74
|
private static String getCellValue( Cell cell)
|
75
|
{
|
76
|
DataFormatter formatter = new DataFormatter();
|
77
|
String formattedCellValue = formatter.formatCellValue(cell);
|
78
|
return formattedCellValue;
|
79
|
|
80
|
}
|
81
|
|
82
|
private void copyFile() throws IOException{
|
83
|
FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file));
|
84
|
|
85
|
}
|
86
|
|
87
|
private void parseDescriptor(){
|
88
|
HashMap<String, String> params = descriptor.getParams();
|
89
|
argument = params.get("argument");
|
90
|
header_row = Integer.parseInt(params.get("header_row"));
|
91
|
tmp_file = params.get("tmp_file");
|
92
|
remove_empty = (params.get("remove_empty_lines") == "yes");
|
93
|
remove_id = params.get("remove_lines_with_id");
|
94
|
column_id = Integer.parseInt(params.get("col_id"));
|
95
|
remove_tmp_file = (params.get("remove_tmp_file") == "yes");
|
96
|
sheet_number = Integer.parseInt(params.get("sheet_number"));
|
97
|
file_to_save = params.get("file_to_save");
|
98
|
}
|
99
|
private void init() throws IOException{
|
100
|
parseDescriptor();
|
101
|
log.info("Parsing the arguments");
|
102
|
parseArguments();
|
103
|
log.info("Copying the file in temp local file");
|
104
|
copyFile();
|
105
|
log.info("Extracting the sheet " + sheet_number);
|
106
|
FileInputStream fis = new FileInputStream(tmp_file);
|
107
|
Workbook workbook = new XSSFWorkbook(fis);
|
108
|
sheet = workbook.getSheetAt(sheet_number);
|
109
|
fis.close();
|
110
|
if(remove_tmp_file) {
|
111
|
File f = new File(tmp_file);
|
112
|
f.delete();
|
113
|
}
|
114
|
|
115
|
}
|
116
|
|
117
|
private void fillMap(JSONObject json, HashMap<String,String> map, String elem){
|
118
|
try{
|
119
|
final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
|
120
|
for(Object entry: arr)
|
121
|
map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to"));
|
122
|
}catch(Throwable e){
|
123
|
log.error("Problems filling the map for " + elem);
|
124
|
throw(e);
|
125
|
}
|
126
|
|
127
|
}
|
128
|
|
129
|
|
130
|
|
131
|
private void parseArguments() {
|
132
|
if (StringUtils.isNotEmpty(argument)){
|
133
|
try{
|
134
|
final JSONObject json = new JSONObject(argument);
|
135
|
if(json.has("header"))
|
136
|
fillMap(json, map_header,"header");
|
137
|
if (json.has("body"))
|
138
|
fillMap(json,map_body,"body");
|
139
|
|
140
|
if(json.has("replace_currency"))
|
141
|
{
|
142
|
replace_currency = true ;
|
143
|
from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from");
|
144
|
to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to");
|
145
|
|
146
|
}
|
147
|
|
148
|
if (json.has("col_currency"))
|
149
|
currency_column = json.getInt("col_currency");
|
150
|
}catch(Throwable e){
|
151
|
log.error("Problems while parsing the argument parameter.");
|
152
|
throw (e);
|
153
|
}
|
154
|
}
|
155
|
|
156
|
|
157
|
|
158
|
}
|
159
|
|
160
|
private String applyReplace(String row, HashMap<String,String>replace){
|
161
|
for(String key: replace.keySet()){
|
162
|
if(row.contains(key))
|
163
|
row = row.replace(key, replace.get(key));
|
164
|
}
|
165
|
return row;
|
166
|
}
|
167
|
|
168
|
private void getHeader(){
|
169
|
Row row = sheet.getRow(header_row);
|
170
|
Iterator<Cell> cellIterator = row.cellIterator();
|
171
|
Cell cell;
|
172
|
String project = "";
|
173
|
int count = 0;
|
174
|
while (cellIterator.hasNext()){
|
175
|
cell = cellIterator.next();
|
176
|
project += applyReplace(cell.getStringCellValue(),map_header) + ";";
|
177
|
if(count++ == column_id) identifier = applyReplace(cell.getStringCellValue(),map_header);
|
178
|
}
|
179
|
project = project.substring(0, project.length() -1 );
|
180
|
csv_writer.setHeader(project.split(";"));
|
181
|
|
182
|
}
|
183
|
|
184
|
private void getData(){
|
185
|
Row row;
|
186
|
Cell cell;
|
187
|
String tmp;
|
188
|
Iterator<Cell>cellIterator;
|
189
|
for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){
|
190
|
row = sheet.getRow(row_number);
|
191
|
cellIterator = row.cellIterator();
|
192
|
int col_number = 0;
|
193
|
|
194
|
boolean discard_row = false;
|
195
|
ArrayList<String> al = new ArrayList<String>();
|
196
|
while(cellIterator.hasNext() && !discard_row){
|
197
|
cell = cellIterator.next();
|
198
|
tmp = getCellValue(cell).trim();
|
199
|
if (col_number == column_id &&
|
200
|
((remove_empty && tmp.trim().equals("")) ||
|
201
|
(!remove_id.equals("") && tmp.equals(remove_id))))
|
202
|
discard_row = true;
|
203
|
|
204
|
if (replace_currency && col_number == currency_column)
|
205
|
tmp = tmp.replace(from_currency,to_currency);
|
206
|
|
207
|
al.add(applyReplace(tmp,map_body));
|
208
|
col_number ++;
|
209
|
}
|
210
|
if(!discard_row){
|
211
|
csv_writer.addProject(al);
|
212
|
|
213
|
}
|
214
|
}
|
215
|
|
216
|
}
|
217
|
|
218
|
private void writeCSVFile(){
|
219
|
|
220
|
csv_writer.writeFile(file_to_save);
|
221
|
}
|
222
|
|
223
|
private InterfaceDescriptor prepareHTTPCSVDescriptor(){
|
224
|
InterfaceDescriptor dex = new InterfaceDescriptor();
|
225
|
dex.setBaseUrl("file://"+file_to_save);
|
226
|
HashMap<String, String> params = new HashMap<String, String>();
|
227
|
params.put("separator", descriptor.getParams().get("separator"));
|
228
|
params.put("identifier",identifier);
|
229
|
params.put("quote",descriptor.getParams().get("quote"));
|
230
|
dex.setParams(params);
|
231
|
return dex;
|
232
|
}
|
233
|
|
234
|
public Iterable<String> parseFile() throws Exception{
|
235
|
|
236
|
|
237
|
init();
|
238
|
log.info("Getting header elements");
|
239
|
getHeader();
|
240
|
log.info("Getting sheet data");
|
241
|
getData();
|
242
|
log.info("Writing the csv file");
|
243
|
writeCSVFile();
|
244
|
log.info("Preparing to parse csv");
|
245
|
|
246
|
return collector.collect(prepareHTTPCSVDescriptor(),"","");
|
247
|
|
248
|
}
|
249
|
|
250
|
|
251
|
}
|