1
|
package eu.dnetlib.data.collector.plugins.excel;
|
2
|
|
3
|
/**
|
4
|
* Created by miriam on 10/05/2017.
|
5
|
*/
|
6
|
import java.io.File;
|
7
|
import java.io.FileInputStream;
|
8
|
import java.io.IOException;
|
9
|
import java.net.URL;
|
10
|
import java.util.ArrayList;
|
11
|
import java.util.HashMap;
|
12
|
import java.util.Iterator;
|
13
|
|
14
|
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
|
15
|
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
16
|
import org.apache.commons.logging.Log;
|
17
|
import org.apache.commons.logging.LogFactory;
|
18
|
import org.apache.poi.ss.usermodel.Cell;
|
19
|
import org.apache.poi.ss.usermodel.DataFormatter;
|
20
|
import org.apache.poi.ss.usermodel.Row;
|
21
|
import org.apache.poi.ss.usermodel.Sheet;
|
22
|
import org.apache.poi.ss.usermodel.Workbook;
|
23
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
24
|
import org.json.*;
|
25
|
|
26
|
import org.apache.commons.io.FileUtils;
|
27
|
|
28
|
public class Read {
|
29
|
|
30
|
private static final Log log = LogFactory.getLog(Read.class);
|
31
|
|
32
|
/** The descriptor. */
|
33
|
private InterfaceDescriptor descriptor;
|
34
|
|
35
|
|
36
|
/*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10";
|
37
|
private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv";
|
38
|
private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," +
|
39
|
"\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}],"
|
40
|
+ "\"col_currency\":10}"; */
|
41
|
private Sheet sheet;
|
42
|
private CSVFileWriter csv_writer = new CSVFileWriter();
|
43
|
private HashMap<String,String> map_header = new HashMap<String,String>();
|
44
|
private HashMap<String,String> map_body = new HashMap<String,String>();
|
45
|
private int header_row;
|
46
|
private String file_to_save ;
|
47
|
private boolean replace_currency = false;
|
48
|
private String from_currency, to_currency;
|
49
|
private boolean remove_empty, remove_tmp_file;
|
50
|
private String remove_id;
|
51
|
private int column_id;
|
52
|
private int currency_column;
|
53
|
private int sheet_number;
|
54
|
private String tmp_file;
|
55
|
private String argument;
|
56
|
private String identifier;
|
57
|
|
58
|
private HttpCSVCollectorPlugin collector;
|
59
|
|
60
|
public HttpCSVCollectorPlugin getCollector() {
|
61
|
return collector;
|
62
|
}
|
63
|
|
64
|
public void setCollector(HttpCSVCollectorPlugin collector) {
|
65
|
this.collector = collector;
|
66
|
}
|
67
|
|
68
|
public Read(InterfaceDescriptor descriptor){
|
69
|
this.descriptor = descriptor;
|
70
|
|
71
|
}
|
72
|
|
73
|
private static String getCellValue( Cell cell)
|
74
|
{
|
75
|
DataFormatter formatter = new DataFormatter();
|
76
|
String formattedCellValue = formatter.formatCellValue(cell);
|
77
|
return formattedCellValue;
|
78
|
|
79
|
}
|
80
|
|
81
|
private void copyFile() throws IOException{
|
82
|
FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file));
|
83
|
|
84
|
}
|
85
|
|
86
|
private void parseDescriptor(){
|
87
|
HashMap<String, String> params = descriptor.getParams();
|
88
|
argument = params.get("argument");
|
89
|
header_row = Integer.parseInt(params.get("header_row"));
|
90
|
tmp_file = params.get("tmp_file");
|
91
|
remove_empty = (params.get("remove_empty_lines") == "yes");
|
92
|
remove_id = params.get("remove_lines_with_id");
|
93
|
column_id = Integer.parseInt(params.get("col_id"));
|
94
|
remove_tmp_file = (params.get("remove_tmp_file") == "yes");
|
95
|
sheet_number = Integer.parseInt(params.get("sheet_number"));
|
96
|
file_to_save = params.get("file_to_save");
|
97
|
}
|
98
|
private void init() throws IOException{
|
99
|
parseDescriptor();
|
100
|
log.info("Parsing the arguments");
|
101
|
parseArguments();
|
102
|
log.info("Copying the file in temp local file");
|
103
|
copyFile();
|
104
|
log.info("Extracting the sheet " + sheet_number);
|
105
|
FileInputStream fis = new FileInputStream(tmp_file);
|
106
|
Workbook workbook = new XSSFWorkbook(fis);
|
107
|
sheet = workbook.getSheetAt(sheet_number);
|
108
|
fis.close();
|
109
|
if(remove_tmp_file) {
|
110
|
File f = new File(tmp_file);
|
111
|
f.delete();
|
112
|
}
|
113
|
|
114
|
}
|
115
|
|
116
|
private void fillMap(JSONObject json, HashMap<String,String> map, String elem){
|
117
|
JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
|
118
|
for(Object entry: arr) {
|
119
|
try {
|
120
|
map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to"));
|
121
|
}catch(Exception ex){
|
122
|
ex.printStackTrace();
|
123
|
}
|
124
|
}
|
125
|
|
126
|
}
|
127
|
|
128
|
private void parseArguments() {
|
129
|
JSONObject json = new JSONObject(argument);
|
130
|
fillMap(json, map_header,"header");
|
131
|
fillMap(json,map_body,"body");
|
132
|
|
133
|
if (!(json.getJSONArray("replace_currency")==null)){
|
134
|
replace_currency = true ;
|
135
|
from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from");
|
136
|
to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to");
|
137
|
}
|
138
|
|
139
|
currency_column = json.getInt("col_currency");
|
140
|
|
141
|
}
|
142
|
|
143
|
private String applyReplace(String row, HashMap<String,String>replace){
|
144
|
for(String key: replace.keySet()){
|
145
|
if(row.contains(key))
|
146
|
row = row.replace(key, replace.get(key));
|
147
|
}
|
148
|
return row;
|
149
|
}
|
150
|
|
151
|
private void getHeader(){
|
152
|
Row row = sheet.getRow(header_row);
|
153
|
Iterator<Cell> cellIterator = row.cellIterator();
|
154
|
Cell cell;
|
155
|
String project = "";
|
156
|
int count = 0;
|
157
|
while (cellIterator.hasNext()){
|
158
|
cell = cellIterator.next();
|
159
|
project += applyReplace(cell.getStringCellValue(),map_header) + ";";
|
160
|
if(count++ == column_id) identifier = applyReplace(cell.getStringCellValue(),map_header);
|
161
|
}
|
162
|
project = project.substring(0, project.length() -1 );
|
163
|
csv_writer.setHeader(project.split(";"));
|
164
|
|
165
|
}
|
166
|
|
167
|
private void getData(){
|
168
|
Row row;
|
169
|
Cell cell;
|
170
|
String tmp;
|
171
|
Iterator<Cell>cellIterator;
|
172
|
for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){
|
173
|
row = sheet.getRow(row_number);
|
174
|
cellIterator = row.cellIterator();
|
175
|
int col_number = 0;
|
176
|
|
177
|
boolean discard_row = false;
|
178
|
ArrayList<String> al = new ArrayList<String>();
|
179
|
while(cellIterator.hasNext() && !discard_row){
|
180
|
cell = cellIterator.next();
|
181
|
tmp = getCellValue(cell);
|
182
|
if (col_number == column_id &&
|
183
|
((remove_empty && tmp.trim().equals("")) ||
|
184
|
(!remove_id.equals("") && tmp.equals(remove_id))))
|
185
|
discard_row = true;
|
186
|
|
187
|
if (replace_currency && col_number == currency_column)
|
188
|
tmp = tmp.replace(from_currency,to_currency);
|
189
|
|
190
|
al.add(applyReplace(tmp,map_body));
|
191
|
col_number ++;
|
192
|
}
|
193
|
if(!discard_row){
|
194
|
csv_writer.addProject(al);
|
195
|
|
196
|
}
|
197
|
}
|
198
|
|
199
|
}
|
200
|
|
201
|
private void writeCSVFile(){
|
202
|
|
203
|
csv_writer.writeFile(file_to_save);
|
204
|
}
|
205
|
|
206
|
private InterfaceDescriptor prepareHTTPCSVDescriptor(){
|
207
|
InterfaceDescriptor dex = new InterfaceDescriptor();
|
208
|
dex.setBaseUrl("file://"+file_to_save);
|
209
|
HashMap<String, String> params = new HashMap<String, String>();
|
210
|
params.put("separator", descriptor.getParams().get("separator"));
|
211
|
params.put("identifier",identifier);
|
212
|
params.put("quote",descriptor.getParams().get("quote"));
|
213
|
dex.setParams(params);
|
214
|
return dex;
|
215
|
}
|
216
|
|
217
|
public Iterable<String> parseFile() throws Exception{
|
218
|
|
219
|
|
220
|
init();
|
221
|
log.info("Getting header elements");
|
222
|
getHeader();
|
223
|
log.info("Getting sheet data");
|
224
|
getData();
|
225
|
log.info("Writing the csv file");
|
226
|
writeCSVFile();
|
227
|
log.info("Preparing to parse csv");
|
228
|
|
229
|
return collector.collect(prepareHTTPCSVDescriptor(),"","");
|
230
|
|
231
|
}
|
232
|
|
233
|
|
234
|
}
|