Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.excel;
2

    
3
/**
4
 * Created by miriam on 10/05/2017.
5
 */
6
import java.io.File;
7
import java.io.FileInputStream;
8
import java.io.IOException;
9
import java.net.URL;
10
import java.util.ArrayList;
11
import java.util.HashMap;
12
import java.util.Iterator;
13

    
14
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
15
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
16
import org.apache.commons.lang3.StringUtils;
17
import org.apache.commons.logging.Log;
18
import org.apache.commons.logging.LogFactory;
19
import org.apache.poi.ss.usermodel.Cell;
20
import org.apache.poi.ss.usermodel.DataFormatter;
21
import org.apache.poi.ss.usermodel.Row;
22
import org.apache.poi.ss.usermodel.Sheet;
23
import org.apache.poi.ss.usermodel.Workbook;
24
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
25
import org.json.*;
26

    
27
import org.apache.commons.io.FileUtils;
28

    
29
public class Read {
30

    
31
	private static final Log log = LogFactory.getLog(Read.class);
32

    
33
	/** The descriptor. */
34
	private InterfaceDescriptor descriptor;
35

    
36

    
37
	/*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10";
38
	private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv";
39
	 private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," +
40
	  "\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}],"
41
			   + "\"col_currency\":10}"; */
42
	private Sheet sheet;
43
	private CSVFileWriter csv_writer = new CSVFileWriter();
44
	private HashMap<String,String> map_header = new HashMap<String,String>();
45
	private HashMap<String,String> map_body = new HashMap<String,String>();
46
	private int header_row;
47
	private String file_to_save ;
48
	private boolean replace_currency = false;
49
	private String from_currency, to_currency;
50
	private boolean remove_empty, remove_tmp_file;
51
	private String remove_id;
52
	private int column_id;
53
	private int currency_column;
54
	private int sheet_number;
55
	private String tmp_file;
56
	private String argument;
57
	private String identifier;
58

    
59
	private HttpCSVCollectorPlugin collector;
60

    
61
	public HttpCSVCollectorPlugin getCollector() {
62
		return collector;
63
	}
64

    
65
	public void setCollector(HttpCSVCollectorPlugin collector) {
66
		this.collector = collector;
67
	}
68

    
69
	public Read(InterfaceDescriptor descriptor){
70
		this.descriptor = descriptor;
71

    
72
	}
73

    
74
	private static String getCellValue( Cell cell)
75
	{
76
		DataFormatter formatter = new DataFormatter();
77
		String formattedCellValue = formatter.formatCellValue(cell);
78
		return formattedCellValue;
79

    
80
	}
81

    
82
	private void copyFile() throws IOException{
83
		FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file));
84

    
85
	}
86

    
87
	private void parseDescriptor(){
88
		HashMap<String, String> params = descriptor.getParams();
89
		argument = params.get("argument");
90
		header_row = Integer.parseInt(params.get("header_row"));
91
		tmp_file = params.get("tmp_file");
92
		remove_empty = (params.get("remove_empty_lines") == "yes");
93
		remove_id = params.get("remove_lines_with_id");
94
		column_id = Integer.parseInt(params.get("col_id"));
95
		remove_tmp_file = (params.get("remove_tmp_file") == "yes");
96
		sheet_number = Integer.parseInt(params.get("sheet_number"));
97
		file_to_save = params.get("file_to_save");
98
	}
99
	private void init() throws IOException{
100
		parseDescriptor();
101
		log.info("Parsing the arguments");
102
		parseArguments();
103
		log.info("Copying the file in temp local file");
104
		copyFile();
105
		log.info("Extracting the sheet " + sheet_number);
106
		FileInputStream fis = new FileInputStream(tmp_file);
107
		Workbook workbook = new XSSFWorkbook(fis);
108
		sheet = workbook.getSheetAt(sheet_number);
109
		fis.close();
110
		if(remove_tmp_file) {
111
			File f = new File(tmp_file);
112
			f.delete();
113
		}
114

    
115
	}
116

    
117
	private void fillMap(JSONObject json, HashMap<String,String> map, String elem){
118
		try{
119
			final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
120
			for(Object entry: arr)
121
				map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to"));
122
		}catch(Throwable e){
123
			log.error("Problems filling the map for " + elem);
124
			throw(e);
125
		}
126

    
127
	}
128

    
129

    
130

    
131
	private void parseArguments() {
132
		if (StringUtils.isNotEmpty(argument)){
133
			try{
134
				final JSONObject json = new JSONObject(argument);
135
				if(json.has("header"))
136
					fillMap(json, map_header,"header");
137
				if (json.has("body"))
138
					fillMap(json,map_body,"body");
139

    
140
				if(json.has("replace_currency"))
141
				{
142
					replace_currency = true	;
143
					from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from");
144
					to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to");
145

    
146
				}
147

    
148
				if (json.has("col_currency"))
149
					currency_column = json.getInt("col_currency");
150
			}catch(Throwable e){
151
				log.error("Problems while parsing the argument parameter.");
152
				throw (e);
153
			}
154
		}
155

    
156

    
157

    
158
	}
159

    
160
	private String applyReplace(String row, HashMap<String,String>replace){
161
		for(String key: replace.keySet()){
162
			if(row.contains(key))
163
				row = row.replace(key, replace.get(key));
164
		}
165
		return row;
166
	}
167

    
168
	private void getHeader(){
169
		Row row = sheet.getRow(header_row);
170
		Iterator<Cell> cellIterator = row.cellIterator();
171
		Cell cell;
172
		String project = "";
173
		int count = 0;
174
		while (cellIterator.hasNext()){
175
			cell = cellIterator.next();
176
			project += applyReplace(cell.getStringCellValue(),map_header) + ";";
177
			if(count++ == column_id) identifier = applyReplace(cell.getStringCellValue(),map_header);
178
		}
179
		project = project.substring(0, project.length() -1 );
180
		csv_writer.setHeader(project.split(";"));
181

    
182
	}
183

    
184
	private void getData(){
185
		Row row;
186
		Cell cell;
187
		String tmp;
188
		Iterator<Cell>cellIterator;
189
		for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){
190
			row = sheet.getRow(row_number);
191
			cellIterator = row.cellIterator();
192
			int col_number = 0;
193

    
194
			boolean discard_row = false;
195
			ArrayList<String> al = new ArrayList<String>();
196
			while(cellIterator.hasNext() && !discard_row){
197
				cell = cellIterator.next();
198
				tmp = getCellValue(cell).trim();
199
				if (col_number == column_id &&
200
						((remove_empty &&  tmp.trim().equals("")) ||
201
								(!remove_id.equals("") && tmp.equals(remove_id))))
202
					discard_row = true;
203

    
204
				if (replace_currency && col_number == currency_column)
205
					tmp = tmp.replace(from_currency,to_currency);
206

    
207
				al.add(applyReplace(tmp,map_body));
208
				col_number ++;
209
			}
210
			if(!discard_row){
211
				csv_writer.addProject(al);
212

    
213
			}
214
		}
215

    
216
	}
217

    
218
	private void writeCSVFile(){
219

    
220
		csv_writer.writeFile(file_to_save);
221
	}
222

    
223
	private InterfaceDescriptor prepareHTTPCSVDescriptor(){
224
		InterfaceDescriptor dex = new InterfaceDescriptor();
225
		dex.setBaseUrl("file://"+file_to_save);
226
		HashMap<String, String> params = new HashMap<String, String>();
227
		params.put("separator", descriptor.getParams().get("separator"));
228
		params.put("identifier",identifier);
229
		params.put("quote",descriptor.getParams().get("quote"));
230
		dex.setParams(params);
231
		return dex;
232
	}
233

    
234
	public Iterable<String> parseFile() throws Exception{
235

    
236

    
237
		init();
238
		log.info("Getting header elements");
239
		getHeader();
240
		log.info("Getting sheet data");
241
		getData();
242
		log.info("Writing the csv file");
243
		writeCSVFile();
244
		log.info("Preparing to parse csv");
245

    
246
		return collector.collect(prepareHTTPCSVDescriptor(),"","");
247

    
248
	}
249

    
250

    
251
}
(2-2/3)