Revision 63319
Added by Michele Artini 2 months ago
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/HttpApiRepositoryIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable; |
|
4 |
|
|
5 |
public interface HttpApiRepositoryIterable extends RepositoryIterable { |
|
6 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/excel/Read.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.excel; |
|
2 |
|
|
3 |
/** |
|
4 |
* Created by miriam on 10/05/2017. |
|
5 |
*/ |
|
6 |
import java.io.File; |
|
7 |
import java.io.FileInputStream; |
|
8 |
import java.io.IOException; |
|
9 |
import java.net.URL; |
|
10 |
import java.util.ArrayList; |
|
11 |
import java.util.HashMap; |
|
12 |
import java.util.Iterator; |
|
13 |
|
|
14 |
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin; |
|
15 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
16 |
import org.apache.commons.lang3.StringUtils; |
|
17 |
import org.apache.commons.logging.Log; |
|
18 |
import org.apache.commons.logging.LogFactory; |
|
19 |
import org.apache.poi.ss.usermodel.Cell; |
|
20 |
import org.apache.poi.ss.usermodel.DataFormatter; |
|
21 |
import org.apache.poi.ss.usermodel.Row; |
|
22 |
import org.apache.poi.ss.usermodel.Sheet; |
|
23 |
import org.apache.poi.ss.usermodel.Workbook; |
|
24 |
import org.apache.poi.xssf.usermodel.XSSFWorkbook; |
|
25 |
import org.json.*; |
|
26 |
|
|
27 |
import org.apache.commons.io.FileUtils; |
|
28 |
|
|
29 |
public class Read { |
|
30 |
|
|
31 |
private static final Log log = LogFactory.getLog(Read.class); |
|
32 |
|
|
33 |
/** The descriptor. */ |
|
34 |
private InterfaceDescriptor descriptor; |
|
35 |
|
|
36 |
|
|
37 |
/*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10"; |
|
38 |
private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv"; |
|
39 |
private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," + |
|
40 |
"\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}]," |
|
41 |
+ "\"col_currency\":10}"; */ |
|
42 |
private Sheet sheet; |
|
43 |
private CSVFileWriter csv_writer = new CSVFileWriter(); |
|
44 |
private HashMap<String,String> map_header = new HashMap<String,String>(); |
|
45 |
private HashMap<String,String> map_body = new HashMap<String,String>(); |
|
46 |
private int header_row; |
|
47 |
private String file_to_save ; |
|
48 |
private boolean replace_currency = false; |
|
49 |
private String from_currency, to_currency; |
|
50 |
private boolean remove_empty, remove_tmp_file; |
|
51 |
private String remove_id; |
|
52 |
private int column_id; |
|
53 |
private int currency_column; |
|
54 |
private int sheet_number; |
|
55 |
private String tmp_file; |
|
56 |
private String argument; |
|
57 |
private String identifier; |
|
58 |
|
|
59 |
private HttpCSVCollectorPlugin collector; |
|
60 |
|
|
61 |
public HttpCSVCollectorPlugin getCollector() { |
|
62 |
return collector; |
|
63 |
} |
|
64 |
|
|
65 |
public void setCollector(HttpCSVCollectorPlugin collector) { |
|
66 |
this.collector = collector; |
|
67 |
} |
|
68 |
|
|
69 |
public Read(InterfaceDescriptor descriptor){ |
|
70 |
this.descriptor = descriptor; |
|
71 |
|
|
72 |
} |
|
73 |
|
|
74 |
private static String getCellValue( Cell cell) |
|
75 |
{ |
|
76 |
DataFormatter formatter = new DataFormatter(); |
|
77 |
String formattedCellValue = formatter.formatCellValue(cell); |
|
78 |
return formattedCellValue; |
|
79 |
|
|
80 |
} |
|
81 |
|
|
82 |
private void copyFile() throws IOException{ |
|
83 |
FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file)); |
|
84 |
|
|
85 |
} |
|
86 |
|
|
87 |
private void parseDescriptor(){ |
|
88 |
HashMap<String, String> params = descriptor.getParams(); |
|
89 |
argument = params.get("argument"); |
|
90 |
header_row = Integer.parseInt(params.get("header_row")); |
|
91 |
tmp_file = params.get("tmp_file"); |
|
92 |
remove_empty = (params.get("remove_empty_lines") == "yes"); |
|
93 |
remove_id = params.get("remove_lines_with_id"); |
|
94 |
column_id = Integer.parseInt(params.get("col_id")); |
|
95 |
remove_tmp_file = (params.get("remove_tmp_file") == "yes"); |
|
96 |
sheet_number = Integer.parseInt(params.get("sheet_number")); |
|
97 |
file_to_save = params.get("file_to_save"); |
|
98 |
} |
|
99 |
private void init() throws IOException{ |
|
100 |
parseDescriptor(); |
|
101 |
log.info("Parsing the arguments"); |
|
102 |
parseArguments(); |
|
103 |
log.info("Copying the file in temp local file"); |
|
104 |
copyFile(); |
|
105 |
log.info("Extracting the sheet " + sheet_number); |
|
106 |
FileInputStream fis = new FileInputStream(tmp_file); |
|
107 |
Workbook workbook = new XSSFWorkbook(fis); |
|
108 |
sheet = workbook.getSheetAt(sheet_number); |
|
109 |
fis.close(); |
|
110 |
if(remove_tmp_file) { |
|
111 |
File f = new File(tmp_file); |
|
112 |
f.delete(); |
|
113 |
} |
|
114 |
|
|
115 |
} |
|
116 |
|
|
117 |
private void fillMap(JSONObject json, HashMap<String,String> map, String elem){ |
|
118 |
try{ |
|
119 |
final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem); |
|
120 |
for(Object entry: arr) |
|
121 |
map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to")); |
|
122 |
}catch(Throwable e){ |
|
123 |
log.error("Problems filling the map for " + elem); |
|
124 |
throw(e); |
|
125 |
} |
|
126 |
|
|
127 |
} |
|
128 |
|
|
129 |
|
|
130 |
|
|
131 |
private void parseArguments() { |
|
132 |
if (StringUtils.isNotEmpty(argument)){ |
|
133 |
try{ |
|
134 |
final JSONObject json = new JSONObject(argument); |
|
135 |
JSONObject tmp = json.getJSONObject("replace"); |
|
136 |
if(tmp.has("header")) |
|
137 |
fillMap(json, map_header,"header"); |
|
138 |
if(tmp.has("body")) |
|
139 |
fillMap(json,map_body,"body"); |
|
140 |
// if(json.has("header")) |
|
141 |
// fillMap(json, map_header,"header"); |
|
142 |
// if (json.has("body")) |
|
143 |
// fillMap(json,map_body,"body"); |
|
144 |
|
|
145 |
if(json.has("replace_currency")) |
|
146 |
{ |
|
147 |
replace_currency = true ; |
|
148 |
from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from"); |
|
149 |
to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to"); |
|
150 |
|
|
151 |
} |
|
152 |
|
|
153 |
if (json.has("col_currency")) |
|
154 |
currency_column = json.getInt("col_currency"); |
|
155 |
}catch(Throwable e){ |
|
156 |
log.error("Problems while parsing the argument parameter."); |
|
157 |
throw (e); |
|
158 |
} |
|
159 |
} |
|
160 |
|
|
161 |
|
|
162 |
|
|
163 |
} |
|
164 |
|
|
165 |
private String applyReplace(String row, HashMap<String,String>replace){ |
|
166 |
for(String key: replace.keySet()){ |
|
167 |
if(row.contains(key)) |
|
168 |
row = row.replace(key, replace.get(key)); |
|
169 |
} |
|
170 |
return row; |
|
171 |
} |
|
172 |
|
|
173 |
private void getHeader(){ |
|
174 |
Row row = sheet.getRow(header_row); |
|
175 |
Iterator<Cell> cellIterator = row.cellIterator(); |
|
176 |
Cell cell; |
|
177 |
String project = ""; |
|
178 |
int count = 0; |
|
179 |
while (cellIterator.hasNext()){ |
|
180 |
cell = cellIterator.next(); |
|
181 |
final String stringCellValue = cell.getStringCellValue(); |
|
182 |
project += applyReplace(stringCellValue,map_header) + ";"; |
|
183 |
if(count++ == column_id) identifier = applyReplace(stringCellValue,map_header); |
|
184 |
} |
|
185 |
project = project.substring(0, project.length() -1 ); |
|
186 |
csv_writer.setHeader(project.split(";")); |
|
187 |
|
|
188 |
} |
|
189 |
|
|
190 |
private void getData(){ |
|
191 |
Row row; |
|
192 |
Cell cell; |
|
193 |
String tmp; |
|
194 |
Iterator<Cell>cellIterator; |
|
195 |
for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){ |
|
196 |
row = sheet.getRow(row_number); |
|
197 |
if (row != null) { |
|
198 |
cellIterator = row.cellIterator(); |
|
199 |
|
|
200 |
int col_number = 0; |
|
201 |
|
|
202 |
boolean discard_row = false; |
|
203 |
ArrayList<String> al = new ArrayList<String>(); |
|
204 |
while (cellIterator.hasNext() && !discard_row) { |
|
205 |
cell = cellIterator.next(); |
|
206 |
tmp = getCellValue(cell).trim(); |
|
207 |
tmp = tmp.replace("\n"," "); |
|
208 |
if (col_number == column_id && |
|
209 |
((remove_empty && tmp.trim().equals("")) || |
|
210 |
(!remove_id.equals("") && tmp.equals(remove_id)))) |
|
211 |
discard_row = true; |
|
212 |
|
|
213 |
if (replace_currency && col_number == currency_column) |
|
214 |
tmp = tmp.replace(from_currency, to_currency); |
|
215 |
|
|
216 |
al.add(applyReplace(tmp, map_body)); |
|
217 |
col_number++; |
|
218 |
} |
|
219 |
if (!discard_row) { |
|
220 |
csv_writer.addProject(al); |
|
221 |
|
|
222 |
} |
|
223 |
} |
|
224 |
} |
|
225 |
|
|
226 |
} |
|
227 |
|
|
228 |
private void writeCSVFile(){ |
|
229 |
|
|
230 |
csv_writer.writeFile(file_to_save); |
|
231 |
} |
|
232 |
|
|
233 |
private InterfaceDescriptor prepareHTTPCSVDescriptor(){ |
|
234 |
InterfaceDescriptor dex = new InterfaceDescriptor(); |
|
235 |
dex.setBaseUrl("file://"+file_to_save); |
|
236 |
HashMap<String, String> params = new HashMap<String, String>(); |
|
237 |
params.put("separator", descriptor.getParams().get("separator")); |
|
238 |
params.put("identifier",identifier); |
|
239 |
params.put("quote",descriptor.getParams().get("quote")); |
|
240 |
dex.setParams(params); |
|
241 |
return dex; |
|
242 |
} |
|
243 |
|
|
244 |
public Iterable<String> parseFile() throws Exception{ |
|
245 |
|
|
246 |
|
|
247 |
init(); |
|
248 |
log.info("Getting header elements"); |
|
249 |
getHeader(); |
|
250 |
log.info("Getting sheet data"); |
|
251 |
getData(); |
|
252 |
log.info("Writing the csv file"); |
|
253 |
writeCSVFile(); |
|
254 |
log.info("Preparing to parse csv"); |
|
255 |
|
|
256 |
return collector.collect(prepareHTTPCSVDescriptor(),"",""); |
|
257 |
|
|
258 |
} |
|
259 |
|
|
260 |
|
|
261 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/excel/ReadExcelPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.excel; |
|
2 |
|
|
3 |
|
|
4 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
5 |
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin; |
|
6 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
7 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
8 |
import org.apache.commons.logging.Log; |
|
9 |
import org.apache.commons.logging.LogFactory; |
|
10 |
import org.springframework.beans.factory.annotation.Autowired; |
|
11 |
import org.springframework.beans.factory.annotation.Required; |
|
12 |
|
|
13 |
/** |
|
14 |
* Created by miriam on 10/05/2017. |
|
15 |
*/ |
|
16 |
public class ReadExcelPlugin extends AbstractCollectorPlugin{ |
|
17 |
|
|
18 |
private static final Log log = LogFactory.getLog(ReadExcelPlugin.class); |
|
19 |
@Autowired |
|
20 |
HttpCSVCollectorPlugin httpCSVCollectorPlugin; |
|
21 |
|
|
22 |
|
|
23 |
|
|
24 |
@Override |
|
25 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) |
|
26 |
throws CollectorServiceException { |
|
27 |
Read r = new Read(interfaceDescriptor); |
|
28 |
r.setCollector(httpCSVCollectorPlugin); |
|
29 |
|
|
30 |
try { |
|
31 |
return r.parseFile(); |
|
32 |
}catch(Exception e){ |
|
33 |
log.error("Error importing excel file"); |
|
34 |
throw new CollectorServiceException(e); |
|
35 |
} |
|
36 |
|
|
37 |
|
|
38 |
} |
|
39 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/datasources/Re3DataCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.datasources; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
|
|
5 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
6 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
7 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
8 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
9 |
import org.apache.commons.io.IOUtils; |
|
10 |
import org.springframework.beans.factory.annotation.Autowired; |
|
11 |
|
|
12 |
/** |
|
13 |
* Plugin to collect metadata record about data repositories from re3data. |
|
14 |
* <p> |
|
15 |
* Documentation on re3data API: http://service.re3data.org/api/doc. |
|
16 |
* </p> |
|
17 |
* <p> |
|
18 |
* BaseURL: http://service.re3data.org |
|
19 |
* </p> |
|
20 |
* <p> |
|
21 |
* API to get the list of repos: baseURL + /api/v1/repositories |
|
22 |
* </p> |
|
23 |
* <p> |
|
24 |
* API to get a repository: baseURL + content of link/@href of the above list |
|
25 |
* </p> |
|
26 |
* |
|
27 |
* @author alessia |
|
28 |
* |
|
29 |
*/ |
|
30 |
public class Re3DataCollectorPlugin extends AbstractCollectorPlugin { |
|
31 |
|
|
32 |
private String repositoryListPath = "/api/v1/repositories"; |
|
33 |
|
|
34 |
@Autowired |
|
35 |
private HttpConnector httpConnector; |
|
36 |
|
|
37 |
@Override |
|
38 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) |
|
39 |
throws CollectorServiceException { |
|
40 |
String repositoryListURL = interfaceDescriptor.getBaseUrl() + repositoryListPath; |
|
41 |
String input; |
|
42 |
try { |
|
43 |
input = httpConnector.getInputSource(repositoryListURL); |
|
44 |
return new Re3DataRepositoriesIterator(IOUtils.toInputStream(input, "UTF-8"), interfaceDescriptor.getBaseUrl(), getHttpConnector()); |
|
45 |
} catch (IOException e) { |
|
46 |
throw new CollectorServiceException(e); |
|
47 |
} |
|
48 |
|
|
49 |
} |
|
50 |
|
|
51 |
public String getRepositoryListPath() { |
|
52 |
return repositoryListPath; |
|
53 |
} |
|
54 |
|
|
55 |
public void setRepositoryListPath(final String repositoryListPath) { |
|
56 |
this.repositoryListPath = repositoryListPath; |
|
57 |
} |
|
58 |
|
|
59 |
public HttpConnector getHttpConnector() { |
|
60 |
return httpConnector; |
|
61 |
} |
|
62 |
|
|
63 |
public void setHttpConnector(final HttpConnector httpConnector) { |
|
64 |
this.httpConnector = httpConnector; |
|
65 |
} |
|
66 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/sitemapindex/SitemapFileIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.schemaorg.Utils; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
import org.apache.commons.io.FileUtils; |
|
6 |
import org.apache.commons.io.IOUtils; |
|
7 |
import org.apache.commons.logging.Log; |
|
8 |
import org.apache.commons.logging.LogFactory; |
|
9 |
|
|
10 |
import java.io.*; |
|
11 |
import java.net.URL; |
|
12 |
import java.nio.charset.Charset; |
|
13 |
import java.util.*; |
|
14 |
|
|
15 |
public class SitemapFileIterator implements Iterator<String> { |
|
16 |
private static final Log log = LogFactory.getLog(SitemapFileIterator.class); |
|
17 |
|
|
18 |
public static class Options { |
|
19 |
|
|
20 |
public enum SitemapFileType{ |
|
21 |
Text, |
|
22 |
GZ |
|
23 |
} |
|
24 |
|
|
25 |
public enum SitemapSchemaType{ |
|
26 |
Text, |
|
27 |
Xml |
|
28 |
} |
|
29 |
|
|
30 |
public Options(){} |
|
31 |
|
|
32 |
public Options(URL fileUrl, Charset charset, SitemapSchemaType schemaType, SitemapFileType fileType) { |
|
33 |
this.fileUrl = fileUrl; |
|
34 |
this.charset = charset; |
|
35 |
this.schemaType = schemaType; |
|
36 |
this.fileType = fileType; |
|
37 |
} |
|
38 |
|
|
39 |
private SitemapFileType fileType; |
|
40 |
private SitemapSchemaType schemaType; |
|
41 |
private URL fileUrl; |
|
42 |
private Charset charset; |
|
43 |
|
|
44 |
public Charset getCharset() { |
|
45 |
return charset; |
|
46 |
} |
|
47 |
|
|
48 |
public void setCharset(Charset charset) { |
|
49 |
this.charset = charset; |
|
50 |
} |
|
51 |
|
|
52 |
public URL getFileUrl() { |
|
53 |
return fileUrl; |
|
54 |
} |
|
55 |
|
|
56 |
public void setFileUrl(URL fileUrl) { |
|
57 |
this.fileUrl = fileUrl; |
|
58 |
} |
|
59 |
|
|
60 |
public SitemapFileType getFileType() { |
|
61 |
return fileType; |
|
62 |
} |
|
63 |
|
|
64 |
public void setFileType(SitemapFileType fileType) { |
|
65 |
this.fileType = fileType; |
|
66 |
} |
|
67 |
|
|
68 |
public SitemapSchemaType getSchemaType() { |
|
69 |
return schemaType; |
|
70 |
} |
|
71 |
|
|
72 |
public void setSchemaType(SitemapSchemaType schemaType) { |
|
73 |
this.schemaType = schemaType; |
|
74 |
} |
|
75 |
|
|
76 |
@Override |
|
77 |
public Object clone(){ |
|
78 |
Options clone = new Options(); |
|
79 |
clone.setCharset(this.getCharset()); |
|
80 |
clone.setFileType(this.getFileType()); |
|
81 |
clone.setFileUrl(this.getFileUrl()); |
|
82 |
clone.setSchemaType(this.getSchemaType()); |
|
83 |
return clone; |
|
84 |
} |
|
85 |
} |
|
86 |
|
|
87 |
private Options options; |
|
88 |
private File downloadedFile; |
|
89 |
private File contentFile; |
|
90 |
private Queue<String> locations; |
|
91 |
|
|
92 |
public SitemapFileIterator(Options options){ |
|
93 |
this.options = options; |
|
94 |
} |
|
95 |
|
|
96 |
public void bootstrap() { |
|
97 |
LinkedList<String> endpoints = null; |
|
98 |
try { |
|
99 |
log.debug(String.format("bootstrapping sitemapindex file access for sitemapindex %s", this.options.getFileUrl())); |
|
100 |
this.downloadedFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp"); |
|
101 |
this.downloadedFile.deleteOnExit(); |
|
102 |
FileUtils.copyURLToFile(this.options.getFileUrl(), this.downloadedFile); |
|
103 |
log.debug(String.format("downloaded file: %s has size %d", this.downloadedFile.toString(), this.downloadedFile.length())); |
|
104 |
|
|
105 |
switch (this.options.getFileType()) { |
|
106 |
case Text: { |
|
107 |
this.contentFile = this.downloadedFile; |
|
108 |
break; |
|
109 |
} |
|
110 |
case GZ: { |
|
111 |
this.contentFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp"); |
|
112 |
this.contentFile.deleteOnExit(); |
|
113 |
Utils.decompressGZipTo(this.downloadedFile, this.contentFile); |
|
114 |
log.debug(String.format("extracted gz file: %s has size %d", this.contentFile.toString(), this.contentFile.length())); |
|
115 |
break; |
|
116 |
} |
|
117 |
default: |
|
118 |
throw new CollectorServiceException("unrecognized file type " + this.options.getFileType()); |
|
119 |
} |
|
120 |
|
|
121 |
List<String> content = this.collectContentLocations(); |
|
122 |
|
|
123 |
log.debug(String.format("extracted %d sitemapindex endpoints", content.size())); |
|
124 |
endpoints = new LinkedList<>(content); |
|
125 |
}catch(Exception ex){ |
|
126 |
log.error(String.format("error processing sitemapindex %s. returning 0 endpoints",this.options.getFileUrl()), ex); |
|
127 |
endpoints = new LinkedList<>(); |
|
128 |
}finally { |
|
129 |
if (this.contentFile != null) { |
|
130 |
this.contentFile.delete(); |
|
131 |
} |
|
132 |
if (this.downloadedFile != null) { |
|
133 |
this.downloadedFile.delete(); |
|
134 |
} |
|
135 |
} |
|
136 |
this.locations = endpoints; |
|
137 |
} |
|
138 |
|
|
139 |
private List<String> collectContentLocations() throws Exception{ |
|
140 |
switch(this.options.getSchemaType()) { |
|
141 |
case Text:{ |
|
142 |
return this.collectTextContentLocations(); |
|
143 |
} |
|
144 |
case Xml:{ |
|
145 |
return this.collectXmlContentLocations(); |
|
146 |
} |
|
147 |
default: throw new CollectorServiceException("unrecognized file type "+this.options.getFileType()); |
|
148 |
} |
|
149 |
} |
|
150 |
|
|
151 |
private List<String> collectTextContentLocations() throws Exception { |
|
152 |
log.debug(String.format("reading endpoint locations from text sitemapindex")); |
|
153 |
try (FileInputStream in = new FileInputStream(this.contentFile)) { |
|
154 |
return IOUtils.readLines(in, this.options.getCharset()); |
|
155 |
} |
|
156 |
} |
|
157 |
|
|
158 |
private List<String> collectXmlContentLocations() throws Exception { |
|
159 |
log.debug(String.format("reading endpoint locations from xml sitemapindex")); |
|
160 |
return Utils.collectAsStrings(this.contentFile,"/urlset/url/loc/text()"); |
|
161 |
} |
|
162 |
|
|
163 |
@Override |
|
164 |
public boolean hasNext() { |
|
165 |
return !this.locations.isEmpty(); |
|
166 |
} |
|
167 |
|
|
168 |
@Override |
|
169 |
public String next() { |
|
170 |
return this.locations.poll(); |
|
171 |
} |
|
172 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/RepositoryQueueIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import org.apache.commons.logging.Log; |
|
4 |
import org.apache.commons.logging.LogFactory; |
|
5 |
|
|
6 |
import java.util.Iterator; |
|
7 |
import java.util.NoSuchElementException; |
|
8 |
import java.util.concurrent.ArrayBlockingQueue; |
|
9 |
import java.util.concurrent.TimeUnit; |
|
10 |
|
|
11 |
public class RepositoryQueueIterator implements Iterator<String> { |
|
12 |
private static final Log log = LogFactory.getLog(RepositoryQueueIterator.class); |
|
13 |
|
|
14 |
public static class Options { |
|
15 |
private Boolean blockPolling; |
|
16 |
private long pollTimeout; |
|
17 |
private TimeUnit pollTimeoutUnit; |
|
18 |
|
|
19 |
public Boolean getBlockPolling() { |
|
20 |
return blockPolling; |
|
21 |
} |
|
22 |
|
|
23 |
public void setBlockPolling(Boolean blockPolling) { |
|
24 |
this.blockPolling = blockPolling; |
|
25 |
} |
|
26 |
|
|
27 |
public long getPollTimeout() { |
|
28 |
return pollTimeout; |
|
29 |
} |
|
30 |
|
|
31 |
public void setPollTimeout(long pollTimeout) { |
|
32 |
this.pollTimeout = pollTimeout; |
|
33 |
} |
|
34 |
|
|
35 |
public TimeUnit getPollTimeoutUnit() { |
|
36 |
return pollTimeoutUnit; |
|
37 |
} |
|
38 |
|
|
39 |
public void setPollTimeoutUnit(TimeUnit pollTimeoutUnit) { |
|
40 |
this.pollTimeoutUnit = pollTimeoutUnit; |
|
41 |
} |
|
42 |
} |
|
43 |
|
|
44 |
private ArrayBlockingQueue<String> queue; |
|
45 |
private Options options; |
|
46 |
private boolean hasTerminated; |
|
47 |
|
|
48 |
public RepositoryQueueIterator(Options options, ArrayBlockingQueue<String> queue) { |
|
49 |
this.options = options; |
|
50 |
this.queue = queue; |
|
51 |
this.hasTerminated = false; |
|
52 |
} |
|
53 |
|
|
54 |
@Override |
|
55 |
public boolean hasNext() { |
|
56 |
if(this.hasTerminated) return false; |
|
57 |
return true; |
|
58 |
} |
|
59 |
|
|
60 |
@Override |
|
61 |
public String next() { |
|
62 |
String next = this.poll(); |
|
63 |
log.debug("next endpoint to process: " + next); |
|
64 |
if (next != null && next.equalsIgnoreCase(RepositoryIterable.TerminationHint)) { |
|
65 |
log.debug("no more endpoints to process"); |
|
66 |
this.hasTerminated = true; |
|
67 |
next = null; |
|
68 |
} |
|
69 |
|
|
70 |
return next; |
|
71 |
} |
|
72 |
|
|
73 |
private String poll(){ |
|
74 |
String item = null; |
|
75 |
log.debug("retrieving endpoint from queue"); |
|
76 |
log.debug("queue size: " + queue.size()); |
|
77 |
if(this.options.getBlockPolling()) { |
|
78 |
try { |
|
79 |
item = this.queue.poll(this.options.getPollTimeout(), this.options.getPollTimeoutUnit()); |
|
80 |
} catch (InterruptedException ex) { |
|
81 |
log.warn(String.format("could not poll elements from queue for more than %s %s. throwing", this.options.getPollTimeout(), this.options.getPollTimeoutUnit())); |
|
82 |
throw new NoSuchElementException(ex.getMessage()); |
|
83 |
} |
|
84 |
} |
|
85 |
else { |
|
86 |
item = this.queue.poll(); |
|
87 |
} |
|
88 |
log.debug("retrieved endpoint from queue"); |
|
89 |
log.debug("queue size: " + queue.size()); |
|
90 |
return item; |
|
91 |
} |
|
92 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/pom.xml | ||
---|---|---|
1 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|
2 |
<modelVersion>4.0.0</modelVersion> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet45-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
</parent> |
|
8 |
<groupId>eu.dnetlib</groupId> |
|
9 |
<artifactId>dnet-collector-plugins</artifactId> |
|
10 |
<version>1.8.1</version> |
|
11 |
<scm> |
|
12 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1</developerConnection> |
|
13 |
</scm> |
|
14 |
|
|
15 |
<build> |
|
16 |
<plugins> |
|
17 |
<plugin> |
|
18 |
<artifactId>maven-assembly-plugin</artifactId> |
|
19 |
<configuration> |
|
20 |
<archive> |
|
21 |
<manifest> |
|
22 |
<mainClass>eu.dnetlib.data.collector.plugins.schemaorg.SchemaOrgMainReactome</mainClass> |
|
23 |
</manifest> |
|
24 |
</archive> |
|
25 |
<descriptorRefs> |
|
26 |
<descriptorRef>jar-with-dependencies</descriptorRef> |
|
27 |
</descriptorRefs> |
|
28 |
</configuration> |
|
29 |
</plugin> |
|
30 |
</plugins> |
|
31 |
</build> |
|
32 |
|
|
33 |
<dependencies> |
|
34 |
<dependency> |
|
35 |
<groupId>eu.dnetlib</groupId> |
|
36 |
<artifactId>dnet-modular-collector-service-rmi</artifactId> |
|
37 |
<version>[1.3.0,2.0.0)</version> |
|
38 |
</dependency> |
|
39 |
<dependency> |
|
40 |
<groupId>eu.dnetlib</groupId> |
|
41 |
<artifactId>dnet-modular-collector-service</artifactId> |
|
42 |
<version>[3.3.26,4.0.0)</version> |
|
43 |
</dependency> |
|
44 |
<dependency> |
|
45 |
<groupId>com.google.code.gson</groupId> |
|
46 |
<artifactId>gson</artifactId> |
|
47 |
<version>${google.gson.version}</version> |
|
48 |
</dependency> |
|
49 |
<dependency> |
|
50 |
<groupId>commons-io</groupId> |
|
51 |
<artifactId>commons-io</artifactId> |
|
52 |
<version>${commons.io.version}</version> |
|
53 |
</dependency> |
|
54 |
<dependency> |
|
55 |
<groupId>junit</groupId> |
|
56 |
<artifactId>junit</artifactId> |
|
57 |
<version>${junit.version}</version> |
|
58 |
<scope>test</scope> |
|
59 |
</dependency> |
|
60 |
<dependency> |
|
61 |
<groupId>org.apache.httpcomponents</groupId> |
|
62 |
<artifactId>httpclient</artifactId> |
|
63 |
<version>4.5</version> |
|
64 |
</dependency> |
|
65 |
<dependency> |
|
66 |
<groupId>eu.dnetlib</groupId> |
|
67 |
<artifactId>cnr-resultset-service</artifactId> |
|
68 |
<version>[2.0.0, 3.0.0)</version> |
|
69 |
<scope>provided</scope> |
|
70 |
</dependency> |
|
71 |
<dependency> |
|
72 |
<groupId>com.ximpleware</groupId> |
|
73 |
<artifactId>vtd-xml</artifactId> |
|
74 |
<version>[2.12, 3.0.0)</version> |
|
75 |
</dependency> |
|
76 |
<dependency> |
|
77 |
<groupId>joda-time</groupId> |
|
78 |
<artifactId>joda-time</artifactId> |
|
79 |
<version>2.9.2</version> |
|
80 |
</dependency> |
|
81 |
|
|
82 |
<dependency> |
|
83 |
<groupId>org.json</groupId> |
|
84 |
<artifactId>json</artifactId> |
|
85 |
<version>20180813</version> |
|
86 |
<type>jar</type> |
|
87 |
</dependency> |
|
88 |
<dependency> |
|
89 |
<groupId>org.apache.commons</groupId> |
|
90 |
<artifactId>commons-lang3</artifactId> |
|
91 |
<version>3.5</version> |
|
92 |
</dependency> |
|
93 |
|
|
94 |
<dependency> |
|
95 |
<groupId>org.apache.poi</groupId> |
|
96 |
<artifactId>poi</artifactId> |
|
97 |
<version>3.16</version> |
|
98 |
</dependency> |
|
99 |
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> |
|
100 |
<dependency> |
|
101 |
<groupId>org.apache.poi</groupId> |
|
102 |
<artifactId>poi-ooxml</artifactId> |
|
103 |
<version>3.16</version> |
|
104 |
</dependency> |
|
105 |
<dependency> |
|
106 |
<groupId>org.jsoup</groupId> |
|
107 |
<artifactId>jsoup</artifactId> |
|
108 |
<version>1.11.2</version> |
|
109 |
</dependency> |
|
110 |
<dependency> |
|
111 |
<groupId>commons-lang</groupId> |
|
112 |
<artifactId>commons-lang</artifactId> |
|
113 |
<version>2.6</version> |
|
114 |
<scope>compile</scope> |
|
115 |
</dependency> |
|
116 |
<dependency> |
|
117 |
<groupId>org.mockito</groupId> |
|
118 |
<artifactId>mockito-core</artifactId> |
|
119 |
<version>3.3.3</version> |
|
120 |
<scope>test</scope> |
|
121 |
</dependency> |
|
122 |
</dependencies> |
|
123 |
</project> |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/RepositoryIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
|
|
5 |
public interface RepositoryIterable extends Iterable<String> { |
|
6 |
public static String TerminationHint = "df667391-676d-4c0f-9c40-426b1001607a"; |
|
7 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/gtr2/Gtr2Helper.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.gtr2; |
|
2 |
|
|
3 |
import org.apache.commons.lang.StringUtils; |
|
4 |
import org.apache.commons.logging.Log; |
|
5 |
import org.apache.commons.logging.LogFactory; |
|
6 |
import org.dom4j.Document; |
|
7 |
import org.dom4j.DocumentHelper; |
|
8 |
import org.joda.time.DateTime; |
|
9 |
import org.joda.time.format.DateTimeFormat; |
|
10 |
import org.joda.time.format.DateTimeFormatter; |
|
11 |
|
|
12 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
13 |
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException; |
|
14 |
|
|
15 |
public class Gtr2Helper { |
|
16 |
|
|
17 |
private static final Log log = LogFactory.getLog(Gtr2Helper.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
18 |
|
|
19 |
private static final HttpConnector connector = new HttpConnector(); |
|
20 |
private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); |
|
21 |
|
|
22 |
private static final int MAX_ATTEMPTS = 10; |
|
23 |
|
|
24 |
public static String cleanURL(final String url) { |
|
25 |
String cleaned = url; |
|
26 |
if (cleaned.contains("gtr.gtr")) { |
|
27 |
cleaned = cleaned.replace("gtr.gtr", "gtr"); |
|
28 |
} |
|
29 |
if (cleaned.startsWith("http://")) { |
|
30 |
cleaned = cleaned.replaceFirst("http://", "https://"); |
|
31 |
} |
|
32 |
return cleaned; |
|
33 |
} |
|
34 |
|
|
35 |
public static Document loadURL(final String url) { |
|
36 |
final String cleanUrl = cleanURL(url); |
|
37 |
return loadURL(cleanUrl, 0); |
|
38 |
} |
|
39 |
|
|
40 |
private static Document loadURL(final String cleanUrl, final int attempt) { |
|
41 |
try { |
|
42 |
log.debug(" * Downloading Url: " + cleanUrl); |
|
43 |
final byte[] bytes = connector.getInputSource(cleanUrl).getBytes("UTF-8"); |
|
44 |
return DocumentHelper.parseText(new String(bytes)); |
|
45 |
} catch (final Throwable e) { |
|
46 |
log.error("Error dowloading url: " + cleanUrl + ", attempt = " + attempt, e); |
|
47 |
if (attempt < MAX_ATTEMPTS) { |
|
48 |
try { |
|
49 |
Thread.sleep(60000); // I wait for a minute |
|
50 |
} catch (final InterruptedException e1) { |
|
51 |
throw new CollectorServiceRuntimeException("Error dowloading url: " + cleanUrl, e); |
|
52 |
} |
|
53 |
return loadURL(cleanUrl, attempt + 1); |
|
54 |
} else { |
|
55 |
throw new CollectorServiceRuntimeException("Error dowloading url: " + cleanUrl, e); |
|
56 |
} |
|
57 |
} |
|
58 |
} |
|
59 |
|
|
60 |
public static DateTime parseDate(final String s) { |
|
61 |
return DateTime.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter); |
|
62 |
} |
|
63 |
|
|
64 |
public static boolean isAfter(final String d, final DateTime fromDate) { |
|
65 |
return StringUtils.isNotBlank(d) && Gtr2Helper.parseDate(d).isAfter(fromDate); |
|
66 |
} |
|
67 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/gtr2/AbstractGtr2CollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.gtr2; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
|
|
5 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
6 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
7 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
8 |
|
|
9 |
public abstract class AbstractGtr2CollectorPlugin extends AbstractCollectorPlugin { |
|
10 |
|
|
11 |
@Override |
|
12 |
public final Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) |
|
13 |
throws CollectorServiceException { |
|
14 |
|
|
15 |
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); } |
|
16 |
|
|
17 |
final String baseUrl = interfaceDescriptor.getBaseUrl(); |
|
18 |
final String startPage = interfaceDescriptor.getParams().get("startPage"); |
|
19 |
final String endPage = interfaceDescriptor.getParams().get("endPage"); |
|
20 |
|
|
21 |
return () -> { |
|
22 |
try { |
|
23 |
return createIterator(baseUrl, fromDate, startPage, endPage); |
|
24 |
} catch (final CollectorServiceException e) { |
|
25 |
throw new RuntimeException(e); |
|
26 |
} |
|
27 |
}; |
|
28 |
} |
|
29 |
|
|
30 |
protected abstract Iterator<String> createIterator(String baseUrl, final String fromDate, String startPage, String endPage) |
|
31 |
throws CollectorServiceException; |
|
32 |
|
|
33 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import org.apache.commons.logging.Log; |
|
4 |
import org.apache.commons.logging.LogFactory; |
|
5 |
|
|
6 |
import java.util.Iterator; |
|
7 |
import java.util.concurrent.ArrayBlockingQueue; |
|
8 |
|
|
9 |
public class SchemaOrgIterable implements Iterable<String> { |
|
10 |
private static final Log log = LogFactory.getLog(SchemaOrgIterable.class); |
|
11 |
|
|
12 |
public static class Options { |
|
13 |
private EndpointAccessIterator.Options endpointAccessOptions; |
|
14 |
private DatasetMappingIterator.Options datasetMappingOptions; |
|
15 |
|
|
16 |
public EndpointAccessIterator.Options getEndpointAccessOptions() { |
|
17 |
return endpointAccessOptions; |
|
18 |
} |
|
19 |
|
|
20 |
public void setEndpointAccessOptions(EndpointAccessIterator.Options endpointAccessOptions) { |
|
21 |
this.endpointAccessOptions = endpointAccessOptions; |
|
22 |
} |
|
23 |
|
|
24 |
public DatasetMappingIterator.Options getDatasetMappingOptions() { |
|
25 |
return datasetMappingOptions; |
|
26 |
} |
|
27 |
|
|
28 |
public void setDatasetMappingOptions(DatasetMappingIterator.Options datasetMappingOptions) { |
|
29 |
this.datasetMappingOptions = datasetMappingOptions; |
|
30 |
} |
|
31 |
} |
|
32 |
|
|
33 |
private Options options; |
|
34 |
private RepositoryIterable repository; |
|
35 |
|
|
36 |
public SchemaOrgIterable(Options options, RepositoryIterable repository){ |
|
37 |
this.options = options; |
|
38 |
this.repository = repository; |
|
39 |
} |
|
40 |
|
|
41 |
@Override |
|
42 |
public Iterator<String> iterator() { |
|
43 |
Iterator<String> repositoryIterator = this.repository.iterator(); |
|
44 |
EndpointAccessIterator endpointAccessIterator = new EndpointAccessIterator(options.getEndpointAccessOptions(), repositoryIterator); |
|
45 |
DatasetMappingIterator datasetMappingIterator = new DatasetMappingIterator(options.getDatasetMappingOptions(), endpointAccessIterator); |
|
46 |
|
|
47 |
return datasetMappingIterator; |
|
48 |
} |
|
49 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/HTTPWithFileNameCollectorIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.httpfilename; |
|
2 |
|
|
3 |
import java.util.*; |
|
4 |
import java.util.concurrent.ArrayBlockingQueue; |
|
5 |
import java.util.concurrent.TimeUnit; |
|
6 |
|
|
7 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
8 |
import org.apache.commons.logging.Log; |
|
9 |
import org.apache.commons.logging.LogFactory; |
|
10 |
import org.json.JSONObject; |
|
11 |
import org.json.XML; |
|
12 |
import org.jsoup.Jsoup; |
|
13 |
import org.jsoup.nodes.Document; |
|
14 |
import org.jsoup.nodes.Element; |
|
15 |
import org.jsoup.select.Elements; |
|
16 |
|
|
17 |
/** |
|
18 |
* Created by miriam on 04/05/2018. |
|
19 |
*/ |
|
20 |
public class HTTPWithFileNameCollectorIterable implements Iterable<String> { |
|
21 |
|
|
22 |
private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class); |
|
23 |
|
|
24 |
private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>"; |
|
25 |
public static final String APP_JSON = "application/json"; |
|
26 |
public static final String APP_XML = "application/xml"; |
|
27 |
public static final String TEXT_HTML = "text/html"; |
|
28 |
private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100); |
|
29 |
|
|
30 |
|
|
31 |
|
|
32 |
|
|
33 |
private String filterParam; |
|
34 |
|
|
35 |
int total = 0; |
|
36 |
int filtered = 0; |
|
37 |
|
|
38 |
public HTTPWithFileNameCollectorIterable(String startUrl, String filter){ |
|
39 |
|
|
40 |
this.filterParam = filter; |
|
41 |
Thread ft = new Thread(new FillMetaQueue(startUrl) ); |
|
42 |
ft.start(); |
|
43 |
} |
|
44 |
|
|
45 |
|
|
46 |
@Override |
|
47 |
public Iterator<String> iterator() { |
|
48 |
return new HttpWithFileNameCollectorIterator(queue); |
|
49 |
} |
|
50 |
|
|
51 |
private class FillMetaQueue implements Runnable { |
|
52 |
final Connector c = new Connector(); |
|
53 |
|
|
54 |
private final List<String> metas = Collections.synchronizedList(new ArrayList<String>()); |
|
55 |
private final List<String> urls = Collections.synchronizedList(new ArrayList<>()); |
|
56 |
|
|
57 |
public FillMetaQueue(String startUrl){ |
|
58 |
if(!startUrl.isEmpty()){ |
|
59 |
urls.add(startUrl); |
|
60 |
} |
|
61 |
} |
|
62 |
|
|
63 |
|
|
64 |
public void fillQueue() { |
|
65 |
String url; |
|
66 |
|
|
67 |
while((metas.size()>0 || urls.size() > 0 )) { |
|
68 |
log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size()); |
|
69 |
if (metas.size() > 0) { |
|
70 |
url = metas.remove(0); |
|
71 |
try { |
|
72 |
c.get(url); |
|
73 |
} catch (CollectorServiceException e) { |
|
74 |
log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
75 |
} |
|
76 |
if(c.isStatusOk()){ |
|
77 |
try { |
|
78 |
String ret = c.getResponse(); |
|
79 |
if (ret != null && ret.length()>0) { |
|
80 |
if (!containsFilter(ret)) |
|
81 |
queue.put(addFilePath(ret, url, url.endsWith(".json"))); |
|
82 |
//queue.offer(addFilePath(ret, url, url.endsWith(".json")), HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS); |
|
83 |
else |
|
84 |
filtered++; |
|
85 |
total++; |
|
86 |
} |
|
87 |
} catch (InterruptedException e) { |
|
88 |
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
89 |
|
|
90 |
} |
|
91 |
} |
|
92 |
} else { |
|
93 |
url = urls.remove(0); |
|
94 |
try { |
|
95 |
c.get(url); |
|
96 |
} catch (CollectorServiceException e) { |
|
97 |
log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
98 |
} |
|
99 |
if(c.isStatusOk()) { |
|
100 |
if (c.responseTypeContains(TEXT_HTML)){ |
|
101 |
recurFolder(c.getResponse(), url); |
|
102 |
} else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){ |
|
103 |
try { |
|
104 |
final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON)); |
|
105 |
//queue.offer(element, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS); |
|
106 |
queue.put(element); |
|
107 |
} catch (InterruptedException e) { |
|
108 |
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
109 |
} |
|
110 |
} |
|
111 |
} |
|
112 |
} |
|
113 |
|
|
114 |
} |
|
115 |
try { |
|
116 |
//queue.offer(HttpWithFileNameCollectorIterator.TERMINATOR, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS); |
|
117 |
queue.put(HttpWithFileNameCollectorIterator.TERMINATOR); |
|
118 |
} catch (InterruptedException e) { |
|
119 |
throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS), e); |
|
120 |
} |
|
121 |
|
|
122 |
} |
|
123 |
|
|
124 |
private boolean containsFilter(String meta){ |
|
125 |
if (filterParam == null || filterParam.isEmpty()) |
|
126 |
return false; |
|
127 |
String[] filter = filterParam.split(";"); |
|
128 |
for(String item:filter){ |
|
129 |
if (meta.contains(item)) |
|
130 |
return true; |
|
131 |
} |
|
132 |
return false; |
|
133 |
} |
|
134 |
|
|
135 |
private String addFilePath(String meta, String url, boolean isJson){ |
|
136 |
String path = url.replace("metadata", "pdf"); |
|
137 |
|
|
138 |
try { |
|
139 |
if(isJson) |
|
140 |
meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}"; |
|
141 |
else { |
|
142 |
|
|
143 |
if (meta.contains("<!DOCTYPE")) { |
|
144 |
meta = meta.substring(meta.indexOf("<!DOCTYPE")); |
|
145 |
meta = meta.substring(meta.indexOf(">") + 1); |
|
146 |
} |
|
147 |
int index = meta.lastIndexOf("</"); |
|
148 |
meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index); |
|
149 |
} |
|
150 |
} catch(Exception ex) { |
|
151 |
log.info("not file with extension .json or .xml"); |
|
152 |
} |
|
153 |
|
|
154 |
|
|
155 |
if(isJson) { |
|
156 |
try { |
|
157 |
return XML.toString(new JSONObject("{'resource':" + meta + "}")); |
|
158 |
} catch(Exception e) { |
|
159 |
log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url); |
|
160 |
// throw new RuntimeException(); |
|
161 |
final String junk = String.format(JUNK, url); |
|
162 |
log.warn("returning " + junk); |
|
163 |
return junk; |
|
164 |
} |
|
165 |
} |
|
166 |
return meta; |
|
167 |
} |
|
168 |
|
|
169 |
private void recurFolder(String text, String url){ |
|
170 |
Document doc = Jsoup.parse(text); |
|
171 |
Elements links = doc.select("a"); |
|
172 |
for(Element e:links){ |
|
173 |
if (!e.text().equals("../")){ |
|
174 |
String file = e.attr("href"); |
|
175 |
if(file.endsWith(".json") || file.endsWith(".xml")) |
|
176 |
metas.add(url+file); |
|
177 |
else |
|
178 |
urls.add(url+file); |
|
179 |
} |
|
180 |
} |
|
181 |
} |
|
182 |
|
|
183 |
|
|
184 |
@Override |
|
185 |
public void run() { |
|
186 |
fillQueue(); |
|
187 |
} |
|
188 |
} |
|
189 |
|
|
190 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/datasets/DatasetsIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.datasets; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.io.InputStream; |
|
5 |
import java.util.Iterator; |
|
6 |
|
|
7 |
import org.apache.commons.io.IOUtils; |
|
8 |
import org.apache.commons.lang3.StringEscapeUtils; |
|
9 |
import org.apache.commons.logging.Log; |
|
10 |
import org.apache.commons.logging.LogFactory; |
|
11 |
import org.apache.http.client.methods.CloseableHttpResponse; |
|
12 |
import org.apache.http.client.methods.HttpPost; |
|
13 |
import org.apache.http.entity.StringEntity; |
|
14 |
import org.apache.http.impl.client.CloseableHttpClient; |
|
15 |
import org.apache.http.impl.client.HttpClients; |
|
16 |
|
|
17 |
import com.google.gson.Gson; |
|
18 |
import com.google.gson.GsonBuilder; |
|
19 |
|
|
20 |
/** |
|
21 |
* The Class JournalIterator. |
|
22 |
*/ |
|
23 |
public class DatasetsIterator implements Iterable<String>, Iterator<String> { |
|
24 |
|
|
25 |
/** The logger. */ |
|
26 |
private static final Log log = LogFactory.getLog(DatasetsIterator.class); |
|
27 |
|
|
28 |
/** The base url template. */ |
|
29 |
private static String BASE_URL_TEMPLATE = "http://ws.pangaea.de/es/pangaea/panmd/_search?_source=xml&size=%d&from=%d"; |
|
30 |
|
|
31 |
/** The journal id. */ |
|
32 |
private String journalId = ""; |
|
33 |
|
|
34 |
/** The journal name. */ |
|
35 |
private String journalName = ""; |
|
36 |
|
|
37 |
/** The journal issn. */ |
|
38 |
private String journalISSN = ""; |
|
39 |
|
|
40 |
/** The openaire datasource. */ |
|
41 |
private String openaireDatasource = ""; |
|
42 |
|
|
43 |
/** The total. */ |
|
44 |
private long total; |
|
45 |
|
|
46 |
/** The from. */ |
|
47 |
private int from; |
|
48 |
|
|
49 |
/** The current iterator. */ |
|
50 |
private int currentIterator; |
|
51 |
|
|
52 |
/** The current response. */ |
|
53 |
private ElasticSearchResponse currentResponse; |
|
54 |
|
|
55 |
/** The request. */ |
|
56 |
private RequestField request; |
|
57 |
|
|
58 |
/** The default size. */ |
|
59 |
private static int DEFAULT_SIZE = 10; |
|
60 |
|
|
61 |
private String projectCordaId; |
|
62 |
|
|
63 |
private static String RECORD_TEMPLATE = "<datasetsRecord><oaf:projectid xmlns:oaf=\"http://namespace.openaire.eu/oaf\">%s</oaf:projectid>" |
|
64 |
+ "<journal name='%s' issn='%s' datasourceid = '%s'/><metadata>%s</metadata></datasetsRecord>"; |
|
65 |
|
|
66 |
/** |
|
67 |
* Instantiates a new journal iterator. |
|
68 |
* |
|
69 |
* @param request |
|
70 |
* the request |
|
71 |
*/ |
|
72 |
public DatasetsIterator(final RequestField request, final String projectCordaId, final PangaeaJournalInfo info) { |
|
73 |
this.request = request; |
|
74 |
this.setProjectCordaId(projectCordaId); |
|
75 |
|
|
76 |
if (info != null) { |
|
77 |
this.setJournalId(info.getJournalId()); |
|
78 |
this.setJournalName(StringEscapeUtils.escapeXml(info.getJournalName())); |
|
79 |
this.setJournalISSN(info.getJournalISSN()); |
|
80 |
this.setOpenaireDatasource(info.getDatasourceId()); |
|
81 |
} |
|
82 |
log.debug("Start Iterator"); |
|
83 |
} |
|
84 |
|
|
85 |
/** |
|
86 |
* Execute query. |
|
87 |
* |
|
88 |
* @param from |
|
89 |
* the from |
|
90 |
* @param size |
|
91 |
* the size |
|
92 |
* @return the string |
|
93 |
*/ |
|
94 |
private String executeQuery(final int from, final int size) { |
|
95 |
log.debug("executing query " + this.request.getQuery().getTerm()); |
|
96 |
log.debug(String.format("from:%d size:%d", from, size)); |
|
97 |
CloseableHttpResponse response = null; |
|
98 |
InputStream responseBody = null; |
|
99 |
CloseableHttpClient httpclient = HttpClients.createDefault(); |
|
100 |
try { |
|
101 |
|
|
102 |
HttpPost post = new HttpPost(String.format(BASE_URL_TEMPLATE, size, from)); |
|
103 |
Gson g = new GsonBuilder().disableHtmlEscaping().create(); |
|
104 |
StringEntity entry = new StringEntity(g.toJson(this.request)); |
|
105 |
post.setEntity(entry); |
|
106 |
long start = System.currentTimeMillis(); |
|
107 |
response = httpclient.execute(post); |
|
108 |
int statusCode = response.getStatusLine().getStatusCode(); |
|
109 |
if (statusCode == 200) { |
|
110 |
responseBody = response.getEntity().getContent(); |
|
111 |
String s = IOUtils.toString(responseBody); |
|
112 |
log.debug("Request done in " + (System.currentTimeMillis() - start) + " ms"); |
|
113 |
responseBody.close(); |
|
114 |
return s; |
|
115 |
} |
|
116 |
return null; |
|
117 |
} catch (Exception e) { |
|
118 |
log.error("Error on executing query :" + request.getQuery().getTerm(), e); |
|
119 |
return null; |
|
120 |
} finally { |
|
121 |
try { |
|
122 |
responseBody.close(); |
|
123 |
response.close(); |
|
124 |
httpclient.close(); |
|
125 |
} catch (IOException e) { |
|
126 |
log.error("Can't close connections gracefully", e); |
|
127 |
} |
|
128 |
} |
|
129 |
|
|
130 |
} |
|
131 |
|
|
132 |
/** |
|
133 |
* Gets the journal id. |
|
134 |
* |
|
135 |
* @return the journalId |
|
136 |
*/ |
|
137 |
public String getJournalId() { |
|
138 |
return journalId; |
|
139 |
} |
|
140 |
|
|
141 |
/** |
|
142 |
* Sets the journal id. |
|
143 |
* |
|
144 |
* @param journalId |
|
145 |
* the journalId to set |
|
146 |
*/ |
|
147 |
public void setJournalId(final String journalId) { |
|
148 |
this.journalId = journalId; |
|
149 |
} |
|
150 |
|
|
151 |
/* |
|
152 |
* (non-Javadoc) |
|
153 |
* |
|
154 |
* @see java.util.Iterator#hasNext() |
|
155 |
*/ |
|
156 |
@Override |
|
157 |
public boolean hasNext() { |
|
158 |
return (from + currentIterator) < total; |
|
159 |
} |
|
160 |
|
|
161 |
/* |
|
162 |
* (non-Javadoc) |
|
163 |
* |
|
164 |
* @see java.util.Iterator#next() |
|
165 |
*/ |
|
166 |
@Override |
|
167 |
public String next() { |
|
168 |
String xml = String.format(RECORD_TEMPLATE, this.projectCordaId, this.journalName, this.journalISSN, this.openaireDatasource, currentResponse |
|
169 |
.getXmlRecords().get(currentIterator)); |
|
170 |
currentIterator++; |
|
171 |
if (currentIterator == DEFAULT_SIZE) { |
|
172 |
getNextItem(); |
|
173 |
} |
|
174 |
return xml; |
|
175 |
} |
|
176 |
|
|
177 |
/* |
|
178 |
* (non-Javadoc) |
|
179 |
* |
|
180 |
* @see java.util.Iterator#remove() |
|
181 |
*/ |
|
182 |
@Override |
|
183 |
public void remove() { |
|
184 |
throw new UnsupportedOperationException(); |
|
185 |
|
|
186 |
} |
|
187 |
|
|
188 |
/* |
|
189 |
* (non-Javadoc) |
|
190 |
* |
|
191 |
* @see java.lang.Iterable#iterator() |
|
192 |
*/ |
|
193 |
@Override |
|
194 |
public Iterator<String> iterator() { |
|
195 |
from = 0; |
|
196 |
total = 0; |
|
197 |
getNextItem(); |
|
198 |
return this; |
|
199 |
} |
|
200 |
|
|
201 |
/** |
|
202 |
* Gets the next item. |
|
203 |
* |
|
204 |
* @return the next item |
|
205 |
*/ |
|
206 |
private void getNextItem() { |
|
207 |
from += currentIterator; |
|
208 |
currentResponse = ElasticSearchResponse.createNewResponse(executeQuery(from, DEFAULT_SIZE)); |
|
209 |
total = currentResponse == null ? 0 : currentResponse.getTotal(); |
|
210 |
log.debug("from : " + from + " total of the request is " + total); |
|
211 |
currentIterator = 0; |
|
212 |
} |
|
213 |
|
|
214 |
/** |
|
215 |
* @return the projectCordaId |
|
216 |
*/ |
|
217 |
public String getProjectCordaId() { |
|
218 |
return projectCordaId; |
|
219 |
} |
|
220 |
|
|
221 |
/** |
|
222 |
* @param projectCordaId |
|
223 |
* the projectCordaId to set |
|
224 |
*/ |
|
225 |
public void setProjectCordaId(final String projectCordaId) { |
|
226 |
this.projectCordaId = projectCordaId; |
|
227 |
} |
|
228 |
|
|
229 |
/** |
|
230 |
* @return the journalName |
|
231 |
*/ |
|
232 |
public String getJournalName() { |
|
233 |
return journalName; |
|
234 |
} |
|
235 |
|
|
236 |
/** |
|
237 |
* @param journalName |
|
238 |
* the journalName to set |
|
239 |
*/ |
|
240 |
public void setJournalName(final String journalName) { |
|
241 |
this.journalName = journalName; |
|
242 |
} |
|
243 |
|
|
244 |
/** |
|
245 |
* @return the journalISSN |
|
246 |
*/ |
|
247 |
public String getJournalISSN() { |
|
248 |
return journalISSN; |
|
249 |
} |
|
250 |
|
|
251 |
/** |
|
252 |
* @param journalISSN |
|
253 |
* the journalISSN to set |
|
254 |
*/ |
|
255 |
public void setJournalISSN(final String journalISSN) { |
|
256 |
this.journalISSN = journalISSN; |
|
257 |
} |
|
258 |
|
|
259 |
/** |
|
260 |
* @return the openaireDatasource |
|
261 |
*/ |
|
262 |
public String getOpenaireDatasource() { |
|
263 |
return openaireDatasource; |
|
264 |
} |
|
265 |
|
|
266 |
/** |
|
267 |
* @param openaireDatasource |
|
268 |
* the openaireDatasource to set |
|
269 |
*/ |
|
270 |
public void setOpenaireDatasource(final String openaireDatasource) { |
|
271 |
this.openaireDatasource = openaireDatasource; |
|
272 |
} |
|
273 |
|
|
274 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/gtr2/Gtr2Iterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.gtr2; |
|
2 |
|
|
3 |
import java.util.ArrayList; |
|
4 |
import java.util.HashMap; |
|
5 |
import java.util.Iterator; |
|
6 |
import java.util.LinkedList; |
|
7 |
import java.util.List; |
|
8 |
import java.util.Map; |
|
9 |
import java.util.Queue; |
|
10 |
import java.util.function.Function; |
|
11 |
|
|
12 |
import org.apache.commons.lang.math.NumberUtils; |
|
13 |
import org.apache.commons.lang3.StringUtils; |
|
14 |
import org.apache.commons.logging.Log; |
|
15 |
import org.apache.commons.logging.LogFactory; |
|
16 |
import org.dom4j.Document; |
|
17 |
import org.dom4j.DocumentException; |
|
18 |
import org.dom4j.DocumentHelper; |
|
19 |
import org.dom4j.Element; |
|
20 |
import org.joda.time.DateTime; |
|
21 |
|
|
22 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
23 |
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException; |
|
24 |
|
|
25 |
public abstract class Gtr2Iterator implements Iterator<String> { |
|
26 |
|
|
27 |
public static final int PAGE_SIZE = 20; |
|
28 |
|
|
29 |
private static final Log log = LogFactory.getLog(Gtr2Iterator.class); |
|
30 |
|
|
31 |
private final String baseUrl; |
|
32 |
private int currPage; |
|
33 |
private int endPage; |
|
34 |
private boolean incremental = false; |
|
35 |
private DateTime fromDate; |
|
36 |
|
|
37 |
private final Map<String, String> cache = new HashMap<>(); |
|
38 |
|
|
39 |
private final Queue<String> queue = new LinkedList<>(); |
|
40 |
|
|
41 |
private String nextElement; |
|
42 |
|
|
43 |
public Gtr2Iterator(final String baseUrl, final String fromDate, final String startPage, final String endPage) |
|
44 |
throws CollectorServiceException { |
|
45 |
|
|
46 |
this.baseUrl = baseUrl; |
|
47 |
this.currPage = NumberUtils.toInt(startPage, 1); |
|
48 |
this.endPage = NumberUtils.toInt(endPage, Integer.MAX_VALUE); |
|
49 |
this.incremental = StringUtils.isNotBlank(fromDate); |
|
50 |
|
|
51 |
if (this.incremental) { |
|
52 |
this.fromDate = Gtr2Helper.parseDate(fromDate); |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-collector-plugins-1.8.1