Revision 52235
Added by Miriam Baglioni over 6 years ago
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameCollectorIterable.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.collector.plugins.HTTPWithFileName; |
2 | 2 |
|
3 |
import eu.dnetlib.data.collector.plugins.projects.gtr2.Gtr2ProjectsIterable; |
|
3 | 4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
4 | 5 |
import org.apache.commons.logging.Log; |
5 | 6 |
import org.apache.commons.logging.LogFactory; |
... | ... | |
21 | 22 |
*/ |
22 | 23 |
public class HTTPWithFileNameCollectorIterable implements Iterable<String> { |
23 | 24 |
private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class); |
25 |
private final String TERMINATOR = "FINITO"; |
|
26 |
final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100); |
|
24 | 27 |
|
25 | 28 |
private final ArrayList<String> urls = new ArrayList<>(); |
26 | 29 |
private final ArrayList<String> metas = new ArrayList<String>(); |
27 |
private String filter; |
|
30 |
private String filterParam;
|
|
28 | 31 |
|
32 |
int total = 0; |
|
33 |
int filtered = 0; |
|
34 |
|
|
29 | 35 |
public HTTPWithFileNameCollectorIterable(String startUrl, String filter){ |
30 |
|
|
31 |
urls.add(startUrl); |
|
32 |
this.filter = filter; |
|
36 |
if (!startUrl.isEmpty()) |
|
37 |
urls.add(startUrl); |
|
38 |
this.filterParam = filter; |
|
39 |
Thread ft = new Thread(new FillMetaQueue()); |
|
40 |
ft.start(); |
|
33 | 41 |
} |
34 | 42 |
|
35 |
private boolean containsFilter(String meta){ |
|
43 |
/* private boolean containsFilter(String meta){
|
|
36 | 44 |
if (filter == null || filter.isEmpty()) |
37 | 45 |
return false; |
38 | 46 |
String[] filter = this.filter.split(";"); |
... | ... | |
92 | 100 |
urls.add(url+file); |
93 | 101 |
} |
94 | 102 |
} |
95 |
} |
|
103 |
}*/
|
|
96 | 104 |
|
97 | 105 |
|
98 | 106 |
@Override |
99 | 107 |
public Iterator<String> iterator() { |
100 |
final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100); |
|
101 | 108 |
|
102 | 109 |
|
110 |
|
|
103 | 111 |
return new Iterator<String>(){ |
104 |
int total = 0; |
|
112 |
/* int total = 0;
|
|
105 | 113 |
int filtered = 0; |
106 | 114 |
public void fillQueue() { |
107 | 115 |
Connector c = new Connector(); |
... | ... | |
155 | 163 |
|
156 | 164 |
} |
157 | 165 |
|
158 |
} |
|
166 |
}*/
|
|
159 | 167 |
@Override |
160 | 168 |
public boolean hasNext() { |
161 |
if (queue.isEmpty()){ |
|
169 |
while (queue.isEmpty()); |
|
170 |
if (queue.peek().equals(TERMINATOR)) |
|
171 |
log.info(String.format("Total number of metadata %d, Number of metadata filtered %d", total, filtered)); |
|
172 |
return !queue.peek().equals(TERMINATOR); |
|
173 |
/*if (queue.isEmpty()){ |
|
162 | 174 |
fillQueue(); |
163 |
} |
|
164 |
if(queue.isEmpty()){ |
|
165 |
log.info(String.format("Total number of metadata %d, Number of metadata filtered %d", total, filtered)); |
|
166 |
return false; |
|
167 |
} |
|
175 |
}*/
|
|
176 |
// if(queue.isEmpty()){
|
|
177 |
//log.info(String.format("Total number of metadata %d, Number of metadata filtered %d", total, filtered));
|
|
178 |
// return false;
|
|
179 |
//}
|
|
168 | 180 |
|
169 |
return true; |
|
181 |
//return true;
|
|
170 | 182 |
} |
171 | 183 |
|
172 | 184 |
@Override |
... | ... | |
174 | 186 |
return queue.poll(); |
175 | 187 |
} |
176 | 188 |
|
177 |
// @Override |
|
178 |
// public void remove() { |
|
179 |
// |
|
180 |
// } |
|
181 |
// |
|
182 |
// @Override |
|
183 |
// public void forEachRemaining(Consumer<? super String> action) { |
|
184 |
// |
|
185 |
// } |
|
186 | 189 |
}; |
187 | 190 |
} |
188 | 191 |
|
192 |
private class FillMetaQueue implements Runnable{ |
|
189 | 193 |
|
194 |
|
|
195 |
Connector c = new Connector(); |
|
196 |
|
|
197 |
public void fillQueue() { |
|
198 |
String url; |
|
199 |
while((metas.size()>0 || urls.size() > 0 )){ |
|
200 |
log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size()); |
|
201 |
if (metas.size() > 0){ |
|
202 |
url = metas.remove(0); |
|
203 |
try { |
|
204 |
c.get(url); |
|
205 |
} catch (CollectorServiceException e) { |
|
206 |
log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
207 |
} |
|
208 |
if(c.isStatusOk()){ |
|
209 |
try { |
|
210 |
String ret = c.getResponse(); |
|
211 |
if (ret != null && ret.length()>0) { |
|
212 |
if (!containsFilter(ret)) |
|
213 |
queue.put(addFilePath(ret, url, url.endsWith(".json"))); |
|
214 |
else |
|
215 |
filtered++; |
|
216 |
total++; |
|
217 |
} |
|
218 |
} catch (InterruptedException e) { |
|
219 |
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
220 |
|
|
221 |
|
|
222 |
} |
|
223 |
} |
|
224 |
}else{ |
|
225 |
url = urls.remove(0); |
|
226 |
try { |
|
227 |
c.get(url); |
|
228 |
} catch (CollectorServiceException e) { |
|
229 |
log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
230 |
} |
|
231 |
if(c.isStatusOk()){ |
|
232 |
if (c.responseTypeContains("text/html")){ |
|
233 |
recurFolder(c.getResponse(),url); |
|
234 |
} |
|
235 |
else if(c.responseTypeContains("application/json") || c.responseTypeContains("application/xml")){ |
|
236 |
try { |
|
237 |
queue.put(addFilePath(c.getResponse(),url, c.responseTypeContains("application/json"))); |
|
238 |
} catch (InterruptedException e) { |
|
239 |
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
240 |
} |
|
241 |
} |
|
242 |
} |
|
243 |
|
|
244 |
} |
|
245 |
|
|
246 |
} |
|
247 |
try { |
|
248 |
queue.put(TERMINATOR); |
|
249 |
} catch (InterruptedException e) { |
|
250 |
e.printStackTrace(); |
|
251 |
} |
|
252 |
|
|
253 |
} |
|
254 |
|
|
255 |
private boolean containsFilter(String meta){ |
|
256 |
if (filterParam == null || filterParam.isEmpty()) |
|
257 |
return false; |
|
258 |
String[] filter = filterParam.split(";"); |
|
259 |
for(String item:filter){ |
|
260 |
if (meta.contains(item)) |
|
261 |
return true; |
|
262 |
} |
|
263 |
return false; |
|
264 |
} |
|
265 |
|
|
266 |
private String addFilePath(String meta,String url, boolean isJson){ |
|
267 |
String path = url.replace("metadata", "pdf"); |
|
268 |
|
|
269 |
try { |
|
270 |
if(isJson) |
|
271 |
meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}"; |
|
272 |
else{ |
|
273 |
|
|
274 |
if (meta.contains("<!DOCTYPE")) { |
|
275 |
meta = meta.substring(meta.indexOf("<!DOCTYPE")); |
|
276 |
meta = meta.substring(meta.indexOf(">") + 1); |
|
277 |
} |
|
278 |
int index = meta.lastIndexOf("</"); |
|
279 |
meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index); |
|
280 |
|
|
281 |
|
|
282 |
} |
|
283 |
|
|
284 |
}catch(Exception ex){ |
|
285 |
log.info("not file with extension .json or .xml"); |
|
286 |
} |
|
287 |
|
|
288 |
|
|
289 |
if(isJson){ |
|
290 |
JSONObject jsonobj = null; |
|
291 |
try { |
|
292 |
jsonobj = new JSONObject("{'resource':" + meta + "}"); |
|
293 |
|
|
294 |
return XML.toString(jsonobj); |
|
295 |
}catch(Exception e){ |
|
296 |
log.fatal("Impossible to transform json object to xml \n" + jsonobj + "\n " + e.getMessage() + "\n" + url); |
|
297 |
throw new RuntimeException(); |
|
298 |
} |
|
299 |
} |
|
300 |
return meta; |
|
301 |
} |
|
302 |
|
|
303 |
private void recurFolder(String text, String url){ |
|
304 |
Document doc = Jsoup.parse(text); |
|
305 |
Elements links = doc.select("a"); |
|
306 |
for(Element e:links){ |
|
307 |
if (!e.text().equals("../")){ |
|
308 |
String file = e.attr("href"); |
|
309 |
if(file.endsWith(".json") || file.endsWith(".xml")) |
|
310 |
metas.add(url+file); |
|
311 |
else |
|
312 |
urls.add(url+file); |
|
313 |
} |
|
314 |
} |
|
315 |
} |
|
316 |
|
|
317 |
|
|
318 |
@Override |
|
319 |
public void run() { |
|
320 |
fillQueue(); |
|
321 |
} |
|
322 |
} |
|
323 |
|
|
324 |
|
|
325 |
|
|
190 | 326 |
} |
Also available in: Unified diff
changed implementation of data gathering