1 |
51970
|
miriam.bag
|
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
|
2 |
51956
|
miriam.bag
|
|
3 |
|
|
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
4 |
|
|
import org.apache.commons.logging.Log;
|
5 |
|
|
import org.apache.commons.logging.LogFactory;
|
6 |
|
|
import org.json.JSONObject;
|
7 |
|
|
import org.json.XML;
|
8 |
|
|
import org.jsoup.Jsoup;
|
9 |
|
|
import org.jsoup.nodes.Document;
|
10 |
|
|
import org.jsoup.nodes.Element;
|
11 |
|
|
import org.jsoup.select.Elements;
|
12 |
|
|
|
13 |
|
|
import java.util.ArrayList;
|
14 |
|
|
import java.util.Iterator;
|
15 |
|
|
|
16 |
|
|
import java.util.concurrent.ArrayBlockingQueue;
|
17 |
|
|
import java.util.function.Consumer;
|
18 |
|
|
|
19 |
|
|
/**
|
20 |
|
|
* Created by miriam on 04/05/2018.
|
21 |
|
|
*/
|
22 |
51970
|
miriam.bag
|
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
|
23 |
|
|
private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
|
24 |
|
|
|
25 |
51956
|
miriam.bag
|
private final ArrayList<String> urls = new ArrayList<>();
|
26 |
52026
|
miriam.bag
|
private final ArrayList<String> metas = new ArrayList<String>();
|
27 |
52054
|
miriam.bag
|
private String filter;
|
28 |
51956
|
miriam.bag
|
|
29 |
52054
|
miriam.bag
|
public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
|
30 |
51956
|
miriam.bag
|
|
31 |
|
|
urls.add(startUrl);
|
32 |
52054
|
miriam.bag
|
this.filter = filter;
|
33 |
51956
|
miriam.bag
|
}
|
34 |
|
|
|
35 |
52054
|
miriam.bag
|
private boolean containsFilter(String meta){
|
36 |
|
|
if (filter == null || filter.isEmpty())
|
37 |
|
|
return false;
|
38 |
|
|
String[] filter = this.filter.split(";");
|
39 |
|
|
for(String item:filter){
|
40 |
|
|
if (meta.contains(item))
|
41 |
|
|
return true;
|
42 |
|
|
}
|
43 |
|
|
return false;
|
44 |
|
|
}
|
45 |
|
|
|
46 |
52026
|
miriam.bag
|
private String addFilePath(String meta,String url, boolean isJson){
|
47 |
51956
|
miriam.bag
|
String path = url.replace("metadata", "pdf");
|
48 |
52054
|
miriam.bag
|
|
49 |
51956
|
miriam.bag
|
try {
|
50 |
52026
|
miriam.bag
|
if(isJson)
|
51 |
|
|
meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
|
52 |
52031
|
miriam.bag
|
else{
|
53 |
52054
|
miriam.bag
|
|
54 |
52056
|
miriam.bag
|
if (meta.trim().startsWith("<!DOCTYPE"))
|
55 |
52054
|
miriam.bag
|
meta = meta.substring(meta.indexOf(">")+1);
|
56 |
|
|
int index = meta.lastIndexOf("</");
|
57 |
|
|
meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
|
58 |
|
|
|
59 |
|
|
|
60 |
52031
|
miriam.bag
|
}
|
61 |
|
|
|
62 |
51956
|
miriam.bag
|
}catch(Exception ex){
|
63 |
52026
|
miriam.bag
|
log.info("not file with extension .json or .xml");
|
64 |
51956
|
miriam.bag
|
}
|
65 |
|
|
|
66 |
52026
|
miriam.bag
|
//JSONObject jsonobj = new JSONObject("{'metadata':"+json+"}");
|
67 |
|
|
if(isJson){
|
68 |
52034
|
miriam.bag
|
JSONObject jsonobj = new JSONObject("{'resource':" + meta + "}");
|
69 |
52026
|
miriam.bag
|
return XML.toString(jsonobj);
|
70 |
|
|
}
|
71 |
|
|
return meta;
|
72 |
51956
|
miriam.bag
|
}
|
73 |
|
|
|
74 |
|
|
private void recurFolder(String text, String url){
|
75 |
|
|
Document doc = Jsoup.parse(text);
|
76 |
|
|
Elements links = doc.select("a");
|
77 |
|
|
for(Element e:links){
|
78 |
|
|
if (!e.text().equals("../")){
|
79 |
|
|
String file = e.attr("href");
|
80 |
52026
|
miriam.bag
|
if(file.endsWith(".json") || file.endsWith(".xml"))
|
81 |
|
|
metas.add(url+file);
|
82 |
51956
|
miriam.bag
|
else
|
83 |
|
|
urls.add(url+file);
|
84 |
|
|
}
|
85 |
|
|
}
|
86 |
|
|
}
|
87 |
|
|
|
88 |
|
|
|
89 |
51970
|
miriam.bag
|
@Override
|
90 |
|
|
public Iterator<String> iterator() {
|
91 |
|
|
final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
|
92 |
|
|
|
93 |
|
|
|
94 |
|
|
return new Iterator<String>(){
|
95 |
|
|
|
96 |
|
|
public void fillQueue() {
|
97 |
|
|
Connector c = new Connector();
|
98 |
|
|
String url;
|
99 |
52026
|
miriam.bag
|
while((metas.size()>0 || urls.size() > 0 ) && queue.size()<100){
|
100 |
|
|
if (metas.size() > 0){
|
101 |
|
|
url = metas.remove(0);
|
102 |
51956
|
miriam.bag
|
try {
|
103 |
51970
|
miriam.bag
|
c.get(url);
|
104 |
|
|
} catch (CollectorServiceException e) {
|
105 |
|
|
log.error("Impossible to collect url: " + url + " error: " + e.getMessage());
|
106 |
51956
|
miriam.bag
|
}
|
107 |
51970
|
miriam.bag
|
if(c.isStatusOk()){
|
108 |
|
|
try {
|
109 |
|
|
String ret = c.getResponse();
|
110 |
52054
|
miriam.bag
|
if (ret != null && ret.length()>0 && !containsFilter(ret))
|
111 |
52026
|
miriam.bag
|
queue.put(addFilePath(ret,url,url.endsWith(".json")));
|
112 |
51970
|
miriam.bag
|
} catch (InterruptedException e) {
|
113 |
|
|
log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
|
114 |
|
|
|
115 |
|
|
}
|
116 |
|
|
}
|
117 |
|
|
}else{
|
118 |
|
|
url = urls.remove(0);
|
119 |
|
|
try {
|
120 |
|
|
c.get(url);
|
121 |
|
|
} catch (CollectorServiceException e) {
|
122 |
|
|
log.error("Impossible to collect url: " + url + " error: " + e.getMessage());
|
123 |
|
|
}
|
124 |
|
|
if(c.isStatusOk()){
|
125 |
|
|
if (c.responseTypeContains("text/html")){
|
126 |
|
|
recurFolder(c.getResponse(),url);
|
127 |
|
|
}
|
128 |
52026
|
miriam.bag
|
else if(c.responseTypeContains("application/json") || c.responseTypeContains("application/xml")){
|
129 |
51970
|
miriam.bag
|
try {
|
130 |
52026
|
miriam.bag
|
queue.put(addFilePath(c.getResponse(),url, c.responseTypeContains("application/json")));
|
131 |
51970
|
miriam.bag
|
} catch (InterruptedException e) {
|
132 |
|
|
log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
|
133 |
|
|
}
|
134 |
|
|
}
|
135 |
|
|
}
|
136 |
|
|
|
137 |
51956
|
miriam.bag
|
}
|
138 |
51970
|
miriam.bag
|
|
139 |
51956
|
miriam.bag
|
}
|
140 |
|
|
|
141 |
|
|
}
|
142 |
|
|
@Override
|
143 |
|
|
public boolean hasNext() {
|
144 |
|
|
if (queue.isEmpty()){
|
145 |
|
|
fillQueue();
|
146 |
|
|
}
|
147 |
|
|
return (!queue.isEmpty());
|
148 |
|
|
}
|
149 |
|
|
|
150 |
|
|
@Override
|
151 |
|
|
public String next() {
|
152 |
|
|
return queue.poll();
|
153 |
|
|
}
|
154 |
|
|
|
155 |
|
|
@Override
|
156 |
|
|
public void remove() {
|
157 |
|
|
|
158 |
|
|
}
|
159 |
|
|
|
160 |
|
|
@Override
|
161 |
|
|
public void forEachRemaining(Consumer<? super String> action) {
|
162 |
|
|
|
163 |
|
|
}
|
164 |
|
|
};
|
165 |
|
|
}
|
166 |
|
|
|
167 |
|
|
|
168 |
|
|
}
|