Revision 51956
Added by Miriam Baglioni almost 6 years ago
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/RSCollectorIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.resourcesynck; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
4 |
import org.apache.commons.logging.Log; |
|
5 |
import org.apache.commons.logging.LogFactory; |
|
6 |
import org.json.JSONObject; |
|
7 |
import org.json.XML; |
|
8 |
import org.jsoup.Jsoup; |
|
9 |
import org.jsoup.nodes.Document; |
|
10 |
import org.jsoup.nodes.Element; |
|
11 |
import org.jsoup.select.Elements; |
|
12 |
|
|
13 |
import java.util.ArrayList; |
|
14 |
import java.util.Iterator; |
|
15 |
|
|
16 |
import java.util.concurrent.ArrayBlockingQueue; |
|
17 |
import java.util.function.Consumer; |
|
18 |
|
|
19 |
/** |
|
20 |
* Created by miriam on 04/05/2018. |
|
21 |
*/ |
|
22 |
public class RSCollectorIterable implements Iterable<String> { |
|
23 |
private static final Log log = LogFactory.getLog(RSCollectorIterable.class); |
|
24 |
private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100); |
|
25 |
private final ArrayList<String> urls = new ArrayList<>(); |
|
26 |
private final ArrayList<String> jsons = new ArrayList<String>(); |
|
27 |
|
|
28 |
|
|
29 |
public RSCollectorIterable(String startUrl){ |
|
30 |
|
|
31 |
urls.add(startUrl); |
|
32 |
fillQueue(); |
|
33 |
} |
|
34 |
|
|
35 |
private String addFilePath(String json,String url){ |
|
36 |
String path = url.replace("metadata", "pdf"); |
|
37 |
try { |
|
38 |
json = json.substring(0, json.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}"; |
|
39 |
}catch(Exception ex){ |
|
40 |
log.info("not file with extension .json"); |
|
41 |
} |
|
42 |
|
|
43 |
JSONObject jsonobj = new JSONObject("{'metadata':"+json+"}"); |
|
44 |
|
|
45 |
return XML.toString(jsonobj); |
|
46 |
} |
|
47 |
|
|
48 |
private void recurFolder(String text, String url){ |
|
49 |
Document doc = Jsoup.parse(text); |
|
50 |
Elements links = doc.select("a"); |
|
51 |
for(Element e:links){ |
|
52 |
if (!e.text().equals("../")){ |
|
53 |
String file = e.attr("href"); |
|
54 |
if(file.endsWith(".json")) |
|
55 |
jsons.add(url+file); |
|
56 |
else |
|
57 |
urls.add(url+file); |
|
58 |
} |
|
59 |
} |
|
60 |
} |
|
61 |
|
|
62 |
private void fillQueue() { |
|
63 |
Connector c = new Connector(); |
|
64 |
String url; |
|
65 |
while((jsons.size()>0 || urls.size() > 0 ) && queue.size()<100){ |
|
66 |
if (jsons.size() > 0){ |
|
67 |
url = jsons.remove(0); |
|
68 |
try { |
|
69 |
c.get(url); |
|
70 |
} catch (CollectorServiceException e) { |
|
71 |
log.error("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
72 |
} |
|
73 |
if(c.isStatusOk()){ |
|
74 |
try { |
|
75 |
String ret = c.getResponse(); |
|
76 |
if (ret != null && ret.length()>0) |
|
77 |
queue.put(addFilePath(ret,url)); |
|
78 |
} catch (InterruptedException e) { |
|
79 |
log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
80 |
|
|
81 |
} |
|
82 |
} |
|
83 |
}else{ |
|
84 |
url = urls.remove(0); |
|
85 |
try { |
|
86 |
c.get(url); |
|
87 |
} catch (CollectorServiceException e) { |
|
88 |
log.error("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
89 |
} |
|
90 |
if(c.isStatusOk()){ |
|
91 |
if (c.responseTypeContains("text/html")){ |
|
92 |
recurFolder(c.getResponse(),url); |
|
93 |
} |
|
94 |
else if(c.responseTypeContains("application/json")){ |
|
95 |
try { |
|
96 |
queue.put(addFilePath(c.getResponse(),url)); |
|
97 |
} catch (InterruptedException e) { |
|
98 |
log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
99 |
} |
|
100 |
} |
|
101 |
} |
|
102 |
|
|
103 |
} |
|
104 |
|
|
105 |
} |
|
106 |
|
|
107 |
} |
|
108 |
|
|
109 |
@Override |
|
110 |
public Iterator<String> iterator() { |
|
111 |
|
|
112 |
return new Iterator<String>(){ |
|
113 |
|
|
114 |
@Override |
|
115 |
public boolean hasNext() { |
|
116 |
if (queue.isEmpty()){ |
|
117 |
fillQueue(); |
|
118 |
} |
|
119 |
return (!queue.isEmpty()); |
|
120 |
} |
|
121 |
|
|
122 |
@Override |
|
123 |
public String next() { |
|
124 |
return queue.poll(); |
|
125 |
} |
|
126 |
|
|
127 |
@Override |
|
128 |
public void remove() { |
|
129 |
|
|
130 |
} |
|
131 |
|
|
132 |
@Override |
|
133 |
public void forEachRemaining(Consumer<? super String> action) { |
|
134 |
|
|
135 |
} |
|
136 |
}; |
|
137 |
} |
|
138 |
|
|
139 |
|
|
140 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/Connector.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.resourcesynck; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
|
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 07/05/2018. |
|
9 |
*/ |
|
10 |
public class Connector extends HttpConnector implements ConnectorInterface { |
|
11 |
private String response; |
|
12 |
|
|
13 |
@Override |
|
14 |
public void get(final String requestUrl) throws CollectorServiceException { |
|
15 |
response = getInputSource(requestUrl); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
public String getResponse() { |
|
20 |
return response; |
|
21 |
} |
|
22 |
|
|
23 |
@Override |
|
24 |
public boolean isStatusOk() { |
|
25 |
return (response != null); |
|
26 |
} |
|
27 |
|
|
28 |
@Override |
|
29 |
public boolean responseTypeContains(String string) { |
|
30 |
String responseType = getResponseType(); |
|
31 |
if (responseType != null) |
|
32 |
return getResponseType().contains(string); |
|
33 |
return false; |
|
34 |
} |
|
35 |
|
|
36 |
|
|
37 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/RSCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.resourcesynck; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 04/05/2018. |
|
9 |
*/ |
|
10 |
public class RSCollectorPlugin extends AbstractCollectorPlugin { |
|
11 |
|
|
12 |
@Override |
|
13 |
public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException { |
|
14 |
return new RSCollectorIterable(interfaceDescriptor.getBaseUrl()); |
|
15 |
} |
|
16 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/ConnectorInterface.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.resourcesynck; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
4 |
|
|
5 |
/** |
|
6 |
* Created by miriam on 07/05/2018. |
|
7 |
*/ |
|
8 |
public interface ConnectorInterface { |
|
9 |
|
|
10 |
public void get(final String requestUrl) throws CollectorServiceException; |
|
11 |
|
|
12 |
public String getResponse(); |
|
13 |
|
|
14 |
public boolean isStatusOk(); |
|
15 |
|
|
16 |
|
|
17 |
public boolean responseTypeContains(String string); |
|
18 |
|
|
19 |
} |
Also available in: Unified diff
pluging for collecting metadata from files mapped to urls (related to #3236)