Revision 52518
Added by Miriam Baglioni over 6 years ago
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.HTTPWithFileName; |
|
2 |
import java.util.Iterator; |
|
3 |
|
|
4 |
import org.junit.Ignore; |
|
5 |
import org.junit.Test; |
|
6 |
|
|
7 |
|
|
8 |
/** |
|
9 |
* Created by miriam on 07/05/2018. |
|
10 |
*/ |
|
11 |
@Ignore |
|
12 |
public class HTTPWithFileNameTest { |
|
13 |
|
|
14 |
private void iterate(Iterator<String> iterator, boolean exit){ |
|
15 |
try{ |
|
16 |
while (iterator.hasNext()){ |
|
17 |
|
|
18 |
System.out.println(iterator.next()); |
|
19 |
if(exit) |
|
20 |
System.exit(0); |
|
21 |
|
|
22 |
|
|
23 |
} |
|
24 |
|
|
25 |
}catch(Exception ex){ |
|
26 |
ex.printStackTrace(); |
|
27 |
} |
|
28 |
} |
|
29 |
|
|
30 |
@Test |
|
31 |
@Ignore |
|
32 |
public void testRSCollectorFrontiers() |
|
33 |
{ |
|
34 |
HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/Frontiers/data/Frontiers/metadata/000/",null); |
|
35 |
iterate(rsc.iterator(),false); |
|
36 |
|
|
37 |
} |
|
38 |
|
|
39 |
@Test |
|
40 |
@Ignore |
|
41 |
public void testRSCollectorPLOSCount() |
|
42 |
{ |
|
43 |
HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/PLOS/data/public_library_of_science/metadata/354/","article-type=\"correction\""); |
|
44 |
Iterator<String> iterator = rsc.iterator(); |
|
45 |
int count = 0; |
|
46 |
int body = 0; |
|
47 |
int corrections = 0; |
|
48 |
try{ |
|
49 |
while (iterator.hasNext()){ |
|
50 |
|
|
51 |
String meta = iterator.next(); |
|
52 |
if (!meta.contains("article-type=\"correction\"")){ |
|
53 |
count++; |
|
54 |
int index = meta.indexOf("<body>"); |
|
55 |
if(meta.substring(index).contains("<sec")) |
|
56 |
body++; |
|
57 |
else { |
|
58 |
System.out.println(meta); |
|
59 |
System.out.println(count); |
|
60 |
} |
|
61 |
|
|
62 |
}else |
|
63 |
corrections++; |
|
64 |
|
|
65 |
} |
|
66 |
System.out.println(count + " " + body + " " + corrections); |
|
67 |
}catch(Exception ex){ |
|
68 |
ex.printStackTrace(); |
|
69 |
} |
|
70 |
} |
|
71 |
|
|
72 |
@Test |
|
73 |
@Ignore |
|
74 |
public void testRSCollectorPLOS() |
|
75 |
{ |
|
76 |
HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/PLOS/data/public_library_of_science/metadata/400/","article-type=\"correction\""); |
|
77 |
|
|
78 |
|
|
79 |
iterate(rsc.iterator(),false); |
|
80 |
} |
|
81 |
|
|
82 |
@Test |
|
83 |
@Ignore |
|
84 |
public void testRSCollectorSpringer() |
|
85 |
{ |
|
86 |
HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/Springer-OA/data/Springer-OA/metadata/8a0/",null); |
|
87 |
|
|
88 |
iterate(rsc.iterator(),false); |
|
89 |
|
|
90 |
} |
|
91 |
|
|
92 |
@Test |
|
93 |
public void testEmptyCollection() |
|
94 |
{ |
|
95 |
HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("",null); |
|
96 |
|
|
97 |
iterate(rsc.iterator(),true); |
|
98 |
} |
|
99 |
|
|
100 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/Connector.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.HTTPWithFileName; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
|
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 07/05/2018. |
|
9 |
*/ |
|
10 |
public class Connector extends HttpConnector implements ConnectorInterface { |
|
11 |
private String response; |
|
12 |
|
|
13 |
@Override |
|
14 |
public void get(final String requestUrl) throws CollectorServiceException { |
|
15 |
response = getInputSource(requestUrl); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
public String getResponse() { |
|
20 |
return response; |
|
21 |
} |
|
22 |
|
|
23 |
@Override |
|
24 |
public boolean isStatusOk() { |
|
25 |
return (response != null); |
|
26 |
} |
|
27 |
|
|
28 |
@Override |
|
29 |
public boolean responseTypeContains(String string) { |
|
30 |
String responseType = getResponseType(); |
|
31 |
if (responseType != null) |
|
32 |
return responseType.contains(string); |
|
33 |
return false; |
|
34 |
} |
|
35 |
|
|
36 |
|
|
37 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameCollectorIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.HTTPWithFileName; |
|
2 |
|
|
3 |
import java.util.ArrayList; |
|
4 |
import java.util.Iterator; |
|
5 |
import java.util.NoSuchElementException; |
|
6 |
import java.util.Objects; |
|
7 |
import java.util.concurrent.ArrayBlockingQueue; |
|
8 |
import java.util.concurrent.TimeUnit; |
|
9 |
|
|
10 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
11 |
import org.apache.commons.logging.Log; |
|
12 |
import org.apache.commons.logging.LogFactory; |
|
13 |
import org.json.JSONObject; |
|
14 |
import org.json.XML; |
|
15 |
import org.jsoup.Jsoup; |
|
16 |
import org.jsoup.nodes.Document; |
|
17 |
import org.jsoup.nodes.Element; |
|
18 |
import org.jsoup.select.Elements; |
|
19 |
|
|
20 |
/** |
|
21 |
* Created by miriam on 04/05/2018. |
|
22 |
*/ |
|
23 |
public class HTTPWithFileNameCollectorIterable implements Iterable<String> { |
|
24 |
|
|
25 |
private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class); |
|
26 |
private static final String TERMINATOR = "FINITO"; |
|
27 |
private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>"; |
|
28 |
public static final String APP_JSON = "application/json"; |
|
29 |
public static final String APP_XML = "application/xml"; |
|
30 |
public static final String TEXT_HTML = "text/html"; |
|
31 |
private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100); |
|
32 |
|
|
33 |
private long waitTime = 60L; |
|
34 |
|
|
35 |
private final ArrayList<String> urls = new ArrayList<>(); |
|
36 |
private final ArrayList<String> metas = new ArrayList<String>(); |
|
37 |
private String filterParam; |
|
38 |
|
|
39 |
int total = 0; |
|
40 |
int filtered = 0; |
|
41 |
|
|
42 |
public HTTPWithFileNameCollectorIterable(String startUrl, String filter){ |
|
43 |
if (!startUrl.isEmpty()) |
|
44 |
urls.add(startUrl); |
|
45 |
this.filterParam = filter; |
|
46 |
Thread ft = new Thread(new FillMetaQueue()); |
|
47 |
ft.start(); |
|
48 |
} |
|
49 |
|
|
50 |
|
|
51 |
@Override |
|
52 |
public Iterator<String> iterator() { |
|
53 |
return new Iterator<String>(){ |
|
54 |
|
|
55 |
private String last = null; |
|
56 |
|
|
57 |
@Override |
|
58 |
public boolean hasNext() { |
|
59 |
return !Objects.equals(last, TERMINATOR); |
|
60 |
} |
|
61 |
|
|
62 |
@Override |
|
63 |
public String next() { |
|
64 |
try { |
|
65 |
last = queue.poll(waitTime, TimeUnit.SECONDS); |
|
66 |
if (Objects.equals(last, TERMINATOR)) { |
|
67 |
log.info("found terminator, omg!"); |
|
68 |
} |
|
69 |
} catch (InterruptedException e) { |
|
70 |
log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS)); |
|
71 |
throw new NoSuchElementException(e.getMessage()); |
|
72 |
} |
|
73 |
return last; |
|
74 |
} |
|
75 |
|
|
76 |
}; |
|
77 |
} |
|
78 |
|
|
79 |
private class FillMetaQueue implements Runnable { |
|
80 |
|
|
81 |
final Connector c = new Connector(); |
|
82 |
|
|
83 |
public void fillQueue() { |
|
84 |
String url; |
|
85 |
while((metas.size()>0 || urls.size() > 0 )) { |
|
86 |
log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size()); |
|
87 |
if (metas.size() > 0) { |
|
88 |
url = metas.remove(0); |
|
89 |
try { |
|
90 |
c.get(url); |
|
91 |
} catch (CollectorServiceException e) { |
|
92 |
log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
93 |
} |
|
94 |
if(c.isStatusOk()){ |
|
95 |
try { |
|
96 |
String ret = c.getResponse(); |
|
97 |
if (ret != null && ret.length()>0) { |
|
98 |
if (!containsFilter(ret)) |
|
99 |
queue.offer(addFilePath(ret, url, url.endsWith(".json")), waitTime, TimeUnit.SECONDS); |
|
100 |
else |
|
101 |
filtered++; |
|
102 |
total++; |
|
103 |
} |
|
104 |
} catch (InterruptedException e) { |
|
105 |
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
106 |
|
|
107 |
} |
|
108 |
} |
|
109 |
} else { |
|
110 |
url = urls.remove(0); |
|
111 |
try { |
|
112 |
c.get(url); |
|
113 |
} catch (CollectorServiceException e) { |
|
114 |
log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
115 |
} |
|
116 |
if(c.isStatusOk()) { |
|
117 |
if (c.responseTypeContains(TEXT_HTML)){ |
|
118 |
recurFolder(c.getResponse(), url); |
|
119 |
} else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){ |
|
120 |
try { |
|
121 |
final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON)); |
|
122 |
queue.offer(element, waitTime, TimeUnit.SECONDS); |
|
123 |
} catch (InterruptedException e) { |
|
124 |
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
125 |
} |
|
126 |
} |
|
127 |
} |
|
128 |
} |
|
129 |
|
|
130 |
} |
|
131 |
try { |
|
132 |
queue.offer(TERMINATOR, waitTime, TimeUnit.SECONDS); |
|
133 |
} catch (InterruptedException e) { |
|
134 |
throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", waitTime, TimeUnit.SECONDS), e); |
|
135 |
} |
|
136 |
|
|
137 |
} |
|
138 |
|
|
139 |
private boolean containsFilter(String meta){ |
|
140 |
if (filterParam == null || filterParam.isEmpty()) |
|
141 |
return false; |
|
142 |
String[] filter = filterParam.split(";"); |
|
143 |
for(String item:filter){ |
|
144 |
if (meta.contains(item)) |
|
145 |
return true; |
|
146 |
} |
|
147 |
return false; |
|
148 |
} |
|
149 |
|
|
150 |
private String addFilePath(String meta, String url, boolean isJson){ |
|
151 |
String path = url.replace("metadata", "pdf"); |
|
152 |
|
|
153 |
try { |
|
154 |
if(isJson) |
|
155 |
meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}"; |
|
156 |
else { |
|
157 |
|
|
158 |
if (meta.contains("<!DOCTYPE")) { |
|
159 |
meta = meta.substring(meta.indexOf("<!DOCTYPE")); |
|
160 |
meta = meta.substring(meta.indexOf(">") + 1); |
|
161 |
} |
|
162 |
int index = meta.lastIndexOf("</"); |
|
163 |
meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index); |
|
164 |
} |
|
165 |
} catch(Exception ex) { |
|
166 |
log.info("not file with extension .json or .xml"); |
|
167 |
} |
|
168 |
|
|
169 |
|
|
170 |
if(isJson) { |
|
171 |
try { |
|
172 |
return XML.toString(new JSONObject("{'resource':" + meta + "}")); |
|
173 |
} catch(Exception e) { |
|
174 |
log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url); |
|
175 |
// throw new RuntimeException(); |
|
176 |
final String junk = String.format(JUNK, url); |
|
177 |
log.warn("returning " + junk); |
|
178 |
return junk; |
|
179 |
} |
|
180 |
} |
|
181 |
return meta; |
|
182 |
} |
|
183 |
|
|
184 |
private void recurFolder(String text, String url){ |
|
185 |
Document doc = Jsoup.parse(text); |
|
186 |
Elements links = doc.select("a"); |
|
187 |
for(Element e:links){ |
|
188 |
if (!e.text().equals("../")){ |
|
189 |
String file = e.attr("href"); |
|
190 |
if(file.endsWith(".json") || file.endsWith(".xml")) |
|
191 |
metas.add(url+file); |
|
192 |
else |
|
193 |
urls.add(url+file); |
|
194 |
} |
|
195 |
} |
|
196 |
} |
|
197 |
|
|
198 |
|
|
199 |
@Override |
|
200 |
public void run() { |
|
201 |
fillQueue(); |
|
202 |
} |
|
203 |
} |
|
204 |
|
|
205 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/ConnectorInterface.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.HTTPWithFileName; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
4 |
|
|
5 |
/** |
|
6 |
* Created by miriam on 07/05/2018. |
|
7 |
*/ |
|
8 |
public interface ConnectorInterface { |
|
9 |
|
|
10 |
public void get(final String requestUrl) throws CollectorServiceException; |
|
11 |
|
|
12 |
public String getResponse(); |
|
13 |
|
|
14 |
public boolean isStatusOk(); |
|
15 |
|
|
16 |
|
|
17 |
public boolean responseTypeContains(String string); |
|
18 |
|
|
19 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.HTTPWithFileName; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 04/05/2018. |
|
9 |
*/ |
|
10 |
public class HTTPWithFileNameCollectorPlugin extends AbstractCollectorPlugin { |
|
11 |
|
|
12 |
@Override |
|
13 |
public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException { |
|
14 |
return new HTTPWithFileNameCollectorIterable(interfaceDescriptor.getBaseUrl(), interfaceDescriptor.getParams().get("filter")); |
|
15 |
} |
|
16 |
} |
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/httpfilename/HTTPWithFileNameTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.httpfilename; |
|
2 |
import java.util.Iterator; |
|
3 |
|
|
4 |
import org.junit.Ignore; |
|
5 |
import org.junit.Test; |
|
6 |
|
|
7 |
|
|
8 |
/** |
|
9 |
* Created by miriam on 07/05/2018. |
|
10 |
*/ |
|
11 |
@Ignore |
|
12 |
public class HTTPWithFileNameTest { |
|
13 |
|
|
14 |
private void iterate(Iterator<String> iterator, boolean exit){ |
|
15 |
try{ |
|
16 |
while (iterator.hasNext()){ |
|
17 |
|
|
18 |
System.out.println(iterator.next()); |
|
19 |
if(exit) |
|
20 |
System.exit(0); |
|
21 |
|
|
22 |
|
|
23 |
} |
|
24 |
|
|
25 |
}catch(Exception ex){ |
|
26 |
ex.printStackTrace(); |
|
27 |
} |
|
28 |
} |
|
29 |
|
|
30 |
@Test |
|
31 |
@Ignore |
|
32 |
public void testRSCollectorFrontiers() |
|
33 |
{ |
|
34 |
HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/Frontiers/data/Frontiers/metadata/000/",null); |
|
35 |
iterate(rsc.iterator(),false); |
|
36 |
|
|
37 |
} |
|
38 |
|
|
39 |
@Test |
|
40 |
@Ignore |
|
41 |
public void testRSCollectorPLOSCount() |
|
42 |
{ |
|
43 |
HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/PLOS/data/public_library_of_science/metadata/354/","article-type=\"correction\""); |
|
44 |
Iterator<String> iterator = rsc.iterator(); |
|
45 |
int count = 0; |
|
46 |
int body = 0; |
|
47 |
int corrections = 0; |
|
48 |
try{ |
|
49 |
while (iterator.hasNext()){ |
|
50 |
|
|
51 |
String meta = iterator.next(); |
|
52 |
if (!meta.contains("article-type=\"correction\"")){ |
|
53 |
count++; |
|
54 |
int index = meta.indexOf("<body>"); |
|
55 |
if(meta.substring(index).contains("<sec")) |
|
56 |
body++; |
|
57 |
else { |
|
58 |
System.out.println(meta); |
|
59 |
System.out.println(count); |
|
60 |
} |
|
61 |
|
|
62 |
}else |
|
63 |
corrections++; |
|
64 |
|
|
65 |
} |
|
66 |
System.out.println(count + " " + body + " " + corrections); |
|
67 |
}catch(Exception ex){ |
|
68 |
ex.printStackTrace(); |
|
69 |
} |
|
70 |
} |
|
71 |
|
|
72 |
@Test |
|
73 |
@Ignore |
|
74 |
public void testRSCollectorPLOS() |
|
75 |
{ |
|
76 |
HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/PLOS/data/public_library_of_science/metadata/400/","article-type=\"correction\""); |
|
77 |
|
|
78 |
|
|
79 |
iterate(rsc.iterator(),false); |
|
80 |
} |
|
81 |
|
|
82 |
@Test |
|
83 |
@Ignore |
|
84 |
public void testRSCollectorSpringer() |
|
85 |
{ |
|
86 |
HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/Springer-OA/data/Springer-OA/metadata/8a0/",null); |
|
87 |
|
|
88 |
iterate(rsc.iterator(),false); |
|
89 |
|
|
90 |
} |
|
91 |
|
|
92 |
@Test |
|
93 |
public void testEmptyCollection() |
|
94 |
{ |
|
95 |
HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("",null); |
|
96 |
|
|
97 |
iterate(rsc.iterator(),true); |
|
98 |
} |
|
99 |
|
|
100 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/HTTPWithFileNameCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.httpfilename; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 04/05/2018. |
|
9 |
*/ |
|
10 |
public class HTTPWithFileNameCollectorPlugin extends AbstractCollectorPlugin { |
|
11 |
|
|
12 |
@Override |
|
13 |
public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException { |
|
14 |
return new HTTPWithFileNameCollectorIterable(interfaceDescriptor.getBaseUrl(), interfaceDescriptor.getParams().get("filter")); |
|
15 |
} |
|
16 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/Connector.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.httpfilename; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
|
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 07/05/2018. |
|
9 |
*/ |
|
10 |
public class Connector extends HttpConnector implements ConnectorInterface { |
|
11 |
private String response; |
|
12 |
|
|
13 |
@Override |
|
14 |
public void get(final String requestUrl) throws CollectorServiceException { |
|
15 |
response = getInputSource(requestUrl); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
public String getResponse() { |
|
20 |
return response; |
|
21 |
} |
|
22 |
|
|
23 |
@Override |
|
24 |
public boolean isStatusOk() { |
|
25 |
return (response != null); |
|
26 |
} |
|
27 |
|
|
28 |
@Override |
|
29 |
public boolean responseTypeContains(String string) { |
|
30 |
String responseType = getResponseType(); |
|
31 |
if (responseType != null) |
|
32 |
return responseType.contains(string); |
|
33 |
return false; |
|
34 |
} |
|
35 |
|
|
36 |
|
|
37 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/ConnectorInterface.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.httpfilename; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
4 |
|
|
5 |
/** |
|
6 |
* Created by miriam on 07/05/2018. |
|
7 |
*/ |
|
8 |
public interface ConnectorInterface { |
|
9 |
|
|
10 |
void get(final String requestUrl) throws CollectorServiceException; |
|
11 |
|
|
12 |
String getResponse(); |
|
13 |
|
|
14 |
boolean isStatusOk(); |
|
15 |
|
|
16 |
|
|
17 |
boolean responseTypeContains(String string); |
|
18 |
|
|
19 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/HTTPWithFileNameCollectorIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.httpfilename; |
|
2 |
|
|
3 |
import java.util.ArrayList; |
|
4 |
import java.util.Iterator; |
|
5 |
import java.util.NoSuchElementException; |
|
6 |
import java.util.Objects; |
|
7 |
import java.util.concurrent.ArrayBlockingQueue; |
|
8 |
import java.util.concurrent.TimeUnit; |
|
9 |
|
|
10 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
11 |
import org.apache.commons.logging.Log; |
|
12 |
import org.apache.commons.logging.LogFactory; |
|
13 |
import org.json.JSONObject; |
|
14 |
import org.json.XML; |
|
15 |
import org.jsoup.Jsoup; |
|
16 |
import org.jsoup.nodes.Document; |
|
17 |
import org.jsoup.nodes.Element; |
|
18 |
import org.jsoup.select.Elements; |
|
19 |
|
|
20 |
/** |
|
21 |
* Created by miriam on 04/05/2018. |
|
22 |
*/ |
|
23 |
public class HTTPWithFileNameCollectorIterable implements Iterable<String> { |
|
24 |
|
|
25 |
private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class); |
|
26 |
private static final String TERMINATOR = "FINITO"; |
|
27 |
private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>"; |
|
28 |
public static final String APP_JSON = "application/json"; |
|
29 |
public static final String APP_XML = "application/xml"; |
|
30 |
public static final String TEXT_HTML = "text/html"; |
|
31 |
private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100); |
|
32 |
|
|
33 |
private long waitTime = 60L; |
|
34 |
|
|
35 |
private final ArrayList<String> urls = new ArrayList<>(); |
|
36 |
private final ArrayList<String> metas = new ArrayList<String>(); |
|
37 |
private String filterParam; |
|
38 |
|
|
39 |
int total = 0; |
|
40 |
int filtered = 0; |
|
41 |
|
|
42 |
public HTTPWithFileNameCollectorIterable(String startUrl, String filter){ |
|
43 |
if (!startUrl.isEmpty()) |
|
44 |
urls.add(startUrl); |
|
45 |
this.filterParam = filter; |
|
46 |
Thread ft = new Thread(new FillMetaQueue()); |
|
47 |
ft.start(); |
|
48 |
} |
|
49 |
|
|
50 |
|
|
51 |
@Override |
|
52 |
public Iterator<String> iterator() { |
|
53 |
return new Iterator<String>(){ |
|
54 |
|
|
55 |
private String last = null; |
|
56 |
private boolean exec_next = true; |
|
57 |
|
|
58 |
@Override |
|
59 |
public boolean hasNext() { |
|
60 |
if(exec_next){ |
|
61 |
try { |
|
62 |
last = queue.poll(waitTime, TimeUnit.SECONDS); |
|
63 |
exec_next = false; |
|
64 |
}catch(InterruptedException e){ |
|
65 |
log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS)); |
|
66 |
throw new NoSuchElementException(e.getMessage()); |
|
67 |
} |
|
68 |
} |
|
69 |
|
|
70 |
return !Objects.equals(last, TERMINATOR); |
|
71 |
} |
|
72 |
|
|
73 |
@Override |
|
74 |
public String next() { |
|
75 |
exec_next = true; |
|
76 |
return last; |
|
77 |
} |
|
78 |
|
|
79 |
// @Override |
|
80 |
// public boolean hasNext() { |
|
81 |
// |
|
82 |
// return !Objects.equals(last, TERMINATOR); |
|
83 |
// } |
|
84 |
// |
|
85 |
// @Override |
|
86 |
// public String next() { |
|
87 |
// try { |
|
88 |
// last = queue.poll(waitTime, TimeUnit.SECONDS); |
|
89 |
// if (Objects.equals(last, TERMINATOR)) { |
|
90 |
// log.info("found terminator, omg!"); |
|
91 |
// } |
|
92 |
// } catch (InterruptedException e) { |
|
93 |
// log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS)); |
|
94 |
// throw new NoSuchElementException(e.getMessage()); |
|
95 |
// } |
|
96 |
// return last; |
|
97 |
// } |
|
98 |
|
|
99 |
}; |
|
100 |
} |
|
101 |
|
|
102 |
private class FillMetaQueue implements Runnable { |
|
103 |
|
|
104 |
final Connector c = new Connector(); |
|
105 |
|
|
106 |
public void fillQueue() { |
|
107 |
String url; |
|
108 |
while((metas.size()>0 || urls.size() > 0 )) { |
|
109 |
log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size()); |
|
110 |
if (metas.size() > 0) { |
|
111 |
url = metas.remove(0); |
|
112 |
try { |
|
113 |
c.get(url); |
|
114 |
} catch (CollectorServiceException e) { |
|
115 |
log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
116 |
} |
|
117 |
if(c.isStatusOk()){ |
|
118 |
try { |
|
119 |
String ret = c.getResponse(); |
|
120 |
if (ret != null && ret.length()>0) { |
|
121 |
if (!containsFilter(ret)) |
|
122 |
queue.offer(addFilePath(ret, url, url.endsWith(".json")), waitTime, TimeUnit.SECONDS); |
|
123 |
else |
|
124 |
filtered++; |
|
125 |
total++; |
|
126 |
} |
|
127 |
} catch (InterruptedException e) { |
|
128 |
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
129 |
|
|
130 |
} |
|
131 |
} |
|
132 |
} else { |
|
133 |
url = urls.remove(0); |
|
134 |
try { |
|
135 |
c.get(url); |
|
136 |
} catch (CollectorServiceException e) { |
|
137 |
log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
138 |
} |
|
139 |
if(c.isStatusOk()) { |
|
140 |
if (c.responseTypeContains(TEXT_HTML)){ |
|
141 |
recurFolder(c.getResponse(), url); |
|
142 |
} else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){ |
|
143 |
try { |
|
144 |
final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON)); |
|
145 |
queue.offer(element, waitTime, TimeUnit.SECONDS); |
|
146 |
} catch (InterruptedException e) { |
|
147 |
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
148 |
} |
|
149 |
} |
|
150 |
} |
|
151 |
} |
|
152 |
|
|
153 |
} |
|
154 |
try { |
|
155 |
queue.offer(TERMINATOR, waitTime, TimeUnit.SECONDS); |
|
156 |
} catch (InterruptedException e) { |
|
157 |
throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", waitTime, TimeUnit.SECONDS), e); |
|
158 |
} |
|
159 |
|
|
160 |
} |
|
161 |
|
|
162 |
private boolean containsFilter(String meta){ |
|
163 |
if (filterParam == null || filterParam.isEmpty()) |
|
164 |
return false; |
|
165 |
String[] filter = filterParam.split(";"); |
|
166 |
for(String item:filter){ |
|
167 |
if (meta.contains(item)) |
|
168 |
return true; |
|
169 |
} |
|
170 |
return false; |
|
171 |
} |
|
172 |
|
|
173 |
private String addFilePath(String meta, String url, boolean isJson){ |
|
174 |
String path = url.replace("metadata", "pdf"); |
|
175 |
|
|
176 |
try { |
|
177 |
if(isJson) |
|
178 |
meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}"; |
|
179 |
else { |
|
180 |
|
|
181 |
if (meta.contains("<!DOCTYPE")) { |
|
182 |
meta = meta.substring(meta.indexOf("<!DOCTYPE")); |
|
183 |
meta = meta.substring(meta.indexOf(">") + 1); |
|
184 |
} |
|
185 |
int index = meta.lastIndexOf("</"); |
|
186 |
meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index); |
|
187 |
} |
|
188 |
} catch(Exception ex) { |
|
189 |
log.info("not file with extension .json or .xml"); |
|
190 |
} |
|
191 |
|
|
192 |
|
|
193 |
if(isJson) { |
|
194 |
try { |
|
195 |
return XML.toString(new JSONObject("{'resource':" + meta + "}")); |
|
196 |
} catch(Exception e) { |
|
197 |
log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url); |
|
198 |
// throw new RuntimeException(); |
|
199 |
final String junk = String.format(JUNK, url); |
|
200 |
log.warn("returning " + junk); |
|
201 |
return junk; |
|
202 |
} |
|
203 |
} |
|
204 |
return meta; |
|
205 |
} |
|
206 |
|
|
207 |
private void recurFolder(String text, String url){ |
|
208 |
Document doc = Jsoup.parse(text); |
|
209 |
Elements links = doc.select("a"); |
|
210 |
for(Element e:links){ |
|
211 |
if (!e.text().equals("../")){ |
|
212 |
String file = e.attr("href"); |
|
213 |
if(file.endsWith(".json") || file.endsWith(".xml")) |
|
214 |
metas.add(url+file); |
|
215 |
else |
|
216 |
urls.add(url+file); |
|
217 |
} |
|
218 |
} |
|
219 |
} |
|
220 |
|
|
221 |
|
|
222 |
@Override |
|
223 |
public void run() { |
|
224 |
fillQueue(); |
|
225 |
} |
|
226 |
} |
|
227 |
|
|
228 |
} |
modules/dnet-collector-plugins/trunk/src/main/resources/eu/dnetlib/data/collector/plugins/applicationContext-dnet-modular-collector-plugins.xml | ||
---|---|---|
32 | 32 |
</property> |
33 | 33 |
</bean> |
34 | 34 |
|
35 |
<bean id="HTTPWithFileNamePlugin" class="eu.dnetlib.data.collector.plugins.HTTPWithFileName.HTTPWithFileNameCollectorPlugin">
|
|
35 |
<bean id="HTTPWithFileNamePlugin" class="eu.dnetlib.data.collector.plugins.httpfilename.HTTPWithFileNameCollectorPlugin">
|
|
36 | 36 |
<property name="protocolDescriptor"> |
37 | 37 |
|
38 | 38 |
<bean class="eu.dnetlib.data.collector.rmi.ProtocolDescriptor" p:name="HTTPWithFileName"> |
Also available in: Unified diff
fix for package name (HTTPWithFileName -> httpfilename and fixed issue on iterator for HTTPWithFileNameCollectorIterable