Revision 51970
Added by Miriam Baglioni about 6 years ago
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/Connector.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.resourcesynck; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
|
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 07/05/2018. |
|
9 |
*/ |
|
10 |
public class Connector extends HttpConnector implements ConnectorInterface { |
|
11 |
private String response; |
|
12 |
|
|
13 |
@Override |
|
14 |
public void get(final String requestUrl) throws CollectorServiceException { |
|
15 |
response = getInputSource(requestUrl); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
public String getResponse() { |
|
20 |
return response; |
|
21 |
} |
|
22 |
|
|
23 |
@Override |
|
24 |
public boolean isStatusOk() { |
|
25 |
return (response != null); |
|
26 |
} |
|
27 |
|
|
28 |
@Override |
|
29 |
public boolean responseTypeContains(String string) { |
|
30 |
String responseType = getResponseType(); |
|
31 |
if (responseType != null) |
|
32 |
return responseType.contains(string); |
|
33 |
return false; |
|
34 |
} |
|
35 |
|
|
36 |
|
|
37 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/RSCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.resourcesynck; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 04/05/2018. |
|
9 |
*/ |
|
10 |
public class RSCollectorPlugin extends AbstractCollectorPlugin { |
|
11 |
|
|
12 |
@Override |
|
13 |
public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException { |
|
14 |
return new RSCollectorIterable(interfaceDescriptor.getBaseUrl()); |
|
15 |
} |
|
16 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/ConnectorInterface.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.resourcesynck; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
4 |
|
|
5 |
/** |
|
6 |
* Created by miriam on 07/05/2018. |
|
7 |
*/ |
|
8 |
public interface ConnectorInterface { |
|
9 |
|
|
10 |
public void get(final String requestUrl) throws CollectorServiceException; |
|
11 |
|
|
12 |
public String getResponse(); |
|
13 |
|
|
14 |
public boolean isStatusOk(); |
|
15 |
|
|
16 |
|
|
17 |
public boolean responseTypeContains(String string); |
|
18 |
|
|
19 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/RSCollectorIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.resourcesynck; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
4 |
import org.apache.commons.logging.Log; |
|
5 |
import org.apache.commons.logging.LogFactory; |
|
6 |
import org.json.JSONObject; |
|
7 |
import org.json.XML; |
|
8 |
import org.jsoup.Jsoup; |
|
9 |
import org.jsoup.nodes.Document; |
|
10 |
import org.jsoup.nodes.Element; |
|
11 |
import org.jsoup.select.Elements; |
|
12 |
|
|
13 |
import java.util.ArrayList; |
|
14 |
import java.util.Iterator; |
|
15 |
|
|
16 |
import java.util.concurrent.ArrayBlockingQueue; |
|
17 |
import java.util.function.Consumer; |
|
18 |
|
|
19 |
/** |
|
20 |
* Created by miriam on 04/05/2018. |
|
21 |
*/ |
|
22 |
public class RSCollectorIterable implements Iterable<String> { |
|
23 |
private static final Log log = LogFactory.getLog(RSCollectorIterable.class); |
|
24 |
private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100); |
|
25 |
private final ArrayList<String> urls = new ArrayList<>(); |
|
26 |
private final ArrayList<String> jsons = new ArrayList<String>(); |
|
27 |
|
|
28 |
|
|
29 |
public RSCollectorIterable(String startUrl){ |
|
30 |
|
|
31 |
urls.add(startUrl); |
|
32 |
fillQueue(); |
|
33 |
} |
|
34 |
|
|
35 |
private String addFilePath(String json,String url){ |
|
36 |
String path = url.replace("metadata", "pdf"); |
|
37 |
try { |
|
38 |
json = json.substring(0, json.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}"; |
|
39 |
}catch(Exception ex){ |
|
40 |
log.info("not file with extension .json"); |
|
41 |
} |
|
42 |
|
|
43 |
JSONObject jsonobj = new JSONObject("{'metadata':"+json+"}"); |
|
44 |
|
|
45 |
return XML.toString(jsonobj); |
|
46 |
} |
|
47 |
|
|
48 |
private void recurFolder(String text, String url){ |
|
49 |
Document doc = Jsoup.parse(text); |
|
50 |
Elements links = doc.select("a"); |
|
51 |
for(Element e:links){ |
|
52 |
if (!e.text().equals("../")){ |
|
53 |
String file = e.attr("href"); |
|
54 |
if(file.endsWith(".json")) |
|
55 |
jsons.add(url+file); |
|
56 |
else |
|
57 |
urls.add(url+file); |
|
58 |
} |
|
59 |
} |
|
60 |
} |
|
61 |
|
|
62 |
private void fillQueue() { |
|
63 |
Connector c = new Connector(); |
|
64 |
String url; |
|
65 |
while((jsons.size()>0 || urls.size() > 0 ) && queue.size()<100){ |
|
66 |
if (jsons.size() > 0){ |
|
67 |
url = jsons.remove(0); |
|
68 |
try { |
|
69 |
c.get(url); |
|
70 |
} catch (CollectorServiceException e) { |
|
71 |
log.error("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
72 |
} |
|
73 |
if(c.isStatusOk()){ |
|
74 |
try { |
|
75 |
String ret = c.getResponse(); |
|
76 |
if (ret != null && ret.length()>0) |
|
77 |
queue.put(addFilePath(ret,url)); |
|
78 |
} catch (InterruptedException e) { |
|
79 |
log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
80 |
|
|
81 |
} |
|
82 |
} |
|
83 |
}else{ |
|
84 |
url = urls.remove(0); |
|
85 |
try { |
|
86 |
c.get(url); |
|
87 |
} catch (CollectorServiceException e) { |
|
88 |
log.error("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
89 |
} |
|
90 |
if(c.isStatusOk()){ |
|
91 |
if (c.responseTypeContains("text/html")){ |
|
92 |
recurFolder(c.getResponse(),url); |
|
93 |
} |
|
94 |
else if(c.responseTypeContains("application/json")){ |
|
95 |
try { |
|
96 |
queue.put(addFilePath(c.getResponse(),url)); |
|
97 |
} catch (InterruptedException e) { |
|
98 |
log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
99 |
} |
|
100 |
} |
|
101 |
} |
|
102 |
|
|
103 |
} |
|
104 |
|
|
105 |
} |
|
106 |
|
|
107 |
} |
|
108 |
|
|
109 |
@Override |
|
110 |
public Iterator<String> iterator() { |
|
111 |
|
|
112 |
return new Iterator<String>(){ |
|
113 |
|
|
114 |
@Override |
|
115 |
public boolean hasNext() { |
|
116 |
if (queue.isEmpty()){ |
|
117 |
fillQueue(); |
|
118 |
} |
|
119 |
return (!queue.isEmpty()); |
|
120 |
} |
|
121 |
|
|
122 |
@Override |
|
123 |
public String next() { |
|
124 |
return queue.poll(); |
|
125 |
} |
|
126 |
|
|
127 |
@Override |
|
128 |
public void remove() { |
|
129 |
|
|
130 |
} |
|
131 |
|
|
132 |
@Override |
|
133 |
public void forEachRemaining(Consumer<? super String> action) { |
|
134 |
|
|
135 |
} |
|
136 |
}; |
|
137 |
} |
|
138 |
|
|
139 |
|
|
140 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/excel/Read.java | ||
---|---|---|
13 | 13 |
|
14 | 14 |
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin; |
15 | 15 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
16 |
import org.apache.commons.lang3.StringUtils; |
|
16 | 17 |
import org.apache.commons.logging.Log; |
17 | 18 |
import org.apache.commons.logging.LogFactory; |
18 | 19 |
import org.apache.poi.ss.usermodel.Cell; |
... | ... | |
114 | 115 |
} |
115 | 116 |
|
116 | 117 |
private void fillMap(JSONObject json, HashMap<String,String> map, String elem){ |
117 |
JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
|
|
118 |
for(Object entry: arr) {
|
|
119 |
try {
|
|
118 |
try{
|
|
119 |
final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
|
|
120 |
for(Object entry: arr)
|
|
120 | 121 |
map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to")); |
121 |
}catch(Exception ex){
|
|
122 |
ex.printStackTrace();
|
|
123 |
}
|
|
122 |
}catch(Throwable e){
|
|
123 |
log.error("Problems filling the map for " + elem);
|
|
124 |
throw(e);
|
|
124 | 125 |
} |
125 | 126 |
|
126 | 127 |
} |
127 | 128 |
|
129 |
|
|
130 |
|
|
128 | 131 |
private void parseArguments() { |
129 |
JSONObject json = new JSONObject(argument); |
|
130 |
fillMap(json, map_header,"header"); |
|
131 |
fillMap(json,map_body,"body"); |
|
132 |
if (StringUtils.isNotEmpty(argument)){ |
|
133 |
try{ |
|
134 |
final JSONObject json = new JSONObject(argument); |
|
135 |
if(json.has("header")) |
|
136 |
fillMap(json, map_header,"header"); |
|
137 |
if (json.has("body")) |
|
138 |
fillMap(json,map_body,"body"); |
|
132 | 139 |
|
133 |
if (!(json.getJSONArray("replace_currency")==null)){ |
|
134 |
replace_currency = true ; |
|
135 |
from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from"); |
|
136 |
to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to"); |
|
140 |
if(json.has("replace_currency")) |
|
141 |
{ |
|
142 |
replace_currency = true ; |
|
143 |
from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from"); |
|
144 |
to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to"); |
|
145 |
|
|
146 |
} |
|
147 |
|
|
148 |
if (json.has("col_currency")) |
|
149 |
currency_column = json.getInt("col_currency"); |
|
150 |
}catch(Throwable e){ |
|
151 |
log.error("Problems while parsing the argument parameter."); |
|
152 |
throw (e); |
|
153 |
} |
|
137 | 154 |
} |
138 | 155 |
|
139 |
currency_column = json.getInt("col_currency"); |
|
140 | 156 |
|
157 |
|
|
141 | 158 |
} |
142 | 159 |
|
143 | 160 |
private String applyReplace(String row, HashMap<String,String>replace){ |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.HTTPWithFileName; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 04/05/2018. |
|
9 |
*/ |
|
10 |
public class HTTPWithFileNameCollectorPlugin extends AbstractCollectorPlugin { |
|
11 |
|
|
12 |
@Override |
|
13 |
public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException { |
|
14 |
return new HTTPWithFileNameCollectorIterable(interfaceDescriptor.getBaseUrl()); |
|
15 |
} |
|
16 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/Connector.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.HTTPWithFileName; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
|
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 07/05/2018. |
|
9 |
*/ |
|
10 |
public class Connector extends HttpConnector implements ConnectorInterface { |
|
11 |
private String response; |
|
12 |
|
|
13 |
@Override |
|
14 |
public void get(final String requestUrl) throws CollectorServiceException { |
|
15 |
response = getInputSource(requestUrl); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
public String getResponse() { |
|
20 |
return response; |
|
21 |
} |
|
22 |
|
|
23 |
@Override |
|
24 |
public boolean isStatusOk() { |
|
25 |
return (response != null); |
|
26 |
} |
|
27 |
|
|
28 |
@Override |
|
29 |
public boolean responseTypeContains(String string) { |
|
30 |
String responseType = getResponseType(); |
|
31 |
if (responseType != null) |
|
32 |
return responseType.contains(string); |
|
33 |
return false; |
|
34 |
} |
|
35 |
|
|
36 |
|
|
37 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/ConnectorInterface.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.HTTPWithFileName; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
4 |
|
|
5 |
/** |
|
6 |
* Created by miriam on 07/05/2018. |
|
7 |
*/ |
|
8 |
public interface ConnectorInterface { |
|
9 |
|
|
10 |
public void get(final String requestUrl) throws CollectorServiceException; |
|
11 |
|
|
12 |
public String getResponse(); |
|
13 |
|
|
14 |
public boolean isStatusOk(); |
|
15 |
|
|
16 |
|
|
17 |
public boolean responseTypeContains(String string); |
|
18 |
|
|
19 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameCollectorIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.HTTPWithFileName; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
4 |
import org.apache.commons.logging.Log; |
|
5 |
import org.apache.commons.logging.LogFactory; |
|
6 |
import org.json.JSONObject; |
|
7 |
import org.json.XML; |
|
8 |
import org.jsoup.Jsoup; |
|
9 |
import org.jsoup.nodes.Document; |
|
10 |
import org.jsoup.nodes.Element; |
|
11 |
import org.jsoup.select.Elements; |
|
12 |
|
|
13 |
import java.util.ArrayList; |
|
14 |
import java.util.Iterator; |
|
15 |
|
|
16 |
import java.util.concurrent.ArrayBlockingQueue; |
|
17 |
import java.util.function.Consumer; |
|
18 |
|
|
19 |
/** |
|
20 |
* Created by miriam on 04/05/2018. |
|
21 |
*/ |
|
22 |
public class HTTPWithFileNameCollectorIterable implements Iterable<String> { |
|
23 |
private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class); |
|
24 |
|
|
25 |
private final ArrayList<String> urls = new ArrayList<>(); |
|
26 |
private final ArrayList<String> jsons = new ArrayList<String>(); |
|
27 |
|
|
28 |
|
|
29 |
public HTTPWithFileNameCollectorIterable(String startUrl){ |
|
30 |
|
|
31 |
urls.add(startUrl); |
|
32 |
} |
|
33 |
|
|
34 |
private String addFilePath(String json,String url){ |
|
35 |
String path = url.replace("metadata", "pdf"); |
|
36 |
try { |
|
37 |
json = json.substring(0, json.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}"; |
|
38 |
}catch(Exception ex){ |
|
39 |
log.info("not file with extension .json"); |
|
40 |
} |
|
41 |
|
|
42 |
JSONObject jsonobj = new JSONObject("{'metadata':"+json+"}"); |
|
43 |
|
|
44 |
return XML.toString(jsonobj); |
|
45 |
} |
|
46 |
|
|
47 |
private void recurFolder(String text, String url){ |
|
48 |
Document doc = Jsoup.parse(text); |
|
49 |
Elements links = doc.select("a"); |
|
50 |
for(Element e:links){ |
|
51 |
if (!e.text().equals("../")){ |
|
52 |
String file = e.attr("href"); |
|
53 |
if(file.endsWith(".json")) |
|
54 |
jsons.add(url+file); |
|
55 |
else |
|
56 |
urls.add(url+file); |
|
57 |
} |
|
58 |
} |
|
59 |
} |
|
60 |
|
|
61 |
|
|
62 |
@Override |
|
63 |
public Iterator<String> iterator() { |
|
64 |
final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100); |
|
65 |
|
|
66 |
|
|
67 |
return new Iterator<String>(){ |
|
68 |
|
|
69 |
public void fillQueue() { |
|
70 |
Connector c = new Connector(); |
|
71 |
String url; |
|
72 |
while((jsons.size()>0 || urls.size() > 0 ) && queue.size()<100){ |
|
73 |
if (jsons.size() > 0){ |
|
74 |
url = jsons.remove(0); |
|
75 |
try { |
|
76 |
c.get(url); |
|
77 |
} catch (CollectorServiceException e) { |
|
78 |
log.error("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
79 |
} |
|
80 |
if(c.isStatusOk()){ |
|
81 |
try { |
|
82 |
String ret = c.getResponse(); |
|
83 |
if (ret != null && ret.length()>0) |
|
84 |
queue.put(addFilePath(ret,url)); |
|
85 |
} catch (InterruptedException e) { |
|
86 |
log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
87 |
|
|
88 |
} |
|
89 |
} |
|
90 |
}else{ |
|
91 |
url = urls.remove(0); |
|
92 |
try { |
|
93 |
c.get(url); |
|
94 |
} catch (CollectorServiceException e) { |
|
95 |
log.error("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
96 |
} |
|
97 |
if(c.isStatusOk()){ |
|
98 |
if (c.responseTypeContains("text/html")){ |
|
99 |
recurFolder(c.getResponse(),url); |
|
100 |
} |
|
101 |
else if(c.responseTypeContains("application/json")){ |
|
102 |
try { |
|
103 |
queue.put(addFilePath(c.getResponse(),url)); |
|
104 |
} catch (InterruptedException e) { |
|
105 |
log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
106 |
} |
|
107 |
} |
|
108 |
} |
|
109 |
|
|
110 |
} |
|
111 |
|
|
112 |
} |
|
113 |
|
|
114 |
} |
|
115 |
@Override |
|
116 |
public boolean hasNext() { |
|
117 |
if (queue.isEmpty()){ |
|
118 |
fillQueue(); |
|
119 |
} |
|
120 |
return (!queue.isEmpty()); |
|
121 |
} |
|
122 |
|
|
123 |
@Override |
|
124 |
public String next() { |
|
125 |
return queue.poll(); |
|
126 |
} |
|
127 |
|
|
128 |
@Override |
|
129 |
public void remove() { |
|
130 |
|
|
131 |
} |
|
132 |
|
|
133 |
@Override |
|
134 |
public void forEachRemaining(Consumer<? super String> action) { |
|
135 |
|
|
136 |
} |
|
137 |
}; |
|
138 |
} |
|
139 |
|
|
140 |
|
|
141 |
} |
Also available in: Unified diff
commit after refactoring