1 |
49643
|
alessia.ba
|
package eu.dnetlib.data.collector.plugins;
|
2 |
28959
|
michele.ar
|
|
3 |
29018
|
michele.ar
|
import java.io.IOException;
|
4 |
29169
|
andrea.man
|
import java.io.InputStream;
|
5 |
51556
|
claudio.at
|
import java.net.*;
|
6 |
28959
|
michele.ar
|
import java.security.GeneralSecurityException;
|
7 |
|
|
import java.security.cert.X509Certificate;
|
8 |
|
|
import java.util.List;
|
9 |
|
|
import java.util.Map;
|
10 |
|
|
import javax.net.ssl.HttpsURLConnection;
|
11 |
|
|
import javax.net.ssl.SSLContext;
|
12 |
|
|
import javax.net.ssl.TrustManager;
|
13 |
|
|
import javax.net.ssl.X509TrustManager;
|
14 |
|
|
|
15 |
51556
|
claudio.at
|
import eu.dnetlib.data.collector.plugin.CollectorPluginErrorLogList;
|
16 |
|
|
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
17 |
29018
|
michele.ar
|
import org.apache.commons.io.IOUtils;
|
18 |
50665
|
jochen.sch
|
import org.apache.commons.lang3.math.NumberUtils;
|
19 |
28959
|
michele.ar
|
import org.apache.commons.logging.Log;
|
20 |
|
|
import org.apache.commons.logging.LogFactory;
|
21 |
|
|
|
22 |
|
|
/**
|
23 |
57866
|
alessia.ba
|
* @author jochen, michele, andrea, alessia
|
24 |
28959
|
michele.ar
|
*/
|
25 |
|
|
public class HttpConnector {
|
26 |
29020
|
michele.ar
|
|
27 |
28959
|
michele.ar
|
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
28 |
|
|
|
29 |
34867
|
andrea.man
|
private int maxNumberOfRetry = 6;
|
30 |
|
|
private int defaultDelay = 120; // seconds
|
31 |
|
|
private int readTimeOut = 120; // seconds
|
32 |
51955
|
miriam.bag
|
|
33 |
53932
|
alessia.ba
|
private String responseType = null;
|
34 |
51955
|
miriam.bag
|
|
35 |
48804
|
jochen.sch
|
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
36 |
29020
|
michele.ar
|
|
37 |
53932
|
alessia.ba
|
public HttpConnector() {
|
38 |
|
|
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
39 |
|
|
}
|
40 |
|
|
|
41 |
28959
|
michele.ar
|
/**
|
42 |
51556
|
claudio.at
|
* Given the URL returns the content via HTTP GET
|
43 |
|
|
*
|
44 |
|
|
* @param requestUrl the URL
|
45 |
34867
|
andrea.man
|
* @return the content of the downloaded resource
|
46 |
51556
|
claudio.at
|
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
47 |
28959
|
michele.ar
|
*/
|
48 |
29018
|
michele.ar
|
public String getInputSource(final String requestUrl) throws CollectorServiceException {
|
49 |
53932
|
alessia.ba
|
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
50 |
|
|
}
|
51 |
|
|
|
52 |
|
|
/**
|
53 |
|
|
* Given the URL returns the content as a stream via HTTP GET
|
54 |
|
|
*
|
55 |
|
|
* @param requestUrl the URL
|
56 |
|
|
* @return the content of the downloaded resource as InputStream
|
57 |
|
|
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
58 |
|
|
*/
|
59 |
|
|
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
|
60 |
38175
|
andrea.man
|
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
61 |
29020
|
michele.ar
|
}
|
62 |
29018
|
michele.ar
|
|
63 |
53932
|
alessia.ba
|
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
64 |
|
|
throws CollectorServiceException {
|
65 |
|
|
try {
|
66 |
|
|
InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
67 |
|
|
try {
|
68 |
|
|
return IOUtils.toString(s);
|
69 |
|
|
} catch (IOException e) {
|
70 |
|
|
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
71 |
|
|
Thread.sleep(defaultDelay * 1000);
|
72 |
|
|
errorList.add(e.getMessage());
|
73 |
|
|
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
74 |
|
|
}
|
75 |
|
|
finally{
|
76 |
|
|
IOUtils.closeQuietly(s);
|
77 |
|
|
}
|
78 |
|
|
} catch (InterruptedException e) {
|
79 |
|
|
throw new CollectorServiceException(e);
|
80 |
|
|
}
|
81 |
|
|
}
|
82 |
51955
|
miriam.bag
|
|
83 |
53932
|
alessia.ba
|
private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
84 |
38175
|
andrea.man
|
throws CollectorServiceException {
|
85 |
29020
|
michele.ar
|
|
86 |
38175
|
andrea.man
|
if (retryNumber > maxNumberOfRetry) { throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); }
|
87 |
34867
|
andrea.man
|
|
88 |
29020
|
michele.ar
|
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
89 |
28959
|
michele.ar
|
try {
|
90 |
29169
|
andrea.man
|
InputStream input = null;
|
91 |
34867
|
andrea.man
|
|
92 |
29020
|
michele.ar
|
try {
|
93 |
37627
|
michele.ar
|
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
94 |
34867
|
andrea.man
|
urlConn.setInstanceFollowRedirects(false);
|
95 |
31724
|
michele.ar
|
urlConn.setReadTimeout(readTimeOut * 1000);
|
96 |
51556
|
claudio.at
|
urlConn.addRequestProperty("User-Agent", userAgent);
|
97 |
28959
|
michele.ar
|
|
98 |
29020
|
michele.ar
|
if (log.isDebugEnabled()) {
|
99 |
|
|
logHeaderFields(urlConn);
|
100 |
|
|
}
|
101 |
|
|
|
102 |
|
|
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
103 |
57866
|
alessia.ba
|
if (is2xx(urlConn.getResponseCode())) {
|
104 |
29169
|
andrea.man
|
input = urlConn.getInputStream();
|
105 |
51955
|
miriam.bag
|
responseType = urlConn.getContentType();
|
106 |
53932
|
alessia.ba
|
return input;
|
107 |
29020
|
michele.ar
|
}
|
108 |
57866
|
alessia.ba
|
if (is3xx(urlConn.getResponseCode())) {
|
109 |
|
|
//REDIRECTS
|
110 |
|
|
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
111 |
|
|
log.debug(String.format("The requested url %s has been moved to %s", requestUrl, newUrl));
|
112 |
|
|
errorList.add(String.format("%s %s %s. Moved to: %s", requestUrl, urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
|
113 |
|
|
urlConn.disconnect();
|
114 |
|
|
if (retryAfter > 0) Thread.sleep(retryAfter * 1000);
|
115 |
|
|
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
116 |
|
|
}
|
117 |
|
|
if (is4xx(urlConn.getResponseCode())) {
|
118 |
|
|
//CLIENT ERROR, DO NOT RETRY
|
119 |
|
|
errorList.add(String.format("%s error %s: %s", requestUrl, urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
120 |
|
|
throw new CollectorServiceException("4xx error: request will not be repeated." + errorList);
|
121 |
|
|
}
|
122 |
|
|
if (is5xx(urlConn.getResponseCode())) {
|
123 |
|
|
//SERVER SIDE ERRORS RETRY ONLY on 503
|
124 |
|
|
switch (urlConn.getResponseCode()) {
|
125 |
|
|
case HttpURLConnection.HTTP_UNAVAILABLE:
|
126 |
|
|
if (retryAfter > 0) {
|
127 |
|
|
log.warn(requestUrl+" - waiting and repeating request after suggested retry-after" + retryAfter + " sec.");
|
128 |
|
|
Thread.sleep(retryAfter * 1000);
|
129 |
|
|
}
|
130 |
|
|
else {
|
131 |
|
|
log.warn(requestUrl+" - waiting and repeating request after default delay of " + defaultDelay + " sec.");
|
132 |
|
|
Thread.sleep(defaultDelay * 1000);
|
133 |
|
|
}
|
134 |
|
|
errorList.add(requestUrl+" 503 Service Unavailable");
|
135 |
|
|
urlConn.disconnect();
|
136 |
|
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
137 |
|
|
default:
|
138 |
|
|
errorList.add(String.format("%s Error %s: %s", requestUrl, urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
139 |
|
|
throw new CollectorServiceException(urlConn.getResponseCode() + "error" + errorList);
|
140 |
|
|
}
|
141 |
|
|
}
|
142 |
|
|
throw new CollectorServiceException("Unexpected status code: " + urlConn.getResponseCode() + "error" + errorList);
|
143 |
|
|
}catch(MalformedURLException | NoRouteToHostException e){
|
144 |
|
|
errorList.add(String.format("Error: %s for request url: %s", e.getCause(), requestUrl));
|
145 |
|
|
throw new CollectorServiceException(e+ "error "+errorList);
|
146 |
29020
|
michele.ar
|
} catch (IOException e) {
|
147 |
31724
|
michele.ar
|
Thread.sleep(defaultDelay * 1000);
|
148 |
57866
|
alessia.ba
|
errorList.add(requestUrl+ " "+e.getMessage());
|
149 |
38175
|
andrea.man
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
150 |
29018
|
michele.ar
|
}
|
151 |
|
|
} catch (InterruptedException e) {
|
152 |
|
|
throw new CollectorServiceException(e);
|
153 |
28959
|
michele.ar
|
}
|
154 |
|
|
}
|
155 |
|
|
|
156 |
29018
|
michele.ar
|
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
157 |
|
|
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
158 |
29020
|
michele.ar
|
|
159 |
34867
|
andrea.man
|
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
160 |
29018
|
michele.ar
|
if (e.getKey() != null) {
|
161 |
|
|
for (String v : e.getValue()) {
|
162 |
29020
|
michele.ar
|
log.debug(" key: " + e.getKey() + " - value: " + v);
|
163 |
29018
|
michele.ar
|
}
|
164 |
|
|
}
|
165 |
|
|
}
|
166 |
28959
|
michele.ar
|
}
|
167 |
29020
|
michele.ar
|
|
168 |
29018
|
michele.ar
|
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
169 |
34867
|
andrea.man
|
for (String key : headerMap.keySet()) {
|
170 |
53932
|
alessia.ba
|
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) && NumberUtils.isCreatable(headerMap.get(key).get(0))) {
|
171 |
|
|
return Integer
|
172 |
|
|
.parseInt(headerMap.get(key).get(0)) + 10;
|
173 |
|
|
}
|
174 |
28959
|
michele.ar
|
}
|
175 |
29018
|
michele.ar
|
return -1;
|
176 |
28959
|
michele.ar
|
}
|
177 |
29020
|
michele.ar
|
|
178 |
29018
|
michele.ar
|
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
|
179 |
34867
|
andrea.man
|
for (String key : headerMap.keySet()) {
|
180 |
38105
|
andrea.man
|
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); }
|
181 |
29018
|
michele.ar
|
}
|
182 |
|
|
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
|
183 |
|
|
}
|
184 |
28959
|
michele.ar
|
|
185 |
|
|
/**
|
186 |
|
|
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
187 |
|
|
*/
|
188 |
31724
|
michele.ar
|
public void initTrustManager() {
|
189 |
28959
|
michele.ar
|
final X509TrustManager tm = new X509TrustManager() {
|
190 |
34867
|
andrea.man
|
|
191 |
|
|
@Override
|
192 |
53932
|
alessia.ba
|
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
|
193 |
|
|
}
|
194 |
34867
|
andrea.man
|
|
195 |
|
|
@Override
|
196 |
53932
|
alessia.ba
|
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
|
197 |
|
|
}
|
198 |
34867
|
andrea.man
|
|
199 |
|
|
@Override
|
200 |
|
|
public X509Certificate[] getAcceptedIssuers() {
|
201 |
28959
|
michele.ar
|
return null;
|
202 |
|
|
}
|
203 |
|
|
};
|
204 |
|
|
try {
|
205 |
|
|
final SSLContext ctx = SSLContext.getInstance("TLS");
|
206 |
34867
|
andrea.man
|
ctx.init(null, new TrustManager[] { tm }, null);
|
207 |
28959
|
michele.ar
|
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
208 |
|
|
} catch (GeneralSecurityException e) {
|
209 |
|
|
log.fatal(e);
|
210 |
|
|
throw new IllegalStateException(e);
|
211 |
|
|
}
|
212 |
|
|
}
|
213 |
31724
|
michele.ar
|
|
214 |
57866
|
alessia.ba
|
private boolean is2xx(final int statusCode) {
|
215 |
|
|
return statusCode >= 200 && statusCode <=299;
|
216 |
|
|
}
|
217 |
|
|
|
218 |
|
|
private boolean is4xx(final int statusCode) {
|
219 |
|
|
return statusCode >= 400 && statusCode <=499;
|
220 |
|
|
}
|
221 |
|
|
|
222 |
|
|
private boolean is3xx(final int statusCode) {
|
223 |
|
|
return statusCode >= 300 && statusCode <=399;
|
224 |
|
|
}
|
225 |
|
|
|
226 |
|
|
private boolean is5xx(final int statusCode) {
|
227 |
|
|
return statusCode >= 500 && statusCode <=599;
|
228 |
|
|
}
|
229 |
|
|
|
230 |
|
|
|
231 |
31724
|
michele.ar
|
public int getMaxNumberOfRetry() {
|
232 |
|
|
return maxNumberOfRetry;
|
233 |
|
|
}
|
234 |
|
|
|
235 |
34867
|
andrea.man
|
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
236 |
31724
|
michele.ar
|
this.maxNumberOfRetry = maxNumberOfRetry;
|
237 |
|
|
}
|
238 |
|
|
|
239 |
|
|
public int getDefaultDelay() {
|
240 |
|
|
return defaultDelay;
|
241 |
|
|
}
|
242 |
|
|
|
243 |
34867
|
andrea.man
|
public void setDefaultDelay(final int defaultDelay) {
|
244 |
31724
|
michele.ar
|
this.defaultDelay = defaultDelay;
|
245 |
|
|
}
|
246 |
|
|
|
247 |
|
|
public int getReadTimeOut() {
|
248 |
|
|
return readTimeOut;
|
249 |
|
|
}
|
250 |
|
|
|
251 |
34867
|
andrea.man
|
public void setReadTimeOut(final int readTimeOut) {
|
252 |
31724
|
michele.ar
|
this.readTimeOut = readTimeOut;
|
253 |
|
|
}
|
254 |
29169
|
andrea.man
|
|
255 |
53932
|
alessia.ba
|
public String getResponseType() {
|
256 |
|
|
return responseType;
|
257 |
|
|
}
|
258 |
51955
|
miriam.bag
|
|
259 |
34867
|
andrea.man
|
}
|