1
|
package eu.dnetlib.data.collector.plugins;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.io.InputStream;
|
5
|
import java.net.*;
|
6
|
import java.security.GeneralSecurityException;
|
7
|
import java.security.cert.X509Certificate;
|
8
|
import java.util.List;
|
9
|
import java.util.Map;
|
10
|
import javax.net.ssl.HttpsURLConnection;
|
11
|
import javax.net.ssl.SSLContext;
|
12
|
import javax.net.ssl.TrustManager;
|
13
|
import javax.net.ssl.X509TrustManager;
|
14
|
|
15
|
import eu.dnetlib.data.collector.plugin.CollectorPluginErrorLogList;
|
16
|
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
17
|
|
18
|
import org.apache.commons.io.IOUtils;
|
19
|
import org.apache.commons.lang3.math.NumberUtils;
|
20
|
import org.apache.commons.logging.Log;
|
21
|
import org.apache.commons.logging.LogFactory;
|
22
|
|
23
|
/**
|
24
|
* @author jochen, michele, andrea, alessia
|
25
|
*/
|
26
|
public class HttpConnector {
|
27
|
|
28
|
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
29
|
|
30
|
private int maxNumberOfRetry = 6;
|
31
|
private int defaultDelay = 120; // seconds
|
32
|
private int readTimeOut = 120; // seconds
|
33
|
|
34
|
|
35
|
private String responseType = null;
|
36
|
|
37
|
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
38
|
|
39
|
// IETF Draft and used by Repositories like ZENODO , not included in APACHE HTTP java packages
|
40
|
// see https://ietf-wg-httpapi.github.io/ratelimit-headers/draft-ietf-httpapi-ratelimit-headers.html
|
41
|
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT = "X-RateLimit-Limit";
|
42
|
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING = "X-RateLimit-Remaining";
|
43
|
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_RESET = "X-RateLimit-Reset";
|
44
|
|
45
|
public HttpConnector() {
|
46
|
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
47
|
}
|
48
|
|
49
|
/**
|
50
|
* Given the URL returns the content via HTTP GET
|
51
|
*
|
52
|
* @param requestUrl the URL
|
53
|
* @return the content of the downloaded resource
|
54
|
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
55
|
*/
|
56
|
public String getInputSource(final String requestUrl) throws CollectorServiceException {
|
57
|
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
58
|
}
|
59
|
|
60
|
/**
|
61
|
* Given the URL returns the content as a stream via HTTP GET
|
62
|
*
|
63
|
* @param requestUrl the URL
|
64
|
* @return the content of the downloaded resource as InputStream
|
65
|
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
66
|
*/
|
67
|
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
|
68
|
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
69
|
}
|
70
|
|
71
|
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
72
|
throws CollectorServiceException {
|
73
|
try {
|
74
|
InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
75
|
try {
|
76
|
return IOUtils.toString(s);
|
77
|
} catch (IOException e) {
|
78
|
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
79
|
Thread.sleep(defaultDelay * 1000);
|
80
|
errorList.add(e.getMessage());
|
81
|
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
82
|
}
|
83
|
finally{
|
84
|
IOUtils.closeQuietly(s);
|
85
|
}
|
86
|
} catch (InterruptedException e) {
|
87
|
throw new CollectorServiceException(e);
|
88
|
}
|
89
|
}
|
90
|
|
91
|
private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
92
|
throws CollectorServiceException {
|
93
|
|
94
|
return attemptDownload(requestUrl, retryNumber, errorList, null);
|
95
|
}
|
96
|
|
97
|
private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList, final Map<String, String> requestHeader)
|
98
|
throws CollectorServiceException {
|
99
|
if (retryNumber > maxNumberOfRetry) { throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); }
|
100
|
|
101
|
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
102
|
try {
|
103
|
InputStream input = null;
|
104
|
|
105
|
try {
|
106
|
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
107
|
this.setRequestHeader(urlConn, requestHeader);
|
108
|
urlConn.setInstanceFollowRedirects(false);
|
109
|
urlConn.setReadTimeout(readTimeOut * 1000);
|
110
|
urlConn.addRequestProperty("User-Agent", userAgent);
|
111
|
|
112
|
if (log.isDebugEnabled()) {
|
113
|
logHeaderFields(urlConn);
|
114
|
}
|
115
|
|
116
|
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
117
|
String rateLimit = urlConn.getHeaderField(HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT);
|
118
|
String rateRemaining = urlConn.getHeaderField(HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING);
|
119
|
|
120
|
if ((rateLimit != null) && (rateRemaining != null) && (Integer.parseInt(rateRemaining) < 9)) {
|
121
|
if (retryAfter > 0) {
|
122
|
backoffAndSleep(retryAfter);
|
123
|
} else {
|
124
|
backoffAndSleep(2000);
|
125
|
}
|
126
|
}
|
127
|
|
128
|
if (is2xx(urlConn.getResponseCode())) {
|
129
|
input = urlConn.getInputStream();
|
130
|
responseType = urlConn.getContentType();
|
131
|
return input;
|
132
|
}
|
133
|
if (is3xx(urlConn.getResponseCode())) {
|
134
|
//REDIRECTS
|
135
|
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
136
|
log.debug(String.format("The requested url %s has been moved to %s", requestUrl, newUrl));
|
137
|
errorList.add(String.format("%s %s %s. Moved to: %s", requestUrl, urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
|
138
|
urlConn.disconnect();
|
139
|
if (retryAfter > 0) Thread.sleep(retryAfter * 1000);
|
140
|
return attemptDownload(newUrl, retryNumber + 1, errorList, requestHeader);
|
141
|
}
|
142
|
if (is4xx(urlConn.getResponseCode())) {
|
143
|
//CLIENT ERROR, DO NOT RETRY
|
144
|
errorList.add(String.format("%s error %s: %s", requestUrl, urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
145
|
throw new CollectorServiceException("4xx error: request will not be repeated. " + errorList);
|
146
|
}
|
147
|
if (is5xx(urlConn.getResponseCode())) {
|
148
|
//SERVER SIDE ERRORS RETRY ONLY on 503
|
149
|
switch (urlConn.getResponseCode()) {
|
150
|
case HttpURLConnection.HTTP_UNAVAILABLE:
|
151
|
if (retryAfter > 0) {
|
152
|
log.warn(requestUrl+" - waiting and repeating request after suggested retry-after " + retryAfter + " sec.");
|
153
|
Thread.sleep(retryAfter * 1000);
|
154
|
}
|
155
|
else {
|
156
|
log.warn(requestUrl+" - waiting and repeating request after default delay of " + defaultDelay + " sec.");
|
157
|
Thread.sleep(defaultDelay * 1000);
|
158
|
}
|
159
|
errorList.add(requestUrl+" 503 Service Unavailable");
|
160
|
urlConn.disconnect();
|
161
|
return attemptDownload(requestUrl, retryNumber + 1, errorList, requestHeader);
|
162
|
default:
|
163
|
errorList.add(String.format("%s Error %s: %s", requestUrl, urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
164
|
throw new CollectorServiceException(urlConn.getResponseCode() + " error " + errorList);
|
165
|
}
|
166
|
}
|
167
|
throw new CollectorServiceException(String.format("Unexpected status code: %s error %s", urlConn.getResponseCode(), errorList));
|
168
|
}catch(MalformedURLException | NoRouteToHostException | CollectorServiceException e){
|
169
|
errorList.add(String.format("Error: %s for request url: %s", e.getCause(), requestUrl));
|
170
|
throw new CollectorServiceException(e+ "error "+errorList);
|
171
|
} catch (IOException e) {
|
172
|
Thread.sleep(defaultDelay * 1000);
|
173
|
errorList.add(requestUrl+ " "+e.getMessage());
|
174
|
return attemptDownload(requestUrl, retryNumber + 1, errorList, requestHeader);
|
175
|
}
|
176
|
} catch (InterruptedException e) {
|
177
|
throw new CollectorServiceException(e);
|
178
|
}
|
179
|
}
|
180
|
|
181
|
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
182
|
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
183
|
|
184
|
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
185
|
if (e.getKey() != null) {
|
186
|
for (String v : e.getValue()) {
|
187
|
log.debug(" key: " + e.getKey() + " - value: " + v);
|
188
|
}
|
189
|
}
|
190
|
}
|
191
|
}
|
192
|
|
193
|
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
194
|
for (String key : headerMap.keySet()) {
|
195
|
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) && NumberUtils.isCreatable(headerMap.get(key).get(0))) {
|
196
|
return Integer
|
197
|
.parseInt(headerMap.get(key).get(0)) + 10;
|
198
|
}
|
199
|
}
|
200
|
return -1;
|
201
|
}
|
202
|
|
203
|
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
|
204
|
for (String key : headerMap.keySet()) {
|
205
|
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); }
|
206
|
}
|
207
|
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
|
208
|
}
|
209
|
|
210
|
/**
|
211
|
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
212
|
*/
|
213
|
public void initTrustManager() {
|
214
|
final X509TrustManager tm = new X509TrustManager() {
|
215
|
|
216
|
@Override
|
217
|
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
|
218
|
}
|
219
|
|
220
|
@Override
|
221
|
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
|
222
|
}
|
223
|
|
224
|
@Override
|
225
|
public X509Certificate[] getAcceptedIssuers() {
|
226
|
return null;
|
227
|
}
|
228
|
};
|
229
|
try {
|
230
|
final SSLContext ctx = SSLContext.getInstance("TLS");
|
231
|
ctx.init(null, new TrustManager[] { tm }, null);
|
232
|
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
233
|
} catch (GeneralSecurityException e) {
|
234
|
log.fatal(e);
|
235
|
throw new IllegalStateException(e);
|
236
|
}
|
237
|
}
|
238
|
|
239
|
private boolean is2xx(final int statusCode) {
|
240
|
return statusCode >= 200 && statusCode <=299;
|
241
|
}
|
242
|
|
243
|
private boolean is4xx(final int statusCode) {
|
244
|
return statusCode >= 400 && statusCode <=499;
|
245
|
}
|
246
|
|
247
|
private boolean is3xx(final int statusCode) {
|
248
|
return statusCode >= 300 && statusCode <=399;
|
249
|
}
|
250
|
|
251
|
private boolean is5xx(final int statusCode) {
|
252
|
return statusCode >= 500 && statusCode <=599;
|
253
|
}
|
254
|
|
255
|
|
256
|
public int getMaxNumberOfRetry() {
|
257
|
return maxNumberOfRetry;
|
258
|
}
|
259
|
|
260
|
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
261
|
this.maxNumberOfRetry = maxNumberOfRetry;
|
262
|
}
|
263
|
|
264
|
public int getDefaultDelay() {
|
265
|
return defaultDelay;
|
266
|
}
|
267
|
|
268
|
public void setDefaultDelay(final int defaultDelay) {
|
269
|
this.defaultDelay = defaultDelay;
|
270
|
}
|
271
|
|
272
|
public int getReadTimeOut() {
|
273
|
return readTimeOut;
|
274
|
}
|
275
|
|
276
|
public void setReadTimeOut(final int readTimeOut) {
|
277
|
this.readTimeOut = readTimeOut;
|
278
|
}
|
279
|
|
280
|
public String getResponseType() {
|
281
|
return responseType;
|
282
|
}
|
283
|
|
284
|
/**
|
285
|
* setRequestHeader
|
286
|
*
|
287
|
* setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
|
288
|
* @param conn
|
289
|
*/
|
290
|
private void setRequestHeader(HttpURLConnection conn, Map<String, String> requestHeader) {
|
291
|
if (requestHeader != null) {
|
292
|
for (String key : requestHeader.keySet()) {
|
293
|
conn.setRequestProperty(key, requestHeader.get(key));
|
294
|
}
|
295
|
log.debug("Set Request Header with: " + requestHeader);
|
296
|
}
|
297
|
|
298
|
}
|
299
|
|
300
|
private void backoffAndSleep(int sleepTimeMs) {
|
301
|
log.info("I'm going to sleep for {}ms", sleepTimeMs);
|
302
|
try {
|
303
|
Thread.sleep(sleepTimeMs);
|
304
|
} catch (InterruptedException e) {
|
305
|
log.error(e.getMessage(), e);
|
306
|
}
|
307
|
}
|
308
|
|
309
|
}
|