1
|
package eu.dnetlib.data.collector.plugins;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.io.InputStream;
|
5
|
import java.net.*;
|
6
|
import java.security.GeneralSecurityException;
|
7
|
import java.security.cert.CertificateException;
|
8
|
import java.security.cert.X509Certificate;
|
9
|
import java.util.List;
|
10
|
import java.util.Map;
|
11
|
import javax.net.ssl.HttpsURLConnection;
|
12
|
import javax.net.ssl.SSLContext;
|
13
|
import javax.net.ssl.TrustManager;
|
14
|
import javax.net.ssl.X509TrustManager;
|
15
|
|
16
|
import eu.dnetlib.data.collector.plugin.CollectorPluginErrorLogList;
|
17
|
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
18
|
import org.apache.commons.io.IOUtils;
|
19
|
import org.apache.commons.lang3.math.NumberUtils;
|
20
|
import org.apache.commons.logging.Log;
|
21
|
import org.apache.commons.logging.LogFactory;
|
22
|
|
23
|
/**
|
24
|
* @author jochen, michele, andrea
|
25
|
*
|
26
|
*/
|
27
|
public class HttpConnector {
|
28
|
|
29
|
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
30
|
|
31
|
private int maxNumberOfRetry = 6;
|
32
|
private int defaultDelay = 120; // seconds
|
33
|
private int readTimeOut = 120; // seconds
|
34
|
|
35
|
private String responseType=null;
|
36
|
|
37
|
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
38
|
|
39
|
public HttpConnector(){
|
40
|
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
41
|
}
|
42
|
|
43
|
/**
|
44
|
* Given the URL returns the content via HTTP GET
|
45
|
*
|
46
|
* @param requestUrl the URL
|
47
|
* @return the content of the downloaded resource
|
48
|
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
49
|
*/
|
50
|
public String getInputSource(final String requestUrl) throws CollectorServiceException {
|
51
|
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
52
|
}
|
53
|
|
54
|
|
55
|
|
56
|
private String attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
57
|
throws CollectorServiceException {
|
58
|
|
59
|
if (retryNumber > maxNumberOfRetry) { throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); }
|
60
|
|
61
|
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
62
|
try {
|
63
|
InputStream input = null;
|
64
|
|
65
|
try {
|
66
|
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
67
|
urlConn.setInstanceFollowRedirects(false);
|
68
|
urlConn.setReadTimeout(readTimeOut * 1000);
|
69
|
urlConn.addRequestProperty("User-Agent", userAgent);
|
70
|
|
71
|
if (log.isDebugEnabled()) {
|
72
|
logHeaderFields(urlConn);
|
73
|
}
|
74
|
|
75
|
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
76
|
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
|
77
|
log.warn("waiting and repeating request after " + retryAfter + " sec.");
|
78
|
Thread.sleep(retryAfter * 1000);
|
79
|
errorList.add("503 Service Unavailable");
|
80
|
urlConn.disconnect();
|
81
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
82
|
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM) || (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) {
|
83
|
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
84
|
log.debug("The requested url has been moved to " + newUrl);
|
85
|
errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
|
86
|
urlConn.disconnect();
|
87
|
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
88
|
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
89
|
log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
90
|
Thread.sleep(defaultDelay * 1000);
|
91
|
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
92
|
urlConn.disconnect();
|
93
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
94
|
} else {
|
95
|
input = urlConn.getInputStream();
|
96
|
responseType = urlConn.getContentType();
|
97
|
|
98
|
return IOUtils.toString(input);
|
99
|
}
|
100
|
} catch (IOException e) {
|
101
|
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
102
|
Thread.sleep(defaultDelay * 1000);
|
103
|
errorList.add(e.getMessage());
|
104
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
105
|
} finally {
|
106
|
IOUtils.closeQuietly(input);
|
107
|
}
|
108
|
} catch (InterruptedException e) {
|
109
|
throw new CollectorServiceException(e);
|
110
|
}
|
111
|
}
|
112
|
|
113
|
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
114
|
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
115
|
|
116
|
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
117
|
if (e.getKey() != null) {
|
118
|
for (String v : e.getValue()) {
|
119
|
log.debug(" key: " + e.getKey() + " - value: " + v);
|
120
|
}
|
121
|
}
|
122
|
}
|
123
|
}
|
124
|
|
125
|
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
126
|
for (String key : headerMap.keySet()) {
|
127
|
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) && NumberUtils.isNumber(headerMap.get(key).get(0))) { return Integer
|
128
|
.parseInt(headerMap.get(key).get(0)) + 10; }
|
129
|
}
|
130
|
return -1;
|
131
|
}
|
132
|
|
133
|
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
|
134
|
for (String key : headerMap.keySet()) {
|
135
|
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); }
|
136
|
}
|
137
|
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
|
138
|
}
|
139
|
|
140
|
/**
|
141
|
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
142
|
*/
|
143
|
public void initTrustManager() {
|
144
|
final X509TrustManager tm = new X509TrustManager() {
|
145
|
|
146
|
@Override
|
147
|
public void checkClientTrusted(final X509Certificate[] xcs, final String string) throws CertificateException {}
|
148
|
|
149
|
@Override
|
150
|
public void checkServerTrusted(final X509Certificate[] xcs, final String string) throws CertificateException {}
|
151
|
|
152
|
@Override
|
153
|
public X509Certificate[] getAcceptedIssuers() {
|
154
|
return null;
|
155
|
}
|
156
|
};
|
157
|
try {
|
158
|
final SSLContext ctx = SSLContext.getInstance("TLS");
|
159
|
ctx.init(null, new TrustManager[] { tm }, null);
|
160
|
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
161
|
} catch (GeneralSecurityException e) {
|
162
|
log.fatal(e);
|
163
|
throw new IllegalStateException(e);
|
164
|
}
|
165
|
}
|
166
|
|
167
|
|
168
|
public int getMaxNumberOfRetry() {
|
169
|
return maxNumberOfRetry;
|
170
|
}
|
171
|
|
172
|
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
173
|
this.maxNumberOfRetry = maxNumberOfRetry;
|
174
|
}
|
175
|
|
176
|
public int getDefaultDelay() {
|
177
|
return defaultDelay;
|
178
|
}
|
179
|
|
180
|
public void setDefaultDelay(final int defaultDelay) {
|
181
|
this.defaultDelay = defaultDelay;
|
182
|
}
|
183
|
|
184
|
public int getReadTimeOut() {
|
185
|
return readTimeOut;
|
186
|
}
|
187
|
|
188
|
public void setReadTimeOut(final int readTimeOut) {
|
189
|
this.readTimeOut = readTimeOut;
|
190
|
}
|
191
|
|
192
|
public String getResponseType() {return responseType;}
|
193
|
|
194
|
}
|