1
|
package eu.dnetlib.data.collector.plugins;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.io.InputStream;
|
5
|
import java.net.*;
|
6
|
import java.security.GeneralSecurityException;
|
7
|
import java.security.cert.X509Certificate;
|
8
|
import java.util.List;
|
9
|
import java.util.Map;
|
10
|
import javax.net.ssl.HttpsURLConnection;
|
11
|
import javax.net.ssl.SSLContext;
|
12
|
import javax.net.ssl.TrustManager;
|
13
|
import javax.net.ssl.X509TrustManager;
|
14
|
|
15
|
import eu.dnetlib.data.collector.plugin.CollectorPluginErrorLogList;
|
16
|
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
17
|
import org.apache.commons.io.IOUtils;
|
18
|
import org.apache.commons.lang3.math.NumberUtils;
|
19
|
import org.apache.commons.logging.Log;
|
20
|
import org.apache.commons.logging.LogFactory;
|
21
|
|
22
|
/**
|
23
|
* @author jochen, michele, andrea
|
24
|
*/
|
25
|
public class HttpConnector {
|
26
|
|
27
|
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
28
|
|
29
|
private int maxNumberOfRetry = 6;
|
30
|
private int defaultDelay = 120; // seconds
|
31
|
private int readTimeOut = 120; // seconds
|
32
|
|
33
|
private String responseType = null;
|
34
|
|
35
|
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
36
|
|
37
|
public HttpConnector() {
|
38
|
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
39
|
}
|
40
|
|
41
|
/**
|
42
|
* Given the URL returns the content via HTTP GET
|
43
|
*
|
44
|
* @param requestUrl the URL
|
45
|
* @return the content of the downloaded resource
|
46
|
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
47
|
*/
|
48
|
public String getInputSource(final String requestUrl) throws CollectorServiceException {
|
49
|
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
50
|
}
|
51
|
|
52
|
/**
|
53
|
* Given the URL returns the content as a stream via HTTP GET
|
54
|
*
|
55
|
* @param requestUrl the URL
|
56
|
* @return the content of the downloaded resource as InputStream
|
57
|
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
58
|
*/
|
59
|
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
|
60
|
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
61
|
}
|
62
|
|
63
|
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
64
|
throws CollectorServiceException {
|
65
|
try {
|
66
|
InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
67
|
try {
|
68
|
return IOUtils.toString(s);
|
69
|
} catch (IOException e) {
|
70
|
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
71
|
Thread.sleep(defaultDelay * 1000);
|
72
|
errorList.add(e.getMessage());
|
73
|
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
74
|
}
|
75
|
finally{
|
76
|
IOUtils.closeQuietly(s);
|
77
|
}
|
78
|
} catch (InterruptedException e) {
|
79
|
throw new CollectorServiceException(e);
|
80
|
}
|
81
|
}
|
82
|
|
83
|
private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
84
|
throws CollectorServiceException {
|
85
|
|
86
|
if (retryNumber > maxNumberOfRetry) { throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); }
|
87
|
|
88
|
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
89
|
try {
|
90
|
InputStream input = null;
|
91
|
|
92
|
try {
|
93
|
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
94
|
urlConn.setInstanceFollowRedirects(false);
|
95
|
urlConn.setReadTimeout(readTimeOut * 1000);
|
96
|
urlConn.addRequestProperty("User-Agent", userAgent);
|
97
|
|
98
|
if (log.isDebugEnabled()) {
|
99
|
logHeaderFields(urlConn);
|
100
|
}
|
101
|
|
102
|
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
103
|
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
|
104
|
log.warn("waiting and repeating request after " + retryAfter + " sec.");
|
105
|
Thread.sleep(retryAfter * 1000);
|
106
|
errorList.add("503 Service Unavailable");
|
107
|
urlConn.disconnect();
|
108
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
109
|
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM) || (urlConn.getResponseCode()
|
110
|
== HttpURLConnection.HTTP_MOVED_TEMP)) {
|
111
|
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
112
|
log.debug("The requested url has been moved to " + newUrl);
|
113
|
errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
|
114
|
urlConn.disconnect();
|
115
|
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
116
|
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
117
|
log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
118
|
Thread.sleep(defaultDelay * 1000);
|
119
|
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
120
|
urlConn.disconnect();
|
121
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
122
|
} else {
|
123
|
input = urlConn.getInputStream();
|
124
|
responseType = urlConn.getContentType();
|
125
|
return input;
|
126
|
}
|
127
|
} catch (IOException e) {
|
128
|
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
129
|
Thread.sleep(defaultDelay * 1000);
|
130
|
errorList.add(e.getMessage());
|
131
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
132
|
}
|
133
|
} catch (InterruptedException e) {
|
134
|
throw new CollectorServiceException(e);
|
135
|
}
|
136
|
}
|
137
|
|
138
|
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
139
|
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
140
|
|
141
|
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
142
|
if (e.getKey() != null) {
|
143
|
for (String v : e.getValue()) {
|
144
|
log.debug(" key: " + e.getKey() + " - value: " + v);
|
145
|
}
|
146
|
}
|
147
|
}
|
148
|
}
|
149
|
|
150
|
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
151
|
for (String key : headerMap.keySet()) {
|
152
|
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) && NumberUtils.isCreatable(headerMap.get(key).get(0))) {
|
153
|
return Integer
|
154
|
.parseInt(headerMap.get(key).get(0)) + 10;
|
155
|
}
|
156
|
}
|
157
|
return -1;
|
158
|
}
|
159
|
|
160
|
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
|
161
|
for (String key : headerMap.keySet()) {
|
162
|
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); }
|
163
|
}
|
164
|
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
|
165
|
}
|
166
|
|
167
|
/**
|
168
|
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
169
|
*/
|
170
|
public void initTrustManager() {
|
171
|
final X509TrustManager tm = new X509TrustManager() {
|
172
|
|
173
|
@Override
|
174
|
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
|
175
|
}
|
176
|
|
177
|
@Override
|
178
|
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
|
179
|
}
|
180
|
|
181
|
@Override
|
182
|
public X509Certificate[] getAcceptedIssuers() {
|
183
|
return null;
|
184
|
}
|
185
|
};
|
186
|
try {
|
187
|
final SSLContext ctx = SSLContext.getInstance("TLS");
|
188
|
ctx.init(null, new TrustManager[] { tm }, null);
|
189
|
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
190
|
} catch (GeneralSecurityException e) {
|
191
|
log.fatal(e);
|
192
|
throw new IllegalStateException(e);
|
193
|
}
|
194
|
}
|
195
|
|
196
|
public int getMaxNumberOfRetry() {
|
197
|
return maxNumberOfRetry;
|
198
|
}
|
199
|
|
200
|
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
201
|
this.maxNumberOfRetry = maxNumberOfRetry;
|
202
|
}
|
203
|
|
204
|
public int getDefaultDelay() {
|
205
|
return defaultDelay;
|
206
|
}
|
207
|
|
208
|
public void setDefaultDelay(final int defaultDelay) {
|
209
|
this.defaultDelay = defaultDelay;
|
210
|
}
|
211
|
|
212
|
public int getReadTimeOut() {
|
213
|
return readTimeOut;
|
214
|
}
|
215
|
|
216
|
public void setReadTimeOut(final int readTimeOut) {
|
217
|
this.readTimeOut = readTimeOut;
|
218
|
}
|
219
|
|
220
|
public String getResponseType() {
|
221
|
return responseType;
|
222
|
}
|
223
|
|
224
|
}
|