1
|
package eu.dnetlib.msro.workers.aggregation.collect.plugins.oai.engine;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.io.InputStream;
|
5
|
import java.net.CookieHandler;
|
6
|
import java.net.CookieManager;
|
7
|
import java.net.CookiePolicy;
|
8
|
import java.net.HttpURLConnection;
|
9
|
import java.net.URL;
|
10
|
import java.security.GeneralSecurityException;
|
11
|
import java.security.KeyManagementException;
|
12
|
import java.security.NoSuchAlgorithmException;
|
13
|
import java.security.cert.CertificateException;
|
14
|
import java.security.cert.X509Certificate;
|
15
|
import java.util.List;
|
16
|
import java.util.Map;
|
17
|
|
18
|
import javax.net.ssl.HttpsURLConnection;
|
19
|
import javax.net.ssl.SSLContext;
|
20
|
import javax.net.ssl.TrustManager;
|
21
|
import javax.net.ssl.X509TrustManager;
|
22
|
|
23
|
import org.apache.commons.io.IOUtils;
|
24
|
import org.apache.commons.lang3.math.NumberUtils;
|
25
|
import org.apache.commons.logging.Log;
|
26
|
import org.apache.commons.logging.LogFactory;
|
27
|
import org.springframework.stereotype.Component;
|
28
|
|
29
|
import eu.dnetlib.msro.workflows.nodes.collect.CollectException;
|
30
|
|
31
|
/**
|
32
|
* @author jochen, michele, andrea
|
33
|
*/
|
34
|
@Component
|
35
|
public class HttpConnector {
|
36
|
|
37
|
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
38
|
|
39
|
private int maxNumberOfRetry = 6;
|
40
|
private int defaultDelay = 120; // seconds
|
41
|
private int readTimeOut = 120; // seconds
|
42
|
private String userAgent = "Mozilla/5.0 (compatible; OAI-Harvester; +http://www.openaire.eu)";
|
43
|
|
44
|
public HttpConnector() {
|
45
|
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
46
|
}
|
47
|
|
48
|
/**
|
49
|
* @param requestUrl
|
50
|
* @return the content of the downloaded resource
|
51
|
* @throws CollectException
|
52
|
*/
|
53
|
public String getInputSource(final String requestUrl) throws CollectException {
|
54
|
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
55
|
}
|
56
|
|
57
|
private String attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
58
|
throws CollectException {
|
59
|
|
60
|
if (retryNumber > maxNumberOfRetry) { throw new CollectException("Max number of retries exceeded. Cause: \n " + errorList); }
|
61
|
|
62
|
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
63
|
try {
|
64
|
InputStream input = null;
|
65
|
|
66
|
try {
|
67
|
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
68
|
urlConn.setInstanceFollowRedirects(false);
|
69
|
urlConn.setReadTimeout(readTimeOut * 1000);
|
70
|
urlConn.addRequestProperty("User-Agent", userAgent);
|
71
|
|
72
|
if (log.isDebugEnabled()) {
|
73
|
logHeaderFields(urlConn);
|
74
|
}
|
75
|
|
76
|
final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
77
|
if (retryAfter > 0) {
|
78
|
log.warn("waiting and repeating request after " + retryAfter + " sec.");
|
79
|
Thread.sleep(retryAfter * 1000);
|
80
|
errorList.add("503 Service Unavailable");
|
81
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
82
|
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM)
|
83
|
|| (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) {
|
84
|
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
85
|
log.info("The requested url has been moved to " + newUrl);
|
86
|
errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
|
87
|
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
88
|
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
89
|
log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
90
|
Thread.sleep(defaultDelay * 1000);
|
91
|
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
92
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
93
|
} else {
|
94
|
input = urlConn.getInputStream();
|
95
|
return IOUtils.toString(input);
|
96
|
}
|
97
|
} catch (final IOException e) {
|
98
|
log.error("error while retrieving from http-connection occured: " + e, e);
|
99
|
Thread.sleep(defaultDelay * 1000);
|
100
|
errorList.add(e.getMessage());
|
101
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
102
|
} finally {
|
103
|
IOUtils.closeQuietly(input);
|
104
|
}
|
105
|
} catch (final InterruptedException e) {
|
106
|
throw new CollectException("Error fetching url: " + requestUrl, e);
|
107
|
}
|
108
|
}
|
109
|
|
110
|
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
111
|
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
112
|
|
113
|
for (final Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
114
|
if (e.getKey() != null) {
|
115
|
for (final String v : e.getValue()) {
|
116
|
log.debug(" key: " + e.getKey() + " - value: " + v);
|
117
|
}
|
118
|
}
|
119
|
}
|
120
|
}
|
121
|
|
122
|
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
123
|
for (final String key : headerMap.keySet()) {
|
124
|
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) && NumberUtils
|
125
|
.isNumber(headerMap.get(key).get(0))) { return Integer
|
126
|
.parseInt(headerMap.get(key).get(0)) + 10; }
|
127
|
}
|
128
|
return -1;
|
129
|
}
|
130
|
|
131
|
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectException {
|
132
|
for (final String key : headerMap.keySet()) {
|
133
|
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); }
|
134
|
}
|
135
|
throw new CollectException("The requested url has been MOVED, but 'location' param is MISSING");
|
136
|
}
|
137
|
|
138
|
/**
|
139
|
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
140
|
*
|
141
|
* @throws NoSuchAlgorithmException
|
142
|
* @throws KeyManagementException
|
143
|
*/
|
144
|
public void initTrustManager() {
|
145
|
final X509TrustManager tm = new X509TrustManager() {
|
146
|
|
147
|
@Override
|
148
|
public void checkClientTrusted(final X509Certificate[] xcs, final String string) throws CertificateException {}
|
149
|
|
150
|
@Override
|
151
|
public void checkServerTrusted(final X509Certificate[] xcs, final String string) throws CertificateException {}
|
152
|
|
153
|
@Override
|
154
|
public X509Certificate[] getAcceptedIssuers() {
|
155
|
return null;
|
156
|
}
|
157
|
};
|
158
|
try {
|
159
|
final SSLContext ctx = SSLContext.getInstance("TLS");
|
160
|
ctx.init(null, new TrustManager[] { tm }, null);
|
161
|
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
162
|
} catch (final GeneralSecurityException e) {
|
163
|
log.fatal(e);
|
164
|
throw new IllegalStateException(e);
|
165
|
}
|
166
|
}
|
167
|
|
168
|
public int getMaxNumberOfRetry() {
|
169
|
return maxNumberOfRetry;
|
170
|
}
|
171
|
|
172
|
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
173
|
this.maxNumberOfRetry = maxNumberOfRetry;
|
174
|
}
|
175
|
|
176
|
public int getDefaultDelay() {
|
177
|
return defaultDelay;
|
178
|
}
|
179
|
|
180
|
public void setDefaultDelay(final int defaultDelay) {
|
181
|
this.defaultDelay = defaultDelay;
|
182
|
}
|
183
|
|
184
|
public int getReadTimeOut() {
|
185
|
return readTimeOut;
|
186
|
}
|
187
|
|
188
|
public void setReadTimeOut(final int readTimeOut) {
|
189
|
this.readTimeOut = readTimeOut;
|
190
|
}
|
191
|
|
192
|
}
|