Project

General

Profile

1
package eu.dnetlib.data.collector.plugins;
2

    
3
import java.io.IOException;
4
import java.io.InputStream;
5
import java.net.*;
6
import java.security.GeneralSecurityException;
7
import java.security.cert.X509Certificate;
8
import java.util.List;
9
import java.util.Map;
10
import javax.net.ssl.HttpsURLConnection;
11
import javax.net.ssl.SSLContext;
12
import javax.net.ssl.TrustManager;
13
import javax.net.ssl.X509TrustManager;
14

    
15
import eu.dnetlib.data.collector.plugin.CollectorPluginErrorLogList;
16
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
17
import org.apache.commons.io.IOUtils;
18
import org.apache.commons.lang3.math.NumberUtils;
19
import org.apache.commons.logging.Log;
20
import org.apache.commons.logging.LogFactory;
21

    
22
/**
23
 * @author jochen, michele, andrea, alessia
24
 */
25
public class HttpConnector {
26

    
27
	private static final Log log = LogFactory.getLog(HttpConnector.class);
28

    
29
	private int maxNumberOfRetry = 6;
30
	private int defaultDelay = 120; // seconds
31
	private int readTimeOut = 120; // seconds
32

    
33
	private String responseType = null;
34

    
35
	private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
36

    
37
	public HttpConnector() {
38
		CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
39
	}
40

    
41
	/**
42
	 * Given the URL returns the content via HTTP GET
43
	 *
44
	 * @param requestUrl the URL
45
	 * @return the content of the downloaded resource
46
	 * @throws CollectorServiceException when retrying more than maxNumberOfRetry times
47
	 */
48
	public String getInputSource(final String requestUrl) throws CollectorServiceException {
49
		return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
50
	}
51

    
52
	/**
53
	 * Given the URL returns the content as a stream via HTTP GET
54
	 *
55
	 * @param requestUrl the URL
56
	 * @return the content of the downloaded resource as InputStream
57
	 * @throws CollectorServiceException when retrying more than maxNumberOfRetry times
58
	 */
59
	public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
60
		return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
61
	}
62

    
63
	private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
64
			throws CollectorServiceException {
65
		try {
66
			InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
67
			try {
68
				return IOUtils.toString(s);
69
			} catch (IOException e) {
70
				log.error("error while retrieving from http-connection occured: " + requestUrl, e);
71
				Thread.sleep(defaultDelay * 1000);
72
				errorList.add(e.getMessage());
73
				return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
74
			}
75
			finally{
76
				IOUtils.closeQuietly(s);
77
			}
78
		} catch (InterruptedException e) {
79
			throw new CollectorServiceException(e);
80
		}
81
	}
82

    
83
	private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
84
			throws CollectorServiceException {
85

    
86
		if (retryNumber > maxNumberOfRetry) { throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); }
87

    
88
		log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
89
		try {
90
			InputStream input = null;
91

    
92
			try {
93
				final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
94
				urlConn.setInstanceFollowRedirects(false);
95
				urlConn.setReadTimeout(readTimeOut * 1000);
96
				urlConn.addRequestProperty("User-Agent", userAgent);
97

    
98
				if (log.isDebugEnabled()) {
99
					logHeaderFields(urlConn);
100
				}
101

    
102
				int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
103
				if (is2xx(urlConn.getResponseCode())) {
104
					input = urlConn.getInputStream();
105
					responseType = urlConn.getContentType();
106
					return input;
107
				}
108
				if (is3xx(urlConn.getResponseCode())) {
109
					//REDIRECTS
110
					final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
111
					log.debug(String.format("The requested url %s has been moved to %s", requestUrl, newUrl));
112
					errorList.add(String.format("%s %s %s. Moved to: %s", requestUrl, urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
113
					urlConn.disconnect();
114
					if (retryAfter > 0) Thread.sleep(retryAfter * 1000);
115
					return attemptDownload(newUrl, retryNumber + 1, errorList);
116
				}
117
				if (is4xx(urlConn.getResponseCode())) {
118
					//CLIENT ERROR, DO NOT RETRY
119
					errorList.add(String.format("%s error %s: %s", requestUrl, urlConn.getResponseCode(), urlConn.getResponseMessage()));
120
					throw new CollectorServiceException("4xx error: request will not be repeated. " + errorList);
121
				}
122
				if (is5xx(urlConn.getResponseCode())) {
123
					//SERVER SIDE ERRORS RETRY ONLY on 503
124
					switch (urlConn.getResponseCode()) {
125
						case HttpURLConnection.HTTP_UNAVAILABLE:
126
							if (retryAfter > 0) {
127
								log.warn(requestUrl+" - waiting and repeating request after suggested retry-after " + retryAfter + " sec.");
128
								Thread.sleep(retryAfter * 1000);
129
							}
130
							else {
131
								log.warn(requestUrl+" - waiting and repeating request after default delay of " + defaultDelay + " sec.");
132
								Thread.sleep(defaultDelay * 1000);
133
							}
134
							errorList.add(requestUrl+" 503 Service Unavailable");
135
							urlConn.disconnect();
136
							return attemptDownload(requestUrl, retryNumber + 1, errorList);
137
						default:
138
							errorList.add(String.format("%s Error %s: %s", requestUrl, urlConn.getResponseCode(), urlConn.getResponseMessage()));
139
							throw new CollectorServiceException(urlConn.getResponseCode() + " error " + errorList);
140
					}
141
				}
142
				throw new CollectorServiceException(String.format("Unexpected status code: %s error %s", urlConn.getResponseCode(), errorList));
143
			}catch(MalformedURLException | NoRouteToHostException e){
144
				errorList.add(String.format("Error: %s for request url: %s", e.getCause(), requestUrl));
145
				throw new CollectorServiceException(e+ "error "+errorList);
146
			} catch (IOException e) {
147
				Thread.sleep(defaultDelay * 1000);
148
				errorList.add(requestUrl+ " "+e.getMessage());
149
				return attemptDownload(requestUrl, retryNumber + 1, errorList);
150
			}
151
		} catch (InterruptedException e) {
152
			throw new CollectorServiceException(e);
153
		}
154
	}
155

    
156
	private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
157
		log.debug("StatusCode: " + urlConn.getResponseMessage());
158

    
159
		for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
160
			if (e.getKey() != null) {
161
				for (String v : e.getValue()) {
162
					log.debug("  key: " + e.getKey() + " - value: " + v);
163
				}
164
			}
165
		}
166
	}
167

    
168
	private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
169
		for (String key : headerMap.keySet()) {
170
			if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) && NumberUtils.isCreatable(headerMap.get(key).get(0))) {
171
				return Integer
172
						.parseInt(headerMap.get(key).get(0)) + 10;
173
			}
174
		}
175
		return -1;
176
	}
177

    
178
	private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
179
		for (String key : headerMap.keySet()) {
180
			if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); }
181
		}
182
		throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
183
	}
184

    
185
	/**
186
	 * register for https scheme; this is a workaround and not intended for the use in trusted environments
187
	 */
188
	public void initTrustManager() {
189
		final X509TrustManager tm = new X509TrustManager() {
190

    
191
			@Override
192
			public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
193
			}
194

    
195
			@Override
196
			public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
197
			}
198

    
199
			@Override
200
			public X509Certificate[] getAcceptedIssuers() {
201
				return null;
202
			}
203
		};
204
		try {
205
			final SSLContext ctx = SSLContext.getInstance("TLS");
206
			ctx.init(null, new TrustManager[] { tm }, null);
207
			HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
208
		} catch (GeneralSecurityException e) {
209
			log.fatal(e);
210
			throw new IllegalStateException(e);
211
		}
212
	}
213

    
214
	private boolean is2xx(final int statusCode) {
215
		return statusCode >= 200 && statusCode <=299;
216
	}
217

    
218
	private boolean is4xx(final int statusCode) {
219
		return statusCode >= 400 && statusCode <=499;
220
	}
221

    
222
	private boolean is3xx(final int statusCode) {
223
		return statusCode >= 300 && statusCode <=399;
224
	}
225

    
226
	private boolean is5xx(final int statusCode) {
227
		return statusCode >= 500 && statusCode <=599;
228
	}
229

    
230

    
231
	public int getMaxNumberOfRetry() {
232
		return maxNumberOfRetry;
233
	}
234

    
235
	public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
236
		this.maxNumberOfRetry = maxNumberOfRetry;
237
	}
238

    
239
	public int getDefaultDelay() {
240
		return defaultDelay;
241
	}
242

    
243
	public void setDefaultDelay(final int defaultDelay) {
244
		this.defaultDelay = defaultDelay;
245
	}
246

    
247
	public int getReadTimeOut() {
248
		return readTimeOut;
249
	}
250

    
251
	public void setReadTimeOut(final int readTimeOut) {
252
		this.readTimeOut = readTimeOut;
253
	}
254

    
255
	public String getResponseType() {
256
		return responseType;
257
	}
258

    
259
}
(8-8/8)