Project

General

Profile

1
package eu.dnetlib.msro.workers.aggregation.collect.plugins.oai.engine;
2

    
3
import java.io.IOException;
4
import java.io.InputStream;
5
import java.net.CookieHandler;
6
import java.net.CookieManager;
7
import java.net.CookiePolicy;
8
import java.net.HttpURLConnection;
9
import java.net.URL;
10
import java.security.GeneralSecurityException;
11
import java.security.KeyManagementException;
12
import java.security.NoSuchAlgorithmException;
13
import java.security.cert.CertificateException;
14
import java.security.cert.X509Certificate;
15
import java.util.List;
16
import java.util.Map;
17

    
18
import javax.net.ssl.HttpsURLConnection;
19
import javax.net.ssl.SSLContext;
20
import javax.net.ssl.TrustManager;
21
import javax.net.ssl.X509TrustManager;
22

    
23
import org.apache.commons.io.IOUtils;
24
import org.apache.commons.lang3.math.NumberUtils;
25
import org.apache.commons.logging.Log;
26
import org.apache.commons.logging.LogFactory;
27
import org.springframework.stereotype.Component;
28

    
29
import eu.dnetlib.msro.workflows.nodes.collect.CollectException;
30

    
31
/**
32
 * @author jochen, michele, andrea
33
 */
34
@Component
35
public class HttpConnector {
36

    
37
	private static final Log log = LogFactory.getLog(HttpConnector.class);
38

    
39
	private int maxNumberOfRetry = 6;
40
	private int defaultDelay = 120; // seconds
41
	private int readTimeOut = 120; // seconds
42
	private String userAgent = "Mozilla/5.0 (compatible; OAI-Harvester; +http://www.openaire.eu)";
43

    
44
	public HttpConnector() {
45
		CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
46
	}
47

    
48
	/**
49
	 * @param requestUrl
50
	 * @return the content of the downloaded resource
51
	 * @throws CollectException
52
	 */
53
	public String getInputSource(final String requestUrl) throws CollectException {
54
		return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
55
	}
56

    
57
	private String attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
58
			throws CollectException {
59

    
60
		if (retryNumber > maxNumberOfRetry) { throw new CollectException("Max number of retries exceeded. Cause: \n " + errorList); }
61

    
62
		log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
63
		try {
64
			InputStream input = null;
65

    
66
			try {
67
				final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
68
				urlConn.setInstanceFollowRedirects(false);
69
				urlConn.setReadTimeout(readTimeOut * 1000);
70
				urlConn.addRequestProperty("User-Agent", userAgent);
71

    
72
				if (log.isDebugEnabled()) {
73
					logHeaderFields(urlConn);
74
				}
75

    
76
				final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
77
				if (retryAfter > 0) {
78
					log.warn("waiting and repeating request after " + retryAfter + " sec.");
79
					Thread.sleep(retryAfter * 1000);
80
					errorList.add("503 Service Unavailable");
81
					return attemptDownload(requestUrl, retryNumber + 1, errorList);
82
				} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM)
83
						|| (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) {
84
					final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
85
					log.info("The requested url has been moved to " + newUrl);
86
					errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
87
					return attemptDownload(newUrl, retryNumber + 1, errorList);
88
				} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
89
					log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
90
					Thread.sleep(defaultDelay * 1000);
91
					errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
92
					return attemptDownload(requestUrl, retryNumber + 1, errorList);
93
				} else {
94
					input = urlConn.getInputStream();
95
					return IOUtils.toString(input);
96
				}
97
			} catch (final IOException e) {
98
				log.error("error while retrieving from http-connection occured: " + e, e);
99
				Thread.sleep(defaultDelay * 1000);
100
				errorList.add(e.getMessage());
101
				return attemptDownload(requestUrl, retryNumber + 1, errorList);
102
			} finally {
103
				IOUtils.closeQuietly(input);
104
			}
105
		} catch (final InterruptedException e) {
106
			throw new CollectException("Error fetching url: " + requestUrl, e);
107
		}
108
	}
109

    
110
	private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
111
		log.debug("StatusCode: " + urlConn.getResponseMessage());
112

    
113
		for (final Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
114
			if (e.getKey() != null) {
115
				for (final String v : e.getValue()) {
116
					log.debug("  key: " + e.getKey() + " - value: " + v);
117
				}
118
			}
119
		}
120
	}
121

    
122
	private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
123
		for (final String key : headerMap.keySet()) {
124
			if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) && NumberUtils
125
					.isNumber(headerMap.get(key).get(0))) { return Integer
126
							.parseInt(headerMap.get(key).get(0)) + 10; }
127
		}
128
		return -1;
129
	}
130

    
131
	private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectException {
132
		for (final String key : headerMap.keySet()) {
133
			if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); }
134
		}
135
		throw new CollectException("The requested url has been MOVED, but 'location' param is MISSING");
136
	}
137

    
138
	/**
139
	 * register for https scheme; this is a workaround and not intended for the use in trusted environments
140
	 *
141
	 * @throws NoSuchAlgorithmException
142
	 * @throws KeyManagementException
143
	 */
144
	public void initTrustManager() {
145
		final X509TrustManager tm = new X509TrustManager() {
146

    
147
			@Override
148
			public void checkClientTrusted(final X509Certificate[] xcs, final String string) throws CertificateException {}
149

    
150
			@Override
151
			public void checkServerTrusted(final X509Certificate[] xcs, final String string) throws CertificateException {}
152

    
153
			@Override
154
			public X509Certificate[] getAcceptedIssuers() {
155
				return null;
156
			}
157
		};
158
		try {
159
			final SSLContext ctx = SSLContext.getInstance("TLS");
160
			ctx.init(null, new TrustManager[] { tm }, null);
161
			HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
162
		} catch (final GeneralSecurityException e) {
163
			log.fatal(e);
164
			throw new IllegalStateException(e);
165
		}
166
	}
167

    
168
	public int getMaxNumberOfRetry() {
169
		return maxNumberOfRetry;
170
	}
171

    
172
	public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
173
		this.maxNumberOfRetry = maxNumberOfRetry;
174
	}
175

    
176
	public int getDefaultDelay() {
177
		return defaultDelay;
178
	}
179

    
180
	public void setDefaultDelay(final int defaultDelay) {
181
		this.defaultDelay = defaultDelay;
182
	}
183

    
184
	public int getReadTimeOut() {
185
		return readTimeOut;
186
	}
187

    
188
	public void setReadTimeOut(final int readTimeOut) {
189
		this.readTimeOut = readTimeOut;
190
	}
191

    
192
}
(2-2/3)