Project

General

Profile

1
package eu.dnetlib.download.plugin;
2

    
3
import java.io.IOException;
4
import java.net.HttpURLConnection;
5
import java.net.URL;
6
import java.util.ArrayList;
7
import java.util.List;
8

    
9
import com.google.common.collect.Iterables;
10
import com.google.gson.Gson;
11
import eu.dnetlib.data.download.rmi.AbstractDownloadPlugin;
12
import eu.dnetlib.data.download.rmi.DownloadItem;
13
import eu.dnetlib.data.download.rmi.DownloadPlugin;
14
import eu.dnetlib.data.download.rmi.DownloadPluginException;
15
import org.apache.commons.logging.Log;
16
import org.apache.commons.logging.LogFactory;
17
import org.jsoup.Jsoup;
18
import org.jsoup.nodes.Document;
19
import org.jsoup.nodes.Element;
20
import org.jsoup.select.Elements;
21

    
22
public class DSpacePDFLinkPlugins extends AbstractDownloadPlugin implements DownloadPlugin {
23

    
24
    /**
25
     * The Constant log.
26
     */
27
    private static final Log log = LogFactory.getLog(DSpacePDFLinkPlugins.class);
28

    
29
    private final static int maxNumberJump = 10;
30

    
31
    private final static int maxNumberConnectRetries = 5;
32

    
33
    /**
34
     * Milliseconds used to backoff in case of connection errors.
35
     */
36
    private final static int BACKOFF_FACTOR = 100;
37

    
38
    private String getHTTPRedirectedURL(final String mainURL) throws Exception {
39
        URL startURL = new URL(mainURL);
40
        HttpURLConnection conn = (HttpURLConnection) startURL.openConnection();
41

    
42
        conn.setConnectTimeout(AbstractDownloadPlugin.DEFAULT_TIMEOUT);
43

    
44
        conn.setInstanceFollowRedirects(true);  // you still need to handle redirect manually.
45
        HttpURLConnection.setFollowRedirects(true);
46
        String location = mainURL;
47

    
48
        int numJump = 1;
49

    
50
        int responseCode = conn.getResponseCode();
51

    
52
        while ((responseCode >= 300) && (responseCode < 400) && (numJump++ < maxNumberJump)) {
53
            location = conn.getHeaderFields().get("Location").get(0);
54
            conn.disconnect();
55
            startURL = new URL(location);
56
            conn = (HttpURLConnection) startURL.openConnection();
57
            conn.setConnectTimeout(AbstractDownloadPlugin.DEFAULT_TIMEOUT);
58
            conn.setInstanceFollowRedirects(true);  // you still need to handle redirect manually.
59
            HttpURLConnection.setFollowRedirects(true);
60
            responseCode = conn.getResponseCode();
61
        }
62
        conn.disconnect();
63
        if (!((responseCode >= 200) && (responseCode < 300)))
64
            return null;
65
        return location;
66
    }
67

    
68
    /**
69
     * Extract url.
70
     *
71
     * @param url the url
72
     * @return the string
73
     */
74
    @Override
75
    public String extractURL(final String url) throws DownloadPluginException {
76
        try {
77
            final String location = getHTTPRedirectedURL(url);
78

    
79
            if (location == null) {
80
                return null;
81
            }
82

    
83
            Document doc = null;
84
            int retries = 0;
85
            boolean success = false;
86

    
87
            while(retries < maxNumberConnectRetries) {
88
                try {
89
                    doc = Jsoup.connect(location).timeout(AbstractDownloadPlugin.DEFAULT_TIMEOUT).get();
90
                    success = true;
91
                    break;
92
                } catch (IOException e) {
93
                    final int millis = BACKOFF_FACTOR * (retries + 1);
94
                    log.debug(String.format("backoff for %s ms before retrying on %s", millis, location));
95
                    Thread.sleep(millis);
96
                }
97
                retries++;
98
            }
99

    
100
            if (!success) {
101
                throw new DownloadPluginException("reached max number of connect retries for URL: " + location);
102
            }
103

    
104
            final Elements links = doc.select("meta[content][name=citation_pdf_url]");
105

    
106
            for (Element link : links) {
107
                String linkValue = link.attr("content");
108
                if (regularExpression != null) {
109
                    for (String regex : regularExpression) {
110
                        if (linkValue.matches(regex)) {
111
                            return linkValue;
112
                        }
113
                    }
114
                } else {
115
                    //if(linkValue.matches("^http.*pdf$")){
116
                        return linkValue;
117
                    //}
118
                }
119

    
120
            }
121
            return null;
122
        } catch (Throwable e) {
123
	        throw new DownloadPluginException("Error on extract URL", e);
124
        }
125
    }
126

    
127
    @Override
128
    public Iterable<DownloadItem> retrieveUrls(final Iterable<DownloadItem> urls) {
129
        return Iterables.transform(urls, input -> retrieveUrl(input));
130
    }
131

    
132
    /*
133
     * (non-Javadoc)
134
     *
135
     * @see eu.dnetlib.data.download.rmi.DownloadPlugin#getPluginName()
136
     */
137
    @Override
138
    public String getPluginName() {
139
        return "DSpacePDFLinkPlugins";
140
    }
141

    
142
    /*
143
     * (non-Javadoc)
144
     *
145
     * @see eu.dnetlib.data.download.rmi.DownloadPlugin#retrieveUrl(eu.dnetlib.data.download.rmi.DownloadItem)
146
     */
147
    @Override
148
    public DownloadItem retrieveUrl(final DownloadItem input) {
149
        if (checkOpenAccess(input) == null) return null;
150
        String url = input.getOriginalUrl();
151

    
152
        if ((url == null) || (url.trim().length() == 0)) return input;
153
        @SuppressWarnings("unchecked")
154
        List<String> urls = new Gson().fromJson(url, ArrayList.class);
155
        if ((urls == null) || (urls.size() == 0)) return input;
156
        if (checkUrlsNotNull(input, urls))
157
            return input;
158
        input.setOriginalUrl(null);
159
        input.setUrl(null);
160
        return input;
161
    }
162

    
163
    @Override
164
    public void setBasePath(final String basePath) {
165
        // TODO Auto-generated method stub
166

    
167
    }
168

    
169
}
(4-4/12)