Project

General

Profile

1
package eu.dnetlib.download.plugin;
2

    
3
import com.google.common.base.Function;
4
import com.google.common.collect.Iterables;
5
import com.google.gson.Gson;
6
import eu.dnetlib.data.download.rmi.AbstractDownloadPlugin;
7
import eu.dnetlib.data.download.rmi.DownloadItem;
8
import eu.dnetlib.data.download.rmi.DownloadPlugin;
9
import eu.dnetlib.data.download.rmi.DownloadPluginException;
10
import org.apache.commons.logging.Log;
11
import org.apache.commons.logging.LogFactory;
12
import org.jsoup.Jsoup;
13
import org.jsoup.nodes.Document;
14
import org.jsoup.nodes.Element;
15
import org.jsoup.select.Elements;
16

    
17
import java.net.HttpURLConnection;
18
import java.net.URL;
19
import java.util.ArrayList;
20
import java.util.List;
21

    
22
public class DSpacePDFLinkPlugins extends AbstractDownloadPlugin implements DownloadPlugin {
23

    
24
    /**
25
     * The Constant log.
26
     */
27
    private static final Log log = LogFactory.getLog(DSpacePDFLinkPlugins.class);
28

    
29
    private final static int maxNumberJump = 10;
30

    
31

    
32

    
33
    private String getHTTPRedirectedURL(final String mainURL) throws Exception {
34
        URL startURL = new URL(mainURL);
35
        HttpURLConnection conn = (HttpURLConnection) startURL.openConnection();
36

    
37
        conn.setConnectTimeout(AbstractDownloadPlugin.DEFAULT_TIMEOUT);
38

    
39
        conn.setInstanceFollowRedirects(true);  // you still need to handle redirect manully.
40
        HttpURLConnection.setFollowRedirects(true);
41
        String location = mainURL;
42

    
43
        int numJump =1;
44

    
45
        int responseCode = conn.getResponseCode();
46

    
47
        while ((responseCode >= 300) && (responseCode < 400) &&  (numJump++ < maxNumberJump)) {
48
            location = conn.getHeaderFields().get("Location").get(0);
49
            conn.disconnect();
50
            startURL = new URL(location);
51
            conn = (HttpURLConnection) startURL.openConnection();
52
            conn.setConnectTimeout(AbstractDownloadPlugin.DEFAULT_TIMEOUT);
53
            conn.setInstanceFollowRedirects(true);  // you still need to handle redirect manully.
54
            HttpURLConnection.setFollowRedirects(true);
55
            responseCode = conn.getResponseCode();
56
        }
57
        conn.disconnect();
58
        if (!((responseCode >= 200) && (responseCode < 300)))
59
            return null;
60
        return location;
61
    }
62

    
63

    
64
    /**
65
     * Extract url.
66
     *
67
     * @param url the url
68
     * @return the string
69
     */
70
    @Override
71
    public String extractURL(final String url) throws DownloadPluginException {
72
        try {
73

    
74
            final String location = getHTTPRedirectedURL(url);
75

    
76
            if (location == null)
77
                return null;
78
            Document doc = Jsoup.connect(location).get();
79
            Elements links = doc.select("meta[content$=.pdf]");
80

    
81
            for (Element link : links) {
82
                String linkValue = link.attr("content");
83
                if (regularExpression != null) {
84
                    for (String regex : regularExpression) {
85
                        if (linkValue.matches(regex))
86
                            return linkValue;
87
                    }
88
                } else
89
                    return linkValue;
90
            }
91
            return null;
92
        } catch (Throwable e) {
93
	        throw new DownloadPluginException("Error on extract URL", e);
94
        }
95

    
96
    }
97

    
98
    @Override
99
    public Iterable<DownloadItem> retrieveUrls(final Iterable<DownloadItem> urls) {
100
        return Iterables.transform(urls, input -> retrieveUrl(input));
101
    }
102

    
103
    /*
104
     * (non-Javadoc)
105
     *
106
     * @see eu.dnetlib.data.download.rmi.DownloadPlugin#getPluginName()
107
     */
108
    @Override
109
    public String getPluginName() {
110
        return "DSpacePDFLinkPlugins";
111
    }
112

    
113
    /*
114
     * (non-Javadoc)
115
     *
116
     * @see eu.dnetlib.data.download.rmi.DownloadPlugin#retrieveUrl(eu.dnetlib.data.download.rmi.DownloadItem)
117
     */
118
    @Override
119
    public DownloadItem retrieveUrl(final DownloadItem input) {
120
        if (checkOpenAccess(input) == null) return null;
121
        String url = input.getOriginalUrl();
122

    
123
        if ((url == null) || (url.trim().length() == 0)) return input;
124
        @SuppressWarnings("unchecked")
125
        List<String> urls = new Gson().fromJson(url, ArrayList.class);
126
        if ((urls == null) || (urls.size() == 0)) return input;
127
        if (checkUrlsNotNull(input, urls))
128
            return input;
129
        input.setOriginalUrl(null);
130
        input.setUrl(null);
131
        return input;
132
    }
133

    
134
    @Override
135
    public void setBasePath(final String basePath) {
136
        // TODO Auto-generated method stub
137

    
138
    }
139

    
140
}
(4-4/12)