|
1 |
package eu.dnetlib.download.plugin;
|
|
2 |
|
|
3 |
import com.google.common.base.Function;
|
|
4 |
import com.google.common.collect.Iterables;
|
|
5 |
import com.google.gson.Gson;
|
|
6 |
import eu.dnetlib.data.download.rmi.AbstractDownloadPlugin;
|
|
7 |
import eu.dnetlib.data.download.rmi.DownloadItem;
|
|
8 |
import eu.dnetlib.data.download.rmi.DownloadPlugin;
|
|
9 |
import eu.dnetlib.data.download.rmi.DownloadPluginException;
|
|
10 |
import org.apache.commons.logging.Log;
|
|
11 |
import org.apache.commons.logging.LogFactory;
|
|
12 |
import org.jsoup.Jsoup;
|
|
13 |
import org.jsoup.nodes.Document;
|
|
14 |
import org.jsoup.nodes.Element;
|
|
15 |
import org.jsoup.select.Elements;
|
|
16 |
|
|
17 |
import java.net.HttpURLConnection;
|
|
18 |
import java.net.URL;
|
|
19 |
import java.util.ArrayList;
|
|
20 |
import java.util.List;
|
|
21 |
|
|
22 |
public class DSpacePDFLinkPlugins extends AbstractDownloadPlugin implements DownloadPlugin {
|
|
23 |
|
|
24 |
/**
|
|
25 |
* The Constant log.
|
|
26 |
*/
|
|
27 |
private static final Log log = LogFactory.getLog(DSpacePDFLinkPlugins.class);
|
|
28 |
|
|
29 |
private final static int maxNumberJump = 10;
|
|
30 |
|
|
31 |
|
|
32 |
|
|
33 |
private String getHTTPRedirectedURL(final String mainURL) throws Exception {
|
|
34 |
URL startURL = new URL(mainURL);
|
|
35 |
HttpURLConnection conn = (HttpURLConnection) startURL.openConnection();
|
|
36 |
|
|
37 |
conn.setConnectTimeout(AbstractDownloadPlugin.DEFAULT_TIMEOUT);
|
|
38 |
|
|
39 |
conn.setInstanceFollowRedirects(true); // you still need to handle redirect manully.
|
|
40 |
HttpURLConnection.setFollowRedirects(true);
|
|
41 |
String location = mainURL;
|
|
42 |
|
|
43 |
int numJump =1;
|
|
44 |
|
|
45 |
int responseCode = conn.getResponseCode();
|
|
46 |
|
|
47 |
while ((responseCode >= 300) && (responseCode < 400) && (numJump++ < maxNumberJump)) {
|
|
48 |
location = conn.getHeaderFields().get("Location").get(0);
|
|
49 |
conn.disconnect();
|
|
50 |
startURL = new URL(location);
|
|
51 |
conn = (HttpURLConnection) startURL.openConnection();
|
|
52 |
conn.setConnectTimeout(AbstractDownloadPlugin.DEFAULT_TIMEOUT);
|
|
53 |
conn.setInstanceFollowRedirects(true); // you still need to handle redirect manully.
|
|
54 |
HttpURLConnection.setFollowRedirects(true);
|
|
55 |
responseCode = conn.getResponseCode();
|
|
56 |
}
|
|
57 |
conn.disconnect();
|
|
58 |
if (!((responseCode >= 200) && (responseCode < 300)))
|
|
59 |
return null;
|
|
60 |
return location;
|
|
61 |
}
|
|
62 |
|
|
63 |
|
|
64 |
/**
|
|
65 |
* Extract url.
|
|
66 |
*
|
|
67 |
* @param url the url
|
|
68 |
* @return the string
|
|
69 |
*/
|
|
70 |
@Override
|
|
71 |
public String extractURL(final String url) throws DownloadPluginException {
|
|
72 |
try {
|
|
73 |
|
|
74 |
final String location = getHTTPRedirectedURL(url);
|
|
75 |
|
|
76 |
if (location == null)
|
|
77 |
return null;
|
|
78 |
Document doc = Jsoup.connect(location).get();
|
|
79 |
Elements links = doc.select("meta[content$=.pdf]");
|
|
80 |
|
|
81 |
for (Element link : links) {
|
|
82 |
String linkValue = link.attr("content");
|
|
83 |
if (regularExpression != null) {
|
|
84 |
for (String regex : regularExpression) {
|
|
85 |
if (linkValue.matches(regex))
|
|
86 |
return linkValue;
|
|
87 |
}
|
|
88 |
} else
|
|
89 |
return linkValue;
|
|
90 |
}
|
|
91 |
return null;
|
|
92 |
} catch (Throwable e) {
|
|
93 |
throw new DownloadPluginException("Error on extract URL", e);
|
|
94 |
}
|
|
95 |
|
|
96 |
}
|
|
97 |
|
|
98 |
@Override
|
|
99 |
public Iterable<DownloadItem> retrieveUrls(final Iterable<DownloadItem> urls) {
|
|
100 |
return Iterables.transform(urls, input -> retrieveUrl(input));
|
|
101 |
}
|
|
102 |
|
|
103 |
/*
|
|
104 |
* (non-Javadoc)
|
|
105 |
*
|
|
106 |
* @see eu.dnetlib.data.download.rmi.DownloadPlugin#getPluginName()
|
|
107 |
*/
|
|
108 |
@Override
|
|
109 |
public String getPluginName() {
|
|
110 |
return "DSpacePDFLinkPlugins";
|
|
111 |
}
|
|
112 |
|
|
113 |
/*
|
|
114 |
* (non-Javadoc)
|
|
115 |
*
|
|
116 |
* @see eu.dnetlib.data.download.rmi.DownloadPlugin#retrieveUrl(eu.dnetlib.data.download.rmi.DownloadItem)
|
|
117 |
*/
|
|
118 |
@Override
|
|
119 |
public DownloadItem retrieveUrl(final DownloadItem input) {
|
|
120 |
if (checkOpenAccess(input) == null) return null;
|
|
121 |
String url = input.getOriginalUrl();
|
|
122 |
|
|
123 |
if ((url == null) || (url.trim().length() == 0)) return input;
|
|
124 |
@SuppressWarnings("unchecked")
|
|
125 |
List<String> urls = new Gson().fromJson(url, ArrayList.class);
|
|
126 |
if ((urls == null) || (urls.size() == 0)) return input;
|
|
127 |
if (checkUrlsNotNull(input, urls))
|
|
128 |
return input;
|
|
129 |
input.setOriginalUrl(null);
|
|
130 |
input.setUrl(null);
|
|
131 |
return input;
|
|
132 |
}
|
|
133 |
|
|
134 |
@Override
|
|
135 |
public void setBasePath(final String basePath) {
|
|
136 |
// TODO Auto-generated method stub
|
|
137 |
|
|
138 |
}
|
|
139 |
|
|
140 |
}
|
added new Plugin for DSpace