Revision 31619
Added by Claudio Atzori about 10 years ago
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/pom.xml | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.ap\ ache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
</parent> |
|
8 |
<modelVersion>4.0.0</modelVersion> |
|
9 |
<groupId>eu.dnetlib</groupId> |
|
10 |
<artifactId>dnet-resource-discovery</artifactId> |
|
11 |
<packaging>jar</packaging> |
|
12 |
<version>2.0.0</version> |
|
13 |
<scm> |
|
14 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0</developerConnection> |
|
15 |
</scm> |
|
16 |
<dependencies> |
|
17 |
<dependency> |
|
18 |
<groupId>apache</groupId> |
|
19 |
<artifactId>commons-logging</artifactId> |
|
20 |
<version>[1.0.0,1.0.1)</version> |
|
21 |
</dependency> |
|
22 |
<dependency> |
|
23 |
<groupId>junit</groupId> |
|
24 |
<artifactId>junit</artifactId> |
|
25 |
<version>${junit.version}</version> |
|
26 |
<scope>test</scope> |
|
27 |
</dependency> |
|
28 |
<dependency> |
|
29 |
<groupId>org.w3c</groupId> |
|
30 |
<artifactId>tidy</artifactId> |
|
31 |
<version>[0.0.0,)</version> |
|
32 |
</dependency> |
|
33 |
<dependency> |
|
34 |
<groupId>net.matuschek</groupId> |
|
35 |
<artifactId>jobo</artifactId> |
|
36 |
<version>[1.4,2.0)</version> |
|
37 |
</dependency> |
|
38 |
<dependency> |
|
39 |
<groupId>DLS</groupId> |
|
40 |
<artifactId>jOAI</artifactId> |
|
41 |
<version>[2.0.9.3,2.0.10.0)</version> |
|
42 |
</dependency> |
|
43 |
<dependency> |
|
44 |
<groupId>com.thoughtworks</groupId> |
|
45 |
<artifactId>xstream</artifactId> |
|
46 |
<version>[0.0.0,)</version> |
|
47 |
</dependency> |
|
48 |
<dependency> |
|
49 |
<groupId>net.sourceforge.nekohtml</groupId> |
|
50 |
<artifactId>nekohtml</artifactId> |
|
51 |
<version>1.9.16</version> |
|
52 |
<exclusions> |
|
53 |
<exclusion> |
|
54 |
<artifactId>xercesImpl</artifactId> |
|
55 |
<groupId>xerces</groupId> |
|
56 |
</exclusion> |
|
57 |
</exclusions> |
|
58 |
</dependency> |
|
59 |
<dependency> |
|
60 |
<groupId>com.jira</groupId> |
|
61 |
<artifactId>heritrix-commons</artifactId> |
|
62 |
<version>[0.0.0,)</version> |
|
63 |
</dependency> |
|
64 |
<dependency> |
|
65 |
<groupId>com.jira</groupId> |
|
66 |
<artifactId>heritrix-modules</artifactId> |
|
67 |
<version>[0.0.0,)</version> |
|
68 |
</dependency> |
|
69 |
<dependency> |
|
70 |
<groupId>com.googlecode</groupId> |
|
71 |
<artifactId>kryo</artifactId> |
|
72 |
<version>1.04</version> |
|
73 |
<exclusions> |
|
74 |
<exclusion> |
|
75 |
<groupId>com.googlecode</groupId> |
|
76 |
<artifactId>minlog</artifactId> |
|
77 |
</exclusion> |
|
78 |
</exclusions> |
|
79 |
</dependency> |
|
80 |
<dependency> |
|
81 |
<groupId>edu.indiana</groupId> |
|
82 |
<artifactId>xpp3</artifactId> |
|
83 |
<version>[0.0.0,)</version> |
|
84 |
</dependency> |
|
85 |
<dependency> |
|
86 |
<groupId>xerces</groupId> |
|
87 |
<artifactId>xercesImpl</artifactId> |
|
88 |
<version>2.11.0</version> |
|
89 |
<scope>provided</scope> |
|
90 |
</dependency> |
|
91 |
</dependencies> |
|
92 |
</project> |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/deploy.info | ||
---|---|---|
1 |
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-resource-discovery/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "dnet-resource-discovery"} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/test/java/eu/dnetlib/testWebCrawl/testCrawl.java | ||
---|---|---|
1 |
package eu.dnetlib.testWebCrawl; |
|
2 |
|
|
3 |
import gr.uoa.di.resourcediscovery.MalformedConfigurationException; |
|
4 |
import gr.uoa.di.resourcediscovery.MethodProvider; |
|
5 |
import gr.uoa.di.resourcediscovery.MethodProviderFileStorageImpl; |
|
6 |
import gr.uoa.di.resourcediscovery.UnknownMethodException; |
|
7 |
import gr.uoa.di.resourcediscovery.methods.XPathAndCrawl; |
|
8 |
|
|
9 |
import java.io.IOException; |
|
10 |
import java.net.URL; |
|
11 |
import java.util.Arrays; |
|
12 |
import java.util.List; |
|
13 |
|
|
14 |
import org.junit.Assert; |
|
15 |
import org.junit.Test; |
|
16 |
import org.xml.sax.SAXException; |
|
17 |
|
|
18 |
public class testCrawl { |
|
19 |
|
|
20 |
@Test |
|
21 |
public void test() throws MalformedConfigurationException, UnknownMethodException, IOException, SAXException { |
|
22 |
|
|
23 |
long starttime = System.currentTimeMillis(); |
|
24 |
String fileName = "/tmp/method-map.xml"; |
|
25 |
List<String> mimeTypes = Arrays.asList(new String[] { "application/pdf" }); |
|
26 |
MethodProvider provider = new MethodProviderFileStorageImpl(fileName); |
|
27 |
URL conUrl = new URL("http://arxiv.org/abs/0908.4286.pdf"); |
|
28 |
XPathAndCrawl xpath = new XPathAndCrawl(mimeTypes, null); |
|
29 |
List<String> resources = xpath.getResources(conUrl, provider); |
|
30 |
Assert.assertTrue("The length should be > 0", resources.size() > 0); |
|
31 |
long endtime = System.currentTimeMillis(); |
|
32 |
System.out.println((endtime - starttime) / 1000); |
|
33 |
} |
|
34 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/methods/XPathAndCrawl.java | ||
---|---|---|
1 |
package gr.uoa.di.resourcediscovery.methods; |
|
2 |
|
|
3 |
import gr.uoa.di.resourcediscovery.MalformedConfigurationException; |
|
4 |
import gr.uoa.di.resourcediscovery.MethodProvider; |
|
5 |
import gr.uoa.di.resourcediscovery.Toolkit; |
|
6 |
|
|
7 |
import java.io.BufferedReader; |
|
8 |
import java.io.FileNotFoundException; |
|
9 |
import java.io.IOException; |
|
10 |
import java.io.InputStreamReader; |
|
11 |
import java.net.MalformedURLException; |
|
12 |
import java.net.URL; |
|
13 |
import java.util.ArrayList; |
|
14 |
import java.util.List; |
|
15 |
|
|
16 |
import org.apache.commons.logging.Log; |
|
17 |
import org.apache.commons.logging.LogFactory; |
|
18 |
import org.archive.modules.net.RobotsDirectives; |
|
19 |
import org.archive.modules.net.Robotstxt; |
|
20 |
import org.cyberneko.html.parsers.DOMParser; |
|
21 |
import org.w3c.dom.Document; |
|
22 |
import org.w3c.dom.Node; |
|
23 |
import org.w3c.dom.traversal.DocumentTraversal; |
|
24 |
import org.w3c.dom.traversal.NodeFilter; |
|
25 |
import org.w3c.dom.traversal.NodeIterator; |
|
26 |
import org.xml.sax.SAXException; |
|
27 |
|
|
28 |
public class XPathAndCrawl implements ResourceDiscoveryMethod { |
|
29 |
|
|
30 |
private static final Log logger = LogFactory.getLog(XPathAndCrawl.class); |
|
31 |
|
|
32 |
private boolean resolveFrames = true; |
|
33 |
private boolean skipFirstPage = false; |
|
34 |
private long sleepMillis = 100; |
|
35 |
private boolean ignoreRobotsTxt = false; |
|
36 |
private String agentName = "OpenAIRE_Harvester"; |
|
37 |
private List<String> mimeTypes = new ArrayList<String>(); |
|
38 |
private boolean fallback = true; |
|
39 |
private String robotstxtUrl = null; |
|
40 |
|
|
41 |
transient private Robotstxt robot = null; |
|
42 |
transient private RobotsDirectives directives = null; |
|
43 |
|
|
44 |
private List<String> xpaths = new ArrayList<String>(); |
|
45 |
|
|
46 |
public XPathAndCrawl() { |
|
47 |
this.ignoreRobotsTxt = true; |
|
48 |
} |
|
49 |
|
|
50 |
// you need one per repository! |
|
51 |
public XPathAndCrawl(List<String> mimeTypes, String robotstxtUrl) throws FileNotFoundException, IOException { |
|
52 |
this.mimeTypes.addAll(mimeTypes); |
|
53 |
|
|
54 |
if (robotstxtUrl != null) { |
|
55 |
URL url = new URL(robotstxtUrl); |
|
56 |
try { |
|
57 |
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream())); |
|
58 |
this.robot = new Robotstxt(in); |
|
59 |
this.directives = this.robot.getDirectivesFor(agentName); |
|
60 |
} catch (FileNotFoundException ex) { |
|
61 |
logger.debug("Robots.txt was not found at " + robotstxtUrl); |
|
62 |
ignoreRobotsTxt = true; |
|
63 |
} |
|
64 |
} else { |
|
65 |
ignoreRobotsTxt = true; |
|
66 |
} |
|
67 |
} |
|
68 |
|
|
69 |
public void setRobotstxt(String robotstxtUrl) throws FileNotFoundException, IOException { |
|
70 |
this.robotstxtUrl = robotstxtUrl; |
|
71 |
if (robotstxtUrl != null) { |
|
72 |
URL url = new URL(robotstxtUrl); |
|
73 |
try { |
|
74 |
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream())); |
|
75 |
this.robot = new Robotstxt(in); |
|
76 |
this.directives = this.robot.getDirectivesFor(agentName); |
|
77 |
} catch (FileNotFoundException ex) { |
|
78 |
logger.debug("Robots.txt was not found at " + robotstxtUrl); |
|
79 |
ignoreRobotsTxt = true; |
|
80 |
} |
|
81 |
} else { |
|
82 |
ignoreRobotsTxt = true; |
|
83 |
} |
|
84 |
} |
|
85 |
|
|
86 |
public String getRobotstxtUrl() { |
|
87 |
return robotstxtUrl; |
|
88 |
} |
|
89 |
|
|
90 |
@Override |
|
91 |
public List<String> getResources(URL upageUrl, MethodProvider provider) throws SAXException, IOException { |
|
92 |
|
|
93 |
String pageUrl = upageUrl.toString(); |
|
94 |
|
|
95 |
logger.debug("Known xpaths: "+this.xpaths); |
|
96 |
|
|
97 |
pageUrl = Toolkit.getRedirectedUrl(pageUrl, this.sleepMillis); |
|
98 |
|
|
99 |
logger.debug("Resolved possible redirections. Url: "+pageUrl); |
|
100 |
|
|
101 |
List<String> ret = new ArrayList<String>(); |
|
102 |
List<String> urls = new ArrayList<String>(); |
|
103 |
urls.add(pageUrl); |
|
104 |
|
|
105 |
// check if url is a redirection |
|
106 |
|
|
107 |
|
|
108 |
if(this.mimeTypes.contains(Toolkit.getMimeType(pageUrl, this.sleepMillis))) { |
|
109 |
ret.add(Toolkit.makeAbsolute(pageUrl, new URL(pageUrl))); |
|
110 |
return ret; |
|
111 |
} |
|
112 |
|
|
113 |
if (this.resolveFrames) { |
|
114 |
DOMParser parser = new DOMParser(); |
|
115 |
parser.parse(pageUrl); |
|
116 |
Document doc = parser.getDocument(); |
|
117 |
urls.addAll(resolveFrames(doc, new URL(pageUrl))); |
|
118 |
logger.debug("urls after resolving frames: " + urls); |
|
119 |
} |
|
120 |
|
|
121 |
if (this.skipFirstPage) { |
|
122 |
List<String> addme = new ArrayList<String>(); |
|
123 |
for (String url : urls) { |
|
124 |
DOMParser parser = new DOMParser(); |
|
125 |
parser.parse(url); |
|
126 |
Document doc = parser.getDocument(); |
|
127 |
addme.addAll(oneDepthDown(doc, new URL(url))); |
|
128 |
} |
|
129 |
|
|
130 |
urls.remove(pageUrl); |
|
131 |
|
|
132 |
if (this.resolveFrames) { |
|
133 |
for (String url : urls) { |
|
134 |
DOMParser parser = new DOMParser(); |
|
135 |
parser.parse(url); |
|
136 |
Document doc = parser.getDocument(); |
|
137 |
addme.addAll(resolveFrames(doc, new URL(url))); |
|
138 |
} |
|
139 |
} |
|
140 |
|
|
141 |
urls.addAll(addme); |
|
142 |
logger.debug("urls after skipping 1st page and resolving frames: " + urls); |
|
143 |
} |
|
144 |
|
|
145 |
for (String url : urls) { |
|
146 |
logger.debug("looking for resource in: " + url); |
|
147 |
try { |
|
148 |
url = Toolkit.makeAbsolute(url, new URL(pageUrl)); |
|
149 |
} catch (Exception e) { |
|
150 |
e.printStackTrace(); |
|
151 |
continue; |
|
152 |
} |
|
153 |
URL startingUrl = new URL(url); |
|
154 |
|
|
155 |
if (!this.ignoreRobotsTxt) |
|
156 |
if (!this.directives.allows(Toolkit.makeRelative(startingUrl))) { |
|
157 |
logger.debug("Skipping " + startingUrl + ". Disallowed by robots.txt directives."); |
|
158 |
continue; |
|
159 |
} |
|
160 |
|
|
161 |
if (this.xpaths.size() == 0) { |
|
162 |
logger.debug("No xpath information, crawling"); |
|
163 |
// this for the first time |
|
164 |
DOMParser parser = new DOMParser(); |
|
165 |
parser.parse(startingUrl.toString()); |
|
166 |
Document doc = parser.getDocument(); |
|
167 |
|
|
168 |
List<Node> resourceNodes = findNodesWithResource(doc, startingUrl); |
|
169 |
|
|
170 |
for (Node resourceNode : resourceNodes) { |
|
171 |
String xp = getXpathToRoot(resourceNode); |
|
172 |
xpaths.add(xp); |
|
173 |
logger.debug(xp); |
|
174 |
} |
|
175 |
|
|
176 |
try { |
|
177 |
URL methodUrl = new URL(pageUrl); |
|
178 |
provider.setMethod(new URL(methodUrl.getProtocol()+"://"+methodUrl.getHost()), this); |
|
179 |
} catch(MalformedConfigurationException e) { |
|
180 |
logger.error("Error updating xpath information", e); |
|
181 |
} |
|
182 |
|
|
183 |
for (String xp : xpaths) { |
|
184 |
String resourceUrl = getResourceUrl(xp, doc, startingUrl); |
|
185 |
if (resourceUrl != null) { |
|
186 |
logger.debug(resourceUrl); |
|
187 |
ret.add(resourceUrl); |
|
188 |
} |
|
189 |
} |
|
190 |
} else { |
|
191 |
// this is for the rest of the pages of the repo |
|
192 |
DOMParser parser = new DOMParser(); |
|
193 |
parser.parse(startingUrl.toString()); |
|
194 |
Document doc = parser.getDocument(); |
|
195 |
|
|
196 |
for (String xp : xpaths) { |
|
197 |
String resourceUrl = getResourceUrl(xp, doc, startingUrl); |
|
198 |
if (resourceUrl != null) { |
|
199 |
logger.debug(resourceUrl); |
|
200 |
ret.add(resourceUrl); |
|
201 |
} |
|
202 |
} |
|
203 |
} |
|
204 |
} |
|
205 |
|
|
206 |
if (ret.size() == 0 && this.fallback) { |
|
207 |
// if no xpath contained the resource, try to find it and add |
|
208 |
// all the xpaths |
|
209 |
for (String url : urls) { |
|
210 |
logger.debug("looking for resource in (not found in xpath): " + url); |
|
211 |
|
|
212 |
try { |
|
213 |
url = Toolkit.makeAbsolute(url, new URL(pageUrl)); |
|
214 |
} catch (Exception e) { |
|
215 |
e.printStackTrace(); |
|
216 |
continue; |
|
217 |
} |
|
218 |
URL startingUrl = new URL(url); |
|
219 |
|
|
220 |
if (!this.ignoreRobotsTxt) |
|
221 |
if (!this.directives.allows(Toolkit.makeRelative(startingUrl))) { |
|
222 |
logger.debug("Skipping " + startingUrl + ". Disallowed by robots.txt directives."); |
|
223 |
continue; |
|
224 |
} |
|
225 |
|
|
226 |
DOMParser parser = new DOMParser(); |
|
227 |
parser.parse(startingUrl.toString()); |
|
228 |
Document doc = parser.getDocument(); |
|
229 |
List<Node> resourceNodes = findNodesWithResource(doc, startingUrl); |
|
230 |
for (Node resourceNode : resourceNodes) { |
|
231 |
String xp = getXpathToRoot(resourceNode); |
|
232 |
xpaths.add(xp); |
|
233 |
logger.debug(xp); |
|
234 |
} |
|
235 |
|
|
236 |
try { |
|
237 |
URL methodUrl = new URL(pageUrl); |
|
238 |
provider.setMethod(new URL(methodUrl.getProtocol()+"://"+methodUrl.getHost()), this); |
|
239 |
} catch(MalformedConfigurationException e) { |
|
240 |
logger.error("Error updating xpath information", e); |
|
241 |
} |
|
242 |
|
|
243 |
for (String xp : xpaths) { |
|
244 |
String resourceUrl = getResourceUrl(xp, doc, startingUrl); |
|
245 |
if (resourceUrl != null) { |
|
246 |
logger.debug(resourceUrl); |
|
247 |
ret.add(resourceUrl); |
|
248 |
} |
|
249 |
} |
|
250 |
} |
|
251 |
} |
|
252 |
|
|
253 |
return ret; |
|
254 |
} |
|
255 |
|
|
256 |
private List<String> resolveFrames(Document doc, URL connectionUrl) { |
|
257 |
List<String> ret = new ArrayList<String>(); |
|
258 |
|
|
259 |
DocumentTraversal traversal = (DocumentTraversal) doc; |
|
260 |
|
|
261 |
NodeIterator iterator = null; |
|
262 |
try { |
|
263 |
iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true); |
|
264 |
} catch (Exception e) { |
|
265 |
e.printStackTrace(); |
|
266 |
return ret; |
|
267 |
} |
|
268 |
|
|
269 |
for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) { |
|
270 |
if (n.getNodeName().equals("FRAME") || n.getNodeName().equals("IFRAME")) { |
|
271 |
String url = n.getAttributes().getNamedItem("src").getNodeValue(); |
|
272 |
try { |
|
273 |
url = Toolkit.makeAbsolute(url, connectionUrl); |
|
274 |
ret.add(url); |
|
275 |
} catch (MalformedURLException ex) { |
|
276 |
continue; |
|
277 |
} |
|
278 |
} |
|
279 |
} |
|
280 |
return ret; |
|
281 |
} |
|
282 |
|
|
283 |
private List<String> oneDepthDown(Document doc, URL connectionUrl) throws IOException { |
|
284 |
List<String> ret = new ArrayList<String>(); |
|
285 |
|
|
286 |
DocumentTraversal traversal = (DocumentTraversal) doc; |
|
287 |
|
|
288 |
NodeIterator iterator = null; |
|
289 |
try { |
|
290 |
iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true); |
|
291 |
} catch (Exception e) { |
|
292 |
e.printStackTrace(); |
|
293 |
return ret; |
|
294 |
} |
|
295 |
|
|
296 |
for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) { |
|
297 |
if (n.getNodeName().equals("A")) { |
|
298 |
String url = n.getAttributes().getNamedItem("href").getNodeValue(); |
|
299 |
try { |
|
300 |
url = Toolkit.makeAbsolute(url, connectionUrl); |
|
301 |
if (Toolkit.getMimeType(url, this.sleepMillis).trim().contains("text/html")) |
|
302 |
ret.add(url); |
|
303 |
} catch (MalformedURLException ex) { |
|
304 |
continue; |
|
305 |
} |
|
306 |
} |
|
307 |
} |
|
308 |
return ret; |
|
309 |
} |
|
310 |
|
|
311 |
private String getXpathToRoot(Node node) { |
|
312 |
String xpath = ""; |
|
313 |
do { |
|
314 |
if (node.getNodeName().equals("HTML")) { |
|
315 |
int before = 1; |
|
316 |
while ((node = node.getPreviousSibling()) != null) |
|
317 |
before++; |
|
318 |
return "/HTML["+before+"]" + xpath; |
|
319 |
} |
|
320 |
int before = 0; |
|
321 |
Node current = node; |
|
322 |
while ((current = current.getPreviousSibling()) != null) |
|
323 |
if (current.getNodeName().equals(node.getNodeName())) |
|
324 |
before++; |
|
325 |
xpath = "/" + node.getNodeName() + "[" + (before + 1) + "]" + xpath; |
|
326 |
} while ((node = node.getParentNode()) != null); |
|
327 |
return xpath; |
|
328 |
} |
|
329 |
|
|
330 |
private List<Node> findNodesWithResource(Document doc, URL connectionUrl) throws IOException { |
|
331 |
List<Node> ret = new ArrayList<Node>(); |
|
332 |
|
|
333 |
DocumentTraversal traversal = (DocumentTraversal) doc; |
|
334 |
|
|
335 |
NodeIterator iterator = null; |
|
336 |
try { |
|
337 |
iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true); |
|
338 |
} catch (Exception e) { |
|
339 |
e.printStackTrace(); |
|
340 |
return ret; |
|
341 |
} |
|
342 |
|
|
343 |
for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) { |
|
344 |
if (n.getNodeName().equals("A")) { |
|
345 |
String url = null; |
|
346 |
try { |
|
347 |
url = n.getAttributes().getNamedItem("href").getNodeValue(); |
|
348 |
} catch(NullPointerException e) { |
|
349 |
// anchor without href |
|
350 |
continue; |
|
351 |
} |
|
352 |
if (url == null) |
|
353 |
continue; |
|
354 |
try { |
|
355 |
url = Toolkit.makeAbsolute(url, connectionUrl); |
|
356 |
if (this.mimeTypes.contains(Toolkit.getMimeType(url, this.sleepMillis).trim())) |
|
357 |
ret.add(n); |
|
358 |
} catch (MalformedURLException ex) { |
|
359 |
continue; |
|
360 |
} |
|
361 |
} |
|
362 |
} |
|
363 |
return ret; |
|
364 |
} |
|
365 |
|
|
366 |
private String getResourceUrl(String xpath, Document doc, URL url) throws MalformedURLException { |
|
367 |
try { |
|
368 |
Node current = doc.getFirstChild(); |
|
369 |
String[] elements = xpath.split("/"); |
|
370 |
for (String element : elements) { |
|
371 |
if (element.trim().equals("")) |
|
372 |
continue; |
|
373 |
int position = Integer.parseInt(element.substring(element.indexOf('[')).replaceAll("\\[", "").replaceAll("\\]", "")); |
|
374 |
String name = element.substring(0, element.indexOf('[')); |
|
375 |
int found = 0; |
|
376 |
do { |
|
377 |
if (current.getNodeName().equals(name)) { |
|
378 |
found++; |
|
379 |
if (found == position) { |
|
380 |
current = current.getFirstChild(); |
|
381 |
break; |
|
382 |
} |
|
383 |
} |
|
384 |
} while ((current = current.getNextSibling()) != null); |
|
385 |
|
|
386 |
} |
|
387 |
String ret = current.getParentNode().getAttributes().getNamedItem("href").getNodeValue(); |
|
388 |
return Toolkit.makeAbsolute(ret, url); |
|
389 |
} catch (Exception e) { |
|
390 |
return null; |
|
391 |
} |
|
392 |
} |
|
393 |
|
|
394 |
private Object readResolve() throws IOException { |
|
395 |
if (robotstxtUrl != null) { |
|
396 |
URL url = new URL(robotstxtUrl); |
|
397 |
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream())); |
|
398 |
this.robot = new Robotstxt(in); |
|
399 |
this.directives = this.robot.getDirectivesFor(agentName); |
|
400 |
} else { |
|
401 |
ignoreRobotsTxt = true; |
|
402 |
} |
|
403 |
|
|
404 |
return this; |
|
405 |
} |
|
406 |
|
|
407 |
public boolean isResolveFrames() { |
|
408 |
return resolveFrames; |
|
409 |
} |
|
410 |
|
|
411 |
public void setResolveFrames(boolean resolveFrames) { |
|
412 |
this.resolveFrames = resolveFrames; |
|
413 |
} |
|
414 |
|
|
415 |
public boolean isSkipFirstPage() { |
|
416 |
return skipFirstPage; |
|
417 |
} |
|
418 |
|
|
419 |
public void setSkipFirstPage(boolean skipFirstPage) { |
|
420 |
this.skipFirstPage = skipFirstPage; |
|
421 |
} |
|
422 |
|
|
423 |
public long getSleepMillis() { |
|
424 |
return sleepMillis; |
|
425 |
} |
|
426 |
|
|
427 |
public void setSleepMillis(long sleepMillis) { |
|
428 |
this.sleepMillis = sleepMillis; |
|
429 |
} |
|
430 |
|
|
431 |
public List<String> getMimeTypes() { |
|
432 |
return mimeTypes; |
|
433 |
} |
|
434 |
|
|
435 |
public void setMimeTypes(List<String> mimeTypes) { |
|
436 |
this.mimeTypes = mimeTypes; |
|
437 |
} |
|
438 |
|
|
439 |
public List<String> getXpaths() { |
|
440 |
return xpaths; |
|
441 |
} |
|
442 |
|
|
443 |
public void setXpaths(List<String> xpaths) { |
|
444 |
this.xpaths = xpaths; |
|
445 |
} |
|
446 |
|
|
447 |
public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) { |
|
448 |
this.ignoreRobotsTxt = ignoreRobotsTxt; |
|
449 |
} |
|
450 |
|
|
451 |
public boolean isIgnoreRobotsTxt() { |
|
452 |
return ignoreRobotsTxt; |
|
453 |
} |
|
454 |
|
|
455 |
public void setAgentName(String agentName) { |
|
456 |
this.agentName = agentName; |
|
457 |
this.directives = this.robot.getDirectivesFor(agentName); |
|
458 |
} |
|
459 |
|
|
460 |
public String getAgentName() { |
|
461 |
return agentName; |
|
462 |
} |
|
463 |
|
|
464 |
public void setFallback(boolean fallback) { |
|
465 |
this.fallback = fallback; |
|
466 |
} |
|
467 |
|
|
468 |
public boolean isFallback() { |
|
469 |
return fallback; |
|
470 |
} |
|
471 |
|
|
472 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/methods/ResourceDiscoveryMethod.java | ||
---|---|---|
1 |
package gr.uoa.di.resourcediscovery.methods; |
|
2 |
|
|
3 |
import gr.uoa.di.resourcediscovery.MethodProvider; |
|
4 |
|
|
5 |
import java.io.IOException; |
|
6 |
import java.net.URL; |
|
7 |
import java.util.List; |
|
8 |
|
|
9 |
import org.xml.sax.SAXException; |
|
10 |
|
|
11 |
public interface ResourceDiscoveryMethod { |
|
12 |
|
|
13 |
public List<String> getResources(URL upageUrl, MethodProvider provider) throws SAXException, IOException; |
|
14 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/methods/URLTransformation.java | ||
---|---|---|
1 |
package gr.uoa.di.resourcediscovery.methods; |
|
2 |
|
|
3 |
import gr.uoa.di.resourcediscovery.MethodProvider; |
|
4 |
|
|
5 |
import java.net.URL; |
|
6 |
import java.util.ArrayList; |
|
7 |
import java.util.List; |
|
8 |
|
|
9 |
public class URLTransformation implements ResourceDiscoveryMethod { |
|
10 |
|
|
11 |
private String regex = null, replacement = ""; |
|
12 |
private String addToEnd = ""; |
|
13 |
|
|
14 |
@Override |
|
15 |
public List<String> getResources(URL upageUrl, MethodProvider provider) { |
|
16 |
String pageUrl = upageUrl.toString(); |
|
17 |
String trsf = pageUrl; |
|
18 |
if (regex != null && !regex.trim().equals("")) |
|
19 |
trsf = pageUrl.replaceAll(regex, replacement); |
|
20 |
|
|
21 |
trsf = trsf + addToEnd; |
|
22 |
|
|
23 |
List<String> ret = new ArrayList<String>(); |
|
24 |
ret.add(trsf); |
|
25 |
|
|
26 |
return ret; |
|
27 |
} |
|
28 |
|
|
29 |
public String getRegex() { |
|
30 |
return regex; |
|
31 |
} |
|
32 |
|
|
33 |
public void setRegex(String regex) { |
|
34 |
this.regex = regex; |
|
35 |
} |
|
36 |
|
|
37 |
public String getAddToEnd() { |
|
38 |
return addToEnd; |
|
39 |
} |
|
40 |
|
|
41 |
public void setAddToEnd(String addToEnd) { |
|
42 |
this.addToEnd = addToEnd; |
|
43 |
} |
|
44 |
|
|
45 |
public String getReplacement() { |
|
46 |
return replacement; |
|
47 |
} |
|
48 |
|
|
49 |
public void setReplacement(String replacement) { |
|
50 |
this.replacement = replacement; |
|
51 |
} |
|
52 |
|
|
53 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/Toolkit.java | ||
---|---|---|
1 |
package gr.uoa.di.resourcediscovery; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.net.HttpURLConnection; |
|
5 |
import java.net.MalformedURLException; |
|
6 |
import java.net.URL; |
|
7 |
import java.net.URLConnection; |
|
8 |
|
|
9 |
import org.apache.commons.logging.Log; |
|
10 |
import org.apache.commons.logging.LogFactory; |
|
11 |
|
|
12 |
|
|
13 |
public class Toolkit { |
|
14 |
|
|
15 |
private static final Log logger = LogFactory.getLog(Toolkit.class); |
|
16 |
static int timeout = 10000; |
|
17 |
|
|
18 |
static public String makeAbsolute(String url, URL connectionUrl) throws MalformedURLException { |
|
19 |
return new URL(connectionUrl, url).toString(); |
|
20 |
} |
|
21 |
|
|
22 |
static public String makeRelative(URL connectionUrl) throws MalformedURLException { |
|
23 |
return connectionUrl.getPath(); |
|
24 |
} |
|
25 |
|
|
26 |
static public String getRedirectedUrl(String resourceURL, long sleepMillis) throws IOException, MalformedURLException { |
|
27 |
URL url = null; |
|
28 |
|
|
29 |
try { |
|
30 |
url = new URL(resourceURL); |
|
31 |
} catch (MalformedURLException mue) { |
|
32 |
logger.error("Error opening first url", mue); |
|
33 |
throw mue; |
|
34 |
} |
|
35 |
|
|
36 |
HttpURLConnection.setFollowRedirects(false); |
|
37 |
|
|
38 |
HttpURLConnection conn = null; |
|
39 |
try { |
|
40 |
Thread.sleep(sleepMillis); |
|
41 |
conn = (HttpURLConnection) url.openConnection(); |
|
42 |
conn.setConnectTimeout(timeout); |
|
43 |
conn.setReadTimeout(timeout); |
|
44 |
conn.setAllowUserInteraction(false); |
|
45 |
conn.setDoOutput(true); |
|
46 |
} catch (ClassCastException ex) { |
|
47 |
throw new MalformedURLException(); |
|
48 |
} catch (InterruptedException e) { |
|
49 |
e.printStackTrace(); |
|
50 |
} |
|
51 |
|
|
52 |
conn.setRequestMethod("HEAD"); |
|
53 |
|
|
54 |
try { |
|
55 |
conn = openConnectionCheckRedirects(conn, sleepMillis); |
|
56 |
} catch (Exception ex) { |
|
57 |
throw new MalformedURLException(); |
|
58 |
} |
|
59 |
|
|
60 |
try { |
|
61 |
Thread.sleep(sleepMillis); |
|
62 |
} catch (InterruptedException e) { |
|
63 |
e.printStackTrace(); |
|
64 |
} |
|
65 |
int statusCode = conn.getResponseCode(); |
|
66 |
if (statusCode == 503) { |
|
67 |
logger.error("Url " + conn.getURL() + " reported status code 503. Please increase the crawler's sleep time."); |
|
68 |
conn.disconnect(); |
|
69 |
|
|
70 |
throw new IOException("Url " + conn.getURL() + " reported status code 503. Please increase the crawler's sleep time."); |
|
71 |
} else if (conn.getResponseCode() >= 400) { |
|
72 |
// Client or server error received |
|
73 |
logger.error("Url " + conn.getURL() + " seems to be unreachable (response code:"+statusCode+"). If this url is not of importance you can ignore this error."); |
|
74 |
conn.disconnect(); |
|
75 |
|
|
76 |
throw new IOException("Url " + conn.getURL() + " seems to be unreachable (response code:"+statusCode+"). If this url is not of importance you can ignore this error."); |
|
77 |
} else { |
|
78 |
return conn.getURL().toString(); |
|
79 |
} |
|
80 |
} |
|
81 |
|
|
82 |
static public String getMimeType(String resourceURL, long sleepMillis) throws IOException, MalformedURLException { |
|
83 |
URL url = null; |
|
84 |
|
|
85 |
try { |
|
86 |
url = new URL(resourceURL); |
|
87 |
} catch (MalformedURLException mue) { |
|
88 |
logger.debug("Error getting mime type" + mue); |
|
89 |
throw mue; |
|
90 |
} |
|
91 |
|
|
92 |
HttpURLConnection.setFollowRedirects(false); |
|
93 |
|
|
94 |
HttpURLConnection conn = null; |
|
95 |
try { |
|
96 |
Thread.sleep(sleepMillis); |
|
97 |
conn = (HttpURLConnection) url.openConnection(); |
|
98 |
conn.setConnectTimeout(timeout); |
|
99 |
conn.setReadTimeout(timeout); |
|
100 |
conn.setAllowUserInteraction(false); |
|
101 |
conn.setDoOutput(true); |
|
102 |
} catch (ClassCastException ex) { |
|
103 |
throw new MalformedURLException(); |
|
104 |
} catch (InterruptedException e) { |
|
105 |
e.printStackTrace(); |
|
106 |
} |
|
107 |
|
|
108 |
conn.setRequestMethod("HEAD"); |
|
109 |
|
|
110 |
try { |
|
111 |
conn = openConnectionCheckRedirects(conn, sleepMillis); |
|
112 |
} catch (Exception ex) { |
|
113 |
throw new MalformedURLException(); |
|
114 |
} |
|
115 |
|
|
116 |
try { |
|
117 |
Thread.sleep(sleepMillis); |
|
118 |
} catch (InterruptedException e) { |
|
119 |
e.printStackTrace(); |
|
120 |
} |
|
121 |
int statusCode = conn.getResponseCode(); |
|
122 |
if (statusCode == 503) { |
|
123 |
logger.error("WARNING: Url " + conn.getURL() + " reported status code 503. Please increase the crawler's sleep time."); |
|
124 |
conn.disconnect(); |
|
125 |
|
|
126 |
return "unknown"; |
|
127 |
} else if (conn.getResponseCode() >= 400) { |
|
128 |
// Client or server error received |
|
129 |
logger.error("WARNING: Url " + conn.getURL() + " seems to be unreachable (response code:"+statusCode+"). If this url is not of importance you can ignore this error."); |
|
130 |
conn.disconnect(); |
|
131 |
|
|
132 |
return "unknown"; |
|
133 |
} else { |
|
134 |
String mimeType = conn.getContentType(); |
|
135 |
|
|
136 |
logger.debug("mime type for " + conn.getURL() + ": " + mimeType); |
|
137 |
logger.debug("response code was: " + statusCode); |
|
138 |
conn.disconnect(); |
|
139 |
if (mimeType == null) |
|
140 |
mimeType = "unknown"; |
|
141 |
return mimeType.replaceAll(";.*", "").trim(); |
|
142 |
} |
|
143 |
} |
|
144 |
|
|
145 |
static public HttpURLConnection openConnectionCheckRedirects(URLConnection c, long sleepMillis) throws IOException { |
|
146 |
boolean redir; |
|
147 |
int redirects = 0; |
|
148 |
|
|
149 |
do { |
|
150 |
redir = false; |
|
151 |
if (c instanceof HttpURLConnection) { |
|
152 |
HttpURLConnection http = (HttpURLConnection) c; |
|
153 |
try { |
|
154 |
Thread.sleep(sleepMillis); |
|
155 |
} catch (InterruptedException e) { |
|
156 |
e.printStackTrace(); |
|
157 |
} |
|
158 |
int stat = http.getResponseCode(); |
|
159 |
|
|
160 |
if (stat >= 300 && stat <= 307 && stat != 306 && stat != HttpURLConnection.HTTP_NOT_MODIFIED) { |
|
161 |
URL base = http.getURL(); |
|
162 |
String loc = http.getHeaderField("Location"); |
|
163 |
URL target = null; |
|
164 |
if (loc != null) { |
|
165 |
target = new URL(base, loc); |
|
166 |
} |
|
167 |
http.disconnect(); |
|
168 |
// Redirection should be allowed only for HTTP and HTTPS |
|
169 |
// and should be limited to 5 redirections at most. |
|
170 |
if (target == null || !(target.getProtocol().equals("http") || target.getProtocol().equals("https")) || redirects >= 5) { |
|
171 |
throw new IOException("Redirection should be allowed only for HTTP and HTTPS and should be limited to 5 redirections at most."); |
|
172 |
} |
|
173 |
redir = true; |
|
174 |
try { |
|
175 |
Thread.sleep(sleepMillis); |
|
176 |
} catch (InterruptedException e) { |
|
177 |
e.printStackTrace(); |
|
178 |
} |
|
179 |
c = target.openConnection(); |
|
180 |
c.setConnectTimeout(timeout); |
|
181 |
c.setReadTimeout(timeout); |
|
182 |
c.setAllowUserInteraction(false); |
|
183 |
c.setDoOutput(true); |
|
184 |
redirects++; |
|
185 |
} |
|
186 |
} |
|
187 |
} while (redir); |
|
188 |
|
|
189 |
return (HttpURLConnection) c; |
|
190 |
} |
|
191 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/UnknownMethodException.java | ||
---|---|---|
1 |
package gr.uoa.di.resourcediscovery; |
|
2 |
|
|
3 |
public class UnknownMethodException extends Exception { |
|
4 |
private static final long serialVersionUID = 760327436365242998L; |
|
5 |
|
|
6 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/MethodProviderFileStorageImpl.java | ||
---|---|---|
1 |
package gr.uoa.di.resourcediscovery; |
|
2 |
|
|
3 |
import gr.uoa.di.resourcediscovery.methods.ResourceDiscoveryMethod; |
|
4 |
|
|
5 |
import java.io.File; |
|
6 |
import java.io.FileNotFoundException; |
|
7 |
import java.io.FileReader; |
|
8 |
import java.io.FileWriter; |
|
9 |
import java.io.IOException; |
|
10 |
import java.net.URL; |
|
11 |
import java.util.HashMap; |
|
12 |
|
|
13 |
import com.thoughtworks.xstream.XStream; |
|
14 |
|
|
15 |
public class MethodProviderFileStorageImpl implements MethodProvider { |
|
16 |
|
|
17 |
private String pathToFile = null; |
|
18 |
|
|
19 |
HashMap<URL, ResourceDiscoveryMethod> map = new HashMap<URL, ResourceDiscoveryMethod>(); |
|
20 |
|
|
21 |
public MethodProviderFileStorageImpl() { |
|
22 |
|
|
23 |
} |
|
24 |
|
|
25 |
@SuppressWarnings("unchecked") |
|
26 |
public MethodProviderFileStorageImpl(String pathToFile) throws FileNotFoundException { |
|
27 |
XStream xstream = new XStream(); |
|
28 |
if(!(new File(pathToFile).exists())) |
|
29 |
map = new HashMap<URL, ResourceDiscoveryMethod>(); |
|
30 |
else |
|
31 |
map = (HashMap<URL, ResourceDiscoveryMethod>) xstream.fromXML(new FileReader(new File(pathToFile))); |
|
32 |
this.pathToFile = pathToFile; |
|
33 |
} |
|
34 |
|
|
35 |
@Override |
|
36 |
public ResourceDiscoveryMethod getMethod(URL baseUrl) throws MalformedConfigurationException, UnknownMethodException, IOException { |
|
37 |
baseUrl = new URL(Toolkit.getRedirectedUrl(baseUrl.toString(), 500)); |
|
38 |
ResourceDiscoveryMethod ret = map.get(new URL(baseUrl.getProtocol()+"://"+baseUrl.getHost())); |
|
39 |
return ret; |
|
40 |
} |
|
41 |
|
|
42 |
@Override |
|
43 |
public void setMethod(URL baseUrl, ResourceDiscoveryMethod method) { |
|
44 |
map.put(baseUrl, method); |
|
45 |
try { |
|
46 |
store(); |
|
47 |
} catch (IOException e) { |
|
48 |
e.printStackTrace(); |
|
49 |
} |
|
50 |
} |
|
51 |
|
|
52 |
public String getPathToFile() { |
|
53 |
return pathToFile; |
|
54 |
} |
|
55 |
|
|
56 |
public void setPathToFile(String pathToFile) { |
|
57 |
this.pathToFile = pathToFile; |
|
58 |
} |
|
59 |
|
|
60 |
public void store() throws IOException { |
|
61 |
XStream xstream = new XStream(); |
|
62 |
xstream.toXML(map, new FileWriter(new File(pathToFile))); |
|
63 |
} |
|
64 |
|
|
65 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/MethodProvider.java | ||
---|---|---|
1 |
package gr.uoa.di.resourcediscovery; |
|
2 |
|
|
3 |
import gr.uoa.di.resourcediscovery.methods.ResourceDiscoveryMethod; |
|
4 |
|
|
5 |
import java.io.IOException; |
|
6 |
import java.net.MalformedURLException; |
|
7 |
import java.net.URL; |
|
8 |
|
|
9 |
public interface MethodProvider { |
|
10 |
|
|
11 |
public ResourceDiscoveryMethod getMethod(URL baseUrl) throws MalformedConfigurationException, UnknownMethodException, MalformedURLException, IOException; |
|
12 |
public void setMethod(URL baseUrl, ResourceDiscoveryMethod method) throws MalformedConfigurationException; |
|
13 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/MalformedConfigurationException.java | ||
---|---|---|
1 |
package gr.uoa.di.resourcediscovery; |
|
2 |
|
|
3 |
public class MalformedConfigurationException extends Exception { |
|
4 |
|
|
5 |
private static final long serialVersionUID = 8557374776080985539L; |
|
6 |
|
|
7 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/eu/dnetlib/data/utility/resource_discovery/crawler/Crawler.java | ||
---|---|---|
1 |
package eu.dnetlib.data.utility.resource_discovery.crawler; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.io.StringWriter; |
|
5 |
import java.net.MalformedURLException; |
|
6 |
import java.net.URL; |
|
7 |
import java.util.Vector; |
|
8 |
|
|
9 |
import net.matuschek.http.HttpException; |
|
10 |
import net.matuschek.http.URLLogger; |
|
11 |
import net.matuschek.spider.WebRobot; |
|
12 |
|
|
13 |
import org.apache.commons.logging.Log; |
|
14 |
import org.apache.commons.logging.LogFactory; |
|
15 |
|
|
16 |
import eu.dnetlib.data.utility.resource_discovery.crawler.config.Configs; |
|
17 |
import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter; |
|
18 |
|
|
19 |
public class Crawler { |
|
20 |
private static final Log logger = LogFactory.getLog(Crawler.class); |
|
21 |
private WebRobot crawler; |
|
22 |
|
|
23 |
public Crawler() throws IOException, HttpException { |
|
24 |
crawler = new WebRobot(); |
|
25 |
Configs.configureCrawler(crawler); |
|
26 |
} |
|
27 |
|
|
28 |
public Crawler(boolean isValidator) throws IOException, HttpException { |
|
29 |
crawler = new WebRobot(); |
|
30 |
if(isValidator) |
|
31 |
Configs.configureCrawlerForValidation(crawler); |
|
32 |
else |
|
33 |
Configs.configureCrawler(crawler); |
|
34 |
} |
|
35 |
|
|
36 |
public void reconfigureForRetry() { |
|
37 |
crawler.setMaxDepth(2); |
|
38 |
} |
|
39 |
|
|
40 |
public Vector<String> getLinks(String url) throws MalformedURLException, IOException, InterruptedException { |
|
41 |
logger.debug("Retrieving links from url "+url); |
|
42 |
crawler.setStartURL(new URL(UrlFilter.resolveRedirections(url))); |
|
43 |
StringWriter sw = new StringWriter(); |
|
44 |
URLLogger log = new URLLogger(sw); |
|
45 |
crawler.setDocManager(log); |
|
46 |
|
|
47 |
crawler.run(); |
|
48 |
|
|
49 |
String [] links = sw.getBuffer().toString().split("\n"); |
|
50 |
Vector<String> linksV = new Vector<String>(); |
|
51 |
for(int i=0; i<links.length; i++) |
|
52 |
linksV.add(links[i]); |
|
53 |
return linksV; |
|
54 |
} |
|
55 |
|
|
56 |
/*public Vector<String> getLinksFaster(String url) throws ParserException, IOException, InterruptedException { |
|
57 |
Thread.sleep(Configs.sleepTime); |
|
58 |
HttpURLConnection.setFollowRedirects(true); |
|
59 |
URL URL = new URL(url); |
|
60 |
HttpURLConnection conn = (HttpURLConnection) URL.openConnection(); |
|
61 |
Parser parser = new Parser(conn); |
|
62 |
|
|
63 |
NodeList list = parser.parse(new TagNameFilter("A")); |
|
64 |
Vector<String> links = new Vector<String>(); |
|
65 |
for(int i=0; i<list.size(); i++) { |
|
66 |
LinkTag n = (LinkTag) list.elementAt(i); |
|
67 |
links.add(n.extractLink()); |
|
68 |
} |
|
69 |
|
|
70 |
return links; |
|
71 |
}*/ |
|
72 |
|
|
73 |
/** |
|
74 |
* Only for testing purposes, not supposed to be called |
|
75 |
*/ |
|
76 |
public static void main(String[] args) { |
|
77 |
Crawler c; |
|
78 |
try { |
|
79 |
c = new Crawler(); |
|
80 |
System.out.println(c.crawler.getAllowWholeHost()+" "+c.crawler.getAllowWholeDomain()); |
|
81 |
} |
|
82 |
catch(Exception e) { |
|
83 |
System.err.println("FATAL ERROR: Crawler could not be configured. Please check your robot.xml parameters and try again."); |
|
84 |
System.err.println(e.getLocalizedMessage()); |
|
85 |
e.printStackTrace(); |
|
86 |
return; |
|
87 |
} |
|
88 |
String url = "http://www.di.uoa.gr/gr"; |
|
89 |
try { |
|
90 |
System.out.println(c.getLinks(url)); |
|
91 |
} |
|
92 |
catch(Exception e) { |
|
93 |
System.err.println("ERROR: Crawler could not retrieve links from url "+url); |
|
94 |
System.err.println(e.getLocalizedMessage()); |
|
95 |
e.printStackTrace(); |
|
96 |
} |
|
97 |
} |
|
98 |
|
|
99 |
public WebRobot getCrawler() { |
|
100 |
return this.crawler; |
|
101 |
} |
|
102 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/eu/dnetlib/data/utility/resource_discovery/crawler/ResourceExtractor.java | ||
---|---|---|
1 |
package eu.dnetlib.data.utility.resource_discovery.crawler; |
|
2 |
|
|
3 |
import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter; |
|
4 |
|
|
5 |
import java.io.IOException; |
|
6 |
import java.util.Vector; |
|
7 |
|
|
8 |
import org.apache.commons.logging.Log; |
|
9 |
import org.apache.commons.logging.LogFactory; |
|
10 |
|
|
11 |
|
|
12 |
public class ResourceExtractor { |
|
13 |
private static final Log logger = LogFactory.getLog(ResourceExtractor.class); |
|
14 |
private Vector<String> filter; |
|
15 |
private Vector<String> latest; |
|
16 |
private int runned; |
|
17 |
|
|
18 |
public ResourceExtractor() { |
|
19 |
runned = 0; |
|
20 |
filter = new Vector<String>(); |
|
21 |
latest = new Vector<String>(); |
|
22 |
} |
|
23 |
|
|
24 |
public Vector<String> extractResource(Vector<String> urls) throws IOException, InterruptedException { |
|
25 |
logger.debug("Extracting resources from links "+urls); |
|
26 |
runned++; |
|
27 |
Vector<String> ret = new Vector<String>(); |
|
28 |
if(runned == 1) { |
|
29 |
filter.addAll(urls); |
|
30 |
for(String url : urls) { |
|
31 |
if(UrlFilter.checkExtension(url) || UrlFilter.checkMimeType(url)) |
|
32 |
ret.add(url); |
|
33 |
} |
|
34 |
return ret; |
|
35 |
} |
|
36 |
for(String url : urls) { |
|
37 |
if(!latest.contains(url) && !filter.contains(url) && (UrlFilter.checkExtension(url) || UrlFilter.checkMimeType(url))) |
|
38 |
ret.add(url); |
|
39 |
} |
|
40 |
latest.clear(); |
|
41 |
latest.addAll(ret); |
|
42 |
return ret; |
|
43 |
} |
|
44 |
|
|
45 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/eu/dnetlib/data/utility/resource_discovery/crawler/config/Configs.java | ||
---|---|---|
1 |
package eu.dnetlib.data.utility.resource_discovery.crawler.config; |
|
2 |
|
|
3 |
import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter; |
|
4 |
|
|
5 |
import java.io.BufferedReader; |
|
6 |
import java.io.IOException; |
|
7 |
import java.io.InputStreamReader; |
|
8 |
import java.io.StringWriter; |
|
9 |
import java.util.Collection; |
|
10 |
import java.util.Vector; |
|
11 |
|
|
12 |
import net.matuschek.http.DownloadRuleSet; |
|
13 |
import net.matuschek.http.HttpException; |
|
14 |
import net.matuschek.http.URLLogger; |
|
15 |
import net.matuschek.spider.WebRobot; |
|
16 |
|
|
17 |
import org.apache.commons.logging.Log; |
|
18 |
import org.apache.commons.logging.LogFactory; |
|
19 |
import org.dlese.dpc.xml.XMLDoc; |
|
20 |
import org.dlese.dpc.xml.XMLException; |
|
21 |
|
|
22 |
public class Configs { |
|
23 |
private static final Log logger = LogFactory.getLog(Configs.class); |
|
24 |
static public String agentName = "JoBo"; |
|
25 |
static public boolean ignoreRobotsTxt=false; |
|
26 |
static public int sleepTime=3000; |
|
27 |
static public int maxDepth=1; |
|
28 |
|
|
29 |
static public boolean walkToOtherHosts=false; |
|
30 |
static public boolean allowWholeHost=false; |
|
31 |
static public boolean allowWholeDomain=false; |
|
32 |
static public boolean flexibleHostCheck=true; |
|
33 |
static public boolean localizeLinks=false; |
|
34 |
static public boolean enableCookies=false; |
|
35 |
|
|
36 |
static public String startReferer=null; |
|
37 |
static public int maxDocumentAge=-1; |
|
38 |
static public String[] allowedUrl=null; |
|
39 |
static public String[] visitMany=null; |
|
40 |
static public String proxy=null; |
|
41 |
static public int bandwidth=-1; |
|
42 |
|
|
43 |
private static String readXMLDoc(String filename) throws IOException { |
|
44 |
BufferedReader br = new BufferedReader(new InputStreamReader(Configs.class.getResourceAsStream(filename))); |
|
45 |
String strLine = null; |
|
46 |
StringBuilder builder = new StringBuilder(); |
|
47 |
|
|
48 |
try { |
|
49 |
while ((strLine = br.readLine()) != null) |
|
50 |
builder.append(strLine); |
|
51 |
|
|
52 |
} finally { |
|
53 |
br.close(); |
|
54 |
} |
|
55 |
|
|
56 |
return builder.toString(); |
|
57 |
} |
|
58 |
|
|
59 |
static { |
|
60 |
try { |
|
61 |
// XMLDoc xd = new XMLDoc("configs/robot.xml", true, true, true); |
|
62 |
logger.debug("Reading configuration file for crawler"); |
|
63 |
XMLDoc xd = new XMLDoc(); |
|
64 |
//xd.useXmlString(readXMLDoc("/eu/dnetlib/functionality/validator/robot.xml"), true, true, true); |
|
65 |
xd.useXmlString(readXMLDoc("/eu/dnetlib/data/utility/resource_discovery/robot.xml"), true, true, true); |
|
66 |
|
|
67 |
String[] ret1 = xd.getXmlFields(0,1,"AgentName"); |
|
68 |
String [] ret2 = xd.getXmlFields(0,1,"IgnoreRobotsTxt"); |
|
69 |
String [] ret3 = xd.getXmlFields(0,1,"SleepTime"); |
|
70 |
String [] ret4 = xd.getXmlFields(0,1,"MaxDepth"); |
|
71 |
String [] ret5 = xd.getXmlFields(0,1,"WalkToOtherHosts"); |
|
72 |
String [] ret6 = xd.getXmlFields(0,1,"AllowWholeHost"); |
|
73 |
String [] ret7 = xd.getXmlFields(0,1,"AllowWholeDomain"); |
|
74 |
String [] ret8 = xd.getXmlFields(0,1,"FlexibleHostCheck"); |
|
75 |
String [] ret9 = xd.getXmlFields(0,1,"LocalizeLinks"); |
|
76 |
String [] ret10 = xd.getXmlFields(0,1,"EnableCookies"); |
|
77 |
String [] ret11 = xd.getXmlFields(0,1,"StartReferer"); |
|
78 |
String [] ret12 = xd.getXmlFields(0,1,"MaxDocumentAge"); |
|
79 |
String [] ret13 = xd.getXmlFields(0,0,"AllowedUrl"); |
|
80 |
String [] ret14 = xd.getXmlFields(0,0,"VisitMany"); |
|
81 |
String [] ret15 = xd.getXmlFields(0,1,"Proxy"); |
|
82 |
String [] ret16 = xd.getXmlFields(0,1,"Bandwidth"); |
|
83 |
if(ret1.length > 0) |
|
84 |
agentName = ret1[0]; |
|
85 |
if(ret2.length > 0) |
|
86 |
ignoreRobotsTxt = Boolean.parseBoolean(ret2[0]); |
|
87 |
if(ret3.length > 0) |
|
88 |
sleepTime = Integer.parseInt(ret3[0]) * 1000; |
|
89 |
if(ret4.length > 0) |
|
90 |
maxDepth = Integer.parseInt(ret4[0]); |
|
91 |
if(ret5.length > 0) |
|
92 |
walkToOtherHosts = Boolean.parseBoolean(ret5[0]); |
|
93 |
if(ret6.length > 0) |
|
94 |
allowWholeHost = Boolean.parseBoolean(ret6[0]); |
|
95 |
if(ret7.length > 0) |
|
96 |
allowWholeDomain = Boolean.parseBoolean(ret7[0]); |
|
97 |
if(ret8.length > 0) |
|
98 |
flexibleHostCheck = Boolean.parseBoolean(ret8[0]); |
|
99 |
if(ret9.length > 0) |
|
100 |
localizeLinks = Boolean.parseBoolean(ret9[0]); |
|
101 |
if(ret10.length > 0) |
|
102 |
enableCookies = Boolean.parseBoolean(ret10[0]); |
|
103 |
if(ret11.length > 0) |
|
104 |
startReferer = ret11[0]; |
|
105 |
if(ret12.length > 0) |
|
106 |
maxDocumentAge = Integer.parseInt(ret12[0]); |
|
107 |
if(ret13.length > 0) |
|
108 |
allowedUrl = ret13; |
|
109 |
if(ret14.length > 0) |
|
110 |
visitMany = ret14; |
|
111 |
if(ret15.length > 0) |
|
112 |
proxy = ret15[0]; |
|
113 |
if(ret16.length > 0) |
|
114 |
bandwidth = Integer.parseInt(ret16[0]); |
|
115 |
} catch (IOException e) { |
|
116 |
logger.debug("Error reading robots.txt", e); |
|
117 |
} catch(XMLException e) { |
|
118 |
logger.debug("WARNING: The file robot.xml seems to be malformed. The default settings will be used for the crawler.", e); |
|
119 |
} catch(NumberFormatException e) { |
|
120 |
logger.debug("WARNING: The file robot.xml seems to be malformed (an integer doesn't seem to be of type integer). The default settings will be used for the crawler.", e); |
|
121 |
} catch (Exception e) { |
|
122 |
logger.error("Error configuring", e); |
|
123 |
} |
|
124 |
} |
|
125 |
|
|
126 |
public static void configureCrawlerForValidation(WebRobot crawler) throws IOException { |
|
127 |
logger.debug("Configuring crawler for validation"); |
|
128 |
crawler.setAgentName("Validator"); |
|
129 |
crawler.setIgnoreRobotsTxt(false); |
|
130 |
crawler.setSleepTime(1); |
|
131 |
crawler.setMaxDepth(1); |
|
132 |
crawler.setWalkToOtherHosts(false); |
|
133 |
crawler.setAllowWholeHost(true); |
|
134 |
crawler.setAllowWholeDomain(true); |
|
135 |
crawler.setFlexibleHostCheck(true); |
|
136 |
crawler.setEnableCookies(true); |
|
137 |
|
|
138 |
DownloadRuleSet rules = new DownloadRuleSet(); |
|
139 |
int minSize = 1, maxSize = 104857600; |
|
140 |
rules.addRule("text", "html", minSize, maxSize, true); |
|
141 |
Collection<String> mimeTypes = UrlFilter.getRequestedMimeTypes(); |
|
142 |
for(String mimeType : mimeTypes) { |
|
143 |
String[] parts = mimeType.split("/"); |
|
144 |
if(parts.length < 2) { |
|
145 |
logger.debug("WARNING: Requested mimetype "+mimeType+" seems to be malformed"); |
|
146 |
throw new IOException(); |
|
147 |
} |
|
148 |
rules.addRule(parts[0], parts[1], minSize, maxSize, true); |
|
149 |
} |
|
150 |
rules.addRule("*", "*", minSize, maxSize, false); |
|
151 |
crawler.setDownloadRuleSet(rules); |
|
152 |
|
|
153 |
} |
|
154 |
|
|
155 |
public static void configureCrawler(WebRobot crawler) throws IOException, HttpException { |
|
156 |
logger.debug("Configuring crawler using configuration file parameters"); |
|
157 |
crawler.setAgentName(agentName); |
|
158 |
crawler.setIgnoreRobotsTxt(ignoreRobotsTxt); |
|
159 |
crawler.setSleepTime(sleepTime/1000); |
|
160 |
crawler.setMaxDepth(maxDepth); |
|
161 |
crawler.setWalkToOtherHosts(walkToOtherHosts); |
|
162 |
crawler.setAllowWholeHost(allowWholeHost); |
|
163 |
crawler.setAllowWholeDomain(allowWholeDomain); |
|
164 |
crawler.setFlexibleHostCheck(flexibleHostCheck); |
|
165 |
crawler.setEnableCookies(enableCookies); |
|
166 |
|
|
167 |
if(startReferer != null) |
|
168 |
crawler.setStartReferer(startReferer); |
|
169 |
if(maxDocumentAge > 0) |
|
170 |
crawler.setMaxDocumentAge(maxDocumentAge); |
|
171 |
if(allowedUrl != null) { |
|
172 |
Vector<String> urls = new Vector<String>(); |
|
173 |
for(int i=0; i<allowedUrl.length; i++) |
|
174 |
urls.add(allowedUrl[i]); |
|
175 |
crawler.setAllowedURLs(urls); |
|
176 |
} |
|
177 |
if(visitMany != null) { |
|
178 |
Vector<String> urls = new Vector<String>(); |
|
179 |
for(int i=0; i<visitMany.length; i++) |
|
180 |
urls.add(visitMany[i]); |
|
181 |
crawler.setVisitMany(urls); |
|
182 |
} |
|
183 |
if(proxy != null) |
|
184 |
crawler.setProxy(proxy); |
|
185 |
if(bandwidth > 0) |
|
186 |
crawler.setBandwidth(bandwidth); |
|
187 |
|
|
188 |
DownloadRuleSet rules = new DownloadRuleSet(); |
|
189 |
int minSize = 1, maxSize = 104857600; |
|
190 |
rules.addRule("text", "html", minSize, maxSize, true); |
|
191 |
Collection<String> mimeTypes = UrlFilter.getRequestedMimeTypes(); |
|
192 |
for(String mimeType : mimeTypes) { |
|
193 |
String[] parts = mimeType.split("/"); |
|
194 |
if(parts.length < 2) { |
|
195 |
logger.debug("WARNING: Requested mimetype "+mimeType+" seems to be malformed"); |
|
196 |
throw new IOException(); |
|
197 |
} |
|
198 |
rules.addRule(parts[0], parts[1], minSize, maxSize, true); |
|
199 |
} |
|
200 |
rules.addRule("*", "*", minSize, maxSize, false); |
|
201 |
crawler.setDownloadRuleSet(rules); |
|
202 |
|
|
203 |
} |
|
204 |
|
|
205 |
public static void main(String[] args) { |
|
206 |
WebRobot robby = new WebRobot(); |
|
207 |
try { |
|
208 |
configureCrawler(robby); |
|
209 |
StringWriter sw = new StringWriter(); |
|
210 |
URLLogger log = new URLLogger(sw); |
|
211 |
robby.setDocManager(log); |
|
212 |
|
|
213 |
robby.run(); |
|
214 |
|
|
215 |
logger.debug(sw.getBuffer().toString()); |
|
216 |
} |
|
217 |
catch(Exception e) { |
|
218 |
logger.debug(e.getLocalizedMessage()); |
|
219 |
} |
|
220 |
} |
|
221 |
} |
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/eu/dnetlib/data/utility/resource_discovery/harvester/ResourceHarvester.java | ||
---|---|---|
1 |
package eu.dnetlib.data.utility.resource_discovery.harvester; |
|
2 |
|
|
3 |
import eu.dnetlib.data.utility.resource_discovery.crawler.Crawler; |
|
4 |
import eu.dnetlib.data.utility.resource_discovery.crawler.ResourceExtractor; |
|
5 |
import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter; |
|
6 |
|
|
7 |
import java.io.IOException; |
|
8 |
import java.util.Date; |
|
9 |
import java.util.Vector; |
|
10 |
|
|
11 |
import org.apache.commons.logging.Log; |
|
12 |
import org.apache.commons.logging.LogFactory; |
|
13 |
import org.dlese.dpc.oai.harvester.Harvester; |
|
14 |
import org.dlese.dpc.oai.harvester.Hexception; |
|
15 |
import org.dlese.dpc.oai.harvester.OAIErrorException; |
|
16 |
import org.dlese.dpc.xml.XMLDoc; |
|
17 |
import org.dlese.dpc.xml.XMLException; |
|
18 |
|
|
19 |
public class ResourceHarvester { |
|
20 |
|
|
21 |
private static final Log logger = LogFactory.getLog(ResourceHarvester.class); |
|
22 |
static public String[][] getRecordsFromRepository(String baseUrl) throws Hexception, OAIErrorException { |
|
23 |
return Harvester.harvest(baseUrl, "oai_dc", null, null, null, null, true); |
|
24 |
} |
|
25 |
|
|
26 |
static public String[][] getRecordsFromRepository(String baseUrl, String set, Date from, Date until) throws Hexception, OAIErrorException { |
|
27 |
return Harvester.harvest(baseUrl, "oai_dc", set, from, until, null, true); |
|
28 |
} |
|
29 |
|
|
30 |
static public Vector<Vector<String>> getResourceAndLinks(String header, String oaiDcRecord, Crawler crawler, ResourceExtractor extractor) throws IOException, InterruptedException { |
|
31 |
Vector<Vector<String>> retrievedAndExtracted = new Vector<Vector<String>>(); |
|
32 |
String id = getDcIdentifier(oaiDcRecord, header); |
|
33 |
if(id != null) { |
|
34 |
String idUrl = UrlFilter.resolveRedirections(id); |
|
35 |
Vector<String> urls; |
|
36 |
urls = crawler.getLinks(idUrl); |
|
37 |
retrievedAndExtracted.add(urls); |
|
38 |
retrievedAndExtracted.add(extractor.extractResource(urls)); |
|
39 |
return retrievedAndExtracted; |
|
40 |
} |
|
41 |
return null; |
|
42 |
} |
|
43 |
|
|
44 |
static public String getIdentifier(String oaiDcRecord, String identifier) throws IOException { |
|
45 |
XMLDoc xd = new XMLDoc(); |
|
46 |
try { |
|
47 |
xd.useXmlString(oaiDcRecord, true, true, true); |
|
48 |
} |
|
49 |
catch(XMLException e) { |
|
50 |
logger.debug("WARNING: The record "+identifier+" seems to be malformed (deleted maybe?)"); |
|
51 |
return null; |
|
52 |
} |
|
53 |
try { |
|
54 |
String [] fields = xd.getXmlFields(1,0,"dc:identifier"); |
|
55 |
Vector<String> urls = new Vector<String>(); |
|
56 |
for(String field : fields) { |
|
57 |
if(UrlFilter.isUrl(field)) |
|
58 |
urls.add(field); |
|
59 |
} |
|
60 |
fields = xd.getXmlFields(0,0,"dc:source"); |
|
61 |
for(String field : fields) { |
|
62 |
if(UrlFilter.isUrl(field)) |
|
63 |
urls.add(field); |
|
64 |
} |
|
65 |
fields = xd.getXmlFields(0,0,"dc:relation"); |
|
66 |
for(String field : fields) { |
|
67 |
if(UrlFilter.isUrl(field)) |
|
68 |
urls.add(field); |
|
69 |
} |
|
70 |
if(urls.size() == 0) { |
|
71 |
logger.debug("WARNING: The record "+identifier+" does not seem to have a field that is a url"); |
|
72 |
return null; |
|
73 |
} |
|
74 |
if(urls.size() == 1) |
|
75 |
return urls.elementAt(0); |
|
76 |
for(String url : urls) { |
|
77 |
if(UrlFilter.checkExtension(url)) |
|
78 |
return url; |
|
79 |
} |
|
80 |
logger.debug("WARNING: The record "+identifier+" has multiple fields with valid urls and there is no way to choose one. The first one will be used"); |
|
81 |
return urls.elementAt(0); |
|
82 |
} |
|
83 |
catch(XMLException e) { |
|
84 |
logger.debug("WARNING: The record "+identifier+" does not seem to have a dc:identifier field"); |
|
85 |
return null; |
|
86 |
} |
|
87 |
} |
|
88 |
|
|
89 |
static public String getDcIdentifier(String oaiDcRecord, String identifier) throws IOException { |
|
90 |
XMLDoc xd = new XMLDoc(); |
|
91 |
try { |
|
92 |
xd.useXmlString(oaiDcRecord, true, true, true); |
|
93 |
} |
|
94 |
catch(XMLException e) { |
|
95 |
logger.debug("WARNING: The record "+identifier+" seems to be malformed (deleted maybe?)"); |
|
96 |
return null; |
|
97 |
} |
|
98 |
try { |
|
99 |
String [] fields = xd.getXmlFields(1,0,"dc:identifier"); |
|
100 |
Vector<String> urls = new Vector<String>(); |
|
101 |
for(String field : fields) { |
|
102 |
if(UrlFilter.isUrl(field)) |
|
103 |
urls.add(field); |
|
104 |
} |
|
105 |
if(urls.size() == 0) { |
|
106 |
logger.debug("WARNING: The record "+identifier+" does not seem to have a field that is a url"); |
|
107 |
return null; |
|
108 |
} |
|
109 |
if(urls.size() == 1) |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-resource-discovery-2.0.0