Project

General

Profile

« Previous | Next » 

Revision 31619

[maven-release-plugin] copy for tag dnet-resource-discovery-2.0.0

View differences:

modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/pom.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.ap\ ache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet-parent</artifactId>
6
		<version>1.0.0</version>
7
	</parent>
8
	<modelVersion>4.0.0</modelVersion>
9
	<groupId>eu.dnetlib</groupId>
10
	<artifactId>dnet-resource-discovery</artifactId>
11
	<packaging>jar</packaging>
12
	<version>2.0.0</version>
13
	<scm>
14
	  <developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0</developerConnection>
15
	</scm>
16
	<dependencies>
17
		<dependency>
18
			<groupId>apache</groupId>
19
			<artifactId>commons-logging</artifactId>
20
			<version>[1.0.0,1.0.1)</version>
21
		</dependency>
22
		<dependency>
23
			<groupId>junit</groupId>
24
			<artifactId>junit</artifactId>
25
			<version>${junit.version}</version>
26
			<scope>test</scope>
27
		</dependency>
28
		<dependency>
29
			<groupId>org.w3c</groupId>
30
			<artifactId>tidy</artifactId>
31
			<version>[0.0.0,)</version>
32
		</dependency>
33
		<dependency>
34
			<groupId>net.matuschek</groupId>
35
			<artifactId>jobo</artifactId>
36
			<version>[1.4,2.0)</version>
37
		</dependency>
38
		<dependency>
39
			<groupId>DLS</groupId>
40
			<artifactId>jOAI</artifactId>
41
			<version>[2.0.9.3,2.0.10.0)</version>
42
		</dependency>
43
		<dependency>
44
			<groupId>com.thoughtworks</groupId>
45
			<artifactId>xstream</artifactId>
46
			<version>[0.0.0,)</version>
47
		</dependency>
48
		<dependency>
49
			<groupId>net.sourceforge.nekohtml</groupId>
50
			<artifactId>nekohtml</artifactId>
51
			<version>1.9.16</version>
52
			<exclusions>
53
				<exclusion>
54
					<artifactId>xercesImpl</artifactId>
55
					<groupId>xerces</groupId>
56
				</exclusion>
57
			</exclusions>
58
		</dependency>
59
		<dependency>
60
			<groupId>com.jira</groupId>
61
			<artifactId>heritrix-commons</artifactId>
62
			<version>[0.0.0,)</version>
63
		</dependency>
64
		<dependency>
65
			<groupId>com.jira</groupId>
66
			<artifactId>heritrix-modules</artifactId>
67
			<version>[0.0.0,)</version>
68
		</dependency>
69
		<dependency>
70
			<groupId>com.googlecode</groupId>
71
			<artifactId>kryo</artifactId>
72
			<version>1.04</version>
73
			<exclusions>
74
				<exclusion>
75
					<groupId>com.googlecode</groupId>
76
					<artifactId>minlog</artifactId>
77
				</exclusion>
78
			</exclusions>
79
		</dependency>
80
		<dependency>
81
			<groupId>edu.indiana</groupId>
82
			<artifactId>xpp3</artifactId>
83
			<version>[0.0.0,)</version>
84
		</dependency>
85
		<dependency>
86
			<groupId>xerces</groupId>
87
			<artifactId>xercesImpl</artifactId>
88
			<version>2.11.0</version>
89
			<scope>provided</scope>
90
		</dependency>
91
	</dependencies>
92
</project>
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/deploy.info
1
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-resource-discovery/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "dnet-resource-discovery"}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/test/java/eu/dnetlib/testWebCrawl/testCrawl.java
1
package eu.dnetlib.testWebCrawl;
2

  
3
import gr.uoa.di.resourcediscovery.MalformedConfigurationException;
4
import gr.uoa.di.resourcediscovery.MethodProvider;
5
import gr.uoa.di.resourcediscovery.MethodProviderFileStorageImpl;
6
import gr.uoa.di.resourcediscovery.UnknownMethodException;
7
import gr.uoa.di.resourcediscovery.methods.XPathAndCrawl;
8

  
9
import java.io.IOException;
10
import java.net.URL;
11
import java.util.Arrays;
12
import java.util.List;
13

  
14
import org.junit.Assert;
15
import org.junit.Test;
16
import org.xml.sax.SAXException;
17

  
18
public class testCrawl {
19

  
20
	@Test
21
	public void test() throws MalformedConfigurationException, UnknownMethodException, IOException, SAXException {
22

  
23
		long starttime = System.currentTimeMillis();
24
		String fileName = "/tmp/method-map.xml";
25
		List<String> mimeTypes = Arrays.asList(new String[] { "application/pdf" });
26
		MethodProvider provider = new MethodProviderFileStorageImpl(fileName);
27
		URL conUrl = new URL("http://arxiv.org/abs/0908.4286.pdf");
28
		XPathAndCrawl xpath = new XPathAndCrawl(mimeTypes, null);
29
		List<String> resources = xpath.getResources(conUrl, provider);
30
		Assert.assertTrue("The length should be > 0", resources.size() > 0);
31
		long endtime = System.currentTimeMillis();
32
		System.out.println((endtime - starttime) / 1000);
33
	}
34
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/methods/XPathAndCrawl.java
1
package gr.uoa.di.resourcediscovery.methods;
2

  
3
import gr.uoa.di.resourcediscovery.MalformedConfigurationException;
4
import gr.uoa.di.resourcediscovery.MethodProvider;
5
import gr.uoa.di.resourcediscovery.Toolkit;
6

  
7
import java.io.BufferedReader;
8
import java.io.FileNotFoundException;
9
import java.io.IOException;
10
import java.io.InputStreamReader;
11
import java.net.MalformedURLException;
12
import java.net.URL;
13
import java.util.ArrayList;
14
import java.util.List;
15

  
16
import org.apache.commons.logging.Log;
17
import org.apache.commons.logging.LogFactory;
18
import org.archive.modules.net.RobotsDirectives;
19
import org.archive.modules.net.Robotstxt;
20
import org.cyberneko.html.parsers.DOMParser;
21
import org.w3c.dom.Document;
22
import org.w3c.dom.Node;
23
import org.w3c.dom.traversal.DocumentTraversal;
24
import org.w3c.dom.traversal.NodeFilter;
25
import org.w3c.dom.traversal.NodeIterator;
26
import org.xml.sax.SAXException;
27

  
28
public class XPathAndCrawl implements ResourceDiscoveryMethod {
29

  
30
	private static final Log logger = LogFactory.getLog(XPathAndCrawl.class);
31

  
32
	private boolean resolveFrames = true;
33
	private boolean skipFirstPage = false;
34
	private long sleepMillis = 100;
35
	private boolean ignoreRobotsTxt = false;
36
	private String agentName = "OpenAIRE_Harvester";
37
	private List<String> mimeTypes = new ArrayList<String>();
38
	private boolean fallback = true;
39
	private String robotstxtUrl = null;
40

  
41
	transient private Robotstxt robot = null;
42
	transient private RobotsDirectives directives = null;
43

  
44
	private List<String> xpaths = new ArrayList<String>();
45

  
46
	public XPathAndCrawl() {
47
		this.ignoreRobotsTxt = true;
48
	}
49

  
50
	// you need one per repository!
51
	public XPathAndCrawl(List<String> mimeTypes, String robotstxtUrl) throws FileNotFoundException, IOException {
52
		this.mimeTypes.addAll(mimeTypes);
53

  
54
		if (robotstxtUrl != null) {
55
			URL url = new URL(robotstxtUrl);
56
			try {
57
				BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
58
				this.robot = new Robotstxt(in);
59
				this.directives = this.robot.getDirectivesFor(agentName);
60
			} catch (FileNotFoundException ex) {
61
				logger.debug("Robots.txt was not found at " + robotstxtUrl);
62
				ignoreRobotsTxt = true;
63
			}
64
		} else {
65
			ignoreRobotsTxt = true;
66
		}
67
	}
68

  
69
	public void setRobotstxt(String robotstxtUrl) throws FileNotFoundException, IOException {
70
		this.robotstxtUrl = robotstxtUrl;
71
		if (robotstxtUrl != null) {
72
			URL url = new URL(robotstxtUrl);
73
			try {
74
				BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
75
				this.robot = new Robotstxt(in);
76
				this.directives = this.robot.getDirectivesFor(agentName);
77
			} catch (FileNotFoundException ex) {
78
				logger.debug("Robots.txt was not found at " + robotstxtUrl);
79
				ignoreRobotsTxt = true;
80
			}
81
		} else {
82
			ignoreRobotsTxt = true;
83
		}
84
	}
85

  
86
	public String getRobotstxtUrl() {
87
		return robotstxtUrl;
88
	}
89

  
90
	@Override
91
	public List<String> getResources(URL upageUrl, MethodProvider provider) throws SAXException, IOException {
92

  
93
		String pageUrl = upageUrl.toString();
94
		
95
		logger.debug("Known xpaths: "+this.xpaths);
96
		
97
		pageUrl = Toolkit.getRedirectedUrl(pageUrl, this.sleepMillis);
98
		
99
		logger.debug("Resolved possible redirections. Url: "+pageUrl);
100
		
101
		List<String> ret = new ArrayList<String>();
102
		List<String> urls = new ArrayList<String>();
103
		urls.add(pageUrl);
104
		
105
		// check if url is a redirection
106
		
107
		
108
		if(this.mimeTypes.contains(Toolkit.getMimeType(pageUrl, this.sleepMillis))) {
109
			ret.add(Toolkit.makeAbsolute(pageUrl, new URL(pageUrl)));
110
			return ret;
111
		}
112

  
113
		if (this.resolveFrames) {
114
			DOMParser parser = new DOMParser();
115
			parser.parse(pageUrl);
116
			Document doc = parser.getDocument();
117
			urls.addAll(resolveFrames(doc, new URL(pageUrl)));
118
			logger.debug("urls after resolving frames: " + urls);
119
		}
120

  
121
		if (this.skipFirstPage) {
122
			List<String> addme = new ArrayList<String>();
123
			for (String url : urls) {
124
				DOMParser parser = new DOMParser();
125
				parser.parse(url);
126
				Document doc = parser.getDocument();
127
				addme.addAll(oneDepthDown(doc, new URL(url)));
128
			}
129

  
130
			urls.remove(pageUrl);
131

  
132
			if (this.resolveFrames) {
133
				for (String url : urls) {
134
					DOMParser parser = new DOMParser();
135
					parser.parse(url);
136
					Document doc = parser.getDocument();
137
					addme.addAll(resolveFrames(doc, new URL(url)));
138
				}
139
			}
140

  
141
			urls.addAll(addme);
142
			logger.debug("urls after skipping 1st page and resolving frames: " + urls);
143
		}
144

  
145
		for (String url : urls) {
146
			logger.debug("looking for resource in: " + url);
147
			try {
148
				url = Toolkit.makeAbsolute(url, new URL(pageUrl));
149
			} catch (Exception e) {
150
				e.printStackTrace();
151
				continue;
152
			}
153
			URL startingUrl = new URL(url);
154

  
155
			if (!this.ignoreRobotsTxt)
156
				if (!this.directives.allows(Toolkit.makeRelative(startingUrl))) {
157
					logger.debug("Skipping " + startingUrl + ". Disallowed by robots.txt directives.");
158
					continue;
159
				}
160

  
161
			if (this.xpaths.size() == 0) {
162
				logger.debug("No xpath information, crawling");
163
				// this for the first time
164
				DOMParser parser = new DOMParser();
165
				parser.parse(startingUrl.toString());
166
				Document doc = parser.getDocument();
167

  
168
				List<Node> resourceNodes = findNodesWithResource(doc, startingUrl);
169

  
170
				for (Node resourceNode : resourceNodes) {
171
					String xp = getXpathToRoot(resourceNode);
172
					xpaths.add(xp);
173
					logger.debug(xp);
174
				}
175
				
176
				try {
177
					URL methodUrl = new URL(pageUrl);
178
					provider.setMethod(new URL(methodUrl.getProtocol()+"://"+methodUrl.getHost()), this);
179
				} catch(MalformedConfigurationException e) {
180
					logger.error("Error updating xpath information", e);
181
				}
182

  
183
				for (String xp : xpaths) {
184
					String resourceUrl = getResourceUrl(xp, doc, startingUrl);
185
					if (resourceUrl != null) {
186
						logger.debug(resourceUrl);
187
						ret.add(resourceUrl);
188
					}
189
				}
190
			} else {
191
				// this is for the rest of the pages of the repo
192
				DOMParser parser = new DOMParser();
193
				parser.parse(startingUrl.toString());
194
				Document doc = parser.getDocument();
195

  
196
				for (String xp : xpaths) {
197
					String resourceUrl = getResourceUrl(xp, doc, startingUrl);
198
					if (resourceUrl != null) {
199
						logger.debug(resourceUrl);
200
						ret.add(resourceUrl);
201
					}
202
				}
203
			}
204
		}
205

  
206
		if (ret.size() == 0 && this.fallback) {
207
			// if no xpath contained the resource, try to find it and add
208
			// all the xpaths
209
			for (String url : urls) {
210
				logger.debug("looking for resource in (not found in xpath): " + url);
211

  
212
				try {
213
					url = Toolkit.makeAbsolute(url, new URL(pageUrl));
214
				} catch (Exception e) {
215
					e.printStackTrace();
216
					continue;
217
				}
218
				URL startingUrl = new URL(url);
219

  
220
				if (!this.ignoreRobotsTxt)
221
					if (!this.directives.allows(Toolkit.makeRelative(startingUrl))) {
222
						logger.debug("Skipping " + startingUrl + ". Disallowed by robots.txt directives.");
223
						continue;
224
					}
225

  
226
				DOMParser parser = new DOMParser();
227
				parser.parse(startingUrl.toString());
228
				Document doc = parser.getDocument();
229
				List<Node> resourceNodes = findNodesWithResource(doc, startingUrl);
230
				for (Node resourceNode : resourceNodes) {
231
					String xp = getXpathToRoot(resourceNode);
232
					xpaths.add(xp);
233
					logger.debug(xp);
234
				}
235
				
236
				try {
237
					URL methodUrl = new URL(pageUrl);
238
					provider.setMethod(new URL(methodUrl.getProtocol()+"://"+methodUrl.getHost()), this);
239
				} catch(MalformedConfigurationException e) {
240
					logger.error("Error updating xpath information", e);
241
				}
242
				
243
				for (String xp : xpaths) {
244
					String resourceUrl = getResourceUrl(xp, doc, startingUrl);
245
					if (resourceUrl != null) {
246
						logger.debug(resourceUrl);
247
						ret.add(resourceUrl);
248
					}
249
				}
250
			}
251
		}
252

  
253
		return ret;
254
	}
255

  
256
	private List<String> resolveFrames(Document doc, URL connectionUrl) {
257
		List<String> ret = new ArrayList<String>();
258

  
259
		DocumentTraversal traversal = (DocumentTraversal) doc;
260

  
261
		NodeIterator iterator = null;
262
		try {
263
			iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true);
264
		} catch (Exception e) {
265
			e.printStackTrace();
266
			return ret;
267
		}
268

  
269
		for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
270
			if (n.getNodeName().equals("FRAME") || n.getNodeName().equals("IFRAME")) {
271
				String url = n.getAttributes().getNamedItem("src").getNodeValue();
272
				try {
273
					url = Toolkit.makeAbsolute(url, connectionUrl);
274
					ret.add(url);
275
				} catch (MalformedURLException ex) {
276
					continue;
277
				}
278
			}
279
		}
280
		return ret;
281
	}
282

  
283
	private List<String> oneDepthDown(Document doc, URL connectionUrl) throws IOException {
284
		List<String> ret = new ArrayList<String>();
285

  
286
		DocumentTraversal traversal = (DocumentTraversal) doc;
287

  
288
		NodeIterator iterator = null;
289
		try {
290
			iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true);
291
		} catch (Exception e) {
292
			e.printStackTrace();
293
			return ret;
294
		}
295

  
296
		for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
297
			if (n.getNodeName().equals("A")) {
298
				String url = n.getAttributes().getNamedItem("href").getNodeValue();
299
				try {
300
					url = Toolkit.makeAbsolute(url, connectionUrl);
301
					if (Toolkit.getMimeType(url, this.sleepMillis).trim().contains("text/html"))
302
						ret.add(url);
303
				} catch (MalformedURLException ex) {
304
					continue;
305
				}
306
			}
307
		}
308
		return ret;
309
	}
310

  
311
	private String getXpathToRoot(Node node) {
312
		String xpath = "";
313
		do {
314
			if (node.getNodeName().equals("HTML")) {
315
				int before = 1;
316
				while ((node = node.getPreviousSibling()) != null)
317
					before++;
318
				return "/HTML["+before+"]" + xpath;
319
			}
320
			int before = 0;
321
			Node current = node;
322
			while ((current = current.getPreviousSibling()) != null)
323
				if (current.getNodeName().equals(node.getNodeName()))
324
					before++;
325
			xpath = "/" + node.getNodeName() + "[" + (before + 1) + "]" + xpath;
326
		} while ((node = node.getParentNode()) != null);
327
		return xpath;
328
	}
329

  
330
	private List<Node> findNodesWithResource(Document doc, URL connectionUrl) throws IOException {
331
		List<Node> ret = new ArrayList<Node>();
332

  
333
		DocumentTraversal traversal = (DocumentTraversal) doc;
334

  
335
		NodeIterator iterator = null;
336
		try {
337
			iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true);
338
		} catch (Exception e) {
339
			e.printStackTrace();
340
			return ret;
341
		}
342

  
343
		for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
344
			if (n.getNodeName().equals("A")) {
345
				String url = null;
346
				try {
347
					url = n.getAttributes().getNamedItem("href").getNodeValue();
348
				} catch(NullPointerException e) {
349
					// anchor without href
350
					continue;
351
				}
352
				if (url == null)
353
					continue;
354
				try {
355
					url = Toolkit.makeAbsolute(url, connectionUrl);
356
					if (this.mimeTypes.contains(Toolkit.getMimeType(url, this.sleepMillis).trim()))
357
						ret.add(n);
358
				} catch (MalformedURLException ex) {
359
					continue;
360
				}
361
			}
362
		}
363
		return ret;
364
	}
365

  
366
	private String getResourceUrl(String xpath, Document doc, URL url) throws MalformedURLException {
367
		try {
368
			Node current = doc.getFirstChild();
369
			String[] elements = xpath.split("/");
370
			for (String element : elements) {
371
				if (element.trim().equals(""))
372
					continue;
373
				int position = Integer.parseInt(element.substring(element.indexOf('[')).replaceAll("\\[", "").replaceAll("\\]", ""));
374
				String name = element.substring(0, element.indexOf('['));
375
				int found = 0;
376
				do {
377
					if (current.getNodeName().equals(name)) {
378
						found++;
379
						if (found == position) {
380
							current = current.getFirstChild();
381
							break;
382
						}
383
					}
384
				} while ((current = current.getNextSibling()) != null);
385

  
386
			}
387
			String ret = current.getParentNode().getAttributes().getNamedItem("href").getNodeValue();
388
			return Toolkit.makeAbsolute(ret, url);
389
		} catch (Exception e) {
390
			return null;
391
		}
392
	}
393

  
394
	private Object readResolve() throws IOException {
395
		if (robotstxtUrl != null) {
396
			URL url = new URL(robotstxtUrl);
397
			BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
398
			this.robot = new Robotstxt(in);
399
			this.directives = this.robot.getDirectivesFor(agentName);
400
		} else {
401
			ignoreRobotsTxt = true;
402
		}
403
		
404
		return this;
405
	}
406

  
407
	public boolean isResolveFrames() {
408
		return resolveFrames;
409
	}
410

  
411
	public void setResolveFrames(boolean resolveFrames) {
412
		this.resolveFrames = resolveFrames;
413
	}
414

  
415
	public boolean isSkipFirstPage() {
416
		return skipFirstPage;
417
	}
418

  
419
	public void setSkipFirstPage(boolean skipFirstPage) {
420
		this.skipFirstPage = skipFirstPage;
421
	}
422

  
423
	public long getSleepMillis() {
424
		return sleepMillis;
425
	}
426

  
427
	public void setSleepMillis(long sleepMillis) {
428
		this.sleepMillis = sleepMillis;
429
	}
430

  
431
	public List<String> getMimeTypes() {
432
		return mimeTypes;
433
	}
434

  
435
	public void setMimeTypes(List<String> mimeTypes) {
436
		this.mimeTypes = mimeTypes;
437
	}
438

  
439
	public List<String> getXpaths() {
440
		return xpaths;
441
	}
442

  
443
	public void setXpaths(List<String> xpaths) {
444
		this.xpaths = xpaths;
445
	}
446

  
447
	public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
448
		this.ignoreRobotsTxt = ignoreRobotsTxt;
449
	}
450

  
451
	public boolean isIgnoreRobotsTxt() {
452
		return ignoreRobotsTxt;
453
	}
454

  
455
	public void setAgentName(String agentName) {
456
		this.agentName = agentName;
457
		this.directives = this.robot.getDirectivesFor(agentName);
458
	}
459

  
460
	public String getAgentName() {
461
		return agentName;
462
	}
463

  
464
	public void setFallback(boolean fallback) {
465
		this.fallback = fallback;
466
	}
467

  
468
	public boolean isFallback() {
469
		return fallback;
470
	}
471

  
472
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/methods/ResourceDiscoveryMethod.java
1
package gr.uoa.di.resourcediscovery.methods;
2

  
3
import gr.uoa.di.resourcediscovery.MethodProvider;
4

  
5
import java.io.IOException;
6
import java.net.URL;
7
import java.util.List;
8

  
9
import org.xml.sax.SAXException;
10

  
11
public interface ResourceDiscoveryMethod {
12

  
13
	public List<String> getResources(URL upageUrl, MethodProvider provider) throws SAXException, IOException;
14
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/methods/URLTransformation.java
1
package gr.uoa.di.resourcediscovery.methods;
2

  
3
import gr.uoa.di.resourcediscovery.MethodProvider;
4

  
5
import java.net.URL;
6
import java.util.ArrayList;
7
import java.util.List;
8

  
9
public class URLTransformation implements ResourceDiscoveryMethod {
10

  
11
	private String regex = null, replacement = "";
12
	private String addToEnd = "";
13

  
14
	@Override
15
	public List<String> getResources(URL upageUrl, MethodProvider provider) {
16
		String pageUrl = upageUrl.toString();
17
		String trsf = pageUrl;
18
		if (regex != null && !regex.trim().equals(""))
19
			trsf = pageUrl.replaceAll(regex, replacement);
20

  
21
		trsf = trsf + addToEnd;
22

  
23
		List<String> ret = new ArrayList<String>();
24
		ret.add(trsf);
25

  
26
		return ret;
27
	}
28

  
29
	public String getRegex() {
30
		return regex;
31
	}
32

  
33
	public void setRegex(String regex) {
34
		this.regex = regex;
35
	}
36

  
37
	public String getAddToEnd() {
38
		return addToEnd;
39
	}
40

  
41
	public void setAddToEnd(String addToEnd) {
42
		this.addToEnd = addToEnd;
43
	}
44

  
45
	public String getReplacement() {
46
		return replacement;
47
	}
48

  
49
	public void setReplacement(String replacement) {
50
		this.replacement = replacement;
51
	}
52

  
53
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/Toolkit.java
1
package gr.uoa.di.resourcediscovery;
2

  
3
import java.io.IOException;
4
import java.net.HttpURLConnection;
5
import java.net.MalformedURLException;
6
import java.net.URL;
7
import java.net.URLConnection;
8

  
9
import org.apache.commons.logging.Log;
10
import org.apache.commons.logging.LogFactory;
11

  
12

  
13
public class Toolkit {
14
	
15
	private static final Log logger = LogFactory.getLog(Toolkit.class);
16
	static int timeout = 10000;
17

  
18
	static public String makeAbsolute(String url, URL connectionUrl) throws MalformedURLException {
19
		return new URL(connectionUrl, url).toString();
20
	}
21

  
22
	static public String makeRelative(URL connectionUrl) throws MalformedURLException {
23
		return connectionUrl.getPath();
24
	}
25
	
26
	static public String getRedirectedUrl(String resourceURL, long sleepMillis) throws IOException, MalformedURLException {
27
		URL url = null;
28

  
29
		try {
30
			url = new URL(resourceURL);
31
		} catch (MalformedURLException mue) {
32
			logger.error("Error opening first url", mue);
33
			throw mue;
34
		}
35

  
36
		HttpURLConnection.setFollowRedirects(false);
37

  
38
		HttpURLConnection conn = null;
39
		try {
40
			Thread.sleep(sleepMillis);
41
			conn = (HttpURLConnection) url.openConnection();
42
			conn.setConnectTimeout(timeout);
43
			conn.setReadTimeout(timeout);
44
			conn.setAllowUserInteraction(false);         
45
			conn.setDoOutput(true);
46
		} catch (ClassCastException ex) {
47
			throw new MalformedURLException();
48
		} catch (InterruptedException e) {
49
			e.printStackTrace();
50
		}
51

  
52
		conn.setRequestMethod("HEAD");
53

  
54
		try {
55
			conn = openConnectionCheckRedirects(conn, sleepMillis);
56
		} catch (Exception ex) {
57
			throw new MalformedURLException();
58
		}
59

  
60
		try {
61
			Thread.sleep(sleepMillis);
62
		} catch (InterruptedException e) {
63
			e.printStackTrace();
64
		}
65
		int statusCode = conn.getResponseCode();
66
		if (statusCode == 503) {
67
			logger.error("Url " + conn.getURL() + " reported status code 503. Please increase the crawler's sleep time.");
68
			conn.disconnect();
69

  
70
			throw new IOException("Url " + conn.getURL() + " reported status code 503. Please increase the crawler's sleep time.");
71
		} else if (conn.getResponseCode() >= 400) {
72
			// Client or server error received
73
			logger.error("Url " + conn.getURL() + " seems to be unreachable (response code:"+statusCode+"). If this url is not of importance you can ignore this error.");
74
			conn.disconnect();
75

  
76
			throw new IOException("Url " + conn.getURL() + " seems to be unreachable (response code:"+statusCode+"). If this url is not of importance you can ignore this error.");
77
		} else {
78
			return conn.getURL().toString();
79
		}
80
	}
81

  
82
	static public String getMimeType(String resourceURL, long sleepMillis) throws IOException, MalformedURLException {
83
		URL url = null;
84

  
85
		try {
86
			url = new URL(resourceURL);
87
		} catch (MalformedURLException mue) {
88
			logger.debug("Error getting mime type" + mue);
89
			throw mue;
90
		}
91

  
92
		HttpURLConnection.setFollowRedirects(false);
93

  
94
		HttpURLConnection conn = null;
95
		try {
96
			Thread.sleep(sleepMillis);
97
			conn = (HttpURLConnection) url.openConnection();
98
			conn.setConnectTimeout(timeout);
99
			conn.setReadTimeout(timeout);
100
			conn.setAllowUserInteraction(false);         
101
			conn.setDoOutput(true);
102
		} catch (ClassCastException ex) {
103
			throw new MalformedURLException();
104
		} catch (InterruptedException e) {
105
			e.printStackTrace();
106
		}
107

  
108
		conn.setRequestMethod("HEAD");
109

  
110
		try {
111
			conn = openConnectionCheckRedirects(conn, sleepMillis);
112
		} catch (Exception ex) {
113
			throw new MalformedURLException();
114
		}
115

  
116
		try {
117
			Thread.sleep(sleepMillis);
118
		} catch (InterruptedException e) {
119
			e.printStackTrace();
120
		}
121
		int statusCode = conn.getResponseCode();
122
		if (statusCode == 503) {
123
			logger.error("WARNING: Url " + conn.getURL() + " reported status code 503. Please increase the crawler's sleep time.");
124
			conn.disconnect();
125

  
126
			return "unknown";
127
		} else if (conn.getResponseCode() >= 400) {
128
			// Client or server error received
129
			logger.error("WARNING: Url " + conn.getURL() + " seems to be unreachable (response code:"+statusCode+"). If this url is not of importance you can ignore this error.");
130
			conn.disconnect();
131

  
132
			return "unknown";
133
		} else {
134
			String mimeType = conn.getContentType();
135

  
136
			logger.debug("mime type for " + conn.getURL() + ": " + mimeType);
137
			logger.debug("response code was: " + statusCode);
138
			conn.disconnect();
139
			if (mimeType == null)
140
				mimeType = "unknown";
141
			return mimeType.replaceAll(";.*", "").trim();
142
		}
143
	}
144

  
145
	static public HttpURLConnection openConnectionCheckRedirects(URLConnection c, long sleepMillis) throws IOException {
146
		boolean redir;
147
		int redirects = 0;
148

  
149
		do {
150
			redir = false;
151
			if (c instanceof HttpURLConnection) {
152
				HttpURLConnection http = (HttpURLConnection) c;
153
				try {
154
					Thread.sleep(sleepMillis);
155
				} catch (InterruptedException e) {
156
					e.printStackTrace();
157
				}
158
				int stat = http.getResponseCode();
159

  
160
				if (stat >= 300 && stat <= 307 && stat != 306 && stat != HttpURLConnection.HTTP_NOT_MODIFIED) {
161
					URL base = http.getURL();
162
					String loc = http.getHeaderField("Location");
163
					URL target = null;
164
					if (loc != null) {
165
						target = new URL(base, loc);
166
					}
167
					http.disconnect();
168
					// Redirection should be allowed only for HTTP and HTTPS
169
					// and should be limited to 5 redirections at most.
170
					if (target == null || !(target.getProtocol().equals("http") || target.getProtocol().equals("https")) || redirects >= 5) {
171
						throw new IOException("Redirection should be allowed only for HTTP and HTTPS and should be limited to 5 redirections at most.");
172
					}
173
					redir = true;
174
					try {
175
						Thread.sleep(sleepMillis);
176
					} catch (InterruptedException e) {
177
						e.printStackTrace();
178
					}
179
					c = target.openConnection();
180
					c.setConnectTimeout(timeout);
181
					c.setReadTimeout(timeout);
182
					c.setAllowUserInteraction(false);         
183
					c.setDoOutput(true);
184
					redirects++;
185
				}
186
			}
187
		} while (redir);
188

  
189
		return (HttpURLConnection) c;
190
	}
191
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/UnknownMethodException.java
1
package gr.uoa.di.resourcediscovery;
2

  
3
public class UnknownMethodException extends Exception {
4
	private static final long serialVersionUID = 760327436365242998L;
5

  
6
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/MethodProviderFileStorageImpl.java
1
package gr.uoa.di.resourcediscovery;
2

  
3
import gr.uoa.di.resourcediscovery.methods.ResourceDiscoveryMethod;
4

  
5
import java.io.File;
6
import java.io.FileNotFoundException;
7
import java.io.FileReader;
8
import java.io.FileWriter;
9
import java.io.IOException;
10
import java.net.URL;
11
import java.util.HashMap;
12

  
13
import com.thoughtworks.xstream.XStream;
14

  
15
public class MethodProviderFileStorageImpl implements MethodProvider {
16

  
17
	private String pathToFile = null;
18

  
19
	HashMap<URL, ResourceDiscoveryMethod> map = new HashMap<URL, ResourceDiscoveryMethod>();
20

  
21
	public MethodProviderFileStorageImpl() {
22

  
23
	}
24

  
25
	@SuppressWarnings("unchecked")
26
	public MethodProviderFileStorageImpl(String pathToFile) throws FileNotFoundException {
27
		XStream xstream = new XStream();
28
		if(!(new File(pathToFile).exists()))
29
			map = new  HashMap<URL, ResourceDiscoveryMethod>();
30
		else
31
			map = (HashMap<URL, ResourceDiscoveryMethod>) xstream.fromXML(new FileReader(new File(pathToFile)));
32
		this.pathToFile = pathToFile;
33
	}
34

  
35
	@Override
36
	public ResourceDiscoveryMethod getMethod(URL baseUrl) throws MalformedConfigurationException, UnknownMethodException, IOException {
37
		baseUrl = new URL(Toolkit.getRedirectedUrl(baseUrl.toString(), 500));
38
		ResourceDiscoveryMethod ret = map.get(new URL(baseUrl.getProtocol()+"://"+baseUrl.getHost()));
39
		return ret;
40
	}
41

  
42
	@Override
43
	public void setMethod(URL baseUrl, ResourceDiscoveryMethod method) {
44
		map.put(baseUrl, method);
45
		try {
46
			store();
47
		} catch (IOException e) {
48
			e.printStackTrace();
49
		}
50
	}
51

  
52
	public String getPathToFile() {
53
		return pathToFile;
54
	}
55

  
56
	public void setPathToFile(String pathToFile) {
57
		this.pathToFile = pathToFile;
58
	}
59

  
60
	public void store() throws IOException {
61
		XStream xstream = new XStream();
62
		xstream.toXML(map, new FileWriter(new File(pathToFile)));
63
	}
64

  
65
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/MethodProvider.java
1
package gr.uoa.di.resourcediscovery;
2

  
3
import gr.uoa.di.resourcediscovery.methods.ResourceDiscoveryMethod;
4

  
5
import java.io.IOException;
6
import java.net.MalformedURLException;
7
import java.net.URL;
8

  
9
public interface MethodProvider {
10

  
11
	public ResourceDiscoveryMethod getMethod(URL baseUrl) throws MalformedConfigurationException, UnknownMethodException, MalformedURLException, IOException;
12
	public void setMethod(URL baseUrl, ResourceDiscoveryMethod method) throws MalformedConfigurationException;
13
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/MalformedConfigurationException.java
1
package gr.uoa.di.resourcediscovery;
2

  
3
public class MalformedConfigurationException extends Exception {
4

  
5
	private static final long serialVersionUID = 8557374776080985539L;
6

  
7
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/eu/dnetlib/data/utility/resource_discovery/crawler/Crawler.java
1
package eu.dnetlib.data.utility.resource_discovery.crawler;
2

  
3
import java.io.IOException;
4
import java.io.StringWriter;
5
import java.net.MalformedURLException;
6
import java.net.URL;
7
import java.util.Vector;
8

  
9
import net.matuschek.http.HttpException;
10
import net.matuschek.http.URLLogger;
11
import net.matuschek.spider.WebRobot;
12

  
13
import org.apache.commons.logging.Log;
14
import org.apache.commons.logging.LogFactory;
15

  
16
import eu.dnetlib.data.utility.resource_discovery.crawler.config.Configs;
17
import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
18

  
19
public class Crawler {
20
	private static final Log logger = LogFactory.getLog(Crawler.class);
21
	private WebRobot crawler;
22

  
23
	public Crawler() throws IOException, HttpException {
24
		crawler = new WebRobot();
25
		Configs.configureCrawler(crawler);
26
	}
27
	
28
	public Crawler(boolean isValidator) throws IOException, HttpException {
29
		crawler = new WebRobot();
30
		if(isValidator)
31
			Configs.configureCrawlerForValidation(crawler);
32
		else
33
			Configs.configureCrawler(crawler);
34
	}
35
	
36
	public void reconfigureForRetry() {
37
		crawler.setMaxDepth(2);
38
	}
39
	
40
	public Vector<String> getLinks(String url) throws MalformedURLException, IOException, InterruptedException {
41
		logger.debug("Retrieving links from url "+url);
42
		crawler.setStartURL(new URL(UrlFilter.resolveRedirections(url)));
43
		StringWriter sw = new StringWriter();
44
	    URLLogger log = new URLLogger(sw);
45
	    crawler.setDocManager(log);
46

  
47
	    crawler.run();
48
	    
49
	    String [] links = sw.getBuffer().toString().split("\n");
50
	    Vector<String> linksV = new Vector<String>();
51
	    for(int i=0; i<links.length; i++)
52
	    	linksV.add(links[i]);
53
	    return linksV;
54
	}
55
	
56
	/*public Vector<String> getLinksFaster(String url) throws ParserException, IOException, InterruptedException {
57
		Thread.sleep(Configs.sleepTime);
58
		HttpURLConnection.setFollowRedirects(true);
59
		URL URL = new URL(url);
60
        HttpURLConnection conn = (HttpURLConnection) URL.openConnection();
61
        Parser parser = new Parser(conn);
62
        
63
        NodeList list = parser.parse(new TagNameFilter("A"));
64
        Vector<String> links = new Vector<String>();
65
		for(int i=0; i<list.size(); i++) {
66
			LinkTag n = (LinkTag) list.elementAt(i);
67
			links.add(n.extractLink());
68
		}
69
		
70
		return links;
71
	}*/
72
	
73
	/**
74
	 * Only for testing purposes, not supposed to be called
75
	 */
76
	public static void main(String[] args) {
77
		Crawler c;
78
		try {
79
		c = new Crawler();
80
		System.out.println(c.crawler.getAllowWholeHost()+" "+c.crawler.getAllowWholeDomain());
81
		}
82
		catch(Exception e) {
83
			System.err.println("FATAL ERROR: Crawler could not be configured. Please check your robot.xml parameters and try again.");
84
			System.err.println(e.getLocalizedMessage());
85
			e.printStackTrace();
86
			return;
87
		}
88
		String url = "http://www.di.uoa.gr/gr";
89
		try {
90
		System.out.println(c.getLinks(url));
91
		}
92
		catch(Exception e) {
93
			System.err.println("ERROR: Crawler could not retrieve links from url "+url);
94
			System.err.println(e.getLocalizedMessage());
95
			e.printStackTrace();
96
		}
97
	}
98

  
99
	public WebRobot getCrawler() {
100
		return this.crawler;
101
	}
102
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/eu/dnetlib/data/utility/resource_discovery/crawler/ResourceExtractor.java
1
package eu.dnetlib.data.utility.resource_discovery.crawler;
2

  
3
import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
4

  
5
import java.io.IOException;
6
import java.util.Vector;
7

  
8
import org.apache.commons.logging.Log;
9
import org.apache.commons.logging.LogFactory;
10

  
11

  
12
public class ResourceExtractor {
13
	private static final Log logger = LogFactory.getLog(ResourceExtractor.class);
14
	private Vector<String> filter;
15
	private Vector<String> latest;
16
	private int runned;
17
	
18
	public ResourceExtractor() {
19
		runned = 0;
20
		filter = new Vector<String>();
21
		latest = new Vector<String>();
22
	}
23
	
24
	public Vector<String> extractResource(Vector<String> urls) throws IOException, InterruptedException {
25
		logger.debug("Extracting resources from links "+urls);
26
		runned++;
27
		Vector<String> ret = new Vector<String>();
28
		if(runned == 1) {
29
			filter.addAll(urls);
30
			for(String url : urls) {
31
				if(UrlFilter.checkExtension(url) || UrlFilter.checkMimeType(url))
32
					ret.add(url);
33
			}
34
			return ret;
35
		}
36
		for(String url : urls) {
37
			if(!latest.contains(url) && !filter.contains(url) && (UrlFilter.checkExtension(url) || UrlFilter.checkMimeType(url)))
38
				ret.add(url);
39
		}
40
		latest.clear();
41
		latest.addAll(ret);
42
		return ret;
43
	}
44
	
45
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/eu/dnetlib/data/utility/resource_discovery/crawler/config/Configs.java
1
package eu.dnetlib.data.utility.resource_discovery.crawler.config;
2

  
3
import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
4

  
5
import java.io.BufferedReader;
6
import java.io.IOException;
7
import java.io.InputStreamReader;
8
import java.io.StringWriter;
9
import java.util.Collection;
10
import java.util.Vector;
11

  
12
import net.matuschek.http.DownloadRuleSet;
13
import net.matuschek.http.HttpException;
14
import net.matuschek.http.URLLogger;
15
import net.matuschek.spider.WebRobot;
16

  
17
import org.apache.commons.logging.Log;
18
import org.apache.commons.logging.LogFactory;
19
import org.dlese.dpc.xml.XMLDoc;
20
import org.dlese.dpc.xml.XMLException;
21

  
22
public class Configs {
23
	private static final Log logger = LogFactory.getLog(Configs.class);
24
	static public String agentName = "JoBo";
25
	static public boolean ignoreRobotsTxt=false;
26
	static public int sleepTime=3000;
27
	static public int maxDepth=1;
28

  
29
	static public boolean walkToOtherHosts=false;
30
	static public boolean allowWholeHost=false;
31
	static public boolean allowWholeDomain=false;
32
	static public boolean flexibleHostCheck=true;
33
	static public boolean localizeLinks=false;
34
	static public boolean enableCookies=false;
35
	
36
	static public String startReferer=null;
37
	static public int maxDocumentAge=-1;
38
	static public String[] allowedUrl=null;
39
	static public String[] visitMany=null;
40
	static public String proxy=null;
41
	static public int bandwidth=-1;
42
	
43
	private static String readXMLDoc(String filename) throws IOException {
44
		BufferedReader br = new BufferedReader(new InputStreamReader(Configs.class.getResourceAsStream(filename)));
45
		String strLine = null;
46
		StringBuilder builder = new StringBuilder();
47
		
48
		try {
49
			while ((strLine = br.readLine()) != null)
50
				builder.append(strLine);
51

  
52
		} finally {
53
			br.close();
54
		}		
55
		
56
		return builder.toString();
57
	}
58

  
59
	static {
60
		try {
61
//		XMLDoc xd = new XMLDoc("configs/robot.xml", true, true, true);
62
		logger.debug("Reading configuration file for crawler");
63
		XMLDoc xd = new XMLDoc();
64
		//xd.useXmlString(readXMLDoc("/eu/dnetlib/functionality/validator/robot.xml"), true, true, true);
65
		xd.useXmlString(readXMLDoc("/eu/dnetlib/data/utility/resource_discovery/robot.xml"), true, true, true);
66
		
67
		String[] ret1 = xd.getXmlFields(0,1,"AgentName");		
68
		String [] ret2 = xd.getXmlFields(0,1,"IgnoreRobotsTxt");
69
		String [] ret3 = xd.getXmlFields(0,1,"SleepTime");
70
		String [] ret4 = xd.getXmlFields(0,1,"MaxDepth");
71
		String [] ret5 = xd.getXmlFields(0,1,"WalkToOtherHosts");
72
		String [] ret6 = xd.getXmlFields(0,1,"AllowWholeHost");
73
		String [] ret7 = xd.getXmlFields(0,1,"AllowWholeDomain");
74
		String [] ret8 = xd.getXmlFields(0,1,"FlexibleHostCheck");
75
		String [] ret9 = xd.getXmlFields(0,1,"LocalizeLinks");
76
		String [] ret10 = xd.getXmlFields(0,1,"EnableCookies");
77
		String [] ret11 = xd.getXmlFields(0,1,"StartReferer");
78
		String [] ret12 = xd.getXmlFields(0,1,"MaxDocumentAge");
79
		String [] ret13 = xd.getXmlFields(0,0,"AllowedUrl");
80
		String [] ret14 = xd.getXmlFields(0,0,"VisitMany");
81
		String [] ret15 = xd.getXmlFields(0,1,"Proxy");
82
		String [] ret16 = xd.getXmlFields(0,1,"Bandwidth");
83
		if(ret1.length > 0)
84
			agentName = ret1[0];
85
		if(ret2.length > 0)
86
			ignoreRobotsTxt = Boolean.parseBoolean(ret2[0]);
87
		if(ret3.length > 0)
88
			sleepTime = Integer.parseInt(ret3[0]) * 1000;
89
		if(ret4.length > 0)
90
			maxDepth = Integer.parseInt(ret4[0]);
91
		if(ret5.length > 0)
92
			walkToOtherHosts = Boolean.parseBoolean(ret5[0]);
93
		if(ret6.length > 0)
94
			allowWholeHost = Boolean.parseBoolean(ret6[0]);
95
		if(ret7.length > 0)
96
			allowWholeDomain = Boolean.parseBoolean(ret7[0]);
97
		if(ret8.length > 0)
98
			flexibleHostCheck = Boolean.parseBoolean(ret8[0]);
99
		if(ret9.length > 0)
100
			localizeLinks = Boolean.parseBoolean(ret9[0]);
101
		if(ret10.length > 0)
102
			enableCookies = Boolean.parseBoolean(ret10[0]);
103
		if(ret11.length > 0)
104
			startReferer = ret11[0];
105
		if(ret12.length > 0)
106
			maxDocumentAge = Integer.parseInt(ret12[0]);
107
		if(ret13.length > 0)
108
			allowedUrl = ret13;
109
		if(ret14.length > 0)
110
			visitMany = ret14;
111
		if(ret15.length > 0)
112
			proxy = ret15[0];
113
		if(ret16.length > 0)
114
			bandwidth = Integer.parseInt(ret16[0]);
115
		} catch (IOException e) {
116
			logger.debug("Error reading robots.txt", e);
117
		} catch(XMLException e) {
118
			logger.debug("WARNING: The file robot.xml seems to be malformed. The default settings will be used for the crawler.", e);
119
		} catch(NumberFormatException e) {
120
			logger.debug("WARNING: The file robot.xml seems to be malformed (an integer doesn't seem to be of type integer). The default settings will be used for the crawler.", e);
121
		} catch (Exception e) {
122
			logger.error("Error configuring", e);
123
		}
124
	}
125
	
126
	public static void configureCrawlerForValidation(WebRobot crawler) throws IOException {
127
		logger.debug("Configuring crawler for validation");
128
		crawler.setAgentName("Validator");
129
		crawler.setIgnoreRobotsTxt(false);
130
		crawler.setSleepTime(1);
131
		crawler.setMaxDepth(1);
132
		crawler.setWalkToOtherHosts(false);
133
		crawler.setAllowWholeHost(true);
134
		crawler.setAllowWholeDomain(true);
135
		crawler.setFlexibleHostCheck(true);
136
		crawler.setEnableCookies(true);
137
			
138
		DownloadRuleSet rules = new DownloadRuleSet();
139
		int minSize = 1, maxSize = 104857600;
140
		rules.addRule("text", "html", minSize, maxSize, true);
141
		Collection<String> mimeTypes = UrlFilter.getRequestedMimeTypes();
142
		for(String mimeType : mimeTypes) {
143
			String[] parts = mimeType.split("/");
144
			if(parts.length < 2) {
145
				logger.debug("WARNING: Requested mimetype "+mimeType+" seems to be malformed");
146
				throw new IOException();
147
			}
148
			rules.addRule(parts[0], parts[1], minSize, maxSize, true);
149
		}
150
		rules.addRule("*", "*", minSize, maxSize, false);
151
		crawler.setDownloadRuleSet(rules);		
152
		
153
	}
154
	
155
	public static void configureCrawler(WebRobot crawler) throws IOException, HttpException {
156
		logger.debug("Configuring crawler using configuration file parameters");
157
		crawler.setAgentName(agentName);
158
		crawler.setIgnoreRobotsTxt(ignoreRobotsTxt);
159
		crawler.setSleepTime(sleepTime/1000);
160
		crawler.setMaxDepth(maxDepth);
161
		crawler.setWalkToOtherHosts(walkToOtherHosts);
162
		crawler.setAllowWholeHost(allowWholeHost);
163
		crawler.setAllowWholeDomain(allowWholeDomain);
164
		crawler.setFlexibleHostCheck(flexibleHostCheck);
165
		crawler.setEnableCookies(enableCookies);
166
		
167
		if(startReferer != null)
168
			crawler.setStartReferer(startReferer);
169
		if(maxDocumentAge > 0)
170
			crawler.setMaxDocumentAge(maxDocumentAge);
171
		if(allowedUrl != null) {
172
			Vector<String> urls = new Vector<String>();
173
			for(int i=0; i<allowedUrl.length; i++)
174
				urls.add(allowedUrl[i]);
175
			crawler.setAllowedURLs(urls);
176
		}
177
		if(visitMany != null) {
178
			Vector<String> urls = new Vector<String>();
179
			for(int i=0; i<visitMany.length; i++)
180
				urls.add(visitMany[i]);
181
			crawler.setVisitMany(urls);
182
		}
183
		if(proxy != null)
184
			crawler.setProxy(proxy);
185
		if(bandwidth > 0)
186
			crawler.setBandwidth(bandwidth);
187
			
188
		DownloadRuleSet rules = new DownloadRuleSet();
189
		int minSize = 1, maxSize = 104857600;
190
		rules.addRule("text", "html", minSize, maxSize, true);
191
		Collection<String> mimeTypes = UrlFilter.getRequestedMimeTypes();
192
		for(String mimeType : mimeTypes) {
193
			String[] parts = mimeType.split("/");
194
			if(parts.length < 2) {
195
				logger.debug("WARNING: Requested mimetype "+mimeType+" seems to be malformed");
196
				throw new IOException();
197
			}
198
			rules.addRule(parts[0], parts[1], minSize, maxSize, true);
199
		}
200
		rules.addRule("*", "*", minSize, maxSize, false);
201
		crawler.setDownloadRuleSet(rules);
202
		
203
	}
204
	
205
	public static void main(String[] args) {
206
		WebRobot robby = new WebRobot();
207
		try {
208
		configureCrawler(robby);
209
		StringWriter sw = new StringWriter();
210
	    URLLogger log = new URLLogger(sw);
211
	    robby.setDocManager(log);
212

  
213
	    robby.run();
214
	    
215
	    logger.debug(sw.getBuffer().toString());
216
		}
217
		catch(Exception e) {
218
			logger.debug(e.getLocalizedMessage());
219
		}
220
	}
221
}
modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/eu/dnetlib/data/utility/resource_discovery/harvester/ResourceHarvester.java
1
package eu.dnetlib.data.utility.resource_discovery.harvester;
2

  
3
import eu.dnetlib.data.utility.resource_discovery.crawler.Crawler;
4
import eu.dnetlib.data.utility.resource_discovery.crawler.ResourceExtractor;
5
import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
6

  
7
import java.io.IOException;
8
import java.util.Date;
9
import java.util.Vector;
10

  
11
import org.apache.commons.logging.Log;
12
import org.apache.commons.logging.LogFactory;
13
import org.dlese.dpc.oai.harvester.Harvester;
14
import org.dlese.dpc.oai.harvester.Hexception;
15
import org.dlese.dpc.oai.harvester.OAIErrorException;
16
import org.dlese.dpc.xml.XMLDoc;
17
import org.dlese.dpc.xml.XMLException;
18

  
19
public class ResourceHarvester {
20

  
21
	private static final Log logger = LogFactory.getLog(ResourceHarvester.class);
22
	static public String[][] getRecordsFromRepository(String baseUrl) throws Hexception, OAIErrorException {
23
		return Harvester.harvest(baseUrl, "oai_dc", null, null, null, null, true);
24
	}
25
	
26
	static public String[][] getRecordsFromRepository(String baseUrl, String set, Date from, Date until) throws Hexception, OAIErrorException {
27
		return Harvester.harvest(baseUrl, "oai_dc", set, from, until, null, true);
28
	}
29
	
30
	static public Vector<Vector<String>> getResourceAndLinks(String header, String oaiDcRecord, Crawler crawler, ResourceExtractor extractor) throws IOException, InterruptedException {
31
		Vector<Vector<String>> retrievedAndExtracted = new Vector<Vector<String>>();
32
		String id = getDcIdentifier(oaiDcRecord, header);
33
		if(id !=  null) {
34
			String idUrl = UrlFilter.resolveRedirections(id);
35
			Vector<String> urls;
36
			urls = crawler.getLinks(idUrl);
37
			retrievedAndExtracted.add(urls);
38
			retrievedAndExtracted.add(extractor.extractResource(urls));
39
			return retrievedAndExtracted;
40
		}
41
		return null;
42
	}
43
	
44
	static public String getIdentifier(String oaiDcRecord, String identifier) throws IOException {
45
		XMLDoc xd = new XMLDoc();
46
		try {
47
		xd.useXmlString(oaiDcRecord, true, true, true);
48
		}
49
		catch(XMLException e) {
50
			logger.debug("WARNING: The record "+identifier+" seems to be malformed (deleted maybe?)");
51
			return null;
52
		}
53
		try {
54
		String [] fields = xd.getXmlFields(1,0,"dc:identifier");
55
		Vector<String> urls = new Vector<String>();
56
		for(String field : fields) {
57
			if(UrlFilter.isUrl(field))
58
				urls.add(field);
59
		}
60
		fields = xd.getXmlFields(0,0,"dc:source");
61
		for(String field : fields) {
62
			if(UrlFilter.isUrl(field))
63
				urls.add(field);
64
		}
65
		fields = xd.getXmlFields(0,0,"dc:relation");
66
		for(String field : fields) {
67
			if(UrlFilter.isUrl(field))
68
				urls.add(field);
69
		}
70
		if(urls.size() == 0) {
71
			logger.debug("WARNING: The record "+identifier+" does not seem to have a field that is a url");
72
			return null;
73
		}
74
		if(urls.size() == 1)
75
			return urls.elementAt(0);
76
		for(String url : urls) {
77
			if(UrlFilter.checkExtension(url))
78
				return url;
79
		}
80
		logger.debug("WARNING: The record "+identifier+" has multiple fields with valid urls and there is no way to choose one. The first one will be used");
81
		return urls.elementAt(0);
82
		}
83
		catch(XMLException e) {
84
			logger.debug("WARNING: The record "+identifier+" does not seem to have a dc:identifier field");
85
			return null;
86
		}
87
	}
88
	
89
	static public String getDcIdentifier(String oaiDcRecord, String identifier) throws IOException {
90
		XMLDoc xd = new XMLDoc();
91
		try {
92
		xd.useXmlString(oaiDcRecord, true, true, true);
93
		}
94
		catch(XMLException e) {
95
			logger.debug("WARNING: The record "+identifier+" seems to be malformed (deleted maybe?)");
96
			return null;
97
		}
98
		try {
99
		String [] fields = xd.getXmlFields(1,0,"dc:identifier");
100
		Vector<String> urls = new Vector<String>();
101
		for(String field : fields) {
102
			if(UrlFilter.isUrl(field))
103
				urls.add(field);
104
		}
105
		if(urls.size() == 0) {
106
			logger.debug("WARNING: The record "+identifier+" does not seem to have a field that is a url");
107
			return null;
108
		}
109
		if(urls.size() == 1)
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff