/ - Diff - D-Net - D-Net project tracking tool

     <?xml version="1.0" encoding="UTF-8"?>
     <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.ap\ ache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
     	<parent>
     		<groupId>eu.dnetlib</groupId>
     		<artifactId>dnet-parent</artifactId>
     		<version>1.0.0</version>
     	</parent>
     	<modelVersion>4.0.0</modelVersion>
     	<groupId>eu.dnetlib</groupId>
     	<artifactId>dnet-resource-discovery</artifactId>
     	<packaging>jar</packaging>
     	<version>2.0.0</version>
     	<scm>
     	  <developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0</developerConnection>
     	</scm>
     	<dependencies>
     		<dependency>
     			<groupId>apache</groupId>
     			<artifactId>commons-logging</artifactId>
     			<version>[1.0.0,1.0.1)</version>
     		</dependency>
     		<dependency>
     			<groupId>junit</groupId>
     			<artifactId>junit</artifactId>
     			<version>${junit.version}</version>
     			<scope>test</scope>
     		</dependency>
     		<dependency>
     			<groupId>org.w3c</groupId>
     			<artifactId>tidy</artifactId>
     			<version>[0.0.0,)</version>
     		</dependency>
     		<dependency>
     			<groupId>net.matuschek</groupId>
     			<artifactId>jobo</artifactId>
     			<version>[1.4,2.0)</version>
     		</dependency>
     		<dependency>
     			<groupId>DLS</groupId>
     			<artifactId>jOAI</artifactId>
     			<version>[2.0.9.3,2.0.10.0)</version>
     		</dependency>
     		<dependency>
     			<groupId>com.thoughtworks</groupId>
     			<artifactId>xstream</artifactId>
     			<version>[0.0.0,)</version>
     		</dependency>
     		<dependency>
     			<groupId>net.sourceforge.nekohtml</groupId>
     			<artifactId>nekohtml</artifactId>
     			<version>1.9.16</version>
     			<exclusions>
     				<exclusion>
     					<artifactId>xercesImpl</artifactId>
     					<groupId>xerces</groupId>
     				</exclusion>
     			</exclusions>
     		</dependency>
     		<dependency>
     			<groupId>com.jira</groupId>
     			<artifactId>heritrix-commons</artifactId>
     			<version>[0.0.0,)</version>
     		</dependency>
     		<dependency>
     			<groupId>com.jira</groupId>
     			<artifactId>heritrix-modules</artifactId>
     			<version>[0.0.0,)</version>
     		</dependency>
     		<dependency>
     			<groupId>com.googlecode</groupId>
     			<artifactId>kryo</artifactId>
     			<version>1.04</version>
     			<exclusions>
     				<exclusion>
     					<groupId>com.googlecode</groupId>
     					<artifactId>minlog</artifactId>
     				</exclusion>
     			</exclusions>
     		</dependency>
     		<dependency>
     			<groupId>edu.indiana</groupId>
     			<artifactId>xpp3</artifactId>
     			<version>[0.0.0,)</version>
     		</dependency>
     		<dependency>
     			<groupId>xerces</groupId>
     			<artifactId>xercesImpl</artifactId>
     			<version>2.11.0</version>
     			<scope>provided</scope>
     		</dependency>
     	</dependencies>
     </project>

modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/deploy.info

1

{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-resource-discovery/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "dnet-resource-discovery"}

     package eu.dnetlib.testWebCrawl;
     import gr.uoa.di.resourcediscovery.MalformedConfigurationException;
     import gr.uoa.di.resourcediscovery.MethodProvider;
     import gr.uoa.di.resourcediscovery.MethodProviderFileStorageImpl;
     import gr.uoa.di.resourcediscovery.UnknownMethodException;
     import gr.uoa.di.resourcediscovery.methods.XPathAndCrawl;
     import java.io.IOException;
     import java.net.URL;
     import java.util.Arrays;
     import java.util.List;
     import org.junit.Assert;
     import org.junit.Test;
     import org.xml.sax.SAXException;
     public class testCrawl {
     	@Test
     	public void test() throws MalformedConfigurationException, UnknownMethodException, IOException, SAXException {
     		long starttime = System.currentTimeMillis();
     		String fileName = "/tmp/method-map.xml";
     		List<String> mimeTypes = Arrays.asList(new String[] { "application/pdf" });
     		MethodProvider provider = new MethodProviderFileStorageImpl(fileName);
     		URL conUrl = new URL("http://arxiv.org/abs/0908.4286.pdf");
     		XPathAndCrawl xpath = new XPathAndCrawl(mimeTypes, null);
     		List<String> resources = xpath.getResources(conUrl, provider);
     		Assert.assertTrue("The length should be > 0", resources.size() > 0);
     		long endtime = System.currentTimeMillis();
     		System.out.println((endtime - starttime) / 1000);
+    	}
+    }

     package gr.uoa.di.resourcediscovery.methods;
     import gr.uoa.di.resourcediscovery.MalformedConfigurationException;
     import gr.uoa.di.resourcediscovery.MethodProvider;
     import gr.uoa.di.resourcediscovery.Toolkit;
     import java.io.BufferedReader;
     import java.io.FileNotFoundException;
     import java.io.IOException;
     import java.io.InputStreamReader;
     import java.net.MalformedURLException;
     import java.net.URL;
     import java.util.ArrayList;
     import java.util.List;
     import org.apache.commons.logging.Log;
     import org.apache.commons.logging.LogFactory;
     import org.archive.modules.net.RobotsDirectives;
     import org.archive.modules.net.Robotstxt;
     import org.cyberneko.html.parsers.DOMParser;
     import org.w3c.dom.Document;
     import org.w3c.dom.Node;
     import org.w3c.dom.traversal.DocumentTraversal;
     import org.w3c.dom.traversal.NodeFilter;
     import org.w3c.dom.traversal.NodeIterator;
     import org.xml.sax.SAXException;
     public class XPathAndCrawl implements ResourceDiscoveryMethod {
     	private static final Log logger = LogFactory.getLog(XPathAndCrawl.class);
     	private boolean resolveFrames = true;
     	private boolean skipFirstPage = false;
     	private long sleepMillis = 100;
     	private boolean ignoreRobotsTxt = false;
     	private String agentName = "OpenAIRE_Harvester";
     	private List<String> mimeTypes = new ArrayList<String>();
     	private boolean fallback = true;
     	private String robotstxtUrl = null;
     	transient private Robotstxt robot = null;
     	transient private RobotsDirectives directives = null;
     	private List<String> xpaths = new ArrayList<String>();
     	public XPathAndCrawl() {
     		this.ignoreRobotsTxt = true;
+    	}
     	// you need one per repository!
     	public XPathAndCrawl(List<String> mimeTypes, String robotstxtUrl) throws FileNotFoundException, IOException {
     		this.mimeTypes.addAll(mimeTypes);
     		if (robotstxtUrl != null) {
     			URL url = new URL(robotstxtUrl);
     			try {
     				BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
     				this.robot = new Robotstxt(in);
     				this.directives = this.robot.getDirectivesFor(agentName);
     			} catch (FileNotFoundException ex) {
     				logger.debug("Robots.txt was not found at " + robotstxtUrl);
     				ignoreRobotsTxt = true;
+    			}
     		} else {
     			ignoreRobotsTxt = true;
+    		}
+    	}
     	public void setRobotstxt(String robotstxtUrl) throws FileNotFoundException, IOException {
     		this.robotstxtUrl = robotstxtUrl;
     		if (robotstxtUrl != null) {
     			URL url = new URL(robotstxtUrl);
     			try {
     				BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
     				this.robot = new Robotstxt(in);
     				this.directives = this.robot.getDirectivesFor(agentName);
     			} catch (FileNotFoundException ex) {
     				logger.debug("Robots.txt was not found at " + robotstxtUrl);
     				ignoreRobotsTxt = true;
+    			}
     		} else {
     			ignoreRobotsTxt = true;
+    		}
+    	}
     	public String getRobotstxtUrl() {
     		return robotstxtUrl;
+    	}
     	@Override
     	public List<String> getResources(URL upageUrl, MethodProvider provider) throws SAXException, IOException {
     		String pageUrl = upageUrl.toString();
     		logger.debug("Known xpaths: "+this.xpaths);
     		pageUrl = Toolkit.getRedirectedUrl(pageUrl, this.sleepMillis);
     		logger.debug("Resolved possible redirections. Url: "+pageUrl);
     		List<String> ret = new ArrayList<String>();
     		List<String> urls = new ArrayList<String>();
     		urls.add(pageUrl);
     		// check if url is a redirection
     		if(this.mimeTypes.contains(Toolkit.getMimeType(pageUrl, this.sleepMillis))) {
     			ret.add(Toolkit.makeAbsolute(pageUrl, new URL(pageUrl)));
     			return ret;
+    		}
     		if (this.resolveFrames) {
     			DOMParser parser = new DOMParser();
     			parser.parse(pageUrl);
     			Document doc = parser.getDocument();
     			urls.addAll(resolveFrames(doc, new URL(pageUrl)));
     			logger.debug("urls after resolving frames: " + urls);
+    		}
     		if (this.skipFirstPage) {
     			List<String> addme = new ArrayList<String>();
     			for (String url : urls) {
     				DOMParser parser = new DOMParser();
     				parser.parse(url);
     				Document doc = parser.getDocument();
     				addme.addAll(oneDepthDown(doc, new URL(url)));
+    			}
     			urls.remove(pageUrl);
     			if (this.resolveFrames) {
     				for (String url : urls) {
     					DOMParser parser = new DOMParser();
     					parser.parse(url);
     					Document doc = parser.getDocument();
     					addme.addAll(resolveFrames(doc, new URL(url)));
+    				}
+    			}
     			urls.addAll(addme);
     			logger.debug("urls after skipping 1st page and resolving frames: " + urls);
+    		}
     		for (String url : urls) {
     			logger.debug("looking for resource in: " + url);
     			try {
     				url = Toolkit.makeAbsolute(url, new URL(pageUrl));
     			} catch (Exception e) {
     				e.printStackTrace();
     				continue;
+    			}
     			URL startingUrl = new URL(url);
     			if (!this.ignoreRobotsTxt)
     				if (!this.directives.allows(Toolkit.makeRelative(startingUrl))) {
     					logger.debug("Skipping " + startingUrl + ". Disallowed by robots.txt directives.");
     					continue;
+    				}
     			if (this.xpaths.size() == 0) {
     				logger.debug("No xpath information, crawling");
     				// this for the first time
     				DOMParser parser = new DOMParser();
     				parser.parse(startingUrl.toString());
     				Document doc = parser.getDocument();
     				List<Node> resourceNodes = findNodesWithResource(doc, startingUrl);
     				for (Node resourceNode : resourceNodes) {
     					String xp = getXpathToRoot(resourceNode);
     					xpaths.add(xp);
     					logger.debug(xp);
+    				}
     				try {
     					URL methodUrl = new URL(pageUrl);
     					provider.setMethod(new URL(methodUrl.getProtocol()+"://"+methodUrl.getHost()), this);
     				} catch(MalformedConfigurationException e) {
     					logger.error("Error updating xpath information", e);
+    				}
     				for (String xp : xpaths) {
     					String resourceUrl = getResourceUrl(xp, doc, startingUrl);
     					if (resourceUrl != null) {
     						logger.debug(resourceUrl);
     						ret.add(resourceUrl);
+    					}
+    				}
     			} else {
     				// this is for the rest of the pages of the repo
     				DOMParser parser = new DOMParser();
     				parser.parse(startingUrl.toString());
     				Document doc = parser.getDocument();
     				for (String xp : xpaths) {
     					String resourceUrl = getResourceUrl(xp, doc, startingUrl);
     					if (resourceUrl != null) {
     						logger.debug(resourceUrl);
     						ret.add(resourceUrl);
+    					}
+    				}
+    			}
+    		}
     		if (ret.size() == 0 && this.fallback) {
     			// if no xpath contained the resource, try to find it and add
     			// all the xpaths
     			for (String url : urls) {
     				logger.debug("looking for resource in (not found in xpath): " + url);
     				try {
     					url = Toolkit.makeAbsolute(url, new URL(pageUrl));
     				} catch (Exception e) {
     					e.printStackTrace();
     					continue;
+    				}
     				URL startingUrl = new URL(url);
     				if (!this.ignoreRobotsTxt)
     					if (!this.directives.allows(Toolkit.makeRelative(startingUrl))) {
     						logger.debug("Skipping " + startingUrl + ". Disallowed by robots.txt directives.");
     						continue;
+    					}
     				DOMParser parser = new DOMParser();
     				parser.parse(startingUrl.toString());
     				Document doc = parser.getDocument();
     				List<Node> resourceNodes = findNodesWithResource(doc, startingUrl);
     				for (Node resourceNode : resourceNodes) {
     					String xp = getXpathToRoot(resourceNode);
     					xpaths.add(xp);
     					logger.debug(xp);
+    				}
     				try {
     					URL methodUrl = new URL(pageUrl);
     					provider.setMethod(new URL(methodUrl.getProtocol()+"://"+methodUrl.getHost()), this);
     				} catch(MalformedConfigurationException e) {
     					logger.error("Error updating xpath information", e);
+    				}
     				for (String xp : xpaths) {
     					String resourceUrl = getResourceUrl(xp, doc, startingUrl);
     					if (resourceUrl != null) {
     						logger.debug(resourceUrl);
     						ret.add(resourceUrl);
+    					}
+    				}
+    			}
+    		}
     		return ret;
+    	}
     	private List<String> resolveFrames(Document doc, URL connectionUrl) {
     		List<String> ret = new ArrayList<String>();
     		DocumentTraversal traversal = (DocumentTraversal) doc;
     		NodeIterator iterator = null;
     		try {
     			iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true);
     		} catch (Exception e) {
     			e.printStackTrace();
     			return ret;
+    		}
     		for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
     			if (n.getNodeName().equals("FRAME") || n.getNodeName().equals("IFRAME")) {
     				String url = n.getAttributes().getNamedItem("src").getNodeValue();
     				try {
     					url = Toolkit.makeAbsolute(url, connectionUrl);
     					ret.add(url);
     				} catch (MalformedURLException ex) {
     					continue;
+    				}
+    			}
+    		}
     		return ret;
+    	}
     	private List<String> oneDepthDown(Document doc, URL connectionUrl) throws IOException {
     		List<String> ret = new ArrayList<String>();
     		DocumentTraversal traversal = (DocumentTraversal) doc;
     		NodeIterator iterator = null;
     		try {
     			iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true);
     		} catch (Exception e) {
     			e.printStackTrace();
     			return ret;
+    		}
     		for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
     			if (n.getNodeName().equals("A")) {
     				String url = n.getAttributes().getNamedItem("href").getNodeValue();
     				try {
     					url = Toolkit.makeAbsolute(url, connectionUrl);
     					if (Toolkit.getMimeType(url, this.sleepMillis).trim().contains("text/html"))
     						ret.add(url);
     				} catch (MalformedURLException ex) {
     					continue;
+    				}
+    			}
+    		}
     		return ret;
+    	}
     	private String getXpathToRoot(Node node) {
     		String xpath = "";
     		do {
     			if (node.getNodeName().equals("HTML")) {
     				int before = 1;
     				while ((node = node.getPreviousSibling()) != null)
     					before++;
     				return "/HTML["+before+"]" + xpath;
+    			}
     			int before = 0;
     			Node current = node;
     			while ((current = current.getPreviousSibling()) != null)
     				if (current.getNodeName().equals(node.getNodeName()))
     					before++;
     			xpath = "/" + node.getNodeName() + "[" + (before + 1) + "]" + xpath;
     		} while ((node = node.getParentNode()) != null);
     		return xpath;
+    	}
     	private List<Node> findNodesWithResource(Document doc, URL connectionUrl) throws IOException {
     		List<Node> ret = new ArrayList<Node>();
     		DocumentTraversal traversal = (DocumentTraversal) doc;
     		NodeIterator iterator = null;
     		try {
     			iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true);
     		} catch (Exception e) {
     			e.printStackTrace();
     			return ret;
+    		}
     		for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
     			if (n.getNodeName().equals("A")) {
     				String url = null;
     				try {
     					url = n.getAttributes().getNamedItem("href").getNodeValue();
     				} catch(NullPointerException e) {
     					// anchor without href
     					continue;
+    				}
     				if (url == null)
     					continue;
     				try {
     					url = Toolkit.makeAbsolute(url, connectionUrl);
     					if (this.mimeTypes.contains(Toolkit.getMimeType(url, this.sleepMillis).trim()))
     						ret.add(n);
     				} catch (MalformedURLException ex) {
     					continue;
+    				}
+    			}
+    		}
     		return ret;
+    	}
     	private String getResourceUrl(String xpath, Document doc, URL url) throws MalformedURLException {
     		try {
     			Node current = doc.getFirstChild();
     			String[] elements = xpath.split("/");
     			for (String element : elements) {
     				if (element.trim().equals(""))
     					continue;
     				int position = Integer.parseInt(element.substring(element.indexOf('[')).replaceAll("\\[", "").replaceAll("\\]", ""));
     				String name = element.substring(0, element.indexOf('['));
     				int found = 0;
     				do {
     					if (current.getNodeName().equals(name)) {
     						found++;
     						if (found == position) {
     							current = current.getFirstChild();
     							break;
+    						}
+    					}
     				} while ((current = current.getNextSibling()) != null);
+    			}
     			String ret = current.getParentNode().getAttributes().getNamedItem("href").getNodeValue();
     			return Toolkit.makeAbsolute(ret, url);
     		} catch (Exception e) {
     			return null;
+    		}
+    	}
     	private Object readResolve() throws IOException {
     		if (robotstxtUrl != null) {
     			URL url = new URL(robotstxtUrl);
     			BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
     			this.robot = new Robotstxt(in);
     			this.directives = this.robot.getDirectivesFor(agentName);
     		} else {
     			ignoreRobotsTxt = true;
+    		}
     		return this;
+    	}
     	public boolean isResolveFrames() {
     		return resolveFrames;
+    	}
     	public void setResolveFrames(boolean resolveFrames) {
     		this.resolveFrames = resolveFrames;
+    	}
     	public boolean isSkipFirstPage() {
     		return skipFirstPage;
+    	}
     	public void setSkipFirstPage(boolean skipFirstPage) {
     		this.skipFirstPage = skipFirstPage;
+    	}
     	public long getSleepMillis() {
     		return sleepMillis;
+    	}
     	public void setSleepMillis(long sleepMillis) {
     		this.sleepMillis = sleepMillis;
+    	}
     	public List<String> getMimeTypes() {
     		return mimeTypes;
+    	}
     	public void setMimeTypes(List<String> mimeTypes) {
     		this.mimeTypes = mimeTypes;
+    	}
     	public List<String> getXpaths() {
     		return xpaths;
+    	}
     	public void setXpaths(List<String> xpaths) {
     		this.xpaths = xpaths;
+    	}
     	public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
     		this.ignoreRobotsTxt = ignoreRobotsTxt;
+    	}
     	public boolean isIgnoreRobotsTxt() {
     		return ignoreRobotsTxt;
+    	}
     	public void setAgentName(String agentName) {
     		this.agentName = agentName;
     		this.directives = this.robot.getDirectivesFor(agentName);
+    	}
     	public String getAgentName() {
     		return agentName;
+    	}
     	public void setFallback(boolean fallback) {
     		this.fallback = fallback;
+    	}
     	public boolean isFallback() {
     		return fallback;
+    	}
+    }

     package gr.uoa.di.resourcediscovery.methods;
     import gr.uoa.di.resourcediscovery.MethodProvider;
     import java.io.IOException;
     import java.net.URL;
     import java.util.List;
     import org.xml.sax.SAXException;
     public interface ResourceDiscoveryMethod {
     	public List<String> getResources(URL upageUrl, MethodProvider provider) throws SAXException, IOException;
+    }

     package gr.uoa.di.resourcediscovery.methods;
     import gr.uoa.di.resourcediscovery.MethodProvider;
     import java.net.URL;
     import java.util.ArrayList;
     import java.util.List;
     public class URLTransformation implements ResourceDiscoveryMethod {
     	private String regex = null, replacement = "";
     	private String addToEnd = "";
     	@Override
     	public List<String> getResources(URL upageUrl, MethodProvider provider) {
     		String pageUrl = upageUrl.toString();
     		String trsf = pageUrl;
     		if (regex != null && !regex.trim().equals(""))
     			trsf = pageUrl.replaceAll(regex, replacement);
     		trsf = trsf + addToEnd;
     		List<String> ret = new ArrayList<String>();
     		ret.add(trsf);
     		return ret;
+    	}
     	public String getRegex() {
     		return regex;
+    	}
     	public void setRegex(String regex) {
     		this.regex = regex;
+    	}
     	public String getAddToEnd() {
     		return addToEnd;
+    	}
     	public void setAddToEnd(String addToEnd) {
     		this.addToEnd = addToEnd;
+    	}
     	public String getReplacement() {
     		return replacement;
+    	}
     	public void setReplacement(String replacement) {
     		this.replacement = replacement;
+    	}
+    }

     package gr.uoa.di.resourcediscovery;
     import java.io.IOException;
     import java.net.HttpURLConnection;
     import java.net.MalformedURLException;
     import java.net.URL;
     import java.net.URLConnection;
     import org.apache.commons.logging.Log;
     import org.apache.commons.logging.LogFactory;
     public class Toolkit {
     	private static final Log logger = LogFactory.getLog(Toolkit.class);
     	static int timeout = 10000;
     	static public String makeAbsolute(String url, URL connectionUrl) throws MalformedURLException {
     		return new URL(connectionUrl, url).toString();
+    	}
     	static public String makeRelative(URL connectionUrl) throws MalformedURLException {
     		return connectionUrl.getPath();
+    	}
     	static public String getRedirectedUrl(String resourceURL, long sleepMillis) throws IOException, MalformedURLException {
     		URL url = null;
     		try {
     			url = new URL(resourceURL);
     		} catch (MalformedURLException mue) {
     			logger.error("Error opening first url", mue);
     			throw mue;
+    		}
     		HttpURLConnection.setFollowRedirects(false);
     		HttpURLConnection conn = null;
     		try {
     			Thread.sleep(sleepMillis);
     			conn = (HttpURLConnection) url.openConnection();
     			conn.setConnectTimeout(timeout);
     			conn.setReadTimeout(timeout);
     			conn.setAllowUserInteraction(false);
     			conn.setDoOutput(true);
     		} catch (ClassCastException ex) {
     			throw new MalformedURLException();
     		} catch (InterruptedException e) {
     			e.printStackTrace();
+    		}
     		conn.setRequestMethod("HEAD");
     		try {
     			conn = openConnectionCheckRedirects(conn, sleepMillis);
     		} catch (Exception ex) {
     			throw new MalformedURLException();
+    		}
     		try {
     			Thread.sleep(sleepMillis);
     		} catch (InterruptedException e) {
     			e.printStackTrace();
+    		}
     		int statusCode = conn.getResponseCode();
     		if (statusCode == 503) {
     			logger.error("Url " + conn.getURL() + " reported status code 503. Please increase the crawler's sleep time.");
     			conn.disconnect();
     			throw new IOException("Url " + conn.getURL() + " reported status code 503. Please increase the crawler's sleep time.");
     		} else if (conn.getResponseCode() >= 400) {
     			// Client or server error received
     			logger.error("Url " + conn.getURL() + " seems to be unreachable (response code:"+statusCode+"). If this url is not of importance you can ignore this error.");
     			conn.disconnect();
     			throw new IOException("Url " + conn.getURL() + " seems to be unreachable (response code:"+statusCode+"). If this url is not of importance you can ignore this error.");
     		} else {
     			return conn.getURL().toString();
+    		}
+    	}
     	static public String getMimeType(String resourceURL, long sleepMillis) throws IOException, MalformedURLException {
     		URL url = null;
     		try {
     			url = new URL(resourceURL);
     		} catch (MalformedURLException mue) {
     			logger.debug("Error getting mime type" + mue);
     			throw mue;
+    		}
     		HttpURLConnection.setFollowRedirects(false);
     		HttpURLConnection conn = null;
     		try {
     			Thread.sleep(sleepMillis);
     			conn = (HttpURLConnection) url.openConnection();
     			conn.setConnectTimeout(timeout);
     			conn.setReadTimeout(timeout);
     			conn.setAllowUserInteraction(false);
     			conn.setDoOutput(true);
     		} catch (ClassCastException ex) {
     			throw new MalformedURLException();
     		} catch (InterruptedException e) {
     			e.printStackTrace();
+    		}
     		conn.setRequestMethod("HEAD");
     		try {
     			conn = openConnectionCheckRedirects(conn, sleepMillis);
     		} catch (Exception ex) {
     			throw new MalformedURLException();
+    		}
     		try {
     			Thread.sleep(sleepMillis);
     		} catch (InterruptedException e) {
     			e.printStackTrace();
+    		}
     		int statusCode = conn.getResponseCode();
     		if (statusCode == 503) {
     			logger.error("WARNING: Url " + conn.getURL() + " reported status code 503. Please increase the crawler's sleep time.");
     			conn.disconnect();
     			return "unknown";
     		} else if (conn.getResponseCode() >= 400) {
     			// Client or server error received
     			logger.error("WARNING: Url " + conn.getURL() + " seems to be unreachable (response code:"+statusCode+"). If this url is not of importance you can ignore this error.");
     			conn.disconnect();
     			return "unknown";
     		} else {
     			String mimeType = conn.getContentType();
     			logger.debug("mime type for " + conn.getURL() + ": " + mimeType);
     			logger.debug("response code was: " + statusCode);
     			conn.disconnect();
     			if (mimeType == null)
     				mimeType = "unknown";
     			return mimeType.replaceAll(";.*", "").trim();
+    		}
+    	}
     	static public HttpURLConnection openConnectionCheckRedirects(URLConnection c, long sleepMillis) throws IOException {
     		boolean redir;
     		int redirects = 0;
     		do {
     			redir = false;
     			if (c instanceof HttpURLConnection) {
     				HttpURLConnection http = (HttpURLConnection) c;
     				try {
     					Thread.sleep(sleepMillis);
     				} catch (InterruptedException e) {
     					e.printStackTrace();
+    				}
     				int stat = http.getResponseCode();
     				if (stat >= 300 && stat <= 307 && stat != 306 && stat != HttpURLConnection.HTTP_NOT_MODIFIED) {
     					URL base = http.getURL();
     					String loc = http.getHeaderField("Location");
     					URL target = null;
     					if (loc != null) {
     						target = new URL(base, loc);
+    					}
     					http.disconnect();
     					// Redirection should be allowed only for HTTP and HTTPS
     					// and should be limited to 5 redirections at most.
     					if (target == null || !(target.getProtocol().equals("http") || target.getProtocol().equals("https")) || redirects >= 5) {
     						throw new IOException("Redirection should be allowed only for HTTP and HTTPS and should be limited to 5 redirections at most.");
+    					}
     					redir = true;
     					try {
     						Thread.sleep(sleepMillis);
     					} catch (InterruptedException e) {
     						e.printStackTrace();
+    					}
     					c = target.openConnection();
     					c.setConnectTimeout(timeout);
     					c.setReadTimeout(timeout);
     					c.setAllowUserInteraction(false);
     					c.setDoOutput(true);
     					redirects++;
+    				}
+    			}
     		} while (redir);
     		return (HttpURLConnection) c;
+    	}
+    }

modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/UnknownMethodException.java
	1	package gr.uoa.di.resourcediscovery;
	2
	3	public class UnknownMethodException extends Exception {
	4	private static final long serialVersionUID = 760327436365242998L;
	5
	6	}

     package gr.uoa.di.resourcediscovery;
     import gr.uoa.di.resourcediscovery.methods.ResourceDiscoveryMethod;
     import java.io.File;
     import java.io.FileNotFoundException;
     import java.io.FileReader;
     import java.io.FileWriter;
     import java.io.IOException;
     import java.net.URL;
     import java.util.HashMap;
     import com.thoughtworks.xstream.XStream;
     public class MethodProviderFileStorageImpl implements MethodProvider {
     	private String pathToFile = null;
     	HashMap<URL, ResourceDiscoveryMethod> map = new HashMap<URL, ResourceDiscoveryMethod>();
     	public MethodProviderFileStorageImpl() {
+    	}
     	@SuppressWarnings("unchecked")
     	public MethodProviderFileStorageImpl(String pathToFile) throws FileNotFoundException {
     		XStream xstream = new XStream();
     		if(!(new File(pathToFile).exists()))
     			map = new  HashMap<URL, ResourceDiscoveryMethod>();
     		else
     			map = (HashMap<URL, ResourceDiscoveryMethod>) xstream.fromXML(new FileReader(new File(pathToFile)));
     		this.pathToFile = pathToFile;
+    	}
     	@Override
     	public ResourceDiscoveryMethod getMethod(URL baseUrl) throws MalformedConfigurationException, UnknownMethodException, IOException {
     		baseUrl = new URL(Toolkit.getRedirectedUrl(baseUrl.toString(), 500));
     		ResourceDiscoveryMethod ret = map.get(new URL(baseUrl.getProtocol()+"://"+baseUrl.getHost()));
     		return ret;
+    	}
     	@Override
     	public void setMethod(URL baseUrl, ResourceDiscoveryMethod method) {
     		map.put(baseUrl, method);
     		try {
     			store();
     		} catch (IOException e) {
     			e.printStackTrace();
+    		}
+    	}
     	public String getPathToFile() {
     		return pathToFile;
+    	}
     	public void setPathToFile(String pathToFile) {
     		this.pathToFile = pathToFile;
+    	}
     	public void store() throws IOException {
     		XStream xstream = new XStream();
     		xstream.toXML(map, new FileWriter(new File(pathToFile)));
+    	}
+    }

     package gr.uoa.di.resourcediscovery;
     import gr.uoa.di.resourcediscovery.methods.ResourceDiscoveryMethod;
     import java.io.IOException;
     import java.net.MalformedURLException;
     import java.net.URL;
     public interface MethodProvider {
     	public ResourceDiscoveryMethod getMethod(URL baseUrl) throws MalformedConfigurationException, UnknownMethodException, MalformedURLException, IOException;
     	public void setMethod(URL baseUrl, ResourceDiscoveryMethod method) throws MalformedConfigurationException;
+    }

modules/dnet-resource-discovery/tags/dnet-resource-discovery-2.0.0/src/main/java/gr/uoa/di/resourcediscovery/MalformedConfigurationException.java
	1	package gr.uoa.di.resourcediscovery;
	2
	3	public class MalformedConfigurationException extends Exception {
	4
	5	private static final long serialVersionUID = 8557374776080985539L;
	6
	7	}

     package eu.dnetlib.data.utility.resource_discovery.crawler;
     import java.io.IOException;
     import java.io.StringWriter;
     import java.net.MalformedURLException;
     import java.net.URL;
     import java.util.Vector;
     import net.matuschek.http.HttpException;
     import net.matuschek.http.URLLogger;
     import net.matuschek.spider.WebRobot;
     import org.apache.commons.logging.Log;
     import org.apache.commons.logging.LogFactory;
     import eu.dnetlib.data.utility.resource_discovery.crawler.config.Configs;
     import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
     public class Crawler {
     	private static final Log logger = LogFactory.getLog(Crawler.class);
     	private WebRobot crawler;
     	public Crawler() throws IOException, HttpException {
     		crawler = new WebRobot();
     		Configs.configureCrawler(crawler);
+    	}
     	public Crawler(boolean isValidator) throws IOException, HttpException {
     		crawler = new WebRobot();
     		if(isValidator)
     			Configs.configureCrawlerForValidation(crawler);
     		else
     			Configs.configureCrawler(crawler);
+    	}
     	public void reconfigureForRetry() {
     		crawler.setMaxDepth(2);
+    	}
     	public Vector<String> getLinks(String url) throws MalformedURLException, IOException, InterruptedException {
     		logger.debug("Retrieving links from url "+url);
     		crawler.setStartURL(new URL(UrlFilter.resolveRedirections(url)));
     		StringWriter sw = new StringWriter();
     	    URLLogger log = new URLLogger(sw);
     	    crawler.setDocManager(log);
     	    crawler.run();
     	    String [] links = sw.getBuffer().toString().split("\n");
     	    Vector<String> linksV = new Vector<String>();
     	    for(int i=0; i<links.length; i++)
     	    	linksV.add(links[i]);
     	    return linksV;
+    	}
     	/*public Vector<String> getLinksFaster(String url) throws ParserException, IOException, InterruptedException {
     		Thread.sleep(Configs.sleepTime);
     		HttpURLConnection.setFollowRedirects(true);
     		URL URL = new URL(url);
             HttpURLConnection conn = (HttpURLConnection) URL.openConnection();
             Parser parser = new Parser(conn);
             NodeList list = parser.parse(new TagNameFilter("A"));
             Vector<String> links = new Vector<String>();
     		for(int i=0; i<list.size(); i++) {
     			LinkTag n = (LinkTag) list.elementAt(i);
     			links.add(n.extractLink());
+    		}
     		return links;
     	}*/
     	/**
     	 * Only for testing purposes, not supposed to be called
     	 */
     	public static void main(String[] args) {
     		Crawler c;
     		try {
     		c = new Crawler();
     		System.out.println(c.crawler.getAllowWholeHost()+" "+c.crawler.getAllowWholeDomain());
+    		}
     		catch(Exception e) {
     			System.err.println("FATAL ERROR: Crawler could not be configured. Please check your robot.xml parameters and try again.");
     			System.err.println(e.getLocalizedMessage());
     			e.printStackTrace();
     			return;
+    		}
     		String url = "http://www.di.uoa.gr/gr";
     		try {
     		System.out.println(c.getLinks(url));
+    		}
     		catch(Exception e) {
     			System.err.println("ERROR: Crawler could not retrieve links from url "+url);
     			System.err.println(e.getLocalizedMessage());
     			e.printStackTrace();
+    		}
+    	}
     	public WebRobot getCrawler() {
     		return this.crawler;
+    	}
+    }

     package eu.dnetlib.data.utility.resource_discovery.crawler;
     import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
     import java.io.IOException;
     import java.util.Vector;
     import org.apache.commons.logging.Log;
     import org.apache.commons.logging.LogFactory;
     public class ResourceExtractor {
     	private static final Log logger = LogFactory.getLog(ResourceExtractor.class);
     	private Vector<String> filter;
     	private Vector<String> latest;
     	private int runned;
     	public ResourceExtractor() {
     		runned = 0;
     		filter = new Vector<String>();
     		latest = new Vector<String>();
+    	}
     	public Vector<String> extractResource(Vector<String> urls) throws IOException, InterruptedException {
     		logger.debug("Extracting resources from links "+urls);
     		runned++;
     		Vector<String> ret = new Vector<String>();
     		if(runned == 1) {
     			filter.addAll(urls);
     			for(String url : urls) {
     				if(UrlFilter.checkExtension(url) || UrlFilter.checkMimeType(url))
     					ret.add(url);
+    			}
     			return ret;
+    		}
     		for(String url : urls) {
     			if(!latest.contains(url) && !filter.contains(url) && (UrlFilter.checkExtension(url) || UrlFilter.checkMimeType(url)))
     				ret.add(url);
+    		}
     		latest.clear();
     		latest.addAll(ret);
     		return ret;
+    	}
+    }

     package eu.dnetlib.data.utility.resource_discovery.crawler.config;
     import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
     import java.io.BufferedReader;
     import java.io.IOException;
     import java.io.InputStreamReader;
     import java.io.StringWriter;
     import java.util.Collection;
     import java.util.Vector;
     import net.matuschek.http.DownloadRuleSet;
     import net.matuschek.http.HttpException;
     import net.matuschek.http.URLLogger;
     import net.matuschek.spider.WebRobot;
     import org.apache.commons.logging.Log;
     import org.apache.commons.logging.LogFactory;
     import org.dlese.dpc.xml.XMLDoc;
     import org.dlese.dpc.xml.XMLException;
     public class Configs {
     	private static final Log logger = LogFactory.getLog(Configs.class);
     	static public String agentName = "JoBo";
     	static public boolean ignoreRobotsTxt=false;
     	static public int sleepTime=3000;
     	static public int maxDepth=1;
     	static public boolean walkToOtherHosts=false;
     	static public boolean allowWholeHost=false;
     	static public boolean allowWholeDomain=false;
     	static public boolean flexibleHostCheck=true;
     	static public boolean localizeLinks=false;
     	static public boolean enableCookies=false;
     	static public String startReferer=null;
     	static public int maxDocumentAge=-1;
     	static public String[] allowedUrl=null;
     	static public String[] visitMany=null;
     	static public String proxy=null;
     	static public int bandwidth=-1;
     	private static String readXMLDoc(String filename) throws IOException {
     		BufferedReader br = new BufferedReader(new InputStreamReader(Configs.class.getResourceAsStream(filename)));
     		String strLine = null;
     		StringBuilder builder = new StringBuilder();
     		try {
     			while ((strLine = br.readLine()) != null)
     				builder.append(strLine);
     		} finally {
     			br.close();
+    		}
     		return builder.toString();
+    	}
     	static {
     		try {
     //		XMLDoc xd = new XMLDoc("configs/robot.xml", true, true, true);
     		logger.debug("Reading configuration file for crawler");
     		XMLDoc xd = new XMLDoc();
     		//xd.useXmlString(readXMLDoc("/eu/dnetlib/functionality/validator/robot.xml"), true, true, true);
     		xd.useXmlString(readXMLDoc("/eu/dnetlib/data/utility/resource_discovery/robot.xml"), true, true, true);
     		String[] ret1 = xd.getXmlFields(0,1,"AgentName");
     		String [] ret2 = xd.getXmlFields(0,1,"IgnoreRobotsTxt");
     		String [] ret3 = xd.getXmlFields(0,1,"SleepTime");
     		String [] ret4 = xd.getXmlFields(0,1,"MaxDepth");
     		String [] ret5 = xd.getXmlFields(0,1,"WalkToOtherHosts");
     		String [] ret6 = xd.getXmlFields(0,1,"AllowWholeHost");
     		String [] ret7 = xd.getXmlFields(0,1,"AllowWholeDomain");
     		String [] ret8 = xd.getXmlFields(0,1,"FlexibleHostCheck");
     		String [] ret9 = xd.getXmlFields(0,1,"LocalizeLinks");
     		String [] ret10 = xd.getXmlFields(0,1,"EnableCookies");
     		String [] ret11 = xd.getXmlFields(0,1,"StartReferer");
     		String [] ret12 = xd.getXmlFields(0,1,"MaxDocumentAge");
     		String [] ret13 = xd.getXmlFields(0,0,"AllowedUrl");
     		String [] ret14 = xd.getXmlFields(0,0,"VisitMany");
     		String [] ret15 = xd.getXmlFields(0,1,"Proxy");
     		String [] ret16 = xd.getXmlFields(0,1,"Bandwidth");
     		if(ret1.length > 0)
     			agentName = ret1[0];
     		if(ret2.length > 0)
     			ignoreRobotsTxt = Boolean.parseBoolean(ret2[0]);
     		if(ret3.length > 0)
     			sleepTime = Integer.parseInt(ret3[0]) * 1000;
     		if(ret4.length > 0)
     			maxDepth = Integer.parseInt(ret4[0]);
     		if(ret5.length > 0)
     			walkToOtherHosts = Boolean.parseBoolean(ret5[0]);
     		if(ret6.length > 0)
     			allowWholeHost = Boolean.parseBoolean(ret6[0]);
     		if(ret7.length > 0)
     			allowWholeDomain = Boolean.parseBoolean(ret7[0]);
     		if(ret8.length > 0)
     			flexibleHostCheck = Boolean.parseBoolean(ret8[0]);
     		if(ret9.length > 0)
     			localizeLinks = Boolean.parseBoolean(ret9[0]);
     		if(ret10.length > 0)
     			enableCookies = Boolean.parseBoolean(ret10[0]);
     		if(ret11.length > 0)
     			startReferer = ret11[0];
     		if(ret12.length > 0)
     			maxDocumentAge = Integer.parseInt(ret12[0]);
     		if(ret13.length > 0)
     			allowedUrl = ret13;
     		if(ret14.length > 0)
     			visitMany = ret14;
     		if(ret15.length > 0)
     			proxy = ret15[0];
     		if(ret16.length > 0)
     			bandwidth = Integer.parseInt(ret16[0]);
     		} catch (IOException e) {
     			logger.debug("Error reading robots.txt", e);
     		} catch(XMLException e) {
     			logger.debug("WARNING: The file robot.xml seems to be malformed. The default settings will be used for the crawler.", e);
     		} catch(NumberFormatException e) {
     			logger.debug("WARNING: The file robot.xml seems to be malformed (an integer doesn't seem to be of type integer). The default settings will be used for the crawler.", e);
     		} catch (Exception e) {
     			logger.error("Error configuring", e);
+    		}
+    	}
     	public static void configureCrawlerForValidation(WebRobot crawler) throws IOException {
     		logger.debug("Configuring crawler for validation");
     		crawler.setAgentName("Validator");
     		crawler.setIgnoreRobotsTxt(false);
     		crawler.setSleepTime(1);
     		crawler.setMaxDepth(1);
     		crawler.setWalkToOtherHosts(false);
     		crawler.setAllowWholeHost(true);
     		crawler.setAllowWholeDomain(true);
     		crawler.setFlexibleHostCheck(true);
     		crawler.setEnableCookies(true);
     		DownloadRuleSet rules = new DownloadRuleSet();
     		int minSize = 1, maxSize = 104857600;
     		rules.addRule("text", "html", minSize, maxSize, true);
     		Collection<String> mimeTypes = UrlFilter.getRequestedMimeTypes();
     		for(String mimeType : mimeTypes) {
     			String[] parts = mimeType.split("/");
     			if(parts.length < 2) {
     				logger.debug("WARNING: Requested mimetype "+mimeType+" seems to be malformed");
     				throw new IOException();
+    			}
     			rules.addRule(parts[0], parts[1], minSize, maxSize, true);
+    		}
     		rules.addRule("*", "*", minSize, maxSize, false);
     		crawler.setDownloadRuleSet(rules);
+    	}
     	public static void configureCrawler(WebRobot crawler) throws IOException, HttpException {
     		logger.debug("Configuring crawler using configuration file parameters");
     		crawler.setAgentName(agentName);
     		crawler.setIgnoreRobotsTxt(ignoreRobotsTxt);
     		crawler.setSleepTime(sleepTime/1000);
     		crawler.setMaxDepth(maxDepth);
     		crawler.setWalkToOtherHosts(walkToOtherHosts);
     		crawler.setAllowWholeHost(allowWholeHost);
     		crawler.setAllowWholeDomain(allowWholeDomain);
     		crawler.setFlexibleHostCheck(flexibleHostCheck);
     		crawler.setEnableCookies(enableCookies);
     		if(startReferer != null)
     			crawler.setStartReferer(startReferer);
     		if(maxDocumentAge > 0)
     			crawler.setMaxDocumentAge(maxDocumentAge);
     		if(allowedUrl != null) {
     			Vector<String> urls = new Vector<String>();
     			for(int i=0; i<allowedUrl.length; i++)
     				urls.add(allowedUrl[i]);
     			crawler.setAllowedURLs(urls);
+    		}
     		if(visitMany != null) {
     			Vector<String> urls = new Vector<String>();
     			for(int i=0; i<visitMany.length; i++)
     				urls.add(visitMany[i]);
     			crawler.setVisitMany(urls);
+    		}
     		if(proxy != null)
     			crawler.setProxy(proxy);
     		if(bandwidth > 0)
     			crawler.setBandwidth(bandwidth);
     		DownloadRuleSet rules = new DownloadRuleSet();
     		int minSize = 1, maxSize = 104857600;
     		rules.addRule("text", "html", minSize, maxSize, true);
     		Collection<String> mimeTypes = UrlFilter.getRequestedMimeTypes();
     		for(String mimeType : mimeTypes) {
     			String[] parts = mimeType.split("/");
     			if(parts.length < 2) {
     				logger.debug("WARNING: Requested mimetype "+mimeType+" seems to be malformed");
     				throw new IOException();
+    			}
     			rules.addRule(parts[0], parts[1], minSize, maxSize, true);
+    		}
     		rules.addRule("*", "*", minSize, maxSize, false);
     		crawler.setDownloadRuleSet(rules);
+    	}
     	public static void main(String[] args) {
     		WebRobot robby = new WebRobot();
     		try {
     		configureCrawler(robby);
     		StringWriter sw = new StringWriter();
     	    URLLogger log = new URLLogger(sw);
     	    robby.setDocManager(log);
     	    robby.run();
     	    logger.debug(sw.getBuffer().toString());
+    		}
     		catch(Exception e) {
     			logger.debug(e.getLocalizedMessage());
+    		}
+    	}
+    }

     package eu.dnetlib.data.utility.resource_discovery.harvester;
     import eu.dnetlib.data.utility.resource_discovery.crawler.Crawler;
     import eu.dnetlib.data.utility.resource_discovery.crawler.ResourceExtractor;
     import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
     import java.io.IOException;
     import java.util.Date;
     import java.util.Vector;
     import org.apache.commons.logging.Log;
     import org.apache.commons.logging.LogFactory;
     import org.dlese.dpc.oai.harvester.Harvester;
     import org.dlese.dpc.oai.harvester.Hexception;
     import org.dlese.dpc.oai.harvester.OAIErrorException;
     import org.dlese.dpc.xml.XMLDoc;
     import org.dlese.dpc.xml.XMLException;
     public class ResourceHarvester {
     	private static final Log logger = LogFactory.getLog(ResourceHarvester.class);
     	static public String[][] getRecordsFromRepository(String baseUrl) throws Hexception, OAIErrorException {
     		return Harvester.harvest(baseUrl, "oai_dc", null, null, null, null, true);
+    	}
     	static public String[][] getRecordsFromRepository(String baseUrl, String set, Date from, Date until) throws Hexception, OAIErrorException {
     		return Harvester.harvest(baseUrl, "oai_dc", set, from, until, null, true);
+    	}
     	static public Vector<Vector<String>> getResourceAndLinks(String header, String oaiDcRecord, Crawler crawler, ResourceExtractor extractor) throws IOException, InterruptedException {
     		Vector<Vector<String>> retrievedAndExtracted = new Vector<Vector<String>>();
     		String id = getDcIdentifier(oaiDcRecord, header);
     		if(id !=  null) {
     			String idUrl = UrlFilter.resolveRedirections(id);
     			Vector<String> urls;
     			urls = crawler.getLinks(idUrl);
     			retrievedAndExtracted.add(urls);
     			retrievedAndExtracted.add(extractor.extractResource(urls));
     			return retrievedAndExtracted;
+    		}
     		return null;
+    	}
     	static public String getIdentifier(String oaiDcRecord, String identifier) throws IOException {
     		XMLDoc xd = new XMLDoc();
     		try {
     		xd.useXmlString(oaiDcRecord, true, true, true);
+    		}
     		catch(XMLException e) {
     			logger.debug("WARNING: The record "+identifier+" seems to be malformed (deleted maybe?)");
     			return null;
+    		}
     		try {
     		String [] fields = xd.getXmlFields(1,0,"dc:identifier");
     		Vector<String> urls = new Vector<String>();
     		for(String field : fields) {
     			if(UrlFilter.isUrl(field))
     				urls.add(field);
+    		}
     		fields = xd.getXmlFields(0,0,"dc:source");
     		for(String field : fields) {
     			if(UrlFilter.isUrl(field))
     				urls.add(field);
+    		}
     		fields = xd.getXmlFields(0,0,"dc:relation");
     		for(String field : fields) {
     			if(UrlFilter.isUrl(field))
     				urls.add(field);
+    		}
     		if(urls.size() == 0) {
     			logger.debug("WARNING: The record "+identifier+" does not seem to have a field that is a url");
     			return null;
+    		}
     		if(urls.size() == 1)
     			return urls.elementAt(0);
     		for(String url : urls) {
     			if(UrlFilter.checkExtension(url))
     				return url;
+    		}
     		logger.debug("WARNING: The record "+identifier+" has multiple fields with valid urls and there is no way to choose one. The first one will be used");
     		return urls.elementAt(0);
+    		}
     		catch(XMLException e) {
     			logger.debug("WARNING: The record "+identifier+" does not seem to have a dc:identifier field");
     			return null;
+    		}
+    	}
     	static public String getDcIdentifier(String oaiDcRecord, String identifier) throws IOException {
     		XMLDoc xd = new XMLDoc();
     		try {
     		xd.useXmlString(oaiDcRecord, true, true, true);
+    		}
     		catch(XMLException e) {
     			logger.debug("WARNING: The record "+identifier+" seems to be malformed (deleted maybe?)");
     			return null;
+    		}
     		try {
     		String [] fields = xd.getXmlFields(1,0,"dc:identifier");
     		Vector<String> urls = new Vector<String>();
     		for(String field : fields) {
     			if(UrlFilter.isUrl(field))
     				urls.add(field);
+    		}
     		if(urls.size() == 0) {
     			logger.debug("WARNING: The record "+identifier+" does not seem to have a field that is a url");
     			return null;
+    		}
     		if(urls.size() == 1)

Project

General

Profile

D-Net

Revision 31619

Added by Claudio Atzori about 10 years ago