Project

General

Profile

« Previous | Next » 

Revision 44089

WT plugin renamed to GRIST plugin: we can import projects of different funders from the GRIST API of ePMC

View differences:

modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/projects/wt/WTProjectsIterableTest.java
1
package eu.dnetlib.data.collector.plugins.projects.wt;
2

  
3
import java.util.Iterator;
4
import java.util.List;
5
import java.util.Set;
6

  
7
import com.google.common.collect.Lists;
8
import com.google.common.collect.Sets;
9
import org.apache.commons.io.IOUtils;
10
import org.dom4j.Document;
11
import org.dom4j.DocumentException;
12
import org.dom4j.io.SAXReader;
13
import org.junit.Before;
14
import org.junit.Ignore;
15
import org.junit.Test;
16

  
17
import static org.junit.Assert.assertNotNull;
18
import static org.junit.Assert.assertTrue;
19

  
20
/**
21
 * WTProjectsIterable Tester.
22
 *
23
 * @author alessia
24
 * @version 1.0
25
 * @since <pre>Apr 22, 2016</pre>
26
 */
27
@Ignore
28
public class WTProjectsIterableTest {
29

  
30
	private String baseUrl = "http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:%22Wellcome%20Trust%22&resultType=core";
31
	private WTProjectsIterable iterable;
32
	private Iterator<String> it;
33
	private SAXReader reader;
34

  
35
	@Before
36
	public void before() throws Exception {
37
		iterable = new WTProjectsIterable(baseUrl);
38
		it = iterable.iterator();
39
		reader = new SAXReader();
40
	}
41

  
42
	/**
43
	 * Method: hasNext()
44
	 */
45
	@Test
46
	public void testHasNext() throws Exception {
47
		assertTrue(it.hasNext());
48
	}
49

  
50
	/**
51
	 * Method: next()
52
	 */
53
	@Test
54
	public void testNext() throws Exception {
55
		assertNotNull(it.next());
56
	}
57

  
58
	/**
59
	 * Method: remove()
60
	 */
61
	@Test(expected = UnsupportedOperationException.class)
62
	public void testRemove() throws Exception {
63
		it.remove();
64
	}
65

  
66
	@Test
67
	public void iterateToNextPage() {
68
		for (int maxInPage = 25; maxInPage > 0; maxInPage--) {
69
			it.next();
70
		}
71
		if (it.hasNext()) {
72
			System.out.println(it.next());
73
		}
74
	}
75

  
76
	@Test
77
	public void checkProjectIdentifiers() throws DocumentException {
78
		List<String> identifiers = Lists.newArrayList();
79
		Iterator<String> it2 = iterable.iterator();
80
		while (it2.hasNext()) {
81
			String id = parseId(it2.next());
82
			if (identifiers.contains(id)) {
83
				System.out.println("Found duplicate identifier: " + id);
84
			}
85
			identifiers.add(id);
86
		}
87
		int listSize = identifiers.size();
88
		System.out.println("List of ids has " + listSize + " identifiers");
89
		System.out.println("Now putting ids into a set to delete duplicates");
90
		Set<String> set = Sets.newHashSet(identifiers);
91
		int setSize = set.size();
92
		System.out.println("Set of ids has " + setSize + " identifiers");
93
		System.out.println();
94
	}
95

  
96
	private String parseId(String record) throws DocumentException {
97
		Document doc = reader.read(IOUtils.toInputStream(record));
98
		return doc.selectSingleNode("//Grant/Id").getText();
99
	}
100

  
101
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/projects/wt/WTCollectorPlugin.java
1
package eu.dnetlib.data.collector.plugins.projects.wt;
2

  
3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
6

  
7
/**
8
 * Plugin to collect metadata record about Wellcome Trust projects and fundings via the europePMC GRIST API.
9
 * <p>
10
 * Documentation on GRIST API: http://europepmc.org/GristAPI.
11
 * </p>
12
 * <p>
13
 * BaseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:"Wellcome Trust"&resultType=core
14
 * where resultType=core asks for the complete information (including abstracts).
15
 * The results returned by the API are XMLs.
16
 * </p>
17
 * <p>
18
 * Pagination: use parameter 'page'. When the response contains empty 'RecordList', it means we reached the end.
19
 * </p>
20
 *
21
 * @author alessia
22
 */
23
public class WTCollectorPlugin extends AbstractCollectorPlugin {
24

  
25
	@Override
26
	public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
27
			throws CollectorServiceException {
28
		//baseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:%22Wellcome%20Trust%22&resultType=core
29
		return new WTProjectsIterable(interfaceDescriptor.getBaseUrl());
30
	}
31

  
32
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/projects/wt/WTProjectsIterable.java
1
package eu.dnetlib.data.collector.plugins.projects.wt;
2

  
3
import java.io.IOException;
4
import java.net.MalformedURLException;
5
import java.net.URL;
6
import java.util.Iterator;
7
import java.util.List;
8
import java.util.Queue;
9
import java.util.concurrent.PriorityBlockingQueue;
10

  
11
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
12
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
13
import eu.dnetlib.enabling.resultset.SizedIterable;
14
import org.apache.commons.io.IOUtils;
15
import org.apache.commons.logging.Log;
16
import org.apache.commons.logging.LogFactory;
17
import org.dom4j.Document;
18
import org.dom4j.DocumentException;
19
import org.dom4j.Element;
20
import org.dom4j.io.SAXReader;
21

  
22
public class WTProjectsIterable implements SizedIterable<String> {
23

  
24
	private static final Log log = LogFactory.getLog(WTProjectsIterable.class); // NOPMD by marko on 11/24/08 5:02 PM
25

  
26
	private String queryURL;
27
	private int total;
28
	private SAXReader reader;
29

  
30
	public WTProjectsIterable(String baseURL) throws CollectorServiceException {
31
		queryURL = baseURL;
32
		reader = new SAXReader();
33
		total = getTotalCount();
34
	}
35

  
36
	@Override
37
	public int getNumberOfElements() {
38
		return total;
39
	}
40

  
41
	private int getTotalCount() throws CollectorServiceException {
42
		try {
43
			URL pageUrl = new URL(queryURL);
44
			log.debug("Getting hit count from: " + pageUrl.toString());
45
			String resultPage = IOUtils.toString(pageUrl);
46
			Document doc = reader.read(IOUtils.toInputStream(resultPage));
47
			String hitCount = doc.selectSingleNode("/Response/HitCount").getText();
48
			return Integer.parseInt(hitCount);
49
		} catch (NumberFormatException e) {
50
			log.warn("Cannot set the total count from '/Response/HitCount'");
51
		} catch (DocumentException e) {
52
			throw new CollectorServiceException(e);
53
		} catch (MalformedURLException e) {
54
			throw new CollectorServiceException(e);
55
		} catch (IOException e) {
56
			throw new CollectorServiceException(e);
57
		}
58
		return -1;
59
	}
60

  
61
	@Override
62
	public Iterator<String> iterator() {
63
		return new Iterator<String>() {
64

  
65
			private Queue<String> projects = new PriorityBlockingQueue<String>();
66
			private boolean morePages = true;
67
			private int pageNumber = 0;
68
			private SAXReader reader = new SAXReader();
69
			//The following is for debug only
70
			private int nextCounter = 0;
71

  
72
			@Override
73
			public boolean hasNext() {
74
				try {
75
					fillProjectListIfNeeded();
76
				} catch (CollectorServiceException e) {
77
					throw new CollectorServiceRuntimeException(e);
78
				}
79
				return !projects.isEmpty();
80
			}
81

  
82
			@Override
83
			public String next() {
84
				nextCounter++;
85
				log.debug(String.format("Calling next %s times. projects queue has %s elements", nextCounter, projects.size()));
86
				try {
87
					fillProjectListIfNeeded();
88
					return projects.poll();
89
				} catch (CollectorServiceException e) {
90
					throw new CollectorServiceRuntimeException(e);
91
				}
92
			}
93

  
94
			@Override
95
			public void remove() {
96
				throw new UnsupportedOperationException();
97
			}
98

  
99
			private boolean fillProjectListIfNeeded() throws CollectorServiceException {
100
				if (morePages && projects.isEmpty()) {
101
					String resultPage = getNextPage();
102
					Document doc = null;
103
					try {
104
						doc = reader.read(IOUtils.toInputStream(resultPage));
105
						List<Element> records = doc.selectNodes("//RecordList/Record");
106
						if (records != null && !records.isEmpty()) {
107
							for (Element p : records) {
108

  
109
								projects.add(p.asXML());
110
							}
111
							return true;
112
						} else {
113
							log.info("No more projects to read at page nr. " + pageNumber);
114
							morePages = false;
115
							return false;
116
						}
117
					} catch (DocumentException e) {
118
						throw new CollectorServiceException(e);
119
					}
120
				} else return false;
121
			}
122

  
123
			private String getNextPage() {
124
				pageNumber++;
125
				try {
126
					URL pageUrl = new URL(queryURL + "&page=" + pageNumber);
127
					log.debug("Getting page at: " + pageUrl.toString());
128
					return IOUtils.toString(pageUrl);
129
				} catch (Exception e) {
130
					throw new CollectorServiceRuntimeException("Error on page " + pageNumber, e);
131
				}
132
			}
133
		};
134
	}
135

  
136
}
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/projects/grist/GristProjectsIterableTest.java
1
package eu.dnetlib.data.collector.plugins.projects.grist;
2

  
3
import java.util.Iterator;
4
import java.util.List;
5
import java.util.Set;
6

  
7
import com.google.common.collect.Lists;
8
import com.google.common.collect.Sets;
9
import org.apache.commons.io.IOUtils;
10
import org.dom4j.Document;
11
import org.dom4j.DocumentException;
12
import org.dom4j.io.SAXReader;
13
import org.junit.Before;
14
import org.junit.Ignore;
15
import org.junit.Test;
16

  
17
import static org.junit.Assert.assertNotNull;
18
import static org.junit.Assert.assertTrue;
19

  
20
/**
21
 * GristProjectsIterable Tester.
22
 *
23
 * @author alessia
24
 * @version 1.0
25
 * @since <pre>Apr 22, 2016</pre>
26
 */
27
@Ignore
28
public class GristProjectsIterableTest {
29

  
30
	private String baseUrl = "http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:%22Wellcome%20Trust%22&resultType=core";
31
	private GristProjectsIterable iterable;
32
	private Iterator<String> it;
33
	private SAXReader reader;
34

  
35
	@Before
36
	public void before() throws Exception {
37
		iterable = new GristProjectsIterable(baseUrl);
38
		it = iterable.iterator();
39
		reader = new SAXReader();
40
	}
41

  
42
	/**
43
	 * Method: hasNext()
44
	 */
45
	@Test
46
	public void testHasNext() throws Exception {
47
		assertTrue(it.hasNext());
48
	}
49

  
50
	/**
51
	 * Method: next()
52
	 */
53
	@Test
54
	public void testNext() throws Exception {
55
		assertNotNull(it.next());
56
	}
57

  
58
	/**
59
	 * Method: remove()
60
	 */
61
	@Test(expected = UnsupportedOperationException.class)
62
	public void testRemove() throws Exception {
63
		it.remove();
64
	}
65

  
66
	@Test
67
	public void iterateToNextPage() {
68
		for (int maxInPage = 25; maxInPage > 0; maxInPage--) {
69
			it.next();
70
		}
71
		if (it.hasNext()) {
72
			System.out.println(it.next());
73
		}
74
	}
75

  
76
	@Test
77
	public void checkProjectIdentifiers() throws DocumentException {
78
		List<String> identifiers = Lists.newArrayList();
79
		Iterator<String> it2 = iterable.iterator();
80
		while (it2.hasNext()) {
81
			String id = parseId(it2.next());
82
			if (identifiers.contains(id)) {
83
				System.out.println("Found duplicate identifier: " + id);
84
			}
85
			identifiers.add(id);
86
		}
87
		int listSize = identifiers.size();
88
		System.out.println("List of ids has " + listSize + " identifiers");
89
		System.out.println("Now putting ids into a set to delete duplicates");
90
		Set<String> set = Sets.newHashSet(identifiers);
91
		int setSize = set.size();
92
		System.out.println("Set of ids has " + setSize + " identifiers");
93
		System.out.println();
94
	}
95

  
96
	private String parseId(String record) throws DocumentException {
97
		Document doc = reader.read(IOUtils.toInputStream(record));
98
		return doc.selectSingleNode("//Grant/Id").getText();
99
	}
100

  
101
}
modules/dnet-collector-plugins/trunk/src/test/resources/log4j.properties
8 8
### Application Level ###
9 9
log4j.logger.eu.dnetlib=INFO
10 10
log4j.logger.eu.dnetlib.data.collector=INFO
11
log4j.logger.eu.dnetlib.data.collector.plugins.projects.wt=DEBUG
11
log4j.logger.eu.dnetlib.data.collector.plugins.projects.grist=DEBUG
12 12

  
13 13

  
14 14

  
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/projects/grist/GristProjectsIterable.java
1
package eu.dnetlib.data.collector.plugins.projects.grist;
2

  
3
import java.io.IOException;
4
import java.net.MalformedURLException;
5
import java.net.URL;
6
import java.util.Iterator;
7
import java.util.List;
8
import java.util.Queue;
9
import java.util.concurrent.PriorityBlockingQueue;
10

  
11
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
12
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
13
import eu.dnetlib.enabling.resultset.SizedIterable;
14
import org.apache.commons.io.IOUtils;
15
import org.apache.commons.logging.Log;
16
import org.apache.commons.logging.LogFactory;
17
import org.dom4j.Document;
18
import org.dom4j.DocumentException;
19
import org.dom4j.Element;
20
import org.dom4j.io.SAXReader;
21

  
22
public class GristProjectsIterable implements SizedIterable<String> {
23

  
24
	private static final Log log = LogFactory.getLog(GristProjectsIterable.class); // NOPMD by marko on 11/24/08 5:02 PM
25

  
26
	private String queryURL;
27
	private int total;
28
	private SAXReader reader;
29

  
30
	public GristProjectsIterable(String baseURL) throws CollectorServiceException {
31
		queryURL = baseURL;
32
		reader = new SAXReader();
33
		total = getTotalCount();
34
	}
35

  
36
	@Override
37
	public int getNumberOfElements() {
38
		return total;
39
	}
40

  
41
	private int getTotalCount() throws CollectorServiceException {
42
		try {
43
			URL pageUrl = new URL(queryURL);
44
			log.debug("Getting hit count from: " + pageUrl.toString());
45
			String resultPage = IOUtils.toString(pageUrl);
46
			Document doc = reader.read(IOUtils.toInputStream(resultPage));
47
			String hitCount = doc.selectSingleNode("/Response/HitCount").getText();
48
			return Integer.parseInt(hitCount);
49
		} catch (NumberFormatException e) {
50
			log.warn("Cannot set the total count from '/Response/HitCount'");
51
		} catch (DocumentException e) {
52
			throw new CollectorServiceException(e);
53
		} catch (MalformedURLException e) {
54
			throw new CollectorServiceException(e);
55
		} catch (IOException e) {
56
			throw new CollectorServiceException(e);
57
		}
58
		return -1;
59
	}
60

  
61
	@Override
62
	public Iterator<String> iterator() {
63
		return new Iterator<String>() {
64

  
65
			private Queue<String> projects = new PriorityBlockingQueue<String>();
66
			private boolean morePages = true;
67
			private int pageNumber = 0;
68
			private SAXReader reader = new SAXReader();
69
			//The following is for debug only
70
			private int nextCounter = 0;
71

  
72
			@Override
73
			public boolean hasNext() {
74
				try {
75
					fillProjectListIfNeeded();
76
				} catch (CollectorServiceException e) {
77
					throw new CollectorServiceRuntimeException(e);
78
				}
79
				return !projects.isEmpty();
80
			}
81

  
82
			@Override
83
			public String next() {
84
				nextCounter++;
85
				log.debug(String.format("Calling next %s times. projects queue has %s elements", nextCounter, projects.size()));
86
				try {
87
					fillProjectListIfNeeded();
88
					return projects.poll();
89
				} catch (CollectorServiceException e) {
90
					throw new CollectorServiceRuntimeException(e);
91
				}
92
			}
93

  
94
			@Override
95
			public void remove() {
96
				throw new UnsupportedOperationException();
97
			}
98

  
99
			private boolean fillProjectListIfNeeded() throws CollectorServiceException {
100
				if (morePages && projects.isEmpty()) {
101
					String resultPage = getNextPage();
102
					Document doc = null;
103
					try {
104
						doc = reader.read(IOUtils.toInputStream(resultPage));
105
						List<Element> records = doc.selectNodes("//RecordList/Record");
106
						if (records != null && !records.isEmpty()) {
107
							for (Element p : records) {
108

  
109
								projects.add(p.asXML());
110
							}
111
							return true;
112
						} else {
113
							log.info("No more projects to read at page nr. " + pageNumber);
114
							morePages = false;
115
							return false;
116
						}
117
					} catch (DocumentException e) {
118
						throw new CollectorServiceException(e);
119
					}
120
				} else return false;
121
			}
122

  
123
			private String getNextPage() {
124
				pageNumber++;
125
				try {
126
					URL pageUrl = new URL(queryURL + "&page=" + pageNumber);
127
					log.debug("Getting page at: " + pageUrl.toString());
128
					return IOUtils.toString(pageUrl);
129
				} catch (Exception e) {
130
					throw new CollectorServiceRuntimeException("Error on page " + pageNumber, e);
131
				}
132
			}
133
		};
134
	}
135

  
136
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/projects/grist/GristCollectorPlugin.java
1
package eu.dnetlib.data.collector.plugins.projects.grist;
2

  
3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
6

  
7
/**
8
 * Plugin to collect metadata record about projects and fundings via the europePMC GRIST API (e.g. WT projects).
9
 * <p>
10
 * Documentation on GRIST API: http://europepmc.org/GristAPI.
11
 * </p>
12
 * <p>
13
 * BaseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:"Wellcome Trust"&resultType=core
14
 * where resultType=core asks for the complete information (including abstracts).
15
 * The results returned by the API are XMLs.
16
 * </p>
17
 * <p>
18
 * Pagination: use parameter 'page'. When the response contains empty 'RecordList', it means we reached the end.
19
 * </p>
20
 *
21
 * @author alessia
22
 */
23
public class GristCollectorPlugin extends AbstractCollectorPlugin {
24

  
25
	@Override
26
	public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
27
			throws CollectorServiceException {
28
		//baseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:%22Wellcome%20Trust%22&resultType=core
29
		return new GristProjectsIterable(interfaceDescriptor.getBaseUrl());
30
	}
31

  
32
}
modules/dnet-collector-plugins/trunk/src/main/resources/eu/dnetlib/data/collector/plugins/applicationContext-dnet-modular-collector-plugins.xml
20 20
		</property>
21 21
	</bean>
22 22

  
23
	<bean id="wtPlugin" class="eu.dnetlib.data.collector.plugins.projects.wt.WTCollectorPlugin">
23
	<bean id="gristPlugin" class="eu.dnetlib.data.collector.plugins.projects.grist.GristCollectorPlugin">
24 24
		<property name="protocolDescriptor">
25
			<bean class="eu.dnetlib.data.collector.rmi.ProtocolDescriptor" p:name="wtProjects"/>
25
			<bean class="eu.dnetlib.data.collector.rmi.ProtocolDescriptor" p:name="gristProjects"/>
26 26
		</property>
27 27
	</bean>
28 28
</beans>

Also available in: Unified diff