Project

General

Profile

« Previous | Next » 

Revision 63300

first implementation

View differences:

modules/dnet-collector-plugins/branches/gtr2_michele/src/test/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2Test.java
1
package eu.dnetlib.data.collector.plugins.projects.gtr2;
2

  
3
import static org.junit.Assert.assertEquals;
4
import static org.junit.Assert.assertNotNull;
5

  
6
import java.util.Iterator;
7

  
8
import org.junit.Before;
9
import org.junit.Ignore;
10
import org.junit.Test;
11

  
12
import com.ximpleware.VTDGen;
13

  
14
import eu.dnetlib.data.collector.plugins.HttpConnector;
15
import eu.dnetlib.miscutils.functional.xml.TryIndentXmlString;
16

  
17
@Ignore
18
public class Gtr2Test {
19

  
20
	private final String baseURL = "https://gtr.ukri.org/gtr/api";
21
	private Gtr2Helper helper;
22
	private Gtr2ProjectsIterator iterator;
23
	private HttpConnector connector;
24

  
25
	@Before
26
	public void prepare() {
27
		helper = new Gtr2Helper();
28
		// System.setProperty("jsse.enableSNIExtension","false");
29
	}
30

  
31
	@Test
32
	public void testOne() throws Exception {
33
		System.out.println("one project");
34
		final String url = "http://gtr.ukri.org/gtr/api/projects/0AE039A7-9A84-4943-AA36-001DB5763245";
35
		final VTDGen vg_tmp = new VTDGen();
36
		connector = new HttpConnector();
37
		final String tmp = connector.getInputSource(url);
38
		final byte[] bytes = tmp.getBytes("UTF-8");
39
		vg_tmp.setDoc(bytes);
40
		vg_tmp.parse(false);
41
		final String s = helper.processProject(vg_tmp.getNav(), "xmlns:ns=\"http:///afgshs\"", url);
42
		System.out.println(s);
43
	}
44

  
45
	@Test
46
	public void testPaging() throws Exception {
47
		iterator = new Gtr2ProjectsIterator(baseURL, null, 2, 2);
48
		final TryIndentXmlString indenter = new TryIndentXmlString();
49

  
50
		while (iterator.hasNext()) {
51
			Thread.sleep(300);
52
			final String res = iterator.next();
53
			assertNotNull(res);
54
			indenter.evaluate(res);
55
			System.out.println(res);
56
		}
57
	}
58

  
59
	@Test
60
	public void testOnePage() throws Exception {
61
		iterator = new Gtr2ProjectsIterator(baseURL, null, 12, 12);
62
		final int count = iterateAndCount(iterator);
63
		assertEquals(21, count);
64
	}
65

  
66
	@Test
67
	public void testIncrementalHarvestingNoRecords() throws Exception {
68
		System.out.println("incremental Harvesting");
69
		iterator = new Gtr2ProjectsIterator(baseURL, "2050-12-12", 11, 13);
70
		final int count = iterateAndCount(iterator);
71
		assertEquals(1, count);
72
	}
73

  
74
	@Test
75
	public void testIncrementalHarvesting() throws Exception {
76
		System.out.println("incremental Harvesting");
77
		iterator = new Gtr2ProjectsIterator(baseURL, "2016-11-30", 11, 11);
78
		final int count = iterateAndCount(iterator);
79
		assertEquals(21, count);
80
	}
81

  
82
	@Test
83
	@Ignore
84
	public void testCompleteHarvesting() throws Exception {
85
		System.out.println("testing complete harvesting");
86
		iterator = new Gtr2ProjectsIterator(baseURL, null);
87
		// TryIndentXmlString indenter = new TryIndentXmlString();
88
		// it.setEndAtPage(3);
89

  
90
		while (iterator.hasNext()) {
91
			final String res = iterator.next();
92
			assertNotNull(res);
93
			// System.out.println(res);
94
			// Scanner keyboard = new Scanner(System.in);
95
			// System.out.println("press enter for next record");
96
			// keyboard.nextLine();
97

  
98
		}
99
	}
100

  
101
	private int iterateAndCount(final Iterator<String> iterator) throws Exception {
102
		int i = 0;
103
		while (iterator.hasNext()) {
104
			assertNotNull(iterator.next());
105
			i++;
106
		}
107
		System.out.println("Got " + i + " projects");
108
		return i;
109
	}
110
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2Helper.java
1 1
package eu.dnetlib.data.collector.plugins.projects.gtr2;
2 2

  
3
import java.io.ByteArrayOutputStream;
4
import java.io.StringWriter;
5

  
6
import org.apache.commons.lang3.StringEscapeUtils;
7 3
import org.apache.commons.logging.Log;
8 4
import org.apache.commons.logging.LogFactory;
5
import org.dom4j.Document;
6
import org.dom4j.DocumentHelper;
7
import org.joda.time.DateTime;
8
import org.joda.time.format.DateTimeFormat;
9
import org.joda.time.format.DateTimeFormatter;
9 10

  
10
import com.ximpleware.AutoPilot;
11
import com.ximpleware.VTDGen;
12
import com.ximpleware.VTDNav;
13

  
14 11
import eu.dnetlib.data.collector.plugins.HttpConnector;
12
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
15 13

  
16 14
public class Gtr2Helper {
17 15

  
18 16
	private static final Log log = LogFactory.getLog(Gtr2Helper.class); // NOPMD by marko on 11/24/08 5:02 PM
19 17

  
20
	private VTDNav mainVTDNav;
21
	private AutoPilot mainAutoPilot;
22
	private StringWriter writer;
23
	private HttpConnector connector;
24
	// private BlockingQueue<String> fragment = new ArrayBlockingQueue<String>(20);
18
	private static final HttpConnector connector = new HttpConnector();
19
	private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
25 20

  
21
	private static final int MAX_ATTEMPTS = 3;
22

  
26 23
	public static String cleanURL(final String url) {
27 24
		String cleaned = url;
28 25
		if (cleaned.contains("gtr.gtr")) {
......
34 31
		return cleaned;
35 32
	}
36 33

  
37
	public String processProject(final VTDNav vn, final String namespaces, final String projectUrl) throws Exception {
38
		writer = new StringWriter();
39
		mainVTDNav = vn;
40
		mainAutoPilot = new AutoPilot(mainVTDNav);
41
		writer.write("<doc " + namespaces + ">");
42
		writeFragment(mainVTDNav);
43

  
44
		mainAutoPilot.selectXPath("//link[@rel='FUND']");
45

  
46
		while (mainAutoPilot.evalXPath() != -1) {
47
			processFunder(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")), projectUrl);
34
	public static Document loadURL(final String url) {
35
		try {
36
			final String cleanUrl = cleanURL(url);
37
			final String xml = loadURL(cleanUrl, 0);
38
			return DocumentHelper.parseText(xml);
39
		} catch (final Exception e) {
40
			log.error("Error parsing xml", e);
41
			throw new CollectorServiceRuntimeException("Error parsing xml", e);
48 42
		}
49

  
50
		mainAutoPilot.resetXPath();
51
		mainAutoPilot.selectXPath(".//link[@rel='LEAD_ORG']");
52
		while (mainAutoPilot.evalXPath() != -1) {
53
			processOrg(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")), new String[] {
54
				"<ld-org>", "</ld-org>"
55
			}, projectUrl);
56
		}
57
		mainAutoPilot.resetXPath();
58
		mainAutoPilot.selectXPath(".//link[@rel='PP_ORG']");
59
		while (mainAutoPilot.evalXPath() != -1) {
60
			processOrg(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")), new String[] {
61
				"<pp-org>", "</pp-org>"
62
			}, projectUrl);
63
		}
64

  
65
		// mainAutoPilot.resetXPath();
66
		// mainAutoPilot.selectXPath(".//link[@rel='PARTICIPANT_ORG']");
67
		// while (mainAutoPilot.evalXPath() != -1) {
68
		// processOrg(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
69
		// new String[]{"<pp-org>", "</pp-org>"}, projectUrl);
70
		// }
71

  
72
		mainAutoPilot.resetXPath();
73
		mainAutoPilot.selectXPath(".//link[@rel='PI_PER']");
74
		while (mainAutoPilot.evalXPath() != -1) {
75
			processPerson(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")), projectUrl);
76
		}
77
		writer.write("</doc>");
78
		writer.close();
79

  
80
		return writer.toString();
81 43
	}
82 44

  
83
	private VTDNav setNavigator(final String httpUrl) {
84
		final VTDGen vg_tmp = new VTDGen();
85
		connector = new HttpConnector();
45
	private static String loadURL(final String cleanUrl, final int attempt) {
86 46
		try {
87
			final byte[] bytes = connector.getInputSource(cleanURL(httpUrl)).getBytes("UTF-8");
88
			vg_tmp.setDoc(bytes);
89
			vg_tmp.parse(false);
90
			// vg_tmp.parseHttpUrl(httpUrl, false);
91
			return vg_tmp.getNav();
92
		} catch (final Throwable e) {
93
			return null;
94
		}
95
	}
96

  
97
	private int evalXpath(final VTDNav fragmentVTDNav, final String xPath) throws Exception {
98

  
99
		final AutoPilot ap_tmp = new AutoPilot(fragmentVTDNav);
100
		ap_tmp.selectXPath(xPath);
101
		return ap_tmp.evalXPath();
102
	}
103

  
104
	private void writeFragment(final VTDNav nav) throws Exception {
105
		final ByteArrayOutputStream b = new ByteArrayOutputStream();
106
		nav.dumpFragment(b);
107
		final String ret = b.toString();
108
		b.reset();
109
		writer.write(ret);
110
	}
111

  
112
	private void writeNewTagAndInfo(final VTDNav vn, final String xPath, final String xmlOpenTag, final String xmlCloseTag, final String attrName)
113
		throws Exception {
114

  
115
		final int nav_res = evalXpath(vn, xPath);
116
		if (nav_res != -1) {
117
			String tmp = xmlOpenTag;
118
			if (attrName != null) {
119
				tmp += vn.toNormalizedString(vn.getAttrVal(attrName));
47
			log.debug("  * Downloading Url: " + cleanUrl);
48
			final byte[] bytes = connector.getInputSource(cleanUrl).getBytes("UTF-8");
49
			return new String(bytes);
50
		} catch (final Exception e) {
51
			log.error("Error dowloading url: " + cleanUrl, e);
52
			if (attempt < MAX_ATTEMPTS) {
53
				return loadURL(cleanUrl, attempt + 1);
120 54
			} else {
121
				tmp += StringEscapeUtils.escapeXml11(vn.toNormalizedString(vn.getText()));
55
				throw new CollectorServiceRuntimeException("Error dowloading url: " + cleanUrl, e);
122 56
			}
123
			tmp += xmlCloseTag;
124
			writer.write(tmp);
125 57
		}
126 58
	}
127 59

  
128
	private void processPerson(final String httpUrl, final String projectUrl) {
129
		log.debug(String.format("Getting person %s for project %s", httpUrl, projectUrl));
130
		final VTDNav vn = setNavigator(cleanURL(httpUrl));
131
		try {
132
			writeFragment(vn);
133
		} catch (final Throwable e) {
134
			log.debug(String.format("Exception in processPerson from %s \n Error message: \n %s", httpUrl, e.getMessage()));
135
		}
136

  
60
	public static DateTime parseDate(final String s) {
61
		// I expect dates in the format 'yyyy-MM-dd'. See class
62
		// eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode
63
		return DateTime.parse(s.substring(0, s.indexOf("T")), simpleDateTimeFormatter);
137 64
	}
138 65

  
139
	private void processOrg(final String httpUrl, final String[] tags, final String projectUrl) {
140
		log.debug(String.format("Getting org %s for project %s", httpUrl, projectUrl));
141
		final VTDNav vn = setNavigator(cleanURL(httpUrl));
142
		try {
143
			writeNewTagAndInfo(vn, "//name", tags[0] + "<name>", "</name>", null);
144
			vn.toElement(VTDNav.ROOT);
145
			writeNewTagAndInfo(vn, "//country", "<country>", "</country>", null);
146
			vn.toElement(VTDNav.ROOT);
147
			writeNewTagAndInfo(vn, ".", "<id>", "</id>" + tags[1], "id");
148
		} catch (final Throwable e) {
149
			log.debug(String.format("Exception in processOrg from %s \n Error message: \n %s", httpUrl, e.getMessage()));
150
		}
66
	public static boolean isAfter(final String d, final DateTime fromDate) {
67
		return Gtr2Helper.parseDate(d).isAfter(fromDate);
151 68
	}
152

  
153
	private void processFunder(final String httpUrl, final String projectUrl) {
154
		log.debug(String.format("Getting funder %s for project %s", httpUrl, projectUrl));
155
		final VTDNav vn = setNavigator(cleanURL(httpUrl));
156
		try {
157
			final AutoPilot ap = new AutoPilot(vn);
158
			writeFragment(vn);
159
			ap.selectXPath(".//link[@rel='FUNDER']");
160
			VTDNav tmp_vn;
161
			while (ap.evalXPath() != -1) {
162
				tmp_vn = setNavigator(vn.toNormalizedString(vn.getAttrVal("href")));
163
				writeNewTagAndInfo(tmp_vn, "//name", "<funder> <name>", "</name></funder>", null);
164
			}
165
		} catch (final Throwable e) {
166
			log.debug(String.format("Exception in processFunder from %s \n Error message: \n %s", httpUrl, e.getMessage()));
167
		}
168
	}
169 69
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2ProjectsIterator.java
1 1
package eu.dnetlib.data.collector.plugins.projects.gtr2;
2 2

  
3
import java.util.ArrayList;
4
import java.util.HashMap;
3 5
import java.util.Iterator;
4
import java.util.concurrent.ArrayBlockingQueue;
5
import java.util.concurrent.TimeUnit;
6
import java.util.LinkedList;
7
import java.util.List;
8
import java.util.Map;
9
import java.util.Queue;
10
import java.util.function.Function;
6 11

  
12
import org.apache.commons.lang.math.NumberUtils;
7 13
import org.apache.commons.lang3.StringUtils;
8 14
import org.apache.commons.logging.Log;
9 15
import org.apache.commons.logging.LogFactory;
16
import org.dom4j.Document;
17
import org.dom4j.DocumentException;
18
import org.dom4j.DocumentHelper;
19
import org.dom4j.Element;
20
import org.dom4j.Node;
10 21
import org.joda.time.DateTime;
11
import org.joda.time.format.DateTimeFormat;
12
import org.joda.time.format.DateTimeFormatter;
13 22

  
14
import com.ximpleware.AutoPilot;
15
import com.ximpleware.VTDGen;
16
import com.ximpleware.VTDNav;
17

  
18
import eu.dnetlib.data.collector.plugins.HttpConnector;
19 23
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
20 24
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
21 25

  
......
24 28
 */
25 29
public class Gtr2ProjectsIterator implements Iterator<String> {
26 30

  
27
	public static final String TERMINATOR = "ARNOLD";
28
	public static final int WAIT_END_SECONDS = 600;
29
	public static final int PAGE_SZIE = 20;
31
	public static final int PAGE_SIZE = 20;
30 32

  
31 33
	private static final Log log = LogFactory.getLog(Gtr2ProjectsIterator.class);
32 34

  
33
	private String queryURL;
34
	private int total = -1;
35
	private int startFromPage = 1;
36
	private int endAtPage;
37
	private VTDGen vg;
38
	private VTDNav vn;
39
	private AutoPilot ap;
40
	private String namespaces;
35
	private final String baseUrl;
36
	private int currPage;
37
	private int endPage;
41 38
	private boolean incremental = false;
42 39
	private DateTime fromDate;
43
	private final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
44
	private final ArrayBlockingQueue<String> projects = new ArrayBlockingQueue<>(200);
45
	// private boolean finished = false;
46
	private String nextElement = "<doc></doc>";
47
	private HttpConnector connector;
48 40

  
41
	private final Map<String, String> cache = new HashMap<>();
42

  
43
	private final Queue<String> queue = new LinkedList<>();
44

  
45
	private String nextElement;
46

  
47
	public Gtr2ProjectsIterator(final String baseUrl, final String fromDate, final String startPage, final String endPage)
48
		throws CollectorServiceException {
49

  
50
		this.baseUrl = baseUrl;
51
		this.currPage = NumberUtils.toInt(startPage, 1);
52
		this.endPage = NumberUtils.toInt(endPage, Integer.MAX_VALUE);
53
		this.incremental = StringUtils.isNotBlank(fromDate);
54

  
55
		if (this.incremental) {
56
			this.fromDate = Gtr2Helper.parseDate(fromDate);
57
		}
58

  
59
		prepareNextElement();
60
	}
61

  
49 62
	@Override
50 63
	public boolean hasNext() {
51
		return !TERMINATOR.equals(nextElement);
64
		return nextElement != null;
52 65
	}
53 66

  
54 67
	@Override
......
56 69
		try {
57 70
			return nextElement;
58 71
		} finally {
59
			try {
60
				nextElement = projects.poll(WAIT_END_SECONDS, TimeUnit.SECONDS);
61
			} catch (final InterruptedException e) {
62
				throw new RuntimeException(e);
63
			}
72
			prepareNextElement();
64 73
		}
65

  
66 74
	}
67 75

  
68 76
	@Override
......
70 78
		throw new UnsupportedOperationException();
71 79
	}
72 80

  
73
	public Gtr2ProjectsIterator(final String baseUrl, final String fromDate) throws CollectorServiceException {
74
		prepare(baseUrl, fromDate);
75
		fillInfo(true);
76
	}
77

  
78
	public Gtr2ProjectsIterator(final String baseUrl, final String fromDate, final int startFromPage, final int endAtPage) throws CollectorServiceException {
79
		prepare(baseUrl, fromDate);
80
		this.setStartFromPage(startFromPage);
81
		this.setEndAtPage(endAtPage);
82
		fillInfo(false);
83
	}
84

  
85
	public Gtr2ProjectsIterator(final String baseUrl, final String fromDate, final String startFromPage, final String endAtPage)
86
		throws CollectorServiceException {
87
		prepare(baseUrl, fromDate);
88
		if (StringUtils.isNotBlank(startFromPage)) {
89
			this.setStartFromPage(Integer.parseInt(startFromPage));
81
	private void prepareNextElement() {
82
		while (this.currPage <= this.endPage && queue.isEmpty()) {
83
			log.debug("FETCHING PAGE + " + currPage + "/" + endPage);
84
			this.queue.addAll(fetchPage(currPage++));
90 85
		}
91
		if (StringUtils.isNotBlank(endAtPage)) {
92
			this.setEndAtPage(Integer.parseInt(endAtPage));
93
		}
94
		fillInfo(false);
86
		this.nextElement = this.queue.poll();
95 87
	}
96 88

  
97
	private void prepare(final String baseUrl, final String fromDate) {
98
		connector = new HttpConnector();
99
		queryURL = baseUrl + "/projects";
100
		vg = new VTDGen();
101
		this.incremental = StringUtils.isNotBlank(fromDate);
102
		if (incremental) {
103
			// I expect fromDate in the format 'yyyy-MM-dd'. See class
104
			// eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode
105
			this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter);
106
			log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString());
107
		}
108
	}
89
	private List<String> fetchPage(final int pageNumber) {
109 90

  
110
	private void fillInfo(final boolean all) throws CollectorServiceException {
91
		final List<String> res = new ArrayList<>();
111 92
		try {
112
			// log.debug("Getting hit count from: " + queryURL);
113
			final byte[] bytes = connector.getInputSource(queryURL).getBytes("UTF-8");
114
			vg.setDoc(bytes);
115
			vg.parse(false);
116
			// vg.parseHttpUrl(queryURL, false);
117
			initParser();
118
			final String hitCount = vn.toNormalizedString(vn.getAttrVal("totalSize"));
119
			final String totalPages = vn.toNormalizedString(vn.getAttrVal("totalPages"));
120
			namespaces = "xmlns:ns1=\"" + vn.toNormalizedString(vn.getAttrVal("ns1")) + "\" ";
121
			namespaces += "xmlns:ns2=\"" + vn.toNormalizedString(vn.getAttrVal("ns2")) + "\" ";
122
			namespaces += "xmlns:ns3=\"" + vn.toNormalizedString(vn.getAttrVal("ns3")) + "\" ";
123
			namespaces += "xmlns:ns4=\"" + vn.toNormalizedString(vn.getAttrVal("ns4")) + "\" ";
124
			namespaces += "xmlns:ns5=\"" + vn.toNormalizedString(vn.getAttrVal("ns5")) + "\" ";
125
			namespaces += "xmlns:ns6=\"" + vn.toNormalizedString(vn.getAttrVal("ns6")) + "\" ";
126
			if (all) {
127
				setEndAtPage(Integer.parseInt(totalPages));
128
				total = Integer.parseInt(hitCount);
93
			final Document doc = Gtr2Helper.loadURL(baseUrl + "/projects?p=" + pageNumber);
94

  
95
			if (endPage == Integer.MAX_VALUE) {
96
				this.endPage = NumberUtils.toInt(doc.valueOf("/*[local-name()='projects']/@*[local-name() = 'totalPages']"));
129 97
			}
130
			final Thread ft = new Thread(new FillProjectList());
131
			ft.start();
132
			log.debug("Expected number of pages: " + (endAtPage - startFromPage + 1));
133
		} catch (final NumberFormatException e) {
134
			log.error("Cannot set the total count or the number of pages");
135
			throw new CollectorServiceException(e);
136
		} catch (final Throwable e) {
137
			throw new CollectorServiceException(e);
138
		}
139
	}
140 98

  
141
	private void initParser() {
142
		vn = vg.getNav();
143
		ap = new AutoPilot(vn);
144
	}
99
			for (final Object po : doc.selectNodes("//*[local-name() = 'project']")) {
100
				final Element master = (Element) po;
145 101

  
146
	public String getQueryURL() {
147
		return queryURL;
148
	}
102
				final String href = master.valueOf("@*[local-name() = 'href']");
149 103

  
150
	public void setQueryURL(final String queryURL) {
151
		this.queryURL = queryURL;
152
	}
104
				if (filterIncremental(master)) {
105
					res.add(expandProject(href, master).asXML());
106
				} else {
107
					log.debug("Skipped project: " + href);
108
				}
153 109

  
154
	public int getTotal() {
155
		return total;
156
	}
110
			}
111
		} catch (final Throwable e) {
112
			log.error("Exception fetching page " + pageNumber, e);
113
			throw new CollectorServiceRuntimeException("Exception fetching page " + pageNumber, e);
114
		}
115
		return res;
157 116

  
158
	public void setTotal(final int total) {
159
		this.total = total;
160 117
	}
161 118

  
162
	public int getEndAtPage() {
163
		return endAtPage;
164
	}
119
	private Document expandProject(final String href, final Element masterProject) {
120
		log.debug("Expanding project: " + href);
165 121

  
166
	public void setEndAtPage(final int endAtPage) {
167
		this.endAtPage = endAtPage;
168
		log.debug("Overriding endAtPage to " + endAtPage);
169
	}
122
		final Element newRoot = DocumentHelper.createElement("doc");
170 123

  
171
	public VTDGen getVg() {
172
		return vg;
173
	}
124
		newRoot.add(Gtr2Helper.loadURL(href).getRootElement());
174 125

  
175
	public void setVg(final VTDGen vg) {
176
		this.vg = vg;
177
	}
126
		addFragments(masterProject, "FUND", newRoot, o -> {
127
			final Element fundNode = o.getRootElement();
178 128

  
179
	public VTDNav getVn() {
180
		return vn;
181
	}
129
			final Node funderLinkNode = fundNode.selectSingleNode("//*[local-name()='link' and @*[local-name()='rel']='FUNDER']");
182 130

  
183
	public void setVn(final VTDNav vn) {
184
		this.vn = vn;
185
	}
131
			if (funderLinkNode != null) {
132
				final String funderHref = funderLinkNode.valueOf("@*[local-name() = 'href']");
133
				final Element funderNode = Gtr2Helper.loadURL(funderHref).getRootElement();
186 134

  
187
	public AutoPilot getAp() {
188
		return ap;
189
	}
135
				final Element newFunderNode = DocumentHelper.createElement("funder");
136
				newFunderNode.addElement("name").setText(funderNode.valueOf("//*[local-name()='name']"));
190 137

  
191
	public void setAp(final AutoPilot ap) {
192
		this.ap = ap;
193
	}
194

  
195
	public String getNamespaces() {
196
		return namespaces;
197
	}
198

  
199
	public void setNamespaces(final String namespaces) {
200
		this.namespaces = namespaces;
201
	}
202

  
203
	public int getStartFromPage() {
204
		return startFromPage;
205
	}
206

  
207
	public void setStartFromPage(final int startFromPage) {
208
		this.startFromPage = startFromPage;
209
		log.debug("Overriding startFromPage to " + startFromPage);
210
	}
211

  
212
	private class FillProjectList implements Runnable {
213

  
214
		private boolean morePages = true;
215
		private int pageNumber = startFromPage;
216

  
217
		@Override
218
		public void run() {
219
			String resultPageUrl = "";
220
			try {
221
				do {
222
					resultPageUrl = getNextPageUrl();
223
					log.debug("Page: " + resultPageUrl);
224
					// clear VGen before processing the next file
225
					vg.clear();
226
					final byte[] bytes = connector.getInputSource(resultPageUrl).getBytes("UTF-8");
227
					vg.setDoc(bytes);
228
					vg.parse(false);
229
					// vg.parseHttpUrl(resultPageUrl, false);
230
					initParser();
231
					ap.selectXPath("//project");
232
					while (ap.evalXPath() != -1) {
233
						final String projectHref = vn.toNormalizedString(vn.getAttrVal("href"));
234
						final ParseProject p = new ParseProject(projectHref);
235
						p.execute();
236
					}
237
					ap.resetXPath();
238

  
239
				} while (morePages);
240
				projects.put(TERMINATOR);
241

  
242
			} catch (final Throwable e) {
243
				log.error("Exception processing " + resultPageUrl + "\n" + e.getMessage());
138
				return newFunderNode;
139
			} else {
140
				return null;
244 141
			}
245
		}
142
		});
143
		addFragments(masterProject, "LEAD_ORG", newRoot, o -> asOrgElement("ld-org", o));
144
		addFragments(masterProject, "PP_ORG", newRoot, o -> asOrgElement("pp-org", o));
145
		addFragments(masterProject, "PI_PER", newRoot, Document::getRootElement);
246 146

  
247
		private String getNextPageUrl() {
248
			final String url = queryURL + "?p=" + pageNumber;
249
			if (pageNumber == endAtPage) {
250
				morePages = false;
251
			}
252
			pageNumber++;
253
			return url;
254
		}
255

  
147
		return DocumentHelper.createDocument(newRoot);
256 148
	}
257 149

  
258
	private class ParseProject {
150
	private void addFragments(final Element master, final String relType, final Element newRoot, final Function<Document, Element> mapper) {
259 151

  
260
		VTDNav vn1;
261
		VTDGen vg1;
262
		private final String projectRef;
152
		for (final Object o : master.selectNodes(".//*[local-name()='link']")) {
153
			final String rel = ((Element) o).valueOf("@*[local-name()='rel']");
154
			final String href = ((Element) o).valueOf("@*[local-name()='href']");
263 155

  
264
		public ParseProject(final String projectHref) {
265
			projectRef = Gtr2Helper.cleanURL(projectHref);
266
			vg1 = new VTDGen();
267
			try {
268
				final byte[] bytes = connector.getInputSource(projectRef).getBytes("UTF-8");
269
				vg1.setDoc(bytes);
270
				vg1.parse(false);
271
				vn1 = vg1.getNav();
272
			} catch (final Throwable e) {
273
				log.error("Exception processing " + projectRef + "\n" + e.getMessage());
274
			}
275
		}
276

  
277
		private int projectsUpdate(final String attr) throws CollectorServiceException {
278
			try {
279
				final int index = vn1.getAttrVal(attr);
280
				if (index != -1) {
281
					final String d = vn1.toNormalizedString(index);
282
					final DateTime recordDate = DateTime.parse(d.substring(0, d.indexOf("T")), simpleDateTimeFormatter);
283
					// updated or created after the last time it was collected
284
					if (recordDate.isAfter(fromDate)) {
285
						log.debug("New project to collect");
286
						return index;
156
			if (relType.equals(rel) && StringUtils.isNotBlank(href)) {
157
				final String cacheKey = relType + "#" + href;
158
				if (cache.containsKey(cacheKey)) {
159
					try {
160
						log.debug(" * from cache (" + relType + "): " + href);
161
						newRoot.add(DocumentHelper.parseText(cache.get(cacheKey)).getRootElement());
162
					} catch (final DocumentException e) {
163
						log.error("Error retrieving cache element: " + cacheKey, e);
164
						throw new CollectorServiceRuntimeException("Error retrieving cache element: " + cacheKey, e);
287 165
					}
288
					return -1;
166
				} else {
167
					final Document doc = Gtr2Helper.loadURL(href);
168
					final Element elem = mapper.apply(doc);
169
					newRoot.add(elem);
170
					cache.put(cacheKey, elem.asXML());
289 171
				}
290
				return index;
291
			} catch (final Throwable e) {
292
				throw new CollectorServiceException(e);
293
			}
294
		}
295 172

  
296
		private String collectProject() throws CollectorServiceException {
297
			try {
298
				final int p = vn1.getAttrVal("href");
299
				final String projectHref = vn1.toNormalizedString(p);
300
				log.debug("Collecting project at " + projectHref);
301
				final Gtr2Helper gtr2Helper = new Gtr2Helper();
302
				return gtr2Helper.processProject(vn1, namespaces, projectHref);
303
			} catch (final Throwable e) {
304
				throw new CollectorServiceException(e);
305 173
			}
306 174
		}
175
	}
307 176

  
308
		private boolean add(final String attr) throws CollectorServiceException {
309
			return projectsUpdate(attr) != -1;
310
		}
177
	private Element asOrgElement(final String nodeName, final Document doc) {
178
		final Element newOrg = DocumentHelper.createElement(nodeName);
179
		newOrg.addElement("name").setText(doc.valueOf("//*[local-name()='name']"));
180
		newOrg.addElement("country").setText(doc.valueOf("//*[local-name()='country']"));
181
		newOrg.addElement("id").setText(doc.valueOf("/*[local-name()='organisation']/@*[local-name()='id']"));
182
		return newOrg;
183
	}
311 184

  
312
		public void execute() {
313
			try {
314
				if (!incremental || incremental && (add("created") || add("updated"))) {
315
					projects.put(collectProject());
316
				}
317
			} catch (final Throwable e) {
318
				log.error("Error on ParseProject " + e.getMessage());
319
				throw new CollectorServiceRuntimeException(e);
320
			}
185
	private boolean filterIncremental(final Element e) {
186
		if (!incremental) {
187
			return true;
188
		} else if (Gtr2Helper.isAfter(e.valueOf("@*[local-name() = 'created']"), fromDate)) {
189
			return true;
190
		} else if (Gtr2Helper.isAfter(e.valueOf("@*[local-name() = 'updated']"), fromDate)) {
191
			return true;
192
		} else {
193
			return false;
321 194
		}
322

  
323 195
	}
324 196

  
325 197
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/test/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2ProjectsIteratorTest.java
1
package eu.dnetlib.data.collector.plugins.projects.gtr2;
2

  
3
import static org.junit.Assert.assertEquals;
4
import static org.junit.Assert.assertNotNull;
5

  
6
import java.util.Iterator;
7

  
8
import org.junit.Ignore;
9
import org.junit.Test;
10

  
11
import eu.dnetlib.miscutils.functional.xml.TryIndentXmlString;
12

  
13
public class Gtr2ProjectsIteratorTest {
14

  
15
	private static final String baseURL = "https://gtr.ukri.org/gtr/api";
16

  
17
	private Gtr2ProjectsIterator iterator;
18

  
19
	@Test
20
	@Ignore
21
	public void testOne() throws Exception {
22
		System.out.println("one project");
23

  
24
		iterator = new Gtr2ProjectsIterator(baseURL, null, null, null);
25

  
26
		while (iterator.hasNext()) {
27
			final String res = iterator.next();
28
			assertNotNull(res);
29
			System.out.println(res);
30
			return;
31
		}
32
	}
33

  
34
	@Test
35
	@Ignore
36
	public void testPaging() throws Exception {
37
		iterator = new Gtr2ProjectsIterator(baseURL, null, "2", "2");
38
		final TryIndentXmlString indenter = new TryIndentXmlString();
39

  
40
		while (iterator.hasNext()) {
41
			Thread.sleep(300);
42
			final String res = iterator.next();
43
			assertNotNull(res);
44
			indenter.evaluate(res);
45
			System.out.println(res);
46
		}
47
	}
48

  
49
	@Test
50
	@Ignore
51
	public void testOnePage() throws Exception {
52
		iterator = new Gtr2ProjectsIterator(baseURL, null, "12", "12");
53
		final int count = iterateAndCount(iterator);
54
		assertEquals(21, count);
55
	}
56

  
57
	@Test
58
	@Ignore
59
	public void testIncrementalHarvestingNoRecords() throws Exception {
60
		System.out.println("incremental Harvesting");
61
		iterator = new Gtr2ProjectsIterator(baseURL, "2050-12-12", "11", "13");
62
		final int count = iterateAndCount(iterator);
63
		assertEquals(1, count);
64
	}
65

  
66
	@Test
67
	@Ignore
68
	public void testIncrementalHarvesting() throws Exception {
69
		System.out.println("incremental Harvesting");
70
		iterator = new Gtr2ProjectsIterator(baseURL, "2016-11-30", "11", "11");
71
		final int count = iterateAndCount(iterator);
72
		assertEquals(21, count);
73
	}
74

  
75
	@Test
76
	@Ignore
77
	public void testCompleteHarvesting() throws Exception {
78
		System.out.println("testing complete harvesting");
79
		iterator = new Gtr2ProjectsIterator(baseURL, null, null, null);
80
		// TryIndentXmlString indenter = new TryIndentXmlString();
81
		// it.setEndAtPage(3);
82

  
83
		while (iterator.hasNext()) {
84
			final String res = iterator.next();
85
			assertNotNull(res);
86
			// System.out.println(res);
87
			// Scanner keyboard = new Scanner(System.in);
88
			// System.out.println("press enter for next record");
89
			// keyboard.nextLine();
90

  
91
		}
92
	}
93

  
94
	private int iterateAndCount(final Iterator<String> iterator) throws Exception {
95
		int i = 0;
96
		while (iterator.hasNext()) {
97
			assertNotNull(iterator.next());
98
			i++;
99
		}
100
		System.out.println("Got " + i + " projects");
101
		return i;
102
	}
103
}

Also available in: Unified diff