Revision 63300
Added by Michele Artini about 2 months ago
modules/dnet-collector-plugins/branches/gtr2_michele/src/test/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2Test.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.projects.gtr2; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertEquals; |
|
4 |
import static org.junit.Assert.assertNotNull; |
|
5 |
|
|
6 |
import java.util.Iterator; |
|
7 |
|
|
8 |
import org.junit.Before; |
|
9 |
import org.junit.Ignore; |
|
10 |
import org.junit.Test; |
|
11 |
|
|
12 |
import com.ximpleware.VTDGen; |
|
13 |
|
|
14 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
15 |
import eu.dnetlib.miscutils.functional.xml.TryIndentXmlString; |
|
16 |
|
|
17 |
@Ignore |
|
18 |
public class Gtr2Test { |
|
19 |
|
|
20 |
private final String baseURL = "https://gtr.ukri.org/gtr/api"; |
|
21 |
private Gtr2Helper helper; |
|
22 |
private Gtr2ProjectsIterator iterator; |
|
23 |
private HttpConnector connector; |
|
24 |
|
|
25 |
@Before |
|
26 |
public void prepare() { |
|
27 |
helper = new Gtr2Helper(); |
|
28 |
// System.setProperty("jsse.enableSNIExtension","false"); |
|
29 |
} |
|
30 |
|
|
31 |
@Test |
|
32 |
public void testOne() throws Exception { |
|
33 |
System.out.println("one project"); |
|
34 |
final String url = "http://gtr.ukri.org/gtr/api/projects/0AE039A7-9A84-4943-AA36-001DB5763245"; |
|
35 |
final VTDGen vg_tmp = new VTDGen(); |
|
36 |
connector = new HttpConnector(); |
|
37 |
final String tmp = connector.getInputSource(url); |
|
38 |
final byte[] bytes = tmp.getBytes("UTF-8"); |
|
39 |
vg_tmp.setDoc(bytes); |
|
40 |
vg_tmp.parse(false); |
|
41 |
final String s = helper.processProject(vg_tmp.getNav(), "xmlns:ns=\"http:///afgshs\"", url); |
|
42 |
System.out.println(s); |
|
43 |
} |
|
44 |
|
|
45 |
@Test |
|
46 |
public void testPaging() throws Exception { |
|
47 |
iterator = new Gtr2ProjectsIterator(baseURL, null, 2, 2); |
|
48 |
final TryIndentXmlString indenter = new TryIndentXmlString(); |
|
49 |
|
|
50 |
while (iterator.hasNext()) { |
|
51 |
Thread.sleep(300); |
|
52 |
final String res = iterator.next(); |
|
53 |
assertNotNull(res); |
|
54 |
indenter.evaluate(res); |
|
55 |
System.out.println(res); |
|
56 |
} |
|
57 |
} |
|
58 |
|
|
59 |
@Test |
|
60 |
public void testOnePage() throws Exception { |
|
61 |
iterator = new Gtr2ProjectsIterator(baseURL, null, 12, 12); |
|
62 |
final int count = iterateAndCount(iterator); |
|
63 |
assertEquals(21, count); |
|
64 |
} |
|
65 |
|
|
66 |
@Test |
|
67 |
public void testIncrementalHarvestingNoRecords() throws Exception { |
|
68 |
System.out.println("incremental Harvesting"); |
|
69 |
iterator = new Gtr2ProjectsIterator(baseURL, "2050-12-12", 11, 13); |
|
70 |
final int count = iterateAndCount(iterator); |
|
71 |
assertEquals(1, count); |
|
72 |
} |
|
73 |
|
|
74 |
@Test |
|
75 |
public void testIncrementalHarvesting() throws Exception { |
|
76 |
System.out.println("incremental Harvesting"); |
|
77 |
iterator = new Gtr2ProjectsIterator(baseURL, "2016-11-30", 11, 11); |
|
78 |
final int count = iterateAndCount(iterator); |
|
79 |
assertEquals(21, count); |
|
80 |
} |
|
81 |
|
|
82 |
@Test |
|
83 |
@Ignore |
|
84 |
public void testCompleteHarvesting() throws Exception { |
|
85 |
System.out.println("testing complete harvesting"); |
|
86 |
iterator = new Gtr2ProjectsIterator(baseURL, null); |
|
87 |
// TryIndentXmlString indenter = new TryIndentXmlString(); |
|
88 |
// it.setEndAtPage(3); |
|
89 |
|
|
90 |
while (iterator.hasNext()) { |
|
91 |
final String res = iterator.next(); |
|
92 |
assertNotNull(res); |
|
93 |
// System.out.println(res); |
|
94 |
// Scanner keyboard = new Scanner(System.in); |
|
95 |
// System.out.println("press enter for next record"); |
|
96 |
// keyboard.nextLine(); |
|
97 |
|
|
98 |
} |
|
99 |
} |
|
100 |
|
|
101 |
private int iterateAndCount(final Iterator<String> iterator) throws Exception { |
|
102 |
int i = 0; |
|
103 |
while (iterator.hasNext()) { |
|
104 |
assertNotNull(iterator.next()); |
|
105 |
i++; |
|
106 |
} |
|
107 |
System.out.println("Got " + i + " projects"); |
|
108 |
return i; |
|
109 |
} |
|
110 |
} |
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2Helper.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.collector.plugins.projects.gtr2; |
2 | 2 |
|
3 |
import java.io.ByteArrayOutputStream; |
|
4 |
import java.io.StringWriter; |
|
5 |
|
|
6 |
import org.apache.commons.lang3.StringEscapeUtils; |
|
7 | 3 |
import org.apache.commons.logging.Log; |
8 | 4 |
import org.apache.commons.logging.LogFactory; |
5 |
import org.dom4j.Document; |
|
6 |
import org.dom4j.DocumentHelper; |
|
7 |
import org.joda.time.DateTime; |
|
8 |
import org.joda.time.format.DateTimeFormat; |
|
9 |
import org.joda.time.format.DateTimeFormatter; |
|
9 | 10 |
|
10 |
import com.ximpleware.AutoPilot; |
|
11 |
import com.ximpleware.VTDGen; |
|
12 |
import com.ximpleware.VTDNav; |
|
13 |
|
|
14 | 11 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
12 |
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException; |
|
15 | 13 |
|
16 | 14 |
public class Gtr2Helper { |
17 | 15 |
|
18 | 16 |
private static final Log log = LogFactory.getLog(Gtr2Helper.class); // NOPMD by marko on 11/24/08 5:02 PM |
19 | 17 |
|
20 |
private VTDNav mainVTDNav; |
|
21 |
private AutoPilot mainAutoPilot; |
|
22 |
private StringWriter writer; |
|
23 |
private HttpConnector connector; |
|
24 |
// private BlockingQueue<String> fragment = new ArrayBlockingQueue<String>(20); |
|
18 |
private static final HttpConnector connector = new HttpConnector(); |
|
19 |
private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); |
|
25 | 20 |
|
21 |
private static final int MAX_ATTEMPTS = 3; |
|
22 |
|
|
26 | 23 |
public static String cleanURL(final String url) { |
27 | 24 |
String cleaned = url; |
28 | 25 |
if (cleaned.contains("gtr.gtr")) { |
... | ... | |
34 | 31 |
return cleaned; |
35 | 32 |
} |
36 | 33 |
|
37 |
public String processProject(final VTDNav vn, final String namespaces, final String projectUrl) throws Exception { |
|
38 |
writer = new StringWriter(); |
|
39 |
mainVTDNav = vn; |
|
40 |
mainAutoPilot = new AutoPilot(mainVTDNav); |
|
41 |
writer.write("<doc " + namespaces + ">"); |
|
42 |
writeFragment(mainVTDNav); |
|
43 |
|
|
44 |
mainAutoPilot.selectXPath("//link[@rel='FUND']"); |
|
45 |
|
|
46 |
while (mainAutoPilot.evalXPath() != -1) { |
|
47 |
processFunder(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")), projectUrl); |
|
34 |
public static Document loadURL(final String url) { |
|
35 |
try { |
|
36 |
final String cleanUrl = cleanURL(url); |
|
37 |
final String xml = loadURL(cleanUrl, 0); |
|
38 |
return DocumentHelper.parseText(xml); |
|
39 |
} catch (final Exception e) { |
|
40 |
log.error("Error parsing xml", e); |
|
41 |
throw new CollectorServiceRuntimeException("Error parsing xml", e); |
|
48 | 42 |
} |
49 |
|
|
50 |
mainAutoPilot.resetXPath(); |
|
51 |
mainAutoPilot.selectXPath(".//link[@rel='LEAD_ORG']"); |
|
52 |
while (mainAutoPilot.evalXPath() != -1) { |
|
53 |
processOrg(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")), new String[] { |
|
54 |
"<ld-org>", "</ld-org>" |
|
55 |
}, projectUrl); |
|
56 |
} |
|
57 |
mainAutoPilot.resetXPath(); |
|
58 |
mainAutoPilot.selectXPath(".//link[@rel='PP_ORG']"); |
|
59 |
while (mainAutoPilot.evalXPath() != -1) { |
|
60 |
processOrg(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")), new String[] { |
|
61 |
"<pp-org>", "</pp-org>" |
|
62 |
}, projectUrl); |
|
63 |
} |
|
64 |
|
|
65 |
// mainAutoPilot.resetXPath(); |
|
66 |
// mainAutoPilot.selectXPath(".//link[@rel='PARTICIPANT_ORG']"); |
|
67 |
// while (mainAutoPilot.evalXPath() != -1) { |
|
68 |
// processOrg(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")), |
|
69 |
// new String[]{"<pp-org>", "</pp-org>"}, projectUrl); |
|
70 |
// } |
|
71 |
|
|
72 |
mainAutoPilot.resetXPath(); |
|
73 |
mainAutoPilot.selectXPath(".//link[@rel='PI_PER']"); |
|
74 |
while (mainAutoPilot.evalXPath() != -1) { |
|
75 |
processPerson(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")), projectUrl); |
|
76 |
} |
|
77 |
writer.write("</doc>"); |
|
78 |
writer.close(); |
|
79 |
|
|
80 |
return writer.toString(); |
|
81 | 43 |
} |
82 | 44 |
|
83 |
private VTDNav setNavigator(final String httpUrl) { |
|
84 |
final VTDGen vg_tmp = new VTDGen(); |
|
85 |
connector = new HttpConnector(); |
|
45 |
private static String loadURL(final String cleanUrl, final int attempt) { |
|
86 | 46 |
try { |
87 |
final byte[] bytes = connector.getInputSource(cleanURL(httpUrl)).getBytes("UTF-8"); |
|
88 |
vg_tmp.setDoc(bytes); |
|
89 |
vg_tmp.parse(false); |
|
90 |
// vg_tmp.parseHttpUrl(httpUrl, false); |
|
91 |
return vg_tmp.getNav(); |
|
92 |
} catch (final Throwable e) { |
|
93 |
return null; |
|
94 |
} |
|
95 |
} |
|
96 |
|
|
97 |
private int evalXpath(final VTDNav fragmentVTDNav, final String xPath) throws Exception { |
|
98 |
|
|
99 |
final AutoPilot ap_tmp = new AutoPilot(fragmentVTDNav); |
|
100 |
ap_tmp.selectXPath(xPath); |
|
101 |
return ap_tmp.evalXPath(); |
|
102 |
} |
|
103 |
|
|
104 |
private void writeFragment(final VTDNav nav) throws Exception { |
|
105 |
final ByteArrayOutputStream b = new ByteArrayOutputStream(); |
|
106 |
nav.dumpFragment(b); |
|
107 |
final String ret = b.toString(); |
|
108 |
b.reset(); |
|
109 |
writer.write(ret); |
|
110 |
} |
|
111 |
|
|
112 |
private void writeNewTagAndInfo(final VTDNav vn, final String xPath, final String xmlOpenTag, final String xmlCloseTag, final String attrName) |
|
113 |
throws Exception { |
|
114 |
|
|
115 |
final int nav_res = evalXpath(vn, xPath); |
|
116 |
if (nav_res != -1) { |
|
117 |
String tmp = xmlOpenTag; |
|
118 |
if (attrName != null) { |
|
119 |
tmp += vn.toNormalizedString(vn.getAttrVal(attrName)); |
|
47 |
log.debug(" * Downloading Url: " + cleanUrl); |
|
48 |
final byte[] bytes = connector.getInputSource(cleanUrl).getBytes("UTF-8"); |
|
49 |
return new String(bytes); |
|
50 |
} catch (final Exception e) { |
|
51 |
log.error("Error dowloading url: " + cleanUrl, e); |
|
52 |
if (attempt < MAX_ATTEMPTS) { |
|
53 |
return loadURL(cleanUrl, attempt + 1); |
|
120 | 54 |
} else { |
121 |
tmp += StringEscapeUtils.escapeXml11(vn.toNormalizedString(vn.getText()));
|
|
55 |
throw new CollectorServiceRuntimeException("Error dowloading url: " + cleanUrl, e);
|
|
122 | 56 |
} |
123 |
tmp += xmlCloseTag; |
|
124 |
writer.write(tmp); |
|
125 | 57 |
} |
126 | 58 |
} |
127 | 59 |
|
128 |
private void processPerson(final String httpUrl, final String projectUrl) { |
|
129 |
log.debug(String.format("Getting person %s for project %s", httpUrl, projectUrl)); |
|
130 |
final VTDNav vn = setNavigator(cleanURL(httpUrl)); |
|
131 |
try { |
|
132 |
writeFragment(vn); |
|
133 |
} catch (final Throwable e) { |
|
134 |
log.debug(String.format("Exception in processPerson from %s \n Error message: \n %s", httpUrl, e.getMessage())); |
|
135 |
} |
|
136 |
|
|
60 |
public static DateTime parseDate(final String s) { |
|
61 |
// I expect dates in the format 'yyyy-MM-dd'. See class |
|
62 |
// eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode |
|
63 |
return DateTime.parse(s.substring(0, s.indexOf("T")), simpleDateTimeFormatter); |
|
137 | 64 |
} |
138 | 65 |
|
139 |
private void processOrg(final String httpUrl, final String[] tags, final String projectUrl) { |
|
140 |
log.debug(String.format("Getting org %s for project %s", httpUrl, projectUrl)); |
|
141 |
final VTDNav vn = setNavigator(cleanURL(httpUrl)); |
|
142 |
try { |
|
143 |
writeNewTagAndInfo(vn, "//name", tags[0] + "<name>", "</name>", null); |
|
144 |
vn.toElement(VTDNav.ROOT); |
|
145 |
writeNewTagAndInfo(vn, "//country", "<country>", "</country>", null); |
|
146 |
vn.toElement(VTDNav.ROOT); |
|
147 |
writeNewTagAndInfo(vn, ".", "<id>", "</id>" + tags[1], "id"); |
|
148 |
} catch (final Throwable e) { |
|
149 |
log.debug(String.format("Exception in processOrg from %s \n Error message: \n %s", httpUrl, e.getMessage())); |
|
150 |
} |
|
66 |
public static boolean isAfter(final String d, final DateTime fromDate) { |
|
67 |
return Gtr2Helper.parseDate(d).isAfter(fromDate); |
|
151 | 68 |
} |
152 |
|
|
153 |
private void processFunder(final String httpUrl, final String projectUrl) { |
|
154 |
log.debug(String.format("Getting funder %s for project %s", httpUrl, projectUrl)); |
|
155 |
final VTDNav vn = setNavigator(cleanURL(httpUrl)); |
|
156 |
try { |
|
157 |
final AutoPilot ap = new AutoPilot(vn); |
|
158 |
writeFragment(vn); |
|
159 |
ap.selectXPath(".//link[@rel='FUNDER']"); |
|
160 |
VTDNav tmp_vn; |
|
161 |
while (ap.evalXPath() != -1) { |
|
162 |
tmp_vn = setNavigator(vn.toNormalizedString(vn.getAttrVal("href"))); |
|
163 |
writeNewTagAndInfo(tmp_vn, "//name", "<funder> <name>", "</name></funder>", null); |
|
164 |
} |
|
165 |
} catch (final Throwable e) { |
|
166 |
log.debug(String.format("Exception in processFunder from %s \n Error message: \n %s", httpUrl, e.getMessage())); |
|
167 |
} |
|
168 |
} |
|
169 | 69 |
} |
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2ProjectsIterator.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.collector.plugins.projects.gtr2; |
2 | 2 |
|
3 |
import java.util.ArrayList; |
|
4 |
import java.util.HashMap; |
|
3 | 5 |
import java.util.Iterator; |
4 |
import java.util.concurrent.ArrayBlockingQueue; |
|
5 |
import java.util.concurrent.TimeUnit; |
|
6 |
import java.util.LinkedList; |
|
7 |
import java.util.List; |
|
8 |
import java.util.Map; |
|
9 |
import java.util.Queue; |
|
10 |
import java.util.function.Function; |
|
6 | 11 |
|
12 |
import org.apache.commons.lang.math.NumberUtils; |
|
7 | 13 |
import org.apache.commons.lang3.StringUtils; |
8 | 14 |
import org.apache.commons.logging.Log; |
9 | 15 |
import org.apache.commons.logging.LogFactory; |
16 |
import org.dom4j.Document; |
|
17 |
import org.dom4j.DocumentException; |
|
18 |
import org.dom4j.DocumentHelper; |
|
19 |
import org.dom4j.Element; |
|
20 |
import org.dom4j.Node; |
|
10 | 21 |
import org.joda.time.DateTime; |
11 |
import org.joda.time.format.DateTimeFormat; |
|
12 |
import org.joda.time.format.DateTimeFormatter; |
|
13 | 22 |
|
14 |
import com.ximpleware.AutoPilot; |
|
15 |
import com.ximpleware.VTDGen; |
|
16 |
import com.ximpleware.VTDNav; |
|
17 |
|
|
18 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
19 | 23 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
20 | 24 |
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException; |
21 | 25 |
|
... | ... | |
24 | 28 |
*/ |
25 | 29 |
public class Gtr2ProjectsIterator implements Iterator<String> { |
26 | 30 |
|
27 |
public static final String TERMINATOR = "ARNOLD"; |
|
28 |
public static final int WAIT_END_SECONDS = 600; |
|
29 |
public static final int PAGE_SZIE = 20; |
|
31 |
public static final int PAGE_SIZE = 20; |
|
30 | 32 |
|
31 | 33 |
private static final Log log = LogFactory.getLog(Gtr2ProjectsIterator.class); |
32 | 34 |
|
33 |
private String queryURL; |
|
34 |
private int total = -1; |
|
35 |
private int startFromPage = 1; |
|
36 |
private int endAtPage; |
|
37 |
private VTDGen vg; |
|
38 |
private VTDNav vn; |
|
39 |
private AutoPilot ap; |
|
40 |
private String namespaces; |
|
35 |
private final String baseUrl; |
|
36 |
private int currPage; |
|
37 |
private int endPage; |
|
41 | 38 |
private boolean incremental = false; |
42 | 39 |
private DateTime fromDate; |
43 |
private final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); |
|
44 |
private final ArrayBlockingQueue<String> projects = new ArrayBlockingQueue<>(200); |
|
45 |
// private boolean finished = false; |
|
46 |
private String nextElement = "<doc></doc>"; |
|
47 |
private HttpConnector connector; |
|
48 | 40 |
|
41 |
private final Map<String, String> cache = new HashMap<>(); |
|
42 |
|
|
43 |
private final Queue<String> queue = new LinkedList<>(); |
|
44 |
|
|
45 |
private String nextElement; |
|
46 |
|
|
47 |
public Gtr2ProjectsIterator(final String baseUrl, final String fromDate, final String startPage, final String endPage) |
|
48 |
throws CollectorServiceException { |
|
49 |
|
|
50 |
this.baseUrl = baseUrl; |
|
51 |
this.currPage = NumberUtils.toInt(startPage, 1); |
|
52 |
this.endPage = NumberUtils.toInt(endPage, Integer.MAX_VALUE); |
|
53 |
this.incremental = StringUtils.isNotBlank(fromDate); |
|
54 |
|
|
55 |
if (this.incremental) { |
|
56 |
this.fromDate = Gtr2Helper.parseDate(fromDate); |
|
57 |
} |
|
58 |
|
|
59 |
prepareNextElement(); |
|
60 |
} |
|
61 |
|
|
49 | 62 |
@Override |
50 | 63 |
public boolean hasNext() { |
51 |
return !TERMINATOR.equals(nextElement);
|
|
64 |
return nextElement != null;
|
|
52 | 65 |
} |
53 | 66 |
|
54 | 67 |
@Override |
... | ... | |
56 | 69 |
try { |
57 | 70 |
return nextElement; |
58 | 71 |
} finally { |
59 |
try { |
|
60 |
nextElement = projects.poll(WAIT_END_SECONDS, TimeUnit.SECONDS); |
|
61 |
} catch (final InterruptedException e) { |
|
62 |
throw new RuntimeException(e); |
|
63 |
} |
|
72 |
prepareNextElement(); |
|
64 | 73 |
} |
65 |
|
|
66 | 74 |
} |
67 | 75 |
|
68 | 76 |
@Override |
... | ... | |
70 | 78 |
throw new UnsupportedOperationException(); |
71 | 79 |
} |
72 | 80 |
|
73 |
public Gtr2ProjectsIterator(final String baseUrl, final String fromDate) throws CollectorServiceException { |
|
74 |
prepare(baseUrl, fromDate); |
|
75 |
fillInfo(true); |
|
76 |
} |
|
77 |
|
|
78 |
public Gtr2ProjectsIterator(final String baseUrl, final String fromDate, final int startFromPage, final int endAtPage) throws CollectorServiceException { |
|
79 |
prepare(baseUrl, fromDate); |
|
80 |
this.setStartFromPage(startFromPage); |
|
81 |
this.setEndAtPage(endAtPage); |
|
82 |
fillInfo(false); |
|
83 |
} |
|
84 |
|
|
85 |
public Gtr2ProjectsIterator(final String baseUrl, final String fromDate, final String startFromPage, final String endAtPage) |
|
86 |
throws CollectorServiceException { |
|
87 |
prepare(baseUrl, fromDate); |
|
88 |
if (StringUtils.isNotBlank(startFromPage)) { |
|
89 |
this.setStartFromPage(Integer.parseInt(startFromPage)); |
|
81 |
private void prepareNextElement() { |
|
82 |
while (this.currPage <= this.endPage && queue.isEmpty()) { |
|
83 |
log.debug("FETCHING PAGE + " + currPage + "/" + endPage); |
|
84 |
this.queue.addAll(fetchPage(currPage++)); |
|
90 | 85 |
} |
91 |
if (StringUtils.isNotBlank(endAtPage)) { |
|
92 |
this.setEndAtPage(Integer.parseInt(endAtPage)); |
|
93 |
} |
|
94 |
fillInfo(false); |
|
86 |
this.nextElement = this.queue.poll(); |
|
95 | 87 |
} |
96 | 88 |
|
97 |
private void prepare(final String baseUrl, final String fromDate) { |
|
98 |
connector = new HttpConnector(); |
|
99 |
queryURL = baseUrl + "/projects"; |
|
100 |
vg = new VTDGen(); |
|
101 |
this.incremental = StringUtils.isNotBlank(fromDate); |
|
102 |
if (incremental) { |
|
103 |
// I expect fromDate in the format 'yyyy-MM-dd'. See class |
|
104 |
// eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode |
|
105 |
this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter); |
|
106 |
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString()); |
|
107 |
} |
|
108 |
} |
|
89 |
private List<String> fetchPage(final int pageNumber) { |
|
109 | 90 |
|
110 |
private void fillInfo(final boolean all) throws CollectorServiceException {
|
|
91 |
final List<String> res = new ArrayList<>();
|
|
111 | 92 |
try { |
112 |
// log.debug("Getting hit count from: " + queryURL); |
|
113 |
final byte[] bytes = connector.getInputSource(queryURL).getBytes("UTF-8"); |
|
114 |
vg.setDoc(bytes); |
|
115 |
vg.parse(false); |
|
116 |
// vg.parseHttpUrl(queryURL, false); |
|
117 |
initParser(); |
|
118 |
final String hitCount = vn.toNormalizedString(vn.getAttrVal("totalSize")); |
|
119 |
final String totalPages = vn.toNormalizedString(vn.getAttrVal("totalPages")); |
|
120 |
namespaces = "xmlns:ns1=\"" + vn.toNormalizedString(vn.getAttrVal("ns1")) + "\" "; |
|
121 |
namespaces += "xmlns:ns2=\"" + vn.toNormalizedString(vn.getAttrVal("ns2")) + "\" "; |
|
122 |
namespaces += "xmlns:ns3=\"" + vn.toNormalizedString(vn.getAttrVal("ns3")) + "\" "; |
|
123 |
namespaces += "xmlns:ns4=\"" + vn.toNormalizedString(vn.getAttrVal("ns4")) + "\" "; |
|
124 |
namespaces += "xmlns:ns5=\"" + vn.toNormalizedString(vn.getAttrVal("ns5")) + "\" "; |
|
125 |
namespaces += "xmlns:ns6=\"" + vn.toNormalizedString(vn.getAttrVal("ns6")) + "\" "; |
|
126 |
if (all) { |
|
127 |
setEndAtPage(Integer.parseInt(totalPages)); |
|
128 |
total = Integer.parseInt(hitCount); |
|
93 |
final Document doc = Gtr2Helper.loadURL(baseUrl + "/projects?p=" + pageNumber); |
|
94 |
|
|
95 |
if (endPage == Integer.MAX_VALUE) { |
|
96 |
this.endPage = NumberUtils.toInt(doc.valueOf("/*[local-name()='projects']/@*[local-name() = 'totalPages']")); |
|
129 | 97 |
} |
130 |
final Thread ft = new Thread(new FillProjectList()); |
|
131 |
ft.start(); |
|
132 |
log.debug("Expected number of pages: " + (endAtPage - startFromPage + 1)); |
|
133 |
} catch (final NumberFormatException e) { |
|
134 |
log.error("Cannot set the total count or the number of pages"); |
|
135 |
throw new CollectorServiceException(e); |
|
136 |
} catch (final Throwable e) { |
|
137 |
throw new CollectorServiceException(e); |
|
138 |
} |
|
139 |
} |
|
140 | 98 |
|
141 |
private void initParser() { |
|
142 |
vn = vg.getNav(); |
|
143 |
ap = new AutoPilot(vn); |
|
144 |
} |
|
99 |
for (final Object po : doc.selectNodes("//*[local-name() = 'project']")) { |
|
100 |
final Element master = (Element) po; |
|
145 | 101 |
|
146 |
public String getQueryURL() { |
|
147 |
return queryURL; |
|
148 |
} |
|
102 |
final String href = master.valueOf("@*[local-name() = 'href']"); |
|
149 | 103 |
|
150 |
public void setQueryURL(final String queryURL) { |
|
151 |
this.queryURL = queryURL; |
|
152 |
} |
|
104 |
if (filterIncremental(master)) { |
|
105 |
res.add(expandProject(href, master).asXML()); |
|
106 |
} else { |
|
107 |
log.debug("Skipped project: " + href); |
|
108 |
} |
|
153 | 109 |
|
154 |
public int getTotal() { |
|
155 |
return total; |
|
156 |
} |
|
110 |
} |
|
111 |
} catch (final Throwable e) { |
|
112 |
log.error("Exception fetching page " + pageNumber, e); |
|
113 |
throw new CollectorServiceRuntimeException("Exception fetching page " + pageNumber, e); |
|
114 |
} |
|
115 |
return res; |
|
157 | 116 |
|
158 |
public void setTotal(final int total) { |
|
159 |
this.total = total; |
|
160 | 117 |
} |
161 | 118 |
|
162 |
public int getEndAtPage() { |
|
163 |
return endAtPage; |
|
164 |
} |
|
119 |
private Document expandProject(final String href, final Element masterProject) { |
|
120 |
log.debug("Expanding project: " + href); |
|
165 | 121 |
|
166 |
public void setEndAtPage(final int endAtPage) { |
|
167 |
this.endAtPage = endAtPage; |
|
168 |
log.debug("Overriding endAtPage to " + endAtPage); |
|
169 |
} |
|
122 |
final Element newRoot = DocumentHelper.createElement("doc"); |
|
170 | 123 |
|
171 |
public VTDGen getVg() { |
|
172 |
return vg; |
|
173 |
} |
|
124 |
newRoot.add(Gtr2Helper.loadURL(href).getRootElement()); |
|
174 | 125 |
|
175 |
public void setVg(final VTDGen vg) { |
|
176 |
this.vg = vg; |
|
177 |
} |
|
126 |
addFragments(masterProject, "FUND", newRoot, o -> { |
|
127 |
final Element fundNode = o.getRootElement(); |
|
178 | 128 |
|
179 |
public VTDNav getVn() { |
|
180 |
return vn; |
|
181 |
} |
|
129 |
final Node funderLinkNode = fundNode.selectSingleNode("//*[local-name()='link' and @*[local-name()='rel']='FUNDER']"); |
|
182 | 130 |
|
183 |
public void setVn(final VTDNav vn) {
|
|
184 |
this.vn = vn;
|
|
185 |
}
|
|
131 |
if (funderLinkNode != null) {
|
|
132 |
final String funderHref = funderLinkNode.valueOf("@*[local-name() = 'href']");
|
|
133 |
final Element funderNode = Gtr2Helper.loadURL(funderHref).getRootElement();
|
|
186 | 134 |
|
187 |
public AutoPilot getAp() { |
|
188 |
return ap; |
|
189 |
} |
|
135 |
final Element newFunderNode = DocumentHelper.createElement("funder"); |
|
136 |
newFunderNode.addElement("name").setText(funderNode.valueOf("//*[local-name()='name']")); |
|
190 | 137 |
|
191 |
public void setAp(final AutoPilot ap) { |
|
192 |
this.ap = ap; |
|
193 |
} |
|
194 |
|
|
195 |
public String getNamespaces() { |
|
196 |
return namespaces; |
|
197 |
} |
|
198 |
|
|
199 |
public void setNamespaces(final String namespaces) { |
|
200 |
this.namespaces = namespaces; |
|
201 |
} |
|
202 |
|
|
203 |
public int getStartFromPage() { |
|
204 |
return startFromPage; |
|
205 |
} |
|
206 |
|
|
207 |
public void setStartFromPage(final int startFromPage) { |
|
208 |
this.startFromPage = startFromPage; |
|
209 |
log.debug("Overriding startFromPage to " + startFromPage); |
|
210 |
} |
|
211 |
|
|
212 |
private class FillProjectList implements Runnable { |
|
213 |
|
|
214 |
private boolean morePages = true; |
|
215 |
private int pageNumber = startFromPage; |
|
216 |
|
|
217 |
@Override |
|
218 |
public void run() { |
|
219 |
String resultPageUrl = ""; |
|
220 |
try { |
|
221 |
do { |
|
222 |
resultPageUrl = getNextPageUrl(); |
|
223 |
log.debug("Page: " + resultPageUrl); |
|
224 |
// clear VGen before processing the next file |
|
225 |
vg.clear(); |
|
226 |
final byte[] bytes = connector.getInputSource(resultPageUrl).getBytes("UTF-8"); |
|
227 |
vg.setDoc(bytes); |
|
228 |
vg.parse(false); |
|
229 |
// vg.parseHttpUrl(resultPageUrl, false); |
|
230 |
initParser(); |
|
231 |
ap.selectXPath("//project"); |
|
232 |
while (ap.evalXPath() != -1) { |
|
233 |
final String projectHref = vn.toNormalizedString(vn.getAttrVal("href")); |
|
234 |
final ParseProject p = new ParseProject(projectHref); |
|
235 |
p.execute(); |
|
236 |
} |
|
237 |
ap.resetXPath(); |
|
238 |
|
|
239 |
} while (morePages); |
|
240 |
projects.put(TERMINATOR); |
|
241 |
|
|
242 |
} catch (final Throwable e) { |
|
243 |
log.error("Exception processing " + resultPageUrl + "\n" + e.getMessage()); |
|
138 |
return newFunderNode; |
|
139 |
} else { |
|
140 |
return null; |
|
244 | 141 |
} |
245 |
} |
|
142 |
}); |
|
143 |
addFragments(masterProject, "LEAD_ORG", newRoot, o -> asOrgElement("ld-org", o)); |
|
144 |
addFragments(masterProject, "PP_ORG", newRoot, o -> asOrgElement("pp-org", o)); |
|
145 |
addFragments(masterProject, "PI_PER", newRoot, Document::getRootElement); |
|
246 | 146 |
|
247 |
private String getNextPageUrl() { |
|
248 |
final String url = queryURL + "?p=" + pageNumber; |
|
249 |
if (pageNumber == endAtPage) { |
|
250 |
morePages = false; |
|
251 |
} |
|
252 |
pageNumber++; |
|
253 |
return url; |
|
254 |
} |
|
255 |
|
|
147 |
return DocumentHelper.createDocument(newRoot); |
|
256 | 148 |
} |
257 | 149 |
|
258 |
private class ParseProject {
|
|
150 |
private void addFragments(final Element master, final String relType, final Element newRoot, final Function<Document, Element> mapper) {
|
|
259 | 151 |
|
260 |
VTDNav vn1;
|
|
261 |
VTDGen vg1;
|
|
262 |
private final String projectRef;
|
|
152 |
for (final Object o : master.selectNodes(".//*[local-name()='link']")) {
|
|
153 |
final String rel = ((Element) o).valueOf("@*[local-name()='rel']");
|
|
154 |
final String href = ((Element) o).valueOf("@*[local-name()='href']");
|
|
263 | 155 |
|
264 |
public ParseProject(final String projectHref) { |
|
265 |
projectRef = Gtr2Helper.cleanURL(projectHref); |
|
266 |
vg1 = new VTDGen(); |
|
267 |
try { |
|
268 |
final byte[] bytes = connector.getInputSource(projectRef).getBytes("UTF-8"); |
|
269 |
vg1.setDoc(bytes); |
|
270 |
vg1.parse(false); |
|
271 |
vn1 = vg1.getNav(); |
|
272 |
} catch (final Throwable e) { |
|
273 |
log.error("Exception processing " + projectRef + "\n" + e.getMessage()); |
|
274 |
} |
|
275 |
} |
|
276 |
|
|
277 |
private int projectsUpdate(final String attr) throws CollectorServiceException { |
|
278 |
try { |
|
279 |
final int index = vn1.getAttrVal(attr); |
|
280 |
if (index != -1) { |
|
281 |
final String d = vn1.toNormalizedString(index); |
|
282 |
final DateTime recordDate = DateTime.parse(d.substring(0, d.indexOf("T")), simpleDateTimeFormatter); |
|
283 |
// updated or created after the last time it was collected |
|
284 |
if (recordDate.isAfter(fromDate)) { |
|
285 |
log.debug("New project to collect"); |
|
286 |
return index; |
|
156 |
if (relType.equals(rel) && StringUtils.isNotBlank(href)) { |
|
157 |
final String cacheKey = relType + "#" + href; |
|
158 |
if (cache.containsKey(cacheKey)) { |
|
159 |
try { |
|
160 |
log.debug(" * from cache (" + relType + "): " + href); |
|
161 |
newRoot.add(DocumentHelper.parseText(cache.get(cacheKey)).getRootElement()); |
|
162 |
} catch (final DocumentException e) { |
|
163 |
log.error("Error retrieving cache element: " + cacheKey, e); |
|
164 |
throw new CollectorServiceRuntimeException("Error retrieving cache element: " + cacheKey, e); |
|
287 | 165 |
} |
288 |
return -1; |
|
166 |
} else { |
|
167 |
final Document doc = Gtr2Helper.loadURL(href); |
|
168 |
final Element elem = mapper.apply(doc); |
|
169 |
newRoot.add(elem); |
|
170 |
cache.put(cacheKey, elem.asXML()); |
|
289 | 171 |
} |
290 |
return index; |
|
291 |
} catch (final Throwable e) { |
|
292 |
throw new CollectorServiceException(e); |
|
293 |
} |
|
294 |
} |
|
295 | 172 |
|
296 |
private String collectProject() throws CollectorServiceException { |
|
297 |
try { |
|
298 |
final int p = vn1.getAttrVal("href"); |
|
299 |
final String projectHref = vn1.toNormalizedString(p); |
|
300 |
log.debug("Collecting project at " + projectHref); |
|
301 |
final Gtr2Helper gtr2Helper = new Gtr2Helper(); |
|
302 |
return gtr2Helper.processProject(vn1, namespaces, projectHref); |
|
303 |
} catch (final Throwable e) { |
|
304 |
throw new CollectorServiceException(e); |
|
305 | 173 |
} |
306 | 174 |
} |
175 |
} |
|
307 | 176 |
|
308 |
private boolean add(final String attr) throws CollectorServiceException { |
|
309 |
return projectsUpdate(attr) != -1; |
|
310 |
} |
|
177 |
private Element asOrgElement(final String nodeName, final Document doc) { |
|
178 |
final Element newOrg = DocumentHelper.createElement(nodeName); |
|
179 |
newOrg.addElement("name").setText(doc.valueOf("//*[local-name()='name']")); |
|
180 |
newOrg.addElement("country").setText(doc.valueOf("//*[local-name()='country']")); |
|
181 |
newOrg.addElement("id").setText(doc.valueOf("/*[local-name()='organisation']/@*[local-name()='id']")); |
|
182 |
return newOrg; |
|
183 |
} |
|
311 | 184 |
|
312 |
public void execute() {
|
|
313 |
try {
|
|
314 |
if (!incremental || incremental && (add("created") || add("updated"))) {
|
|
315 |
projects.put(collectProject());
|
|
316 |
}
|
|
317 |
} catch (final Throwable e) {
|
|
318 |
log.error("Error on ParseProject " + e.getMessage());
|
|
319 |
throw new CollectorServiceRuntimeException(e);
|
|
320 |
}
|
|
185 |
private boolean filterIncremental(final Element e) {
|
|
186 |
if (!incremental) {
|
|
187 |
return true;
|
|
188 |
} else if (Gtr2Helper.isAfter(e.valueOf("@*[local-name() = 'created']"), fromDate)) {
|
|
189 |
return true;
|
|
190 |
} else if (Gtr2Helper.isAfter(e.valueOf("@*[local-name() = 'updated']"), fromDate)) {
|
|
191 |
return true;
|
|
192 |
} else {
|
|
193 |
return false;
|
|
321 | 194 |
} |
322 |
|
|
323 | 195 |
} |
324 | 196 |
|
325 | 197 |
} |
modules/dnet-collector-plugins/branches/gtr2_michele/src/test/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2ProjectsIteratorTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.projects.gtr2; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertEquals; |
|
4 |
import static org.junit.Assert.assertNotNull; |
|
5 |
|
|
6 |
import java.util.Iterator; |
|
7 |
|
|
8 |
import org.junit.Ignore; |
|
9 |
import org.junit.Test; |
|
10 |
|
|
11 |
import eu.dnetlib.miscutils.functional.xml.TryIndentXmlString; |
|
12 |
|
|
13 |
public class Gtr2ProjectsIteratorTest { |
|
14 |
|
|
15 |
private static final String baseURL = "https://gtr.ukri.org/gtr/api"; |
|
16 |
|
|
17 |
private Gtr2ProjectsIterator iterator; |
|
18 |
|
|
19 |
@Test |
|
20 |
@Ignore |
|
21 |
public void testOne() throws Exception { |
|
22 |
System.out.println("one project"); |
|
23 |
|
|
24 |
iterator = new Gtr2ProjectsIterator(baseURL, null, null, null); |
|
25 |
|
|
26 |
while (iterator.hasNext()) { |
|
27 |
final String res = iterator.next(); |
|
28 |
assertNotNull(res); |
|
29 |
System.out.println(res); |
|
30 |
return; |
|
31 |
} |
|
32 |
} |
|
33 |
|
|
34 |
@Test |
|
35 |
@Ignore |
|
36 |
public void testPaging() throws Exception { |
|
37 |
iterator = new Gtr2ProjectsIterator(baseURL, null, "2", "2"); |
|
38 |
final TryIndentXmlString indenter = new TryIndentXmlString(); |
|
39 |
|
|
40 |
while (iterator.hasNext()) { |
|
41 |
Thread.sleep(300); |
|
42 |
final String res = iterator.next(); |
|
43 |
assertNotNull(res); |
|
44 |
indenter.evaluate(res); |
|
45 |
System.out.println(res); |
|
46 |
} |
|
47 |
} |
|
48 |
|
|
49 |
@Test |
|
50 |
@Ignore |
|
51 |
public void testOnePage() throws Exception { |
|
52 |
iterator = new Gtr2ProjectsIterator(baseURL, null, "12", "12"); |
|
53 |
final int count = iterateAndCount(iterator); |
|
54 |
assertEquals(21, count); |
|
55 |
} |
|
56 |
|
|
57 |
@Test |
|
58 |
@Ignore |
|
59 |
public void testIncrementalHarvestingNoRecords() throws Exception { |
|
60 |
System.out.println("incremental Harvesting"); |
|
61 |
iterator = new Gtr2ProjectsIterator(baseURL, "2050-12-12", "11", "13"); |
|
62 |
final int count = iterateAndCount(iterator); |
|
63 |
assertEquals(1, count); |
|
64 |
} |
|
65 |
|
|
66 |
@Test |
|
67 |
@Ignore |
|
68 |
public void testIncrementalHarvesting() throws Exception { |
|
69 |
System.out.println("incremental Harvesting"); |
|
70 |
iterator = new Gtr2ProjectsIterator(baseURL, "2016-11-30", "11", "11"); |
|
71 |
final int count = iterateAndCount(iterator); |
|
72 |
assertEquals(21, count); |
|
73 |
} |
|
74 |
|
|
75 |
@Test |
|
76 |
@Ignore |
|
77 |
public void testCompleteHarvesting() throws Exception { |
|
78 |
System.out.println("testing complete harvesting"); |
|
79 |
iterator = new Gtr2ProjectsIterator(baseURL, null, null, null); |
|
80 |
// TryIndentXmlString indenter = new TryIndentXmlString(); |
|
81 |
// it.setEndAtPage(3); |
|
82 |
|
|
83 |
while (iterator.hasNext()) { |
|
84 |
final String res = iterator.next(); |
|
85 |
assertNotNull(res); |
|
86 |
// System.out.println(res); |
|
87 |
// Scanner keyboard = new Scanner(System.in); |
|
88 |
// System.out.println("press enter for next record"); |
|
89 |
// keyboard.nextLine(); |
|
90 |
|
|
91 |
} |
|
92 |
} |
|
93 |
|
|
94 |
private int iterateAndCount(final Iterator<String> iterator) throws Exception { |
|
95 |
int i = 0; |
|
96 |
while (iterator.hasNext()) { |
|
97 |
assertNotNull(iterator.next()); |
|
98 |
i++; |
|
99 |
} |
|
100 |
System.out.println("Got " + i + " projects"); |
|
101 |
return i; |
|
102 |
} |
|
103 |
} |
Also available in: Unified diff
first implementation