Revision 63262
Added by Michele Artini 5 months ago
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2ProjectsIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.projects.gtr2; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
import java.util.NoSuchElementException; |
|
5 |
import java.util.concurrent.ArrayBlockingQueue; |
|
6 |
import java.util.concurrent.ExecutorService; |
|
7 |
import java.util.concurrent.Executors; |
|
8 |
import java.util.concurrent.TimeUnit; |
|
9 |
|
|
10 |
import com.ximpleware.AutoPilot; |
|
11 |
import com.ximpleware.VTDGen; |
|
12 |
import com.ximpleware.VTDNav; |
|
13 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
14 |
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException; |
|
15 |
import eu.dnetlib.enabling.resultset.SizedIterable; |
|
16 |
import org.apache.commons.lang3.StringUtils; |
|
17 |
import org.apache.commons.logging.Log; |
|
18 |
import org.apache.commons.logging.LogFactory; |
|
19 |
import org.joda.time.DateTime; |
|
20 |
import org.joda.time.format.DateTimeFormat; |
|
21 |
import org.joda.time.format.DateTimeFormatter; |
|
22 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
23 |
|
|
24 |
/** |
|
25 |
* Created by alessia on 28/11/16. |
|
26 |
*/ |
|
27 |
public class Gtr2ProjectsIterator implements Iterator<String> { |
|
28 |
|
|
29 |
public static final String TERMINATOR = "ARNOLD"; |
|
30 |
public static final int WAIT_END_SECONDS = 600; |
|
31 |
public static final int PAGE_SZIE = 20; |
|
32 |
|
|
33 |
private static final Log log = LogFactory.getLog(Gtr2ProjectsIterator.class); |
|
34 |
|
|
35 |
private String queryURL; |
|
36 |
private int total = -1; |
|
37 |
private int startFromPage = 1; |
|
38 |
private int endAtPage; |
|
39 |
private VTDGen vg; |
|
40 |
private VTDNav vn; |
|
41 |
private AutoPilot ap; |
|
42 |
private String namespaces; |
|
43 |
private boolean incremental = false; |
|
44 |
private DateTime fromDate; |
|
45 |
private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); |
|
46 |
private ArrayBlockingQueue<String> projects = new ArrayBlockingQueue<String>(20); |
|
47 |
//private boolean finished = false; |
|
48 |
private final ExecutorService es = Executors.newFixedThreadPool(PAGE_SZIE); |
|
49 |
private String nextElement = "<doc></doc>"; |
|
50 |
private HttpConnector connector; |
|
51 |
|
|
52 |
public boolean hasNext() { |
|
53 |
|
|
54 |
return !nextElement.equals(TERMINATOR); |
|
55 |
|
|
56 |
} |
|
57 |
|
|
58 |
@Override |
|
59 |
public String next() { |
|
60 |
try{ |
|
61 |
return nextElement; |
|
62 |
}finally{ |
|
63 |
try { |
|
64 |
nextElement = projects.poll(WAIT_END_SECONDS, TimeUnit.SECONDS); |
|
65 |
} catch (InterruptedException e) { |
|
66 |
throw new RuntimeException(e); |
|
67 |
} |
|
68 |
} |
|
69 |
|
|
70 |
} |
|
71 |
|
|
72 |
@Override |
|
73 |
public void remove() { |
|
74 |
throw new UnsupportedOperationException(); |
|
75 |
} |
|
76 |
public Gtr2ProjectsIterator(final String baseUrl, final String fromDate) throws CollectorServiceException { |
|
77 |
prepare(baseUrl, fromDate); |
|
78 |
fillInfo(true); |
|
79 |
} |
|
80 |
|
|
81 |
public Gtr2ProjectsIterator(final String baseUrl, final String fromDate, final int startFromPage, final int endAtPage) throws CollectorServiceException { |
|
82 |
prepare(baseUrl, fromDate); |
|
83 |
this.setStartFromPage(startFromPage); |
|
84 |
this.setEndAtPage(endAtPage); |
|
85 |
fillInfo(false); |
|
86 |
} |
|
87 |
|
|
88 |
private void prepare(final String baseUrl, final String fromDate) { |
|
89 |
connector = new HttpConnector(); |
|
90 |
queryURL = baseUrl + "/projects"; |
|
91 |
vg = new VTDGen(); |
|
92 |
this.incremental = StringUtils.isNotBlank(fromDate); |
|
93 |
if (incremental) { |
|
94 |
// I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode |
|
95 |
this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter); |
|
96 |
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString()); |
|
97 |
} |
|
98 |
} |
|
99 |
|
|
100 |
|
|
101 |
|
|
102 |
private void fillInfo(final boolean all) throws CollectorServiceException { |
|
103 |
try { |
|
104 |
// log.debug("Getting hit count from: " + queryURL); |
|
105 |
byte[] bytes = connector.getInputSource(queryURL).getBytes("UTF-8"); |
|
106 |
vg.setDoc(bytes); |
|
107 |
vg.parse(false); |
|
108 |
//vg.parseHttpUrl(queryURL, false); |
|
109 |
initParser(); |
|
110 |
String hitCount = vn.toNormalizedString(vn.getAttrVal("totalSize")); |
|
111 |
String totalPages = vn.toNormalizedString(vn.getAttrVal("totalPages")); |
|
112 |
namespaces = "xmlns:ns1=\"" + vn.toNormalizedString(vn.getAttrVal("ns1")) + "\" "; |
|
113 |
namespaces += "xmlns:ns2=\"" + vn.toNormalizedString(vn.getAttrVal("ns2")) + "\" "; |
|
114 |
namespaces += "xmlns:ns3=\"" + vn.toNormalizedString(vn.getAttrVal("ns3")) + "\" "; |
|
115 |
namespaces += "xmlns:ns4=\"" + vn.toNormalizedString(vn.getAttrVal("ns4")) + "\" "; |
|
116 |
namespaces += "xmlns:ns5=\"" + vn.toNormalizedString(vn.getAttrVal("ns5")) + "\" "; |
|
117 |
namespaces += "xmlns:ns6=\"" + vn.toNormalizedString(vn.getAttrVal("ns6")) + "\" "; |
|
118 |
if (all) { |
|
119 |
setEndAtPage(Integer.parseInt(totalPages)); |
|
120 |
total = Integer.parseInt(hitCount); |
|
121 |
} |
|
122 |
Thread ft = new Thread(new FillProjectList()); |
|
123 |
ft.start(); |
|
124 |
log.debug("Expected number of pages: " + (endAtPage - startFromPage + 1)); |
|
125 |
} catch (NumberFormatException e) { |
|
126 |
log.error("Cannot set the total count or the number of pages"); |
|
127 |
throw new CollectorServiceException(e); |
|
128 |
} catch (Throwable e) { |
|
129 |
throw new CollectorServiceException(e); |
|
130 |
} |
|
131 |
} |
|
132 |
|
|
133 |
|
|
134 |
private void initParser() { |
|
135 |
vn = vg.getNav(); |
|
136 |
ap = new AutoPilot(vn); |
|
137 |
} |
|
138 |
|
|
139 |
public String getQueryURL() { |
|
140 |
return queryURL; |
|
141 |
} |
|
142 |
|
|
143 |
public void setQueryURL(final String queryURL) { |
|
144 |
this.queryURL = queryURL; |
|
145 |
} |
|
146 |
|
|
147 |
public int getTotal() { |
|
148 |
return total; |
|
149 |
} |
|
150 |
|
|
151 |
public void setTotal(final int total) { |
|
152 |
this.total = total; |
|
153 |
} |
|
154 |
|
|
155 |
public int getEndAtPage() { |
|
156 |
return endAtPage; |
|
157 |
} |
|
158 |
|
|
159 |
public void setEndAtPage(final int endAtPage) { |
|
160 |
this.endAtPage = endAtPage; |
|
161 |
log.debug("Overriding endAtPage to " + endAtPage); |
|
162 |
} |
|
163 |
|
|
164 |
public VTDGen getVg() { |
|
165 |
return vg; |
|
166 |
} |
|
167 |
|
|
168 |
public void setVg(final VTDGen vg) { |
|
169 |
this.vg = vg; |
|
170 |
} |
|
171 |
|
|
172 |
public VTDNav getVn() { |
|
173 |
return vn; |
|
174 |
} |
|
175 |
|
|
176 |
public void setVn(final VTDNav vn) { |
|
177 |
this.vn = vn; |
|
178 |
} |
|
179 |
|
|
180 |
public AutoPilot getAp() { |
|
181 |
return ap; |
|
182 |
} |
|
183 |
|
|
184 |
public void setAp(final AutoPilot ap) { |
|
185 |
this.ap = ap; |
|
186 |
} |
|
187 |
|
|
188 |
public String getNamespaces() { |
|
189 |
return namespaces; |
|
190 |
} |
|
191 |
|
|
192 |
public void setNamespaces(final String namespaces) { |
|
193 |
this.namespaces = namespaces; |
|
194 |
} |
|
195 |
|
|
196 |
public int getStartFromPage() { |
|
197 |
return startFromPage; |
|
198 |
} |
|
199 |
|
|
200 |
public void setStartFromPage(final int startFromPage) { |
|
201 |
this.startFromPage = startFromPage; |
|
202 |
log.debug("Overriding startFromPage to " + startFromPage); |
|
203 |
} |
|
204 |
|
|
205 |
private class FillProjectList implements Runnable { |
|
206 |
|
|
207 |
private boolean morePages = true; |
|
208 |
private int pageNumber = startFromPage; |
|
209 |
|
|
210 |
@Override |
|
211 |
public void run() { |
|
212 |
String resultPageUrl = ""; |
|
213 |
try { |
|
214 |
do { |
|
215 |
resultPageUrl = getNextPageUrl(); |
|
216 |
log.debug("Page: " + resultPageUrl); |
|
217 |
// clear VGen before processing the next file |
|
218 |
vg.clear(); |
|
219 |
byte[] bytes = connector.getInputSource(resultPageUrl).getBytes("UTF-8"); |
|
220 |
vg.setDoc(bytes); |
|
221 |
vg.parse(false); |
|
222 |
//vg.parseHttpUrl(resultPageUrl, false); |
|
223 |
initParser(); |
|
224 |
ap.selectXPath("//project"); |
|
225 |
int res; |
|
226 |
|
|
227 |
while ((res = ap.evalXPath()) != -1) { |
|
228 |
final String projectHref = vn.toNormalizedString(vn.getAttrVal("href")); |
|
229 |
Thread t = new Thread(new ParseProject(projectHref)); |
|
230 |
t.setName("Thread for " + res); |
|
231 |
es.execute(t); |
|
232 |
} |
|
233 |
ap.resetXPath(); |
|
234 |
|
|
235 |
} while (morePages); |
|
236 |
es.shutdown(); |
|
237 |
es.awaitTermination(WAIT_END_SECONDS, TimeUnit.SECONDS); |
|
238 |
projects.put(TERMINATOR); |
|
239 |
|
|
240 |
} catch (Throwable e) { |
|
241 |
log.error("Exception processing " + resultPageUrl + "\n" + e.getMessage()); |
|
242 |
} |
|
243 |
} |
|
244 |
|
|
245 |
private String getNextPageUrl() { |
|
246 |
String url = queryURL + "?p=" + pageNumber; |
|
247 |
if (pageNumber == endAtPage) { |
|
248 |
morePages = false; |
|
249 |
} |
|
250 |
pageNumber++; |
|
251 |
return url; |
|
252 |
} |
|
253 |
|
|
254 |
} |
|
255 |
|
|
256 |
private class ParseProject implements Runnable { |
|
257 |
|
|
258 |
VTDNav vn1; |
|
259 |
VTDGen vg1; |
|
260 |
private String projectRef; |
|
261 |
|
|
262 |
public ParseProject(String projectHref) { |
|
263 |
if(projectHref.contains("gtr.gtr")){ |
|
264 |
projectHref = projectHref.replace("gtr.gtr","gtr"); |
|
265 |
} |
|
266 |
projectRef = projectHref; |
|
267 |
log.debug("strat " + projectRef); |
|
268 |
vg1 = new VTDGen(); |
|
269 |
try { |
|
270 |
byte[] bytes = connector.getInputSource(projectRef).getBytes("UTF-8"); |
|
271 |
vg1.setDoc(bytes); |
|
272 |
vg1.parse(false); |
|
273 |
//vg1.parseHttpUrl(projectRef, false); |
|
274 |
vn1 = vg1.getNav(); |
|
275 |
}catch(Throwable e){ |
|
276 |
log.error("Exception processing " + projectRef + "\n" + e.getMessage()); |
|
277 |
} |
|
278 |
log.debug("end " + projectRef); |
|
279 |
} |
|
280 |
|
|
281 |
private int projectsUpdate(String attr) throws CollectorServiceException { |
|
282 |
try { |
|
283 |
int index = vn1.getAttrVal(attr); |
|
284 |
if (index != -1) { |
|
285 |
String d = vn1.toNormalizedString(index); |
|
286 |
DateTime recordDate = DateTime.parse(d.substring(0, d.indexOf("T")), simpleDateTimeFormatter); |
|
287 |
// updated or created after the last time it was collected |
|
288 |
if (recordDate.isAfter(fromDate)) { |
|
289 |
log.debug("New project to collect"); |
|
290 |
return index; |
|
291 |
} |
|
292 |
return -1; |
|
293 |
} |
|
294 |
return index; |
|
295 |
} catch (Throwable e) { |
|
296 |
throw new CollectorServiceException(e); |
|
297 |
} |
|
298 |
} |
|
299 |
|
|
300 |
private String collectProject() throws CollectorServiceException { |
|
301 |
try { |
|
302 |
|
|
303 |
int p = vn1.getAttrVal("href"); |
|
304 |
|
|
305 |
final String projectHref = vn1.toNormalizedString(p); |
|
306 |
log.debug("collecting project at " + projectHref); |
|
307 |
|
|
308 |
Gtr2Helper gtr2Helper = new Gtr2Helper(); |
|
309 |
String projectPackage = gtr2Helper.processProject(vn1, namespaces); |
|
310 |
|
|
311 |
return projectPackage; |
|
312 |
} catch (Throwable e) { |
|
313 |
throw new CollectorServiceException(e); |
|
314 |
} |
|
315 |
} |
|
316 |
|
|
317 |
private boolean add(String attr) throws CollectorServiceException { |
|
318 |
return projectsUpdate(attr) != -1; |
|
319 |
} |
|
320 |
|
|
321 |
@Override |
|
322 |
public void run() { |
|
323 |
log.debug("Getting project info from " + projectRef); |
|
324 |
try { |
|
325 |
if (!incremental || (incremental && (add("created") || add("updated")))) { |
|
326 |
projects.put(collectProject()); |
|
327 |
log.debug("Project enqueued " + projectRef); |
|
328 |
} |
|
329 |
} catch (Throwable e) { |
|
330 |
log.error("Error on ParseProject " + e.getMessage()); |
|
331 |
throw new CollectorServiceRuntimeException(e); |
|
332 |
} |
|
333 |
} |
|
334 |
|
|
335 |
} |
|
336 |
|
|
337 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/projects/grist/GristCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.projects.grist; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
6 |
|
|
7 |
/** |
|
8 |
* Plugin to collect metadata record about projects and fundings via the europePMC GRIST API (e.g. WT projects). |
|
9 |
* <p> |
|
10 |
* Documentation on GRIST API: http://europepmc.org/GristAPI. |
|
11 |
* </p> |
|
12 |
* <p> |
|
13 |
* BaseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:"Wellcome Trust"&resultType=core |
|
14 |
* where resultType=core asks for the complete information (including abstracts). |
|
15 |
* The results returned by the API are XMLs. |
|
16 |
* </p> |
|
17 |
* <p> |
|
18 |
* Pagination: use parameter 'page'. When the response contains empty 'RecordList', it means we reached the end. |
|
19 |
* </p> |
|
20 |
* |
|
21 |
* @author alessia |
|
22 |
*/ |
|
23 |
public class GristCollectorPlugin extends AbstractCollectorPlugin { |
|
24 |
|
|
25 |
@Override |
|
26 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) |
|
27 |
throws CollectorServiceException { |
|
28 |
//baseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:%22Wellcome%20Trust%22&resultType=core |
|
29 |
return new GristProjectsIterable(interfaceDescriptor.getBaseUrl()); |
|
30 |
} |
|
31 |
|
|
32 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/pom.xml | ||
---|---|---|
1 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|
2 |
<modelVersion>4.0.0</modelVersion> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet45-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
</parent> |
|
8 |
<groupId>eu.dnetlib</groupId> |
|
9 |
<artifactId>dnet-collector-plugins</artifactId> |
|
10 |
<version>1.7.8</version> |
|
11 |
<scm> |
|
12 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8</developerConnection> |
|
13 |
</scm> |
|
14 |
|
|
15 |
<build> |
|
16 |
<plugins> |
|
17 |
<plugin> |
|
18 |
<artifactId>maven-assembly-plugin</artifactId> |
|
19 |
<configuration> |
|
20 |
<archive> |
|
21 |
<manifest> |
|
22 |
<mainClass>eu.dnetlib.data.collector.plugins.schemaorg.SchemaOrgMainReactome</mainClass> |
|
23 |
</manifest> |
|
24 |
</archive> |
|
25 |
<descriptorRefs> |
|
26 |
<descriptorRef>jar-with-dependencies</descriptorRef> |
|
27 |
</descriptorRefs> |
|
28 |
</configuration> |
|
29 |
</plugin> |
|
30 |
</plugins> |
|
31 |
</build> |
|
32 |
|
|
33 |
<dependencies> |
|
34 |
<dependency> |
|
35 |
<groupId>eu.dnetlib</groupId> |
|
36 |
<artifactId>dnet-modular-collector-service-rmi</artifactId> |
|
37 |
<version>[1.3.0,2.0.0)</version> |
|
38 |
</dependency> |
|
39 |
<dependency> |
|
40 |
<groupId>eu.dnetlib</groupId> |
|
41 |
<artifactId>dnet-modular-collector-service</artifactId> |
|
42 |
<version>[3.3.26,4.0.0)</version> |
|
43 |
</dependency> |
|
44 |
<dependency> |
|
45 |
<groupId>com.google.code.gson</groupId> |
|
46 |
<artifactId>gson</artifactId> |
|
47 |
<version>${google.gson.version}</version> |
|
48 |
</dependency> |
|
49 |
<dependency> |
|
50 |
<groupId>commons-io</groupId> |
|
51 |
<artifactId>commons-io</artifactId> |
|
52 |
<version>${commons.io.version}</version> |
|
53 |
</dependency> |
|
54 |
<dependency> |
|
55 |
<groupId>junit</groupId> |
|
56 |
<artifactId>junit</artifactId> |
|
57 |
<version>${junit.version}</version> |
|
58 |
<scope>test</scope> |
|
59 |
</dependency> |
|
60 |
<dependency> |
|
61 |
<groupId>org.apache.httpcomponents</groupId> |
|
62 |
<artifactId>httpclient</artifactId> |
|
63 |
<version>4.5</version> |
|
64 |
</dependency> |
|
65 |
<dependency> |
|
66 |
<groupId>eu.dnetlib</groupId> |
|
67 |
<artifactId>cnr-resultset-service</artifactId> |
|
68 |
<version>[2.0.0, 3.0.0)</version> |
|
69 |
<scope>provided</scope> |
|
70 |
</dependency> |
|
71 |
<dependency> |
|
72 |
<groupId>com.ximpleware</groupId> |
|
73 |
<artifactId>vtd-xml</artifactId> |
|
74 |
<version>[2.12, 3.0.0)</version> |
|
75 |
</dependency> |
|
76 |
<dependency> |
|
77 |
<groupId>joda-time</groupId> |
|
78 |
<artifactId>joda-time</artifactId> |
|
79 |
<version>2.9.2</version> |
|
80 |
</dependency> |
|
81 |
|
|
82 |
<dependency> |
|
83 |
<groupId>org.json</groupId> |
|
84 |
<artifactId>json</artifactId> |
|
85 |
<version>20180813</version> |
|
86 |
<type>jar</type> |
|
87 |
</dependency> |
|
88 |
<dependency> |
|
89 |
<groupId>org.apache.commons</groupId> |
|
90 |
<artifactId>commons-lang3</artifactId> |
|
91 |
<version>3.5</version> |
|
92 |
</dependency> |
|
93 |
|
|
94 |
<dependency> |
|
95 |
<groupId>org.apache.poi</groupId> |
|
96 |
<artifactId>poi</artifactId> |
|
97 |
<version>3.16</version> |
|
98 |
</dependency> |
|
99 |
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> |
|
100 |
<dependency> |
|
101 |
<groupId>org.apache.poi</groupId> |
|
102 |
<artifactId>poi-ooxml</artifactId> |
|
103 |
<version>3.16</version> |
|
104 |
</dependency> |
|
105 |
<dependency> |
|
106 |
<groupId>org.jsoup</groupId> |
|
107 |
<artifactId>jsoup</artifactId> |
|
108 |
<version>1.11.2</version> |
|
109 |
</dependency> |
|
110 |
<dependency> |
|
111 |
<groupId>commons-lang</groupId> |
|
112 |
<artifactId>commons-lang</artifactId> |
|
113 |
<version>2.6</version> |
|
114 |
<scope>compile</scope> |
|
115 |
</dependency> |
|
116 |
<dependency> |
|
117 |
<groupId>org.mockito</groupId> |
|
118 |
<artifactId>mockito-core</artifactId> |
|
119 |
<version>3.3.3</version> |
|
120 |
<scope>test</scope> |
|
121 |
</dependency> |
|
122 |
</dependencies> |
|
123 |
</project> |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/rest/RestCollectorPlugin.java | ||
---|---|---|
1 |
/** |
|
2 |
* |
|
3 |
*/ |
|
4 |
package eu.dnetlib.data.collector.plugins.rest; |
|
5 |
|
|
6 |
import com.google.gson.Gson; |
|
7 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
8 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
9 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
10 |
import org.apache.commons.lang3.StringUtils; |
|
11 |
import org.json.JSONObject; |
|
12 |
|
|
13 |
import java.util.Map; |
|
14 |
|
|
15 |
/** |
|
16 |
* @author js, Andreas Czerniak |
|
17 |
* @date 2020-04-09 |
|
18 |
* |
|
19 |
*/ |
|
20 |
public class RestCollectorPlugin extends AbstractCollectorPlugin { |
|
21 |
|
|
22 |
@Override |
|
23 |
public Iterable<String> collect(InterfaceDescriptor ifDescriptor, String arg1, String arg2) |
|
24 |
throws CollectorServiceException { |
|
25 |
final String baseUrl = ifDescriptor.getBaseUrl(); |
|
26 |
final String resumptionType = ifDescriptor.getParams().get("resumptionType"); |
|
27 |
final String resumptionParam = ifDescriptor.getParams().get("resumptionParam"); |
|
28 |
final String resumptionXpath = ifDescriptor.getParams().get("resumptionXpath"); |
|
29 |
final String resultTotalXpath = ifDescriptor.getParams().get("resultTotalXpath"); |
|
30 |
final String resultFormatParam = ifDescriptor.getParams().get("resultFormatParam"); |
|
31 |
final String resultFormatValue = ifDescriptor.getParams().get("resultFormatValue"); |
|
32 |
final String resultSizeParam = ifDescriptor.getParams().get("resultSizeParam"); |
|
33 |
final String resultSizeValue = (StringUtils.isBlank(ifDescriptor.getParams().get("resultSizeValue"))) ? "100" : ifDescriptor.getParams().get("resultSizeValue"); |
|
34 |
final String queryParams = ifDescriptor.getParams().get("queryParams"); |
|
35 |
final String entityXpath = ifDescriptor.getParams().get("entityXpath"); |
|
36 |
final String authMethod = ifDescriptor.getParams().get("authMethod"); |
|
37 |
final String authToken = ifDescriptor.getParams().get("authToken"); |
|
38 |
final String requestHeaderMap = ifDescriptor.getParams().get("requestHeaderMap"); |
|
39 |
Gson gson = new Gson(); |
|
40 |
Map<String, String> requestHeaders = gson.fromJson(requestHeaderMap, Map.class); |
|
41 |
|
|
42 |
|
|
43 |
if (StringUtils.isBlank(baseUrl)) {throw new CollectorServiceException("Param 'baseUrl' is null or empty");} |
|
44 |
if (StringUtils.isBlank(resumptionType)) {throw new CollectorServiceException("Param 'resumptionType' is null or empty");} |
|
45 |
if (StringUtils.isBlank(resumptionParam)) {throw new CollectorServiceException("Param 'resumptionParam' is null or empty");} |
|
46 |
// if (StringUtils.isBlank(resumptionXpath)) {throw new CollectorServiceException("Param 'resumptionXpath' is null or empty");} |
|
47 |
// if (StringUtils.isBlank(resultTotalXpath)) {throw new CollectorServiceException("Param 'resultTotalXpath' is null or empty");} |
|
48 |
// resultFormatParam can be emtpy because some Rest-APIs doesn't like this argument in the query |
|
49 |
//if (StringUtils.isBlank(resultFormatParam)) {throw new CollectorServiceException("Param 'resultFormatParam' is null, empty or whitespace");} |
|
50 |
if (StringUtils.isBlank(resultFormatValue)) {throw new CollectorServiceException("Param 'resultFormatValue' is null or empty");} |
|
51 |
// if (StringUtils.isBlank(resultSizeParam)) {throw new CollectorServiceException("Param 'resultSizeParam' is null or empty");} |
|
52 |
// prevent resumptionType: discover -- if (Integer.valueOf(resultSizeValue) <= 1) {throw new CollectorServiceException("Param 'resultSizeValue' is less than 2");} |
|
53 |
|
|
54 |
// queryParams could be empty like for DRIS+ API from euroCRIS |
|
55 |
//if (StringUtils.isBlank(queryParams)) {throw new CollectorServiceException("Param 'queryParams' is null or empty");} |
|
56 |
if (StringUtils.isBlank(entityXpath)) {throw new CollectorServiceException("Param 'entityXpath' is null or empty");} |
|
57 |
|
|
58 |
String resFormat = ifDescriptor.getParams().get("resultOutputFormat"); |
|
59 |
final String resultOutputFormat = StringUtils.isNotBlank(resFormat) ? resFormat.toLowerCase() : resultFormatValue.toLowerCase(); |
|
60 |
|
|
61 |
return () -> new RestIterator( |
|
62 |
baseUrl, |
|
63 |
resumptionType, |
|
64 |
resumptionParam, |
|
65 |
resumptionXpath, |
|
66 |
resultTotalXpath, |
|
67 |
resultFormatParam, |
|
68 |
resultFormatValue, |
|
69 |
resultSizeParam, |
|
70 |
resultSizeValue, |
|
71 |
queryParams, |
|
72 |
entityXpath, |
|
73 |
authMethod, |
|
74 |
authToken, |
|
75 |
resultOutputFormat, requestHeaders); |
|
76 |
} |
|
77 |
|
|
78 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/utils/JsonUtils.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.utils; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; |
|
4 |
import org.apache.commons.logging.Log; |
|
5 |
import org.apache.commons.logging.LogFactory; |
|
6 |
|
|
7 |
public class JsonUtils { |
|
8 |
|
|
9 |
private static final Log log = LogFactory.getLog(JsonUtils.class); |
|
10 |
|
|
11 |
public static final String wrapName = "recordWrap"; |
|
12 |
/** |
|
13 |
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to '' |
|
14 |
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names |
|
15 |
* and work-around for the JSON to XML converting of org.json.XML-package. |
|
16 |
* |
|
17 |
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"], |
|
18 |
* |
|
19 |
* @param jsonInput |
|
20 |
* @return convertedJsonKeynameOutput |
|
21 |
*/ |
|
22 |
public String syntaxConvertJsonKeyNames(String jsonInput) { |
|
23 |
|
|
24 |
log.trace("before convertJsonKeyNames: " + jsonInput); |
|
25 |
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml) |
|
26 |
// replace ' 's in JSON Namens with '_' |
|
27 |
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) { |
|
28 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); |
|
29 |
} |
|
30 |
|
|
31 |
// replace forward-slash (sign '/' ) in JSON Names with '_' |
|
32 |
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) { |
|
33 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":"); |
|
34 |
} |
|
35 |
|
|
36 |
// replace '(' in JSON Names with '' |
|
37 |
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) { |
|
38 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":"); |
|
39 |
} |
|
40 |
|
|
41 |
// replace ')' in JSON Names with '' |
|
42 |
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) { |
|
43 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":"); |
|
44 |
} |
|
45 |
|
|
46 |
// add prefix of startNumbers in JSON Keynames with 'n_' |
|
47 |
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) { |
|
48 |
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":"); |
|
49 |
} |
|
50 |
// add prefix of only numbers in JSON Keynames with 'm_' |
|
51 |
while (jsonInput.matches(".*\"([0-9]+)\":.*")) { |
|
52 |
jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":"); |
|
53 |
} |
|
54 |
|
|
55 |
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with '' |
|
56 |
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) { |
|
57 |
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":"); |
|
58 |
} |
|
59 |
|
|
60 |
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. |
|
61 |
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) { |
|
62 |
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":"); |
|
63 |
// } |
|
64 |
|
|
65 |
// replace '=' in JSON Keynames with '-' |
|
66 |
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) { |
|
67 |
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":"); |
|
68 |
} |
|
69 |
|
|
70 |
// replace '@' in JSON Keynames with 'oat_' |
|
71 |
while (jsonInput.matches(".*\"@([^\"]*)\":.*")) { |
|
72 |
jsonInput = jsonInput.replaceAll("\"@([^\"]*)\":", "\"oat_$1\":"); |
|
73 |
} |
|
74 |
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput); |
|
75 |
return jsonInput; |
|
76 |
} |
|
77 |
|
|
78 |
/** |
|
79 |
* |
|
80 |
* https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities |
|
81 |
* * |
|
82 |
* @param bufferStr - XML string |
|
83 |
* @return |
|
84 |
*/ |
|
85 |
public String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) { |
|
86 |
|
|
87 |
while (bufferStr.matches(".*<([^<>].*),(.)>.*")) { |
|
88 |
bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>"); |
|
89 |
} |
|
90 |
|
|
91 |
// replace [#x10-#x1f] with '' |
|
92 |
// while (bufferStr.matches(".*[0-9a-f].*")) { |
|
93 |
// bufferStr = bufferStr.replaceAll("([0-9a-fA-F])", ""); |
|
94 |
// } |
|
95 |
|
|
96 |
return bufferStr; |
|
97 |
} |
|
98 |
|
|
99 |
public String convertToXML(final String jsonRecord){ |
|
100 |
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; |
|
101 |
|
|
102 |
log.trace("before convertToXML: " + jsonRecord); |
|
103 |
org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord)); |
|
104 |
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element |
|
105 |
log.trace("before inputStream: " + resultXml); |
|
106 |
resultXml = XmlCleaner.cleanAllEntities(resultXml); |
|
107 |
log.trace("after cleaning and end of convertToXML: " + resultXml); |
|
108 |
return resultXml; |
|
109 |
} |
|
110 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/test/java/eu/dnetlib/data/collector/plugins/researchfi/ResearchFiCollectorPluginTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.researchfi; |
|
2 |
|
|
3 |
import java.util.HashSet; |
|
4 |
import java.util.Set; |
|
5 |
|
|
6 |
import org.dom4j.DocumentException; |
|
7 |
import org.dom4j.DocumentHelper; |
|
8 |
import org.junit.Before; |
|
9 |
import org.junit.Ignore; |
|
10 |
import org.junit.Test; |
|
11 |
|
|
12 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
13 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
14 |
|
|
15 |
public class ResearchFiCollectorPluginTest { |
|
16 |
|
|
17 |
private final ResearchFiCollectorPlugin plugin = new ResearchFiCollectorPlugin(); |
|
18 |
|
|
19 |
@Before |
|
20 |
public void setUp() throws Exception {} |
|
21 |
|
|
22 |
@Test |
|
23 |
@Ignore |
|
24 |
public final void testCollect() throws CollectorServiceException, DocumentException { |
|
25 |
final InterfaceDescriptor iface = new InterfaceDescriptor(); |
|
26 |
iface.setBaseUrl("https://research.fi/api/rest/v1/funding-decisions?FunderName=AKA&FundingStartYearFrom=2022"); |
|
27 |
iface.setProtocol("research_fi"); |
|
28 |
iface.getParams().put("auth_url", "https://researchfi-auth.2.rahtiapp.fi/realms/publicapi/protocol/openid-connect/token"); |
|
29 |
iface.getParams().put("auth_client_id", ""); |
|
30 |
iface.getParams().put("auth_client_secret", ""); |
|
31 |
|
|
32 |
int count = 0; |
|
33 |
final Set<String> ids = new HashSet<>(); |
|
34 |
|
|
35 |
for (final String s : plugin.collect(iface, null, null)) { |
|
36 |
|
|
37 |
if (count == 0) { |
|
38 |
System.out.println("First: " + s); |
|
39 |
} |
|
40 |
count++; |
|
41 |
|
|
42 |
final String id = DocumentHelper.parseText(s).valueOf("/recordWrap/funderProjectNumber"); |
|
43 |
if (ids.contains(id)) { |
|
44 |
System.out.println("Id already present: " + id); |
|
45 |
} |
|
46 |
ids.add(id); |
|
47 |
} |
|
48 |
|
|
49 |
System.out.println("Total records: " + count); |
|
50 |
System.out.println("Total identifiers: " + ids.size()); |
|
51 |
} |
|
52 |
|
|
53 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/doiresolver/DOIResolverIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.doiresolver; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator; |
|
4 |
import org.apache.commons.lang.StringUtils; |
|
5 |
import org.apache.commons.logging.Log; |
|
6 |
import org.apache.commons.logging.LogFactory; |
|
7 |
|
|
8 |
import java.io.IOException; |
|
9 |
import java.nio.file.Files; |
|
10 |
import java.nio.file.Paths; |
|
11 |
import java.util.Iterator; |
|
12 |
import java.util.concurrent.ArrayBlockingQueue; |
|
13 |
import java.util.concurrent.TimeUnit; |
|
14 |
import java.util.stream.Stream; |
|
15 |
|
|
16 |
public class DOIResolverIterator implements Iterator<String> { |
|
17 |
|
|
18 |
private static final Log log = LogFactory.getLog(DOIResolverIterator.class); |
|
19 |
|
|
20 |
protected static final String STARTER = "FIRE"; |
|
21 |
protected static final String TERMINATOR = "ARNOLD"; |
|
22 |
protected static final String BAD_TERMINATOR = "BAD"; |
|
23 |
protected static final String UNRESOLVED = "UNRESOLVED"; |
|
24 |
protected static long TIMEOUT = 5; |
|
25 |
protected static TimeUnit TIMEOUT_UNIT = TimeUnit.SECONDS; |
|
26 |
|
|
27 |
/** Path to the dir that contains the files, each a csv with a list of DOIs, one per line. **/ |
|
28 |
private String baseDir; |
|
29 |
private String fromDate; |
|
30 |
|
|
31 |
private ArrayBlockingQueue<String> queue; |
|
32 |
|
|
33 |
private CrossrefResolver crossrefResolver; |
|
34 |
|
|
35 |
|
|
36 |
public DOIResolverIterator(final String baseDir, final CrossrefResolver crossrefResolver, final String fromDate) { |
|
37 |
this.baseDir = baseDir; |
|
38 |
this.fromDate = fromDate; |
|
39 |
this.queue = new ArrayBlockingQueue<>(100); |
|
40 |
this.crossrefResolver = crossrefResolver; |
|
41 |
init(); |
|
42 |
} |
|
43 |
|
|
44 |
private void init(){ |
|
45 |
log.info("Init"); |
|
46 |
|
|
47 |
new Thread(() -> { |
|
48 |
try{ |
|
49 |
final FileSystemIterator fsi = new FileSystemIterator(baseDir, "csv", fromDate); |
|
50 |
// put first item in the queue |
|
51 |
if(queue.offer(STARTER)) { |
|
52 |
// read the file, ask the resolvers, put results in a shared queue |
|
53 |
//whatever exceptions, add terminator to the queue |
|
54 |
while (fsi.hasNext()) { |
|
55 |
String filePath = fsi.next(); |
|
56 |
try (Stream<String> stream = Files.lines(Paths.get(filePath))) { |
|
57 |
|
|
58 |
stream.forEach(doi -> { |
|
59 |
try { |
|
60 |
String resolved = resolve(doi); |
|
61 |
if(!resolved.equals(UNRESOLVED)) queue.offer(resolved, TIMEOUT, TIMEOUT_UNIT); |
|
62 |
} catch (InterruptedException e) { |
|
63 |
log.error("DOI processing aborted, cannot offer resolved doi: "+doi+" . Did the consumer die?"); |
|
64 |
log.error(e); |
|
65 |
queue.offer(BAD_TERMINATOR); |
|
66 |
} |
|
67 |
}); |
|
68 |
|
|
69 |
} catch (IOException e) { |
|
70 |
log.error("DOI processing aborted"); |
|
71 |
log.error(e); |
|
72 |
queue.offer(BAD_TERMINATOR); |
|
73 |
} |
|
74 |
} |
|
75 |
} |
|
76 |
} catch (Exception e) { |
|
77 |
log.error("DOI processing aborted"); |
|
78 |
log.error(e); |
|
79 |
queue.offer(BAD_TERMINATOR); |
|
80 |
} |
|
81 |
queue.offer(TERMINATOR); |
|
82 |
log.info("Finished processing DOI list"); |
|
83 |
} |
|
84 |
).start(); |
|
85 |
} |
|
86 |
|
|
87 |
private String resolve(final String doi){ |
|
88 |
log.debug("Resolving "+doi); |
|
89 |
log.debug("Crossref..."); |
|
90 |
String record = crossrefResolver.resolve(cleanDOI(doi)); |
|
91 |
if(StringUtils.isNotBlank(record)) return record; |
|
92 |
else { |
|
93 |
//try another resolver |
|
94 |
log.debug("Resolver returned blank item"); |
|
95 |
} |
|
96 |
return UNRESOLVED; |
|
97 |
} |
|
98 |
|
|
99 |
/** |
|
100 |
* Returns the identifier part of the DOI only. |
|
101 |
* @param doi |
|
102 |
* @return the DOI |
|
103 |
*/ |
|
104 |
protected String cleanDOI(final String doi){ |
|
105 |
return doi.replace("http://dx.doi.org/", "").replace("https://dx.doi.org/", "") |
|
106 |
.replace("https://doi.org/", "").replace("http://doi.org/", ""); |
|
107 |
} |
|
108 |
|
|
109 |
@Override |
|
110 |
public boolean hasNext() { |
|
111 |
return doHasNext(); |
|
112 |
} |
|
113 |
|
|
114 |
private boolean doHasNext(){ |
|
115 |
//If I get a null value, the queue is currently empty. so we wait for something |
|
116 |
String element = queue.peek(); |
|
117 |
while(element == null) { |
|
118 |
try { |
|
119 |
log.debug("Sleeping while waiting for something in the queue"); |
|
120 |
Thread.sleep(1000); |
|
121 |
element = queue.peek(); |
|
122 |
} catch (InterruptedException e) { |
|
123 |
e.printStackTrace(); |
|
124 |
} |
|
125 |
} |
|
126 |
log.debug("Found in queue element: "+element); |
|
127 |
switch(element){ |
|
128 |
case TERMINATOR: |
|
129 |
case BAD_TERMINATOR: |
|
130 |
return false; |
|
131 |
case STARTER: |
|
132 |
case UNRESOLVED: //although they should not be inserted at all in the queue |
|
133 |
queue.poll(); |
|
134 |
return doHasNext(); |
|
135 |
default: |
|
136 |
return true; |
|
137 |
} |
|
138 |
} |
|
139 |
|
|
140 |
@Override |
|
141 |
public String next() { |
|
142 |
return queue.poll(); |
|
143 |
} |
|
144 |
|
|
145 |
public String getBaseDir() { |
|
146 |
return baseDir; |
|
147 |
} |
|
148 |
|
|
149 |
public void setBaseDir(String baseDir) { |
|
150 |
this.baseDir = baseDir; |
|
151 |
} |
|
152 |
|
|
153 |
public CrossrefResolver getCrossrefResolver() { |
|
154 |
return crossrefResolver; |
|
155 |
} |
|
156 |
|
|
157 |
public void setCrossrefResolver(CrossrefResolver crossrefResolver) { |
|
158 |
this.crossrefResolver = crossrefResolver; |
|
159 |
} |
|
160 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/datacite/DataciteESIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.datacite; |
|
2 |
|
|
3 |
|
|
4 |
import java.io.ByteArrayOutputStream; |
|
5 |
import java.io.IOException; |
|
6 |
import java.net.URL; |
|
7 |
import java.util.ArrayDeque; |
|
8 |
import java.util.Iterator; |
|
9 |
import java.util.Objects; |
|
10 |
import java.util.Queue; |
|
11 |
import java.util.zip.DataFormatException; |
|
12 |
import java.util.zip.Inflater; |
|
13 |
|
|
14 |
import com.google.gson.Gson; |
|
15 |
import com.google.gson.GsonBuilder; |
|
16 |
import eu.dnetlib.data.collector.plugins.datacite.schema.DataciteSchema; |
|
17 |
import eu.dnetlib.data.collector.plugins.datacite.schema.Result; |
|
18 |
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; |
|
19 |
import org.apache.commons.codec.binary.Base64; |
|
20 |
import org.apache.commons.io.IOUtils; |
|
21 |
import org.apache.commons.lang3.StringUtils; |
|
22 |
import org.apache.commons.logging.Log; |
|
23 |
import org.apache.commons.logging.LogFactory; |
|
24 |
|
|
25 |
public class DataciteESIterator implements Iterator<String> { |
|
26 |
|
|
27 |
private static final Log log = LogFactory.getLog(DataciteESIterator.class); |
|
28 |
|
|
29 |
private final long timestamp; |
|
30 |
|
|
31 |
private String scrollId; |
|
32 |
|
|
33 |
private Queue<String> currentPage; |
|
34 |
|
|
35 |
private final Gson g = new GsonBuilder().create(); |
|
36 |
|
|
37 |
private String baseURL; |
|
38 |
|
|
39 |
private static final String START_PATH = "new_scan"; |
|
40 |
private static final String NEXT_PATH = "scan/%s"; |
|
41 |
|
|
42 |
|
|
43 |
public DataciteESIterator(long timestamp, String baseUrl) throws Exception { |
|
44 |
this.timestamp = timestamp; |
|
45 |
this.baseURL = baseUrl; |
|
46 |
currentPage = new ArrayDeque<>(); |
|
47 |
startRequest(); |
|
48 |
} |
|
49 |
|
|
50 |
protected static String decompression(final Result r) { |
|
51 |
return decompression(r.getBody().getBytes()); |
|
52 |
} |
|
53 |
|
|
54 |
protected static String decompression(final byte[] bodyBytes){ |
|
55 |
try { |
|
56 |
byte[] byteArray = Base64.decodeBase64(bodyBytes); |
|
57 |
Inflater decompresser = new Inflater(); |
|
58 |
decompresser.setInput(byteArray); |
|
59 |
ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length); |
|
60 |
byte[] buffer = new byte[8192]; |
|
61 |
while (!decompresser.finished()) { |
|
62 |
int size = decompresser.inflate(buffer); |
|
63 |
bos.write(buffer, 0, size); |
|
64 |
} |
|
65 |
byte[] unzippeddata = bos.toByteArray(); |
|
66 |
decompresser.end(); |
|
67 |
|
|
68 |
return new String(unzippeddata); |
|
69 |
} catch (DataFormatException e) { |
|
70 |
log.warn("Exception when decompressing: "+e.getMessage()); |
|
71 |
return null; |
|
72 |
} |
|
73 |
} |
|
74 |
|
|
75 |
private void fillQueue(final String hits) { |
|
76 |
if (StringUtils.isBlank(hits) || "[]".equalsIgnoreCase(hits.trim())) |
|
77 |
return; |
|
78 |
try { |
|
79 |
DataciteSchema datacitepage = g.fromJson(hits, DataciteSchema.class); |
|
80 |
this.scrollId = datacitepage.getScrollId(); |
|
81 |
datacitepage.getResult().stream().map(DataciteESIterator::decompression).filter(Objects::nonNull).forEach(this.currentPage::add); |
|
82 |
} catch (Throwable e) { |
|
83 |
System.out.println(hits); |
|
84 |
e.printStackTrace(); |
|
85 |
} |
|
86 |
} |
|
87 |
|
|
88 |
private void startRequest() throws Exception { |
|
89 |
String url = baseURL+"/"+START_PATH; |
|
90 |
final URL startUrl = new URL(timestamp >0 ? url + "?timestamp="+timestamp : url); |
|
91 |
fillQueue(IOUtils.toString(startUrl.openStream())); |
|
92 |
} |
|
93 |
|
|
94 |
private void getNextPage() throws IOException { |
|
95 |
String url = baseURL+"/"+NEXT_PATH; |
|
96 |
final URL startUrl = new URL(String.format(url,scrollId)); |
|
97 |
fillQueue(IOUtils.toString(startUrl.openStream())); |
|
98 |
} |
|
99 |
|
|
100 |
|
|
101 |
@Override |
|
102 |
public boolean hasNext() { |
|
103 |
return currentPage.size() >0; |
|
104 |
} |
|
105 |
|
|
106 |
@Override |
|
107 |
public String next() { |
|
108 |
|
|
109 |
if (currentPage.size() == 0) { |
|
110 |
|
|
111 |
return null; |
|
112 |
} |
|
113 |
|
|
114 |
String nextItem = currentPage.remove(); |
|
115 |
if (currentPage.size() == 0) { |
|
116 |
try { |
|
117 |
getNextPage(); |
|
118 |
} catch (Throwable e) { |
|
119 |
throw new RuntimeException(e); |
|
120 |
} |
|
121 |
} |
|
122 |
|
|
123 |
return XmlCleaner.cleanAllEntities(nextItem); |
|
124 |
} |
|
125 |
|
|
126 |
public String getBaseURL() { |
|
127 |
return baseURL; |
|
128 |
} |
|
129 |
|
|
130 |
public void setBaseURL(final String baseURL) { |
|
131 |
this.baseURL = baseURL; |
|
132 |
} |
|
133 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/doiresolver/DOIResolver.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.doiresolver; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
4 |
|
|
5 |
public interface DOIResolver { |
|
6 |
|
|
7 |
String resolve(String doi); |
|
8 |
|
|
9 |
void setBaseURL(String baseURL); |
|
10 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/test/resources/eu/dnetlib/data/collector/plugins/schemaorg/sitemap.xml | ||
---|---|---|
1 |
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> |
|
2 |
<sitemap> |
|
3 |
<loc>file:target/test-classes/eu/dnetlib/data/collector/plugins/schemaorg/sitemap_file.xml</loc> |
|
4 |
</sitemap> |
|
5 |
</sitemapindex> |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/test/resources/eu/dnetlib/data/collector/plugins/datasets/pangaea-eu-projects_Openaire.csv | ||
---|---|---|
1 |
4148;FondTara;Fondation Tara Expeditions;tara________::1 |
|
2 |
60;QUEEN;Quaternary Environment of the Eurasian North;corda_______::304178 |
|
3 |
4106;EPOCA;European Project on Ocean Acidification;corda_______::211384 |
|
4 |
4119;HERMIONE;Hotspot Ecosystem Research and Mans Impact On European Seas;corda_______::226354 |
|
5 |
4122;HYPOX;In situ monitoring of oxygen depletion in hypoxic ecosystems of coastal and open seas and land-locked water bodies;corda_______::226213 |
|
6 |
4127;CoralFISH;Ecosystem based management of corals, fish and fisheries in the deep waters of Europe and beyond;corda_______::213144 |
|
7 |
4129;ice2sea;ice2sea;corda_______::226375 |
|
8 |
4138;ECO2;Sub-seabed CO2 Storage: Impact on Marine Ecosystems;corda_______::265847 |
|
9 |
4142;MedSeA;Mediterranean Sea Acidification in a Changing Climate;corda_______::265103 |
|
10 |
4145;DARCLIFE;Deep subsurface Archaea: carbon cycle, life strategies, and role in sedimentary ecosystems;corda_______::247153 |
|
11 |
4147;EURO-BASIN;Basin Scale Analysis, Synthesis and Integration;corda_______::264933 |
|
12 |
4154;Past4Future;Climate Change: Learning from the past climate;corda_______::243908 |
|
13 |
4172;CARBOCHANGE;Changes in the carbon uptake and emissions by oceans in a changing climate;corda_______::264879 |
|
14 |
4175;ERA-CLIM;European Reanalysis of Global Climate Observations;corda_______::265229 |
|
15 |
4181;PAGE21;Changing Permafrost in the Arctic and its Global Effects in the 21st Century;corda_______::282700 |
|
16 |
4182;MicroB3;MicroB3 - Microbial Biodiversity, Bioinformatics and Biotechnology;corda_______::308299 |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/datacite/schema/Result.java | ||
---|---|---|
1 |
|
|
2 |
package eu.dnetlib.data.collector.plugins.datacite.schema; |
|
3 |
|
|
4 |
import com.google.gson.annotations.Expose; |
|
5 |
import com.google.gson.annotations.SerializedName; |
|
6 |
|
|
7 |
public class Result { |
|
8 |
|
|
9 |
@SerializedName("body") |
|
10 |
@Expose |
|
11 |
private String body; |
|
12 |
@SerializedName("id") |
|
13 |
@Expose |
|
14 |
private String id; |
|
15 |
@SerializedName("originalId") |
|
16 |
@Expose |
|
17 |
private String originalId; |
|
18 |
@SerializedName("timestamp") |
|
19 |
@Expose |
|
20 |
private Integer timestamp; |
|
21 |
|
|
22 |
public String getBody() { |
|
23 |
return body; |
|
24 |
} |
|
25 |
|
|
26 |
public void setBody(String body) { |
|
27 |
this.body = body; |
|
28 |
} |
|
29 |
|
|
30 |
public String getId() { |
|
31 |
return id; |
|
32 |
} |
|
33 |
|
|
34 |
public void setId(String id) { |
|
35 |
this.id = id; |
|
36 |
} |
|
37 |
|
|
38 |
public String getOriginalId() { |
|
39 |
return originalId; |
|
40 |
} |
|
41 |
|
|
42 |
public void setOriginalId(String originalId) { |
|
43 |
this.originalId = originalId; |
|
44 |
} |
|
45 |
|
|
46 |
public Integer getTimestamp() { |
|
47 |
return timestamp; |
|
48 |
} |
|
49 |
|
|
50 |
public void setTimestamp(Integer timestamp) { |
|
51 |
this.timestamp = timestamp; |
|
52 |
} |
|
53 |
|
|
54 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/test/resources/eu.dnetlib.data.collector.plugins.projects.gtr2/projects.xml | ||
---|---|---|
1 |
<ns2:projects xmlns:ns1="http://gtr.rcuk.ac.uk/gtr/api" |
|
2 |
xmlns:ns2="http://gtr.rcuk.ac.uk/gtr/api/project" |
|
3 |
xmlns:ns3="http://gtr.rcuk.ac.uk/gtr/api/project/outcome" |
|
4 |
xmlns:ns4="http://gtr.rcuk.ac.uk/gtr/api/organisation" |
|
5 |
xmlns:ns5="http://gtr.rcuk.ac.uk/gtr/api/person" xmlns:ns6="http://gtr.rcuk.ac.uk/gtr/api/fund" |
|
6 |
ns1:page="1" ns1:size="20" ns1:totalPages="3417" ns1:totalSize="68323"> |
|
7 |
<ns2:project ns1:id="E178742B-571B-498F-8402-122F17C47546" |
|
8 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/projects/E178742B-571B-498F-8402-122F17C47546" |
|
9 |
ns1:created="2016-11-11T20:42:55Z"> |
|
10 |
<ns1:links test="ciao"> |
|
11 |
<ns1:link |
|
12 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/persons/CB8C3733-D17E-46A8-9E7C-D5A76F36612A" |
|
13 |
ns1:rel="PI_PER"/> |
|
14 |
<ns1:link |
|
15 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/persons/95D46800-A4DF-40AC-8FD3-7EAF5194B22C" |
|
16 |
ns1:rel="COI_PER"/> |
|
17 |
<ns1:link |
|
18 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/organisations/8319F78A-DCBD-49F6-BE00-78E1CD75CDA9" |
|
19 |
ns1:rel="LEAD_ORG"/> |
|
20 |
<ns1:link |
|
21 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/funds/B794CB74-BD85-452B-8030-69BD4AF82CE9" |
|
22 |
ns1:rel="FUND" ns1:start="2007-06-01T00:00:00+01:00" |
|
23 |
ns1:end="2010-05-31T00:00:00+01:00"/> |
|
24 |
<ns1:link |
|
25 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/publications/2A19F274-8BAF-4AFC-865C-638BFB5FBDB9" |
|
26 |
ns1:rel="PUBLICATION"/> |
|
27 |
</ns1:links> |
|
28 |
<ns2:identifiers> |
|
29 |
<ns2:identifier ns2:type="RCUK">BB/E021409/1</ns2:identifier> |
|
30 |
</ns2:identifiers> |
|
31 |
<ns2:title> A multicellular 3D stem cell model to define the role of stroma in epithelial |
|
32 |
differentiation </ns2:title> |
|
33 |
<ns2:status>Closed</ns2:status> |
|
34 |
<ns2:grantCategory>Research Grant</ns2:grantCategory> |
|
35 |
<ns2:leadOrganisationDepartment>Biology</ns2:leadOrganisationDepartment> |
|
36 |
<ns2:abstractText> In aging men the disorders of prostate are a major medical problem. |
|
37 |
Benign prostatic hyperplasia and cancer are increasingly prevalent. To find cures for |
|
38 |
these diseases it is essential to understand how the prostate grows and functions |
|
39 |
normally. All organs have their own population of stem cells which grow and develop into |
|
40 |
a variety of cells which communicate to form correct organ architecture and function. |
|
41 |
This occurs as a result of signals from the stem cell's own genes but also from signals |
|
42 |
provided by neighbouring cells, known as stroma. In the prostate, how this occurs is |
|
43 |
unknown. We propose to develop a model to grow gland-like structures from adult stem |
|
44 |
cells in the laboratory. The model will be employed to understand how stromal cells |
|
45 |
influence prostate cellular architecture. We aim to identify proteins which act as |
|
46 |
signals from the stroma to change epithelial shape. The shape of a cell has important |
|
47 |
effects on cell function. These experiments will increase our knowledge of how tissues |
|
48 |
develop and function. Development of tissue-like models based on human cells will |
|
49 |
provide a valuable gap between results from animal models and human clinical studies, to |
|
50 |
help understand the basic mechanisms of human physiology and disease. Such model systems |
|
51 |
will reduce the need for animal experimentation, which is currently the best way to |
|
52 |
investigate complex cell interactions in tissues. We anticipate the model will aid |
|
53 |
university directed research into human differentiation and disease mechanisms, but also |
|
54 |
for the pharmaceutical industry to screen new drugs for efficacy and safety in humans |
|
55 |
before trial. </ns2:abstractText> |
|
56 |
<ns2:techAbstractText> Recent advances in our lab have resulted in the isolation of human |
|
57 |
adult prostate stem cells and the development of 3D models of prostatic acini from basal |
|
58 |
cells. Results from 3D modelling indicate that stroma is important for epithelial |
|
59 |
morphogenesis and differentiation. Importantly, stromal cultures increase epithelial |
|
60 |
cell polarity and columnar cell shape. Using electron microscopy and RT-PCR our |
|
61 |
preliminary data has found that these morphological effects are accompanied by increased |
|
62 |
desmosomal expression. We now wish to develop our tissue engineering to produce a 3D |
|
63 |
model of prostatic acini using a homogeneous population of stem cells. A stem cell model |
|
64 |
will allow the study of full epithelial differentiation and the stem cell niche. It is |
|
65 |
important to model the prostate with human cells because the mouse prostate has a |
|
66 |
different anatomy, cell structure and protein function, and does not develop equivalent |
|
67 |
diseases to humans. The model will be used to investigate our hypothesis that 'stroma |
|
68 |
signals to control epithelial cell shape and polarity'. We will confirm which desmosomal |
|
69 |
isoforms are present in prostate epithelial acini and which are upregulated by stromal |
|
70 |
cultures, using Western Blotting and real time PCR. Upregulated desmosomal isoforms will |
|
71 |
be used as markers for epithelial cell polarity and shape. A differential gene |
|
72 |
expression profile will be generated from stroma grown with epithelial acini in 3D |
|
73 |
culture and stroma grown in 3D culture without acini, using microarray analysis. |
|
74 |
Candidate stromal genes will be identified that signal to upregulate epithelial polarity |
|
75 |
(desmosomal expression) and their function will be confirmed using siRNA knockdown |
|
76 |
studies. This is a novel pathway for epithelial cell differentiation which has not been |
|
77 |
studied before. </ns2:techAbstractText> |
|
78 |
<ns2:healthCategories/> |
|
79 |
<ns2:researchActivities/> |
|
80 |
<ns2:researchSubjects/> |
|
81 |
<ns2:researchTopics/> |
|
82 |
</ns2:project> |
|
83 |
<ns2:project ns1:id="E37C97C5-7489-4205-834F-151D05B7E07A" |
|
84 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/projects/E37C97C5-7489-4205-834F-151D05B7E07A" |
|
85 |
ns1:created="2016-11-11T20:42:55Z"> |
|
86 |
<ns1:links test="helo2"> |
|
87 |
<ns1:link |
|
88 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/persons/AFFB5A85-DAC7-48F2-AE07-952481073BAA" |
|
89 |
ns1:rel="PI_PER"/> |
|
90 |
<ns1:link |
|
91 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/persons/DFA58FA2-CCD6-445F-B2BC-E830C23FA563" |
|
92 |
ns1:rel="COI_PER"/> |
|
93 |
<ns1:link |
|
94 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/persons/6E4302E7-A895-4FF7-AE8C-26C2478A82E6" |
|
95 |
ns1:rel="COI_PER"/> |
|
96 |
<ns1:link |
|
97 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/organisations/30A429E3-83B7-4E41-99C0-14A144F07DFE" |
|
98 |
ns1:rel="LEAD_ORG"/> |
|
99 |
<ns1:link |
|
100 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/organisations/FADDC755-1F45-47D7-8591-F183B7160CC2" |
|
101 |
ns1:rel="PP_ORG"/> |
|
102 |
<ns1:link |
|
103 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/organisations/E82E4BC6-2839-4E7A-82CA-88B033E53B45" |
|
104 |
ns1:rel="PP_ORG"/> |
|
105 |
<ns1:link |
|
106 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/organisations/F8B807ED-ACF2-4724-9DC6-291B77059637" |
|
107 |
ns1:rel="PP_ORG"/> |
|
108 |
<ns1:link |
|
109 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/funds/B7A4A2BB-1530-4846-8F2B-891EEBFF3F5F" |
|
110 |
ns1:rel="FUND" ns1:start="2013-10-01T00:00:00+01:00" ns1:end="2017-11-30T00:00:00Z"/> |
|
111 |
<ns1:link |
|
112 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/keyfindings/45480FAD-42D9-4948-ADE2-1B161F6BF481" |
|
113 |
ns1:rel="KEY_FINDING"/> |
|
114 |
<ns1:link |
|
115 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/disseminations/6E8F7769-1A48-4BDD-8248-6B5938CA3495" |
|
116 |
ns1:rel="DISSEMINATION"/> |
|
117 |
<ns1:link |
|
118 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/impactsummaries/01F42709-4FC3-4267-9E2A-08591C7950F8" |
|
119 |
ns1:rel="IMPACT_SUMMARY"/> |
|
120 |
<ns1:link |
|
121 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/publications/76690C2C-FEF9-42AC-A4A2-B09338B33C45" |
|
122 |
ns1:rel="PUBLICATION"/> |
|
123 |
<ns1:link |
|
124 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/publications/7B0D46AB-374C-4327-9EFE-5CA9683560FB" |
|
125 |
ns1:rel="PUBLICATION"/> |
|
126 |
<ns1:link |
|
127 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/publications/03415E7B-3BE0-4207-857D-B595F52E2C65" |
|
128 |
ns1:rel="PUBLICATION"/> |
|
129 |
<ns1:link |
|
130 |
ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/publications/2D27AB50-6985-4A61-83C6-54A102DFCFB8" |
|
131 |
ns1:rel="PUBLICATION"/> |
|
132 |
</ns1:links> |
|
133 |
<ns2:identifiers> |
|
134 |
<ns2:identifier ns2:type="RCUK">NE/K001906/1</ns2:identifier> |
|
135 |
</ns2:identifiers> |
|
136 |
<ns2:title> Biogeochemistry, macronutrient and carbon cycling in the benthic layer </ns2:title> |
|
137 |
<ns2:status>Active</ns2:status> |
|
138 |
<ns2:grantCategory>Research Grant</ns2:grantCategory> |
|
139 |
<ns2:leadOrganisationDepartment>School of Ocean and Earth |
|
140 |
Science</ns2:leadOrganisationDepartment> |
|
141 |
<ns2:abstractText> The coasts and shelf seas that surround us have been the focal point of |
|
142 |
human prosperity and well-being throughout our history and, consequently, have had a |
|
143 |
disproportionate effect on our culture. The societal importance of the shelf seas |
|
144 |
extends beyond food production to include biodiversity, carbon cycling and storage, |
|
145 |
waste disposal, nutrient cycling, recreation and renewable energy. Yet, as increasing |
|
146 |
proportions of the global population move closer to the coast, our seas have become |
|
147 |
progressively eroded by human activities, including overfishing, pollution, habitat |
|
148 |
disturbance and climate change. This is worrying because the condition of the seabed, |
|
149 |
biodiversity and human society are inextricably linked. Hence, there is an urgent need |
|
150 |
to understand the relative sensitivities of a range of shelf habitats so that human |
|
151 |
pressures can be managed more effectively to ensure the long-term sustainability of our |
|
152 |
seas and provision of societal benefits. Achieving these aims is not straightforward, as |
|
153 |
the capacity of the seabed to provide the goods and services we rely upon depends on the |
|
154 |
type of substrate (rock, gravel, sand, mud) and local conditions; some habitats are |
|
155 |
naturally dynamic and relatively insensitive to disturbance, while others are |
|
156 |
comparatively stable and vulnerable to change. This makes it very difficult to assess |
|
157 |
habitat sensitivities or make general statements about what benefits we can expect from |
|
158 |
our seas in the future. Recently, NERC and DEFRA have initiated a major new research |
|
159 |
programme on Shelf Sea Biogeochemistry that will improve knowledge about these issues. |
|
160 |
In response to this call, we have assembled a consortium of leading scientists that |
|
161 |
includes microbiologists, ecologists, physical oceanographers, biogeochemists, |
|
162 |
mathematical modellers and policy advisors. With assistance from organisations like |
|
163 |
CEFAS, Marine Scotland and AFBI, they will carry out a series of research cruises around |
|
164 |
the UK that will map the sensitivity and status of seabed habitats based on their |
|
165 |
physical condition, the microbial and faunal communities that inhabit them, and the size |
|
166 |
and dynamics of the nitrogen and carbon pools found there. The latest marine |
|
167 |
technologies will measure the amount of mixing and flow rates just above the seabed, as |
|
168 |
well as detailed seabed topography. These measurements will allow better understanding |
|
169 |
of the physical processes responsible for movement and mixing of sediment, nutrient, and |
|
170 |
carbon. At the same time, cores will be retrieved containing the microbial and faunal |
|
171 |
communities and their activity and behaviour will be linked to specific biogeochemical |
|
172 |
responses. Highly specialised autonomous vehicles, called landers, will also measure |
|
173 |
nutrient concentrations and fluxes at the seabed. Components of the system can then be |
|
174 |
experimentally manipulated to mimic scenarios of change, such as changing hydrodynamics, |
|
175 |
disturbance or components of climate change. This will be achieved in the field by |
|
176 |
generating different flow regimes using a submerged flume or, in the laboratory, using |
|
177 |
intact sediment communities exposed to different levels of CO2, temperature and oxygen. |
|
178 |
By measuring the biogeochemical response and behaviour of the microbial and faunal |
|
179 |
communities to these changes, we will generate an understanding of what may happen if |
|
180 |
such changes did occur across our shelf seas. We will use all of this information to |
|
181 |
assess the relative vulnerability of areas of the UK seabed by overlaying the |
|
182 |
observation and experimental results over maps of various human pressures, which will be |
|
183 |
of value to planners and policymakers. Mathematical models will test future scenarios of |
|
184 |
change, such as opening or closing vulnerable areas to fishing or anticipated changes in |
|
185 |
the factors that control nutrient and carbon stocks. This will be valuable in exploring |
|
186 |
different responses to external pressures and for deciding which management measures |
|
187 |
should be put in place to preserve our shelf seas for future generations </ns2:abstractText> |
|
188 |
<ns2:potentialImpact> Commercial private sector and the knowledge economy: new and |
|
189 |
innovative methodologies, equipment and techniques, and combined state-of-the-art |
|
190 |
technologies (>2.3 million in-kind, see JeS) will assess what the primary physical |
|
191 |
and biogeochemical controls of shelf productivity are up to shelf sea scales. Since many |
|
192 |
interests rely on the marine environment, beneficiaries will be varied. By sharing |
|
193 |
expertise and knowledge, a world-leading manufacturer of microsensors and microscale |
|
194 |
instrumentation and an internationally recognized marine environmental data acquisition |
|
195 |
company will benefit from exploitable opportunities, e.g. new visualisation tools that |
|
196 |
enable holistic understanding of large-scale ecosystem processes. Policy professionals, |
|
197 |
governmental and devolved governmental organisations: The importance of shelf seas to |
|
198 |
society extends beyond fisheries to wider issues, such as biodiversity, carbon cycling |
|
199 |
and storage, waste disposal, nutrient cycling, and renewable energy resources. |
|
200 |
Consortium expertise will contribute to these UK priority challenges. The UK Marine |
|
201 |
& Coastal Access Act (MCAA), UK Climate Change Act, EU Habitats Directive and EU |
|
202 |
Marine Strategy Framework Directive (MSFD) support sustainable use of the marine |
|
203 |
environment. They also support the UK vision for achieving 'clean, healthy, safe |
|
204 |
productive and biologically diverse ocean and seas' (UK Marine Science Strategy). We |
|
205 |
will provide a coherent framework for sound evidence based-science in support of these |
|
206 |
policy instruments and statutory requirements. For example, the MSFD aims to achieve |
|
207 |
Good Environmental Status in EU marine waters by 2020, but we lack understanding of the |
|
208 |
magnitude and synchronicity of change in SSEs. Our research will directly inform |
|
209 |
Descriptor 1 (biological diversity) and 6 (seabed integrity) for a wide range of |
|
210 |
sediment habitats over time, which is important because the determination of good |
|
211 |
environmental status may have to be adapted over time (addressed in Task 2) "in vie |
|
212 |
of the dynamic nature of marine ecosystems and their natural variability, and given that |
|
213 |
the pressures and impacts on them may vary with the evolvement of different patterns of |
|
214 |
human activity and the impact of climate change" (MSFD). Our work will also inform |
|
215 |
environmental monitoring programmes: OSPARs Joint Assessment and Monitoring programme, |
|
216 |
the Eutrophication Monitoring Programme and The Clean Seas Environment Monitoring |
|
217 |
Programme (CSEMP, led by consortium member CEFAS). Task 1-3 complement the outcomes of |
|
218 |
CESEMP and provide scientific evidence to OSPAR. Similarly, experimental scenarios and |
|
219 |
modelling approaches will provide needed information for (i) the EU Water Framework |
|
220 |
Directive (the requirement for 'good chemical and ecological status' by 2015 does not |
|
221 |
account for climate change) and, (ii) the UK White Paper for MCAA (it is unclear how |
|
222 |
commitments to "look ahead at the predicted impacts of climate change on the marine |
|
223 |
environment, how marine activities will contribute towards it, and how they are affected |
|
224 |
by it" will be achieved). Finally, other EU instruments, such as the Habitats |
|
225 |
Directive (introduced in 1992), the EU Common Fisheries Policy (revised in 2002) and |
|
226 |
national legislation such as the UK MCAA and Scottish Marine Act, assume that removal |
|
227 |
(or control) of direct pressures will result in ecosystem recovery and/or species |
|
228 |
persistence. Our programme includes experimental scenarios and modelling approaches to |
|
229 |
provide further information on the vulnerability of SSEs in environmental futures under |
|
230 |
multiple pressures (Task 3). Our outputs will also help NERC meet its science theme |
|
231 |
challenges. Public, wider community: active engagement with a variety of organisations |
|
232 |
is detailed in Pathways to Impact (PtI). Skills& training: In addition to academic |
|
233 |
progression, early career researchers will gain experience and receive mentoring in |
|
234 |
running a large interdisciplinary programme, as well as training in communication skills |
|
235 |
and scientific methodology </ns2:potentialImpact> |
|
236 |
<ns2:healthCategories/> |
|
237 |
<ns2:researchActivities/> |
|
238 |
<ns2:researchSubjects> |
|
239 |
<ns2:researchSubject> |
|
240 |
<ns2:id>138395</ns2:id> |
|
241 |
<ns2:text>Marine environments</ns2:text> |
|
242 |
<ns2:percentage>75</ns2:percentage> |
|
243 |
</ns2:researchSubject> |
|
244 |
<ns2:researchSubject> |
|
245 |
<ns2:id>46902</ns2:id> |
|
246 |
<ns2:text>Geosciences</ns2:text> |
|
247 |
<ns2:percentage>15</ns2:percentage> |
|
248 |
</ns2:researchSubject> |
|
249 |
<ns2:researchSubject> |
|
250 |
<ns2:id>13097</ns2:id> |
|
251 |
<ns2:text>Ecol, biodivers.& systematics</ns2:text> |
|
252 |
<ns2:percentage>5</ns2:percentage> |
|
253 |
</ns2:researchSubject> |
|
254 |
<ns2:researchSubject> |
|
255 |
<ns2:id>33851</ns2:id> |
|
256 |
<ns2:text>Microbial sciences</ns2:text> |
|
257 |
<ns2:percentage>5</ns2:percentage> |
|
258 |
</ns2:researchSubject> |
|
259 |
</ns2:researchSubjects> |
|
260 |
<ns2:researchTopics> |
|
261 |
<ns2:researchTopic> |
|
262 |
<ns2:id>21005</ns2:id> |
|
263 |
<ns2:text>Sediment/Sedimentary Processes</ns2:text> |
|
264 |
<ns2:percentage>15</ns2:percentage> |
|
265 |
</ns2:researchTopic> |
|
266 |
<ns2:researchTopic> |
|
267 |
<ns2:id>143045</ns2:id> |
|
268 |
<ns2:text>Ecosystem Scale Processes</ns2:text> |
|
269 |
<ns2:percentage>15</ns2:percentage> |
|
270 |
</ns2:researchTopic> |
|
271 |
<ns2:researchTopic> |
|
272 |
<ns2:id>63200</ns2:id> |
|
273 |
<ns2:text>Biogeochemical Cycles</ns2:text> |
|
274 |
<ns2:percentage>60</ns2:percentage> |
|
275 |
</ns2:researchTopic> |
|
276 |
<ns2:researchTopic> |
|
277 |
<ns2:id>108367</ns2:id> |
|
278 |
<ns2:text>Community Ecology</ns2:text> |
|
279 |
<ns2:percentage>5</ns2:percentage> |
|
280 |
</ns2:researchTopic> |
|
281 |
<ns2:researchTopic> |
|
282 |
<ns2:id>80410</ns2:id> |
|
283 |
<ns2:text>Responses to environment</ns2:text> |
|
284 |
<ns2:percentage>5</ns2:percentage> |
|
285 |
</ns2:researchTopic> |
|
286 |
</ns2:researchTopics> |
|
287 |
</ns2:project> |
|
288 |
</ns2:projects> |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/excel/Read.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.excel; |
|
2 |
|
|
3 |
/** |
|
4 |
* Created by miriam on 10/05/2017. |
|
5 |
*/ |
|
6 |
import java.io.File; |
|
7 |
import java.io.FileInputStream; |
|
8 |
import java.io.IOException; |
|
9 |
import java.net.URL; |
|
10 |
import java.util.ArrayList; |
|
11 |
import java.util.HashMap; |
|
12 |
import java.util.Iterator; |
|
13 |
|
|
14 |
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin; |
|
15 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
16 |
import org.apache.commons.lang3.StringUtils; |
|
17 |
import org.apache.commons.logging.Log; |
|
18 |
import org.apache.commons.logging.LogFactory; |
|
19 |
import org.apache.poi.ss.usermodel.Cell; |
|
20 |
import org.apache.poi.ss.usermodel.DataFormatter; |
|
21 |
import org.apache.poi.ss.usermodel.Row; |
|
22 |
import org.apache.poi.ss.usermodel.Sheet; |
|
23 |
import org.apache.poi.ss.usermodel.Workbook; |
|
24 |
import org.apache.poi.xssf.usermodel.XSSFWorkbook; |
|
25 |
import org.json.*; |
|
26 |
|
|
27 |
import org.apache.commons.io.FileUtils; |
|
28 |
|
|
29 |
public class Read { |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-collector-plugins-1.7.8