Project

General

Profile

« Previous | Next » 

Revision 63262

Added by Michele Artini 5 months ago

[maven-release-plugin] copy for tag dnet-collector-plugins-1.7.8

View differences:

modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2ProjectsIterator.java
1
package eu.dnetlib.data.collector.plugins.projects.gtr2;
2

  
3
import java.util.Iterator;
4
import java.util.NoSuchElementException;
5
import java.util.concurrent.ArrayBlockingQueue;
6
import java.util.concurrent.ExecutorService;
7
import java.util.concurrent.Executors;
8
import java.util.concurrent.TimeUnit;
9

  
10
import com.ximpleware.AutoPilot;
11
import com.ximpleware.VTDGen;
12
import com.ximpleware.VTDNav;
13
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
14
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
15
import eu.dnetlib.enabling.resultset.SizedIterable;
16
import org.apache.commons.lang3.StringUtils;
17
import org.apache.commons.logging.Log;
18
import org.apache.commons.logging.LogFactory;
19
import org.joda.time.DateTime;
20
import org.joda.time.format.DateTimeFormat;
21
import org.joda.time.format.DateTimeFormatter;
22
import eu.dnetlib.data.collector.plugins.HttpConnector;
23

  
24
/**
25
 * Created by alessia on 28/11/16.
26
 */
27
public class Gtr2ProjectsIterator implements Iterator<String> {
28

  
29
	public static final String TERMINATOR = "ARNOLD";
30
	public static final int WAIT_END_SECONDS = 600;
31
	public static final int PAGE_SZIE = 20;
32

  
33
	private static final Log log = LogFactory.getLog(Gtr2ProjectsIterator.class);
34

  
35
	private String queryURL;
36
	private int total = -1;
37
	private int startFromPage = 1;
38
	private int endAtPage;
39
	private VTDGen vg;
40
	private VTDNav vn;
41
	private AutoPilot ap;
42
	private String namespaces;
43
	private boolean incremental = false;
44
	private DateTime fromDate;
45
	private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
46
	private ArrayBlockingQueue<String> projects = new ArrayBlockingQueue<String>(20);
47
	//private boolean finished = false;
48
	private final ExecutorService es = Executors.newFixedThreadPool(PAGE_SZIE);
49
	private String nextElement = "<doc></doc>";
50
	private HttpConnector connector;
51

  
52
	public boolean hasNext() {
53

  
54
			return !nextElement.equals(TERMINATOR);
55

  
56
	}
57

  
58
	@Override
59
	public String next() {
60
		try{
61
			return nextElement;
62
		}finally{
63
			try {
64
				nextElement = projects.poll(WAIT_END_SECONDS, TimeUnit.SECONDS);
65
			} catch (InterruptedException e) {
66
				throw new RuntimeException(e);
67
			}
68
		}
69

  
70
	}
71

  
72
	@Override
73
	public void remove() {
74
		throw new UnsupportedOperationException();
75
	}
76
	public Gtr2ProjectsIterator(final String baseUrl, final String fromDate) throws CollectorServiceException {
77
		prepare(baseUrl, fromDate);
78
		fillInfo(true);
79
	}
80

  
81
	public Gtr2ProjectsIterator(final String baseUrl, final String fromDate, final int startFromPage, final int endAtPage) throws CollectorServiceException {
82
		prepare(baseUrl, fromDate);
83
		this.setStartFromPage(startFromPage);
84
		this.setEndAtPage(endAtPage);
85
		fillInfo(false);
86
	}
87

  
88
	private void prepare(final String baseUrl, final String fromDate) {
89
		connector = new HttpConnector();
90
		queryURL = baseUrl + "/projects";
91
		vg = new VTDGen();
92
		this.incremental = StringUtils.isNotBlank(fromDate);
93
		if (incremental) {
94
			// I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode
95
			this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter);
96
			log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString());
97
		}
98
	}
99

  
100

  
101

  
102
	private void fillInfo(final boolean all) throws CollectorServiceException {
103
		try {
104
			// log.debug("Getting hit count from: " + queryURL);
105
			byte[] bytes = connector.getInputSource(queryURL).getBytes("UTF-8");
106
			vg.setDoc(bytes);
107
			vg.parse(false);
108
			//vg.parseHttpUrl(queryURL, false);
109
			initParser();
110
			String hitCount = vn.toNormalizedString(vn.getAttrVal("totalSize"));
111
			String totalPages = vn.toNormalizedString(vn.getAttrVal("totalPages"));
112
			namespaces = "xmlns:ns1=\"" + vn.toNormalizedString(vn.getAttrVal("ns1")) + "\" ";
113
			namespaces += "xmlns:ns2=\"" + vn.toNormalizedString(vn.getAttrVal("ns2")) + "\" ";
114
			namespaces += "xmlns:ns3=\"" + vn.toNormalizedString(vn.getAttrVal("ns3")) + "\" ";
115
			namespaces += "xmlns:ns4=\"" + vn.toNormalizedString(vn.getAttrVal("ns4")) + "\" ";
116
			namespaces += "xmlns:ns5=\"" + vn.toNormalizedString(vn.getAttrVal("ns5")) + "\" ";
117
			namespaces += "xmlns:ns6=\"" + vn.toNormalizedString(vn.getAttrVal("ns6")) + "\" ";
118
			if (all) {
119
				setEndAtPage(Integer.parseInt(totalPages));
120
				total = Integer.parseInt(hitCount);
121
			}
122
			Thread ft = new Thread(new FillProjectList());
123
			ft.start();
124
			log.debug("Expected number of pages: " + (endAtPage - startFromPage + 1));
125
		} catch (NumberFormatException e) {
126
			log.error("Cannot set the total count or the number of pages");
127
			throw new CollectorServiceException(e);
128
		} catch (Throwable e) {
129
			throw new CollectorServiceException(e);
130
		}
131
	}
132

  
133

  
134
	private void initParser() {
135
		vn = vg.getNav();
136
		ap = new AutoPilot(vn);
137
	}
138

  
139
	public String getQueryURL() {
140
		return queryURL;
141
	}
142

  
143
	public void setQueryURL(final String queryURL) {
144
		this.queryURL = queryURL;
145
	}
146

  
147
	public int getTotal() {
148
		return total;
149
	}
150

  
151
	public void setTotal(final int total) {
152
		this.total = total;
153
	}
154

  
155
	public int getEndAtPage() {
156
		return endAtPage;
157
	}
158

  
159
	public void setEndAtPage(final int endAtPage) {
160
		this.endAtPage = endAtPage;
161
		log.debug("Overriding endAtPage to " + endAtPage);
162
	}
163

  
164
	public VTDGen getVg() {
165
		return vg;
166
	}
167

  
168
	public void setVg(final VTDGen vg) {
169
		this.vg = vg;
170
	}
171

  
172
	public VTDNav getVn() {
173
		return vn;
174
	}
175

  
176
	public void setVn(final VTDNav vn) {
177
		this.vn = vn;
178
	}
179

  
180
	public AutoPilot getAp() {
181
		return ap;
182
	}
183

  
184
	public void setAp(final AutoPilot ap) {
185
		this.ap = ap;
186
	}
187

  
188
	public String getNamespaces() {
189
		return namespaces;
190
	}
191

  
192
	public void setNamespaces(final String namespaces) {
193
		this.namespaces = namespaces;
194
	}
195

  
196
	public int getStartFromPage() {
197
		return startFromPage;
198
	}
199

  
200
	public void setStartFromPage(final int startFromPage) {
201
		this.startFromPage = startFromPage;
202
		log.debug("Overriding startFromPage to " + startFromPage);
203
	}
204

  
205
	private class FillProjectList implements Runnable {
206

  
207
		private boolean morePages = true;
208
		private int pageNumber = startFromPage;
209

  
210
		@Override
211
		public void run() {
212
			String resultPageUrl = "";
213
			try {
214
				do {
215
					resultPageUrl = getNextPageUrl();
216
					log.debug("Page: " + resultPageUrl);
217
					// clear VGen before processing the next file
218
					vg.clear();
219
					byte[] bytes = connector.getInputSource(resultPageUrl).getBytes("UTF-8");
220
					vg.setDoc(bytes);
221
					vg.parse(false);
222
					//vg.parseHttpUrl(resultPageUrl, false);
223
					initParser();
224
					ap.selectXPath("//project");
225
					int res;
226

  
227
					while ((res = ap.evalXPath()) != -1) {
228
						final String projectHref = vn.toNormalizedString(vn.getAttrVal("href"));
229
						Thread t = new Thread(new ParseProject(projectHref));
230
						t.setName("Thread for " + res);
231
						es.execute(t);
232
					}
233
					ap.resetXPath();
234

  
235
				} while (morePages);
236
				es.shutdown();
237
				es.awaitTermination(WAIT_END_SECONDS, TimeUnit.SECONDS);
238
				projects.put(TERMINATOR);
239

  
240
			} catch (Throwable e) {
241
				log.error("Exception processing " + resultPageUrl + "\n" + e.getMessage());
242
			}
243
		}
244

  
245
		private String getNextPageUrl() {
246
			String url = queryURL + "?p=" + pageNumber;
247
			if (pageNumber == endAtPage) {
248
				morePages = false;
249
			}
250
			pageNumber++;
251
			return url;
252
		}
253

  
254
	}
255

  
256
	private class ParseProject implements Runnable {
257

  
258
		VTDNav vn1;
259
		VTDGen vg1;
260
		private String projectRef;
261

  
262
		public ParseProject(String projectHref) {
263
			if(projectHref.contains("gtr.gtr")){
264
				projectHref = projectHref.replace("gtr.gtr","gtr");
265
			}
266
			projectRef = projectHref;
267
			log.debug("strat " + projectRef);
268
			vg1 = new VTDGen();
269
			try {
270
				byte[] bytes = connector.getInputSource(projectRef).getBytes("UTF-8");
271
				vg1.setDoc(bytes);
272
				vg1.parse(false);
273
				//vg1.parseHttpUrl(projectRef, false);
274
				vn1 = vg1.getNav();
275
			}catch(Throwable e){
276
				log.error("Exception processing " + projectRef + "\n" + e.getMessage());
277
			}
278
			log.debug("end " + projectRef);
279
		}
280

  
281
		private int projectsUpdate(String attr) throws CollectorServiceException {
282
			try {
283
				int index = vn1.getAttrVal(attr);
284
				if (index != -1) {
285
					String d = vn1.toNormalizedString(index);
286
					DateTime recordDate = DateTime.parse(d.substring(0, d.indexOf("T")), simpleDateTimeFormatter);
287
					// updated or created after the last time it was collected
288
					if (recordDate.isAfter(fromDate)) {
289
						log.debug("New project to collect");
290
						return index;
291
					}
292
					return -1;
293
				}
294
				return index;
295
			} catch (Throwable e) {
296
				throw new CollectorServiceException(e);
297
			}
298
		}
299

  
300
		private String collectProject() throws CollectorServiceException {
301
			try {
302

  
303
				int p = vn1.getAttrVal("href");
304

  
305
				final String projectHref = vn1.toNormalizedString(p);
306
				log.debug("collecting project at " + projectHref);
307

  
308
				Gtr2Helper gtr2Helper = new Gtr2Helper();
309
				String projectPackage = gtr2Helper.processProject(vn1, namespaces);
310

  
311
				return projectPackage;
312
			} catch (Throwable e) {
313
				throw new CollectorServiceException(e);
314
			}
315
		}
316

  
317
		private boolean add(String attr) throws CollectorServiceException {
318
			return projectsUpdate(attr) != -1;
319
		}
320

  
321
		@Override
322
		public void run() {
323
			log.debug("Getting project info from " + projectRef);
324
			try {
325
				if (!incremental || (incremental && (add("created") || add("updated")))) {
326
					projects.put(collectProject());
327
					log.debug("Project enqueued " + projectRef);
328
				}
329
			} catch (Throwable e) {
330
				log.error("Error on ParseProject " + e.getMessage());
331
				throw new CollectorServiceRuntimeException(e);
332
			}
333
		}
334

  
335
	}
336

  
337
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/projects/grist/GristCollectorPlugin.java
1
package eu.dnetlib.data.collector.plugins.projects.grist;
2

  
3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
6

  
7
/**
8
 * Plugin to collect metadata record about projects and fundings via the europePMC GRIST API (e.g. WT projects).
9
 * <p>
10
 * Documentation on GRIST API: http://europepmc.org/GristAPI.
11
 * </p>
12
 * <p>
13
 * BaseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:"Wellcome Trust"&resultType=core
14
 * where resultType=core asks for the complete information (including abstracts).
15
 * The results returned by the API are XMLs.
16
 * </p>
17
 * <p>
18
 * Pagination: use parameter 'page'. When the response contains empty 'RecordList', it means we reached the end.
19
 * </p>
20
 *
21
 * @author alessia
22
 */
23
public class GristCollectorPlugin extends AbstractCollectorPlugin {
24

  
25
	@Override
26
	public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
27
			throws CollectorServiceException {
28
		//baseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:%22Wellcome%20Trust%22&resultType=core
29
		return new GristProjectsIterable(interfaceDescriptor.getBaseUrl());
30
	}
31

  
32
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/pom.xml
1
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2
	<modelVersion>4.0.0</modelVersion>
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet45-parent</artifactId>
6
		<version>1.0.0</version>
7
	</parent>
8
	<groupId>eu.dnetlib</groupId>
9
	<artifactId>dnet-collector-plugins</artifactId>
10
	<version>1.7.8</version>
11
	<scm>
12
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8</developerConnection>
13
	</scm>
14

  
15
	<build>
16
		<plugins>
17
			<plugin>
18
				<artifactId>maven-assembly-plugin</artifactId>
19
				<configuration>
20
					<archive>
21
						<manifest>
22
							<mainClass>eu.dnetlib.data.collector.plugins.schemaorg.SchemaOrgMainReactome</mainClass>
23
						</manifest>
24
					</archive>
25
					<descriptorRefs>
26
						<descriptorRef>jar-with-dependencies</descriptorRef>
27
					</descriptorRefs>
28
				</configuration>
29
			</plugin>
30
		</plugins>
31
	</build>
32

  
33
	<dependencies>
34
		<dependency>
35
			<groupId>eu.dnetlib</groupId>
36
			<artifactId>dnet-modular-collector-service-rmi</artifactId>
37
			<version>[1.3.0,2.0.0)</version>
38
		</dependency>
39
		<dependency>
40
			<groupId>eu.dnetlib</groupId>
41
			<artifactId>dnet-modular-collector-service</artifactId>
42
			<version>[3.3.26,4.0.0)</version>
43
		</dependency>
44
		<dependency>
45
			<groupId>com.google.code.gson</groupId>
46
			<artifactId>gson</artifactId>
47
			<version>${google.gson.version}</version>
48
		</dependency>
49
		<dependency>
50
			<groupId>commons-io</groupId>
51
			<artifactId>commons-io</artifactId>
52
			<version>${commons.io.version}</version>
53
		</dependency>
54
		<dependency>
55
			<groupId>junit</groupId>
56
			<artifactId>junit</artifactId>
57
			<version>${junit.version}</version>
58
			<scope>test</scope>
59
		</dependency>
60
		<dependency>
61
			<groupId>org.apache.httpcomponents</groupId>
62
			<artifactId>httpclient</artifactId>
63
			<version>4.5</version>
64
		</dependency>
65
		<dependency>
66
			<groupId>eu.dnetlib</groupId>
67
			<artifactId>cnr-resultset-service</artifactId>
68
			<version>[2.0.0, 3.0.0)</version>
69
			<scope>provided</scope>
70
		</dependency>
71
		<dependency>
72
			<groupId>com.ximpleware</groupId>
73
			<artifactId>vtd-xml</artifactId>
74
			<version>[2.12, 3.0.0)</version>
75
		</dependency>
76
		<dependency>
77
			<groupId>joda-time</groupId>
78
			<artifactId>joda-time</artifactId>
79
			<version>2.9.2</version>
80
		</dependency>
81

  
82
		<dependency>
83
			<groupId>org.json</groupId>
84
			<artifactId>json</artifactId>
85
			<version>20180813</version>
86
		 <type>jar</type>
87
		</dependency>
88
		<dependency>
89
			<groupId>org.apache.commons</groupId>
90
			<artifactId>commons-lang3</artifactId>
91
			<version>3.5</version>
92
		</dependency>
93

  
94
		<dependency>
95
			<groupId>org.apache.poi</groupId>
96
			<artifactId>poi</artifactId>
97
			<version>3.16</version>
98
		</dependency>
99
		<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
100
		<dependency>
101
			<groupId>org.apache.poi</groupId>
102
			<artifactId>poi-ooxml</artifactId>
103
			<version>3.16</version>
104
		</dependency>
105
		<dependency>
106
			<groupId>org.jsoup</groupId>
107
			<artifactId>jsoup</artifactId>
108
			<version>1.11.2</version>
109
		</dependency>
110
		<dependency>
111
			<groupId>commons-lang</groupId>
112
			<artifactId>commons-lang</artifactId>
113
			<version>2.6</version>
114
			<scope>compile</scope>
115
		</dependency>
116
        <dependency>
117
            <groupId>org.mockito</groupId>
118
            <artifactId>mockito-core</artifactId>
119
            <version>3.3.3</version>
120
            <scope>test</scope>
121
        </dependency>
122
    </dependencies>
123
</project>
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/rest/RestCollectorPlugin.java
1
/**
2
 * 
3
 */
4
package eu.dnetlib.data.collector.plugins.rest;
5

  
6
import com.google.gson.Gson;
7
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
8
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
9
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
10
import org.apache.commons.lang3.StringUtils;
11
import org.json.JSONObject;
12

  
13
import java.util.Map;
14

  
15
/**
16
 * @author 	js, Andreas Czerniak
17
 * @date 	2020-04-09
18
 *
19
 */
20
public class RestCollectorPlugin extends AbstractCollectorPlugin {
21

  
22
	@Override
23
	public Iterable<String> collect(InterfaceDescriptor ifDescriptor, String arg1, String arg2)
24
			throws CollectorServiceException {
25
		final String baseUrl = ifDescriptor.getBaseUrl();
26
		final String resumptionType = ifDescriptor.getParams().get("resumptionType");
27
		final String resumptionParam = ifDescriptor.getParams().get("resumptionParam");
28
		final String resumptionXpath = ifDescriptor.getParams().get("resumptionXpath");
29
		final String resultTotalXpath = ifDescriptor.getParams().get("resultTotalXpath");
30
		final String resultFormatParam = ifDescriptor.getParams().get("resultFormatParam");
31
		final String resultFormatValue = ifDescriptor.getParams().get("resultFormatValue");
32
		final String resultSizeParam = ifDescriptor.getParams().get("resultSizeParam");
33
		final String resultSizeValue = (StringUtils.isBlank(ifDescriptor.getParams().get("resultSizeValue"))) ? "100" : ifDescriptor.getParams().get("resultSizeValue");
34
        final String queryParams = ifDescriptor.getParams().get("queryParams");
35
		final String entityXpath = ifDescriptor.getParams().get("entityXpath");
36
		final String authMethod = ifDescriptor.getParams().get("authMethod");
37
		final String authToken = ifDescriptor.getParams().get("authToken");
38
		final String requestHeaderMap = ifDescriptor.getParams().get("requestHeaderMap");
39
		Gson gson = new Gson();
40
		Map<String, String> requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
41

  
42

  
43
		if (StringUtils.isBlank(baseUrl)) {throw new CollectorServiceException("Param 'baseUrl' is null or empty");}
44
		if (StringUtils.isBlank(resumptionType)) {throw new CollectorServiceException("Param 'resumptionType' is null or empty");}
45
		if (StringUtils.isBlank(resumptionParam)) {throw new CollectorServiceException("Param 'resumptionParam' is null or empty");}
46
		// if (StringUtils.isBlank(resumptionXpath)) {throw new CollectorServiceException("Param 'resumptionXpath' is null or empty");}
47
		// if (StringUtils.isBlank(resultTotalXpath)) {throw new CollectorServiceException("Param 'resultTotalXpath' is null or empty");}
48
		// resultFormatParam can be emtpy because some Rest-APIs doesn't like this argument in the query
49
		//if (StringUtils.isBlank(resultFormatParam)) {throw new CollectorServiceException("Param 'resultFormatParam' is null, empty or whitespace");}
50
		if (StringUtils.isBlank(resultFormatValue)) {throw new CollectorServiceException("Param 'resultFormatValue' is null or empty");}
51
		// if (StringUtils.isBlank(resultSizeParam)) {throw new CollectorServiceException("Param 'resultSizeParam' is null or empty");}
52
		// prevent resumptionType: discover -- if (Integer.valueOf(resultSizeValue) <= 1) {throw new CollectorServiceException("Param 'resultSizeValue' is less than 2");}
53

  
54
		// queryParams could be empty like for DRIS+ API from euroCRIS
55
		//if (StringUtils.isBlank(queryParams)) {throw new CollectorServiceException("Param 'queryParams' is null or empty");}
56
		if (StringUtils.isBlank(entityXpath)) {throw new CollectorServiceException("Param 'entityXpath' is null or empty");}
57

  
58
		String resFormat = ifDescriptor.getParams().get("resultOutputFormat");
59
		final String resultOutputFormat = StringUtils.isNotBlank(resFormat) ? resFormat.toLowerCase() : resultFormatValue.toLowerCase();
60
		
61
		return () -> new RestIterator(
62
				baseUrl,
63
				resumptionType,
64
				resumptionParam,
65
				resumptionXpath,
66
				resultTotalXpath,
67
				resultFormatParam,
68
				resultFormatValue,
69
				resultSizeParam,
70
                resultSizeValue,
71
				queryParams,
72
				entityXpath,
73
				authMethod,
74
				authToken,
75
				resultOutputFormat, requestHeaders);
76
	}
77

  
78
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/utils/JsonUtils.java
1
package eu.dnetlib.data.collector.plugins.utils;
2

  
3
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6

  
7
public class JsonUtils {
8

  
9
    private static final Log log = LogFactory.getLog(JsonUtils.class);
10

  
11
    public static final String wrapName = "recordWrap";
12
    /**
13
     * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
14
     * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
15
     * and work-around for the JSON to XML converting of org.json.XML-package.
16
     *
17
     * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
18
     *
19
     * @param jsonInput
20
     * @return convertedJsonKeynameOutput
21
     */
22
    public String syntaxConvertJsonKeyNames(String jsonInput) {
23

  
24
        log.trace("before convertJsonKeyNames: " + jsonInput);
25
        // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
26
        // replace ' 's in JSON Namens with '_'
27
        while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
28
            jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
29
        }
30

  
31
        // replace forward-slash (sign '/' ) in JSON Names with '_'
32
        while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
33
            jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
34
        }
35

  
36
        // replace '(' in JSON Names with ''
37
        while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
38
            jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
39
        }
40

  
41
        // replace ')' in JSON Names with ''
42
        while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
43
            jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
44
        }
45

  
46
        // add prefix of startNumbers in JSON Keynames with 'n_'
47
        while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
48
            jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
49
        }
50
        // add prefix of only numbers in JSON Keynames with 'm_'
51
        while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
52
            jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
53
        }
54

  
55
        // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
56
        while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
57
            jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
58
        }
59

  
60
        // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
61
        //            while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
62
        //                jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
63
        //            }
64

  
65
        // replace '=' in JSON Keynames with '-'
66
        while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
67
            jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
68
        }
69

  
70
        // replace '@' in JSON Keynames with 'oat_'
71
        while (jsonInput.matches(".*\"@([^\"]*)\":.*")) {
72
            jsonInput = jsonInput.replaceAll("\"@([^\"]*)\":", "\"oat_$1\":");
73
        }
74
        log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
75
        return jsonInput;
76
    }
77

  
78
    /**
79
     *
80
     * https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
81
     *          *
82
     * @param bufferStr - XML string
83
     * @return
84
     */
85
    public String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
86

  
87
        while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
88
            bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
89
        }
90

  
91
        // replace [#x10-#x1f] with ''
92
        //            while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
93
        //                bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
94
        //            }
95

  
96
        return bufferStr;
97
    }
98

  
99
    public String convertToXML(final String jsonRecord){
100
        String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
101

  
102
        log.trace("before convertToXML: " + jsonRecord);
103
        org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord));
104
        resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
105
        log.trace("before inputStream: " + resultXml);
106
        resultXml = XmlCleaner.cleanAllEntities(resultXml);
107
        log.trace("after cleaning and end of convertToXML: " + resultXml);
108
        return resultXml;
109
    }
110
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/test/java/eu/dnetlib/data/collector/plugins/researchfi/ResearchFiCollectorPluginTest.java
1
package eu.dnetlib.data.collector.plugins.researchfi;
2

  
3
import java.util.HashSet;
4
import java.util.Set;
5

  
6
import org.dom4j.DocumentException;
7
import org.dom4j.DocumentHelper;
8
import org.junit.Before;
9
import org.junit.Ignore;
10
import org.junit.Test;
11

  
12
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
13
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
14

  
15
public class ResearchFiCollectorPluginTest {
16

  
17
	private final ResearchFiCollectorPlugin plugin = new ResearchFiCollectorPlugin();
18

  
19
	@Before
20
	public void setUp() throws Exception {}
21

  
22
	@Test
23
	@Ignore
24
	public final void testCollect() throws CollectorServiceException, DocumentException {
25
		final InterfaceDescriptor iface = new InterfaceDescriptor();
26
		iface.setBaseUrl("https://research.fi/api/rest/v1/funding-decisions?FunderName=AKA&FundingStartYearFrom=2022");
27
		iface.setProtocol("research_fi");
28
		iface.getParams().put("auth_url", "https://researchfi-auth.2.rahtiapp.fi/realms/publicapi/protocol/openid-connect/token");
29
		iface.getParams().put("auth_client_id", "");
30
		iface.getParams().put("auth_client_secret", "");
31

  
32
		int count = 0;
33
		final Set<String> ids = new HashSet<>();
34

  
35
		for (final String s : plugin.collect(iface, null, null)) {
36

  
37
			if (count == 0) {
38
				System.out.println("First: " + s);
39
			}
40
			count++;
41

  
42
			final String id = DocumentHelper.parseText(s).valueOf("/recordWrap/funderProjectNumber");
43
			if (ids.contains(id)) {
44
				System.out.println("Id already present: " + id);
45
			}
46
			ids.add(id);
47
		}
48

  
49
		System.out.println("Total records: " + count);
50
		System.out.println("Total identifiers: " + ids.size());
51
	}
52

  
53
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/doiresolver/DOIResolverIterator.java
1
package eu.dnetlib.data.collector.plugins.doiresolver;
2

  
3
import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator;
4
import org.apache.commons.lang.StringUtils;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7

  
8
import java.io.IOException;
9
import java.nio.file.Files;
10
import java.nio.file.Paths;
11
import java.util.Iterator;
12
import java.util.concurrent.ArrayBlockingQueue;
13
import java.util.concurrent.TimeUnit;
14
import java.util.stream.Stream;
15

  
16
public class DOIResolverIterator implements Iterator<String> {
17

  
18
    private static final Log log = LogFactory.getLog(DOIResolverIterator.class);
19

  
20
    protected static final String STARTER = "FIRE";
21
    protected static final String TERMINATOR = "ARNOLD";
22
    protected static final String BAD_TERMINATOR = "BAD";
23
    protected static final String UNRESOLVED = "UNRESOLVED";
24
    protected static long TIMEOUT = 5;
25
    protected static TimeUnit TIMEOUT_UNIT = TimeUnit.SECONDS;
26

  
27
    /** Path to the dir that contains the files, each a csv with a list of DOIs, one per line. **/
28
    private String baseDir;
29
    private String fromDate;
30

  
31
    private ArrayBlockingQueue<String> queue;
32

  
33
    private CrossrefResolver crossrefResolver;
34

  
35

  
36
    public DOIResolverIterator(final String baseDir, final CrossrefResolver crossrefResolver, final String fromDate) {
37
        this.baseDir = baseDir;
38
        this.fromDate = fromDate;
39
        this.queue = new ArrayBlockingQueue<>(100);
40
        this.crossrefResolver = crossrefResolver;
41
        init();
42
    }
43

  
44
    private void init(){
45
        log.info("Init");
46

  
47
        new Thread(() -> {
48
            try{
49
                final FileSystemIterator fsi = new FileSystemIterator(baseDir, "csv", fromDate);
50
                // put first item in the queue
51
                if(queue.offer(STARTER)) {
52
                    // read the file, ask the resolvers, put results in a shared queue
53
                    //whatever exceptions, add terminator to the queue
54
                    while (fsi.hasNext()) {
55
                        String filePath = fsi.next();
56
                        try (Stream<String> stream = Files.lines(Paths.get(filePath))) {
57

  
58
                            stream.forEach(doi -> {
59
                                try {
60
                                    String resolved = resolve(doi);
61
                                    if(!resolved.equals(UNRESOLVED)) queue.offer(resolved, TIMEOUT, TIMEOUT_UNIT);
62
                                } catch (InterruptedException e) {
63
                                    log.error("DOI processing aborted, cannot offer resolved doi: "+doi+" . Did the consumer die?");
64
                                    log.error(e);
65
                                    queue.offer(BAD_TERMINATOR);
66
                                }
67
                            });
68

  
69
                        } catch (IOException e) {
70
                            log.error("DOI processing aborted");
71
                            log.error(e);
72
                            queue.offer(BAD_TERMINATOR);
73
                        }
74
                    }
75
                }
76
            } catch (Exception e) {
77
                log.error("DOI processing aborted");
78
                log.error(e);
79
                queue.offer(BAD_TERMINATOR);
80
            }
81
            queue.offer(TERMINATOR);
82
            log.info("Finished processing DOI list");
83
        }
84
        ).start();
85
    }
86

  
87
    private String resolve(final String doi){
88
       log.debug("Resolving "+doi);
89
       log.debug("Crossref...");
90
       String record = crossrefResolver.resolve(cleanDOI(doi));
91
       if(StringUtils.isNotBlank(record)) return record;
92
       else {
93
           //try another resolver
94
           log.debug("Resolver returned blank item");
95
       }
96
       return UNRESOLVED;
97
    }
98

  
99
    /**
100
     * Returns the identifier part of the DOI only.
101
     * @param doi
102
     * @return the DOI
103
     */
104
    protected String cleanDOI(final String doi){
105
       return doi.replace("http://dx.doi.org/", "").replace("https://dx.doi.org/", "")
106
               .replace("https://doi.org/", "").replace("http://doi.org/", "");
107
    }
108

  
109
    @Override
110
    public boolean hasNext() {
111
       return doHasNext();
112
    }
113

  
114
    private boolean doHasNext(){
115
        //If I get a null value, the queue is currently empty. so we wait for something
116
        String element = queue.peek();
117
        while(element == null) {
118
            try {
119
                log.debug("Sleeping while waiting for something in the queue");
120
                Thread.sleep(1000);
121
                element = queue.peek();
122
            } catch (InterruptedException e) {
123
                e.printStackTrace();
124
            }
125
        }
126
        log.debug("Found in queue element: "+element);
127
        switch(element){
128
            case TERMINATOR:
129
            case BAD_TERMINATOR:
130
                return false;
131
            case STARTER:
132
            case UNRESOLVED: //although they should not be inserted at all in the queue
133
                queue.poll();
134
                return doHasNext();
135
            default:
136
                return true;
137
        }
138
    }
139

  
140
    @Override
141
    public String next() {
142
        return queue.poll();
143
    }
144

  
145
    public String getBaseDir() {
146
        return baseDir;
147
    }
148

  
149
    public void setBaseDir(String baseDir) {
150
        this.baseDir = baseDir;
151
    }
152

  
153
    public CrossrefResolver getCrossrefResolver() {
154
        return crossrefResolver;
155
    }
156

  
157
    public void setCrossrefResolver(CrossrefResolver crossrefResolver) {
158
        this.crossrefResolver = crossrefResolver;
159
    }
160
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/datacite/DataciteESIterator.java
1
package eu.dnetlib.data.collector.plugins.datacite;
2

  
3

  
4
import java.io.ByteArrayOutputStream;
5
import java.io.IOException;
6
import java.net.URL;
7
import java.util.ArrayDeque;
8
import java.util.Iterator;
9
import java.util.Objects;
10
import java.util.Queue;
11
import java.util.zip.DataFormatException;
12
import java.util.zip.Inflater;
13

  
14
import com.google.gson.Gson;
15
import com.google.gson.GsonBuilder;
16
import eu.dnetlib.data.collector.plugins.datacite.schema.DataciteSchema;
17
import eu.dnetlib.data.collector.plugins.datacite.schema.Result;
18
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
19
import org.apache.commons.codec.binary.Base64;
20
import org.apache.commons.io.IOUtils;
21
import org.apache.commons.lang3.StringUtils;
22
import org.apache.commons.logging.Log;
23
import org.apache.commons.logging.LogFactory;
24

  
25
public class DataciteESIterator implements Iterator<String> {
26

  
27
    private static final Log log = LogFactory.getLog(DataciteESIterator.class);
28

  
29
    private final long timestamp;
30

  
31
    private String scrollId;
32

  
33
    private Queue<String> currentPage;
34

  
35
    private final Gson g =  new GsonBuilder().create();
36

  
37
    private String baseURL;
38

  
39
    private static final String START_PATH = "new_scan";
40
    private static final String NEXT_PATH = "scan/%s";
41

  
42

  
43
    public DataciteESIterator(long timestamp, String baseUrl) throws Exception {
44
        this.timestamp = timestamp;
45
        this.baseURL = baseUrl;
46
        currentPage = new ArrayDeque<>();
47
        startRequest();
48
    }
49

  
50
    protected static String decompression(final Result r) {
51
        return decompression(r.getBody().getBytes());
52
    }
53

  
54
    protected static String decompression(final byte[] bodyBytes){
55
        try {
56
            byte[] byteArray = Base64.decodeBase64(bodyBytes);
57
            Inflater decompresser = new Inflater();
58
            decompresser.setInput(byteArray);
59
            ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
60
            byte[] buffer = new byte[8192];
61
            while (!decompresser.finished()) {
62
                int size = decompresser.inflate(buffer);
63
                bos.write(buffer, 0, size);
64
            }
65
            byte[] unzippeddata = bos.toByteArray();
66
            decompresser.end();
67

  
68
            return new String(unzippeddata);
69
        } catch (DataFormatException e) {
70
            log.warn("Exception when decompressing: "+e.getMessage());
71
            return null;
72
        }
73
    }
74

  
75
    private void fillQueue(final String hits) {
76
        if (StringUtils.isBlank(hits) || "[]".equalsIgnoreCase(hits.trim()))
77
            return;
78
        try {
79
            DataciteSchema datacitepage = g.fromJson(hits, DataciteSchema.class);
80
            this.scrollId = datacitepage.getScrollId();
81
            datacitepage.getResult().stream().map(DataciteESIterator::decompression).filter(Objects::nonNull).forEach(this.currentPage::add);
82
        } catch (Throwable e) {
83
            System.out.println(hits);
84
            e.printStackTrace();
85
        }
86
    }
87

  
88
    private void startRequest() throws Exception {
89
        String url = baseURL+"/"+START_PATH;
90
        final URL startUrl = new URL(timestamp >0 ? url + "?timestamp="+timestamp : url);
91
        fillQueue(IOUtils.toString(startUrl.openStream()));
92
    }
93

  
94
    private void getNextPage() throws IOException {
95
        String url = baseURL+"/"+NEXT_PATH;
96
        final URL startUrl = new URL(String.format(url,scrollId));
97
        fillQueue(IOUtils.toString(startUrl.openStream()));
98
    }
99

  
100

  
101
    @Override
102
    public boolean hasNext() {
103
        return  currentPage.size() >0;
104
    }
105

  
106
    @Override
107
    public String next() {
108

  
109
        if (currentPage.size() == 0) {
110

  
111
            return null;
112
        }
113

  
114
        String nextItem = currentPage.remove();
115
        if (currentPage.size() == 0) {
116
            try {
117
                getNextPage();
118
            } catch (Throwable e) {
119
                throw new RuntimeException(e);
120
            }
121
        }
122

  
123
        return XmlCleaner.cleanAllEntities(nextItem);
124
    }
125

  
126
    public String getBaseURL() {
127
        return baseURL;
128
    }
129

  
130
    public void setBaseURL(final String baseURL) {
131
        this.baseURL = baseURL;
132
    }
133
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/doiresolver/DOIResolver.java
1
package eu.dnetlib.data.collector.plugins.doiresolver;
2

  
3
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
4

  
5
public interface DOIResolver {
6

  
7
    String resolve(String doi);
8

  
9
    void setBaseURL(String baseURL);
10
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/test/resources/eu/dnetlib/data/collector/plugins/schemaorg/sitemap.xml
1
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
2
    <sitemap>
3
        <loc>file:target/test-classes/eu/dnetlib/data/collector/plugins/schemaorg/sitemap_file.xml</loc>
4
    </sitemap>
5
</sitemapindex>
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/test/resources/eu/dnetlib/data/collector/plugins/datasets/pangaea-eu-projects_Openaire.csv
1
4148;FondTara;Fondation Tara Expeditions;tara________::1
2
60;QUEEN;Quaternary Environment of the Eurasian North;corda_______::304178
3
4106;EPOCA;European Project on Ocean Acidification;corda_______::211384
4
4119;HERMIONE;Hotspot Ecosystem Research and Mans Impact On European Seas;corda_______::226354
5
4122;HYPOX;In situ monitoring of oxygen depletion in hypoxic ecosystems of coastal and open seas and land-locked water bodies;corda_______::226213
6
4127;CoralFISH;Ecosystem based management of corals, fish and fisheries in the deep waters of Europe and beyond;corda_______::213144
7
4129;ice2sea;ice2sea;corda_______::226375
8
4138;ECO2;Sub-seabed CO2 Storage: Impact on Marine Ecosystems;corda_______::265847
9
4142;MedSeA;Mediterranean Sea Acidification in a Changing Climate;corda_______::265103
10
4145;DARCLIFE;Deep subsurface Archaea: carbon cycle, life strategies, and role in sedimentary ecosystems;corda_______::247153
11
4147;EURO-BASIN;Basin Scale Analysis, Synthesis and Integration;corda_______::264933
12
4154;Past4Future;Climate Change: Learning from the past climate;corda_______::243908
13
4172;CARBOCHANGE;Changes in the carbon uptake and emissions by oceans in a changing climate;corda_______::264879
14
4175;ERA-CLIM;European Reanalysis of Global Climate Observations;corda_______::265229
15
4181;PAGE21;Changing Permafrost in the Arctic and its Global Effects in the 21st Century;corda_______::282700
16
4182;MicroB3;MicroB3 - Microbial Biodiversity, Bioinformatics and Biotechnology;corda_______::308299
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/datacite/schema/Result.java
1

  
2
package eu.dnetlib.data.collector.plugins.datacite.schema;
3

  
4
import com.google.gson.annotations.Expose;
5
import com.google.gson.annotations.SerializedName;
6

  
7
public class Result {
8

  
9
    @SerializedName("body")
10
    @Expose
11
    private String body;
12
    @SerializedName("id")
13
    @Expose
14
    private String id;
15
    @SerializedName("originalId")
16
    @Expose
17
    private String originalId;
18
    @SerializedName("timestamp")
19
    @Expose
20
    private Integer timestamp;
21

  
22
    public String getBody() {
23
        return body;
24
    }
25

  
26
    public void setBody(String body) {
27
        this.body = body;
28
    }
29

  
30
    public String getId() {
31
        return id;
32
    }
33

  
34
    public void setId(String id) {
35
        this.id = id;
36
    }
37

  
38
    public String getOriginalId() {
39
        return originalId;
40
    }
41

  
42
    public void setOriginalId(String originalId) {
43
        this.originalId = originalId;
44
    }
45

  
46
    public Integer getTimestamp() {
47
        return timestamp;
48
    }
49

  
50
    public void setTimestamp(Integer timestamp) {
51
        this.timestamp = timestamp;
52
    }
53

  
54
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/test/resources/eu.dnetlib.data.collector.plugins.projects.gtr2/projects.xml
1
<ns2:projects xmlns:ns1="http://gtr.rcuk.ac.uk/gtr/api"
2
              xmlns:ns2="http://gtr.rcuk.ac.uk/gtr/api/project"
3
              xmlns:ns3="http://gtr.rcuk.ac.uk/gtr/api/project/outcome"
4
              xmlns:ns4="http://gtr.rcuk.ac.uk/gtr/api/organisation"
5
              xmlns:ns5="http://gtr.rcuk.ac.uk/gtr/api/person" xmlns:ns6="http://gtr.rcuk.ac.uk/gtr/api/fund"
6
              ns1:page="1" ns1:size="20" ns1:totalPages="3417" ns1:totalSize="68323">
7
	<ns2:project ns1:id="E178742B-571B-498F-8402-122F17C47546"
8
	             ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/projects/E178742B-571B-498F-8402-122F17C47546"
9
	             ns1:created="2016-11-11T20:42:55Z">
10
		<ns1:links test="ciao">
11
			<ns1:link
12
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/persons/CB8C3733-D17E-46A8-9E7C-D5A76F36612A"
13
					ns1:rel="PI_PER"/>
14
			<ns1:link
15
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/persons/95D46800-A4DF-40AC-8FD3-7EAF5194B22C"
16
					ns1:rel="COI_PER"/>
17
			<ns1:link
18
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/organisations/8319F78A-DCBD-49F6-BE00-78E1CD75CDA9"
19
					ns1:rel="LEAD_ORG"/>
20
			<ns1:link
21
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/funds/B794CB74-BD85-452B-8030-69BD4AF82CE9"
22
					ns1:rel="FUND" ns1:start="2007-06-01T00:00:00+01:00"
23
					ns1:end="2010-05-31T00:00:00+01:00"/>
24
			<ns1:link
25
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/publications/2A19F274-8BAF-4AFC-865C-638BFB5FBDB9"
26
					ns1:rel="PUBLICATION"/>
27
		</ns1:links>
28
		<ns2:identifiers>
29
			<ns2:identifier ns2:type="RCUK">BB/E021409/1</ns2:identifier>
30
		</ns2:identifiers>
31
		<ns2:title> A multicellular 3D stem cell model to define the role of stroma in epithelial
32
			differentiation </ns2:title>
33
		<ns2:status>Closed</ns2:status>
34
		<ns2:grantCategory>Research Grant</ns2:grantCategory>
35
		<ns2:leadOrganisationDepartment>Biology</ns2:leadOrganisationDepartment>
36
		<ns2:abstractText> In aging men the disorders of prostate are a major medical problem.
37
			Benign prostatic hyperplasia and cancer are increasingly prevalent. To find cures for
38
			these diseases it is essential to understand how the prostate grows and functions
39
			normally. All organs have their own population of stem cells which grow and develop into
40
			a variety of cells which communicate to form correct organ architecture and function.
41
			This occurs as a result of signals from the stem cell's own genes but also from signals
42
			provided by neighbouring cells, known as stroma. In the prostate, how this occurs is
43
			unknown. We propose to develop a model to grow gland-like structures from adult stem
44
			cells in the laboratory. The model will be employed to understand how stromal cells
45
			influence prostate cellular architecture. We aim to identify proteins which act as
46
			signals from the stroma to change epithelial shape. The shape of a cell has important
47
			effects on cell function. These experiments will increase our knowledge of how tissues
48
			develop and function. Development of tissue-like models based on human cells will
49
			provide a valuable gap between results from animal models and human clinical studies, to
50
			help understand the basic mechanisms of human physiology and disease. Such model systems
51
			will reduce the need for animal experimentation, which is currently the best way to
52
			investigate complex cell interactions in tissues. We anticipate the model will aid
53
			university directed research into human differentiation and disease mechanisms, but also
54
			for the pharmaceutical industry to screen new drugs for efficacy and safety in humans
55
			before trial. </ns2:abstractText>
56
		<ns2:techAbstractText> Recent advances in our lab have resulted in the isolation of human
57
			adult prostate stem cells and the development of 3D models of prostatic acini from basal
58
			cells. Results from 3D modelling indicate that stroma is important for epithelial
59
			morphogenesis and differentiation. Importantly, stromal cultures increase epithelial
60
			cell polarity and columnar cell shape. Using electron microscopy and RT-PCR our
61
			preliminary data has found that these morphological effects are accompanied by increased
62
			desmosomal expression. We now wish to develop our tissue engineering to produce a 3D
63
			model of prostatic acini using a homogeneous population of stem cells. A stem cell model
64
			will allow the study of full epithelial differentiation and the stem cell niche. It is
65
			important to model the prostate with human cells because the mouse prostate has a
66
			different anatomy, cell structure and protein function, and does not develop equivalent
67
			diseases to humans. The model will be used to investigate our hypothesis that 'stroma
68
			signals to control epithelial cell shape and polarity'. We will confirm which desmosomal
69
			isoforms are present in prostate epithelial acini and which are upregulated by stromal
70
			cultures, using Western Blotting and real time PCR. Upregulated desmosomal isoforms will
71
			be used as markers for epithelial cell polarity and shape. A differential gene
72
			expression profile will be generated from stroma grown with epithelial acini in 3D
73
			culture and stroma grown in 3D culture without acini, using microarray analysis.
74
			Candidate stromal genes will be identified that signal to upregulate epithelial polarity
75
			(desmosomal expression) and their function will be confirmed using siRNA knockdown
76
			studies. This is a novel pathway for epithelial cell differentiation which has not been
77
			studied before. </ns2:techAbstractText>
78
		<ns2:healthCategories/>
79
		<ns2:researchActivities/>
80
		<ns2:researchSubjects/>
81
		<ns2:researchTopics/>
82
	</ns2:project>
83
	<ns2:project ns1:id="E37C97C5-7489-4205-834F-151D05B7E07A"
84
	             ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/projects/E37C97C5-7489-4205-834F-151D05B7E07A"
85
	             ns1:created="2016-11-11T20:42:55Z">
86
		<ns1:links test="helo2">
87
			<ns1:link
88
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/persons/AFFB5A85-DAC7-48F2-AE07-952481073BAA"
89
					ns1:rel="PI_PER"/>
90
			<ns1:link
91
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/persons/DFA58FA2-CCD6-445F-B2BC-E830C23FA563"
92
					ns1:rel="COI_PER"/>
93
			<ns1:link
94
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/persons/6E4302E7-A895-4FF7-AE8C-26C2478A82E6"
95
					ns1:rel="COI_PER"/>
96
			<ns1:link
97
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/organisations/30A429E3-83B7-4E41-99C0-14A144F07DFE"
98
					ns1:rel="LEAD_ORG"/>
99
			<ns1:link
100
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/organisations/FADDC755-1F45-47D7-8591-F183B7160CC2"
101
					ns1:rel="PP_ORG"/>
102
			<ns1:link
103
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/organisations/E82E4BC6-2839-4E7A-82CA-88B033E53B45"
104
					ns1:rel="PP_ORG"/>
105
			<ns1:link
106
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/organisations/F8B807ED-ACF2-4724-9DC6-291B77059637"
107
					ns1:rel="PP_ORG"/>
108
			<ns1:link
109
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/funds/B7A4A2BB-1530-4846-8F2B-891EEBFF3F5F"
110
					ns1:rel="FUND" ns1:start="2013-10-01T00:00:00+01:00" ns1:end="2017-11-30T00:00:00Z"/>
111
			<ns1:link
112
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/keyfindings/45480FAD-42D9-4948-ADE2-1B161F6BF481"
113
					ns1:rel="KEY_FINDING"/>
114
			<ns1:link
115
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/disseminations/6E8F7769-1A48-4BDD-8248-6B5938CA3495"
116
					ns1:rel="DISSEMINATION"/>
117
			<ns1:link
118
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/impactsummaries/01F42709-4FC3-4267-9E2A-08591C7950F8"
119
					ns1:rel="IMPACT_SUMMARY"/>
120
			<ns1:link
121
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/publications/76690C2C-FEF9-42AC-A4A2-B09338B33C45"
122
					ns1:rel="PUBLICATION"/>
123
			<ns1:link
124
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/publications/7B0D46AB-374C-4327-9EFE-5CA9683560FB"
125
					ns1:rel="PUBLICATION"/>
126
			<ns1:link
127
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/publications/03415E7B-3BE0-4207-857D-B595F52E2C65"
128
					ns1:rel="PUBLICATION"/>
129
			<ns1:link
130
					ns1:href="http://gtr.rcuk.ac.uk:80/gtr/api/outcomes/publications/2D27AB50-6985-4A61-83C6-54A102DFCFB8"
131
					ns1:rel="PUBLICATION"/>
132
		</ns1:links>
133
		<ns2:identifiers>
134
			<ns2:identifier ns2:type="RCUK">NE/K001906/1</ns2:identifier>
135
		</ns2:identifiers>
136
		<ns2:title> Biogeochemistry, macronutrient and carbon cycling in the benthic layer </ns2:title>
137
		<ns2:status>Active</ns2:status>
138
		<ns2:grantCategory>Research Grant</ns2:grantCategory>
139
		<ns2:leadOrganisationDepartment>School of Ocean and Earth
140
			Science</ns2:leadOrganisationDepartment>
141
		<ns2:abstractText> The coasts and shelf seas that surround us have been the focal point of
142
			human prosperity and well-being throughout our history and, consequently, have had a
143
			disproportionate effect on our culture. The societal importance of the shelf seas
144
			extends beyond food production to include biodiversity, carbon cycling and storage,
145
			waste disposal, nutrient cycling, recreation and renewable energy. Yet, as increasing
146
			proportions of the global population move closer to the coast, our seas have become
147
			progressively eroded by human activities, including overfishing, pollution, habitat
148
			disturbance and climate change. This is worrying because the condition of the seabed,
149
			biodiversity and human society are inextricably linked. Hence, there is an urgent need
150
			to understand the relative sensitivities of a range of shelf habitats so that human
151
			pressures can be managed more effectively to ensure the long-term sustainability of our
152
			seas and provision of societal benefits. Achieving these aims is not straightforward, as
153
			the capacity of the seabed to provide the goods and services we rely upon depends on the
154
			type of substrate (rock, gravel, sand, mud) and local conditions; some habitats are
155
			naturally dynamic and relatively insensitive to disturbance, while others are
156
			comparatively stable and vulnerable to change. This makes it very difficult to assess
157
			habitat sensitivities or make general statements about what benefits we can expect from
158
			our seas in the future. Recently, NERC and DEFRA have initiated a major new research
159
			programme on Shelf Sea Biogeochemistry that will improve knowledge about these issues.
160
			In response to this call, we have assembled a consortium of leading scientists that
161
			includes microbiologists, ecologists, physical oceanographers, biogeochemists,
162
			mathematical modellers and policy advisors. With assistance from organisations like
163
			CEFAS, Marine Scotland and AFBI, they will carry out a series of research cruises around
164
			the UK that will map the sensitivity and status of seabed habitats based on their
165
			physical condition, the microbial and faunal communities that inhabit them, and the size
166
			and dynamics of the nitrogen and carbon pools found there. The latest marine
167
			technologies will measure the amount of mixing and flow rates just above the seabed, as
168
			well as detailed seabed topography. These measurements will allow better understanding
169
			of the physical processes responsible for movement and mixing of sediment, nutrient, and
170
			carbon. At the same time, cores will be retrieved containing the microbial and faunal
171
			communities and their activity and behaviour will be linked to specific biogeochemical
172
			responses. Highly specialised autonomous vehicles, called landers, will also measure
173
			nutrient concentrations and fluxes at the seabed. Components of the system can then be
174
			experimentally manipulated to mimic scenarios of change, such as changing hydrodynamics,
175
			disturbance or components of climate change. This will be achieved in the field by
176
			generating different flow regimes using a submerged flume or, in the laboratory, using
177
			intact sediment communities exposed to different levels of CO2, temperature and oxygen.
178
			By measuring the biogeochemical response and behaviour of the microbial and faunal
179
			communities to these changes, we will generate an understanding of what may happen if
180
			such changes did occur across our shelf seas. We will use all of this information to
181
			assess the relative vulnerability of areas of the UK seabed by overlaying the
182
			observation and experimental results over maps of various human pressures, which will be
183
			of value to planners and policymakers. Mathematical models will test future scenarios of
184
			change, such as opening or closing vulnerable areas to fishing or anticipated changes in
185
			the factors that control nutrient and carbon stocks. This will be valuable in exploring
186
			different responses to external pressures and for deciding which management measures
187
			should be put in place to preserve our shelf seas for future generations </ns2:abstractText>
188
		<ns2:potentialImpact> Commercial private sector and the knowledge economy: new and
189
			innovative methodologies, equipment and techniques, and combined state-of-the-art
190
			technologies (&gt;2.3 million in-kind, see JeS) will assess what the primary physical
191
			and biogeochemical controls of shelf productivity are up to shelf sea scales. Since many
192
			interests rely on the marine environment, beneficiaries will be varied. By sharing
193
			expertise and knowledge, a world-leading manufacturer of microsensors and microscale
194
			instrumentation and an internationally recognized marine environmental data acquisition
195
			company will benefit from exploitable opportunities, e.g. new visualisation tools that
196
			enable holistic understanding of large-scale ecosystem processes. Policy professionals,
197
			governmental and devolved governmental organisations: The importance of shelf seas to
198
			society extends beyond fisheries to wider issues, such as biodiversity, carbon cycling
199
			and storage, waste disposal, nutrient cycling, and renewable energy resources.
200
			Consortium expertise will contribute to these UK priority challenges. The UK Marine
201
			&amp; Coastal Access Act (MCAA), UK Climate Change Act, EU Habitats Directive and EU
202
			Marine Strategy Framework Directive (MSFD) support sustainable use of the marine
203
			environment. They also support the UK vision for achieving 'clean, healthy, safe
204
			productive and biologically diverse ocean and seas' (UK Marine Science Strategy). We
205
			will provide a coherent framework for sound evidence based-science in support of these
206
			policy instruments and statutory requirements. For example, the MSFD aims to achieve
207
			Good Environmental Status in EU marine waters by 2020, but we lack understanding of the
208
			magnitude and synchronicity of change in SSEs. Our research will directly inform
209
			Descriptor 1 (biological diversity) and 6 (seabed integrity) for a wide range of
210
			sediment habitats over time, which is important because the determination of good
211
			environmental status may have to be adapted over time (addressed in Task 2) &quot;in vie
212
			of the dynamic nature of marine ecosystems and their natural variability, and given that
213
			the pressures and impacts on them may vary with the evolvement of different patterns of
214
			human activity and the impact of climate change&quot; (MSFD). Our work will also inform
215
			environmental monitoring programmes: OSPARs Joint Assessment and Monitoring programme,
216
			the Eutrophication Monitoring Programme and The Clean Seas Environment Monitoring
217
			Programme (CSEMP, led by consortium member CEFAS). Task 1-3 complement the outcomes of
218
			CESEMP and provide scientific evidence to OSPAR. Similarly, experimental scenarios and
219
			modelling approaches will provide needed information for (i) the EU Water Framework
220
			Directive (the requirement for 'good chemical and ecological status' by 2015 does not
221
			account for climate change) and, (ii) the UK White Paper for MCAA (it is unclear how
222
			commitments to &quot;look ahead at the predicted impacts of climate change on the marine
223
			environment, how marine activities will contribute towards it, and how they are affected
224
			by it&quot; will be achieved). Finally, other EU instruments, such as the Habitats
225
			Directive (introduced in 1992), the EU Common Fisheries Policy (revised in 2002) and
226
			national legislation such as the UK MCAA and Scottish Marine Act, assume that removal
227
			(or control) of direct pressures will result in ecosystem recovery and/or species
228
			persistence. Our programme includes experimental scenarios and modelling approaches to
229
			provide further information on the vulnerability of SSEs in environmental futures under
230
			multiple pressures (Task 3). Our outputs will also help NERC meet its science theme
231
			challenges. Public, wider community: active engagement with a variety of organisations
232
			is detailed in Pathways to Impact (PtI). Skills&amp; training: In addition to academic
233
			progression, early career researchers will gain experience and receive mentoring in
234
			running a large interdisciplinary programme, as well as training in communication skills
235
			and scientific methodology </ns2:potentialImpact>
236
		<ns2:healthCategories/>
237
		<ns2:researchActivities/>
238
		<ns2:researchSubjects>
239
			<ns2:researchSubject>
240
				<ns2:id>138395</ns2:id>
241
				<ns2:text>Marine environments</ns2:text>
242
				<ns2:percentage>75</ns2:percentage>
243
			</ns2:researchSubject>
244
			<ns2:researchSubject>
245
				<ns2:id>46902</ns2:id>
246
				<ns2:text>Geosciences</ns2:text>
247
				<ns2:percentage>15</ns2:percentage>
248
			</ns2:researchSubject>
249
			<ns2:researchSubject>
250
				<ns2:id>13097</ns2:id>
251
				<ns2:text>Ecol, biodivers.&amp; systematics</ns2:text>
252
				<ns2:percentage>5</ns2:percentage>
253
			</ns2:researchSubject>
254
			<ns2:researchSubject>
255
				<ns2:id>33851</ns2:id>
256
				<ns2:text>Microbial sciences</ns2:text>
257
				<ns2:percentage>5</ns2:percentage>
258
			</ns2:researchSubject>
259
		</ns2:researchSubjects>
260
		<ns2:researchTopics>
261
			<ns2:researchTopic>
262
				<ns2:id>21005</ns2:id>
263
				<ns2:text>Sediment/Sedimentary Processes</ns2:text>
264
				<ns2:percentage>15</ns2:percentage>
265
			</ns2:researchTopic>
266
			<ns2:researchTopic>
267
				<ns2:id>143045</ns2:id>
268
				<ns2:text>Ecosystem Scale Processes</ns2:text>
269
				<ns2:percentage>15</ns2:percentage>
270
			</ns2:researchTopic>
271
			<ns2:researchTopic>
272
				<ns2:id>63200</ns2:id>
273
				<ns2:text>Biogeochemical Cycles</ns2:text>
274
				<ns2:percentage>60</ns2:percentage>
275
			</ns2:researchTopic>
276
			<ns2:researchTopic>
277
				<ns2:id>108367</ns2:id>
278
				<ns2:text>Community Ecology</ns2:text>
279
				<ns2:percentage>5</ns2:percentage>
280
			</ns2:researchTopic>
281
			<ns2:researchTopic>
282
				<ns2:id>80410</ns2:id>
283
				<ns2:text>Responses to environment</ns2:text>
284
				<ns2:percentage>5</ns2:percentage>
285
			</ns2:researchTopic>
286
		</ns2:researchTopics>
287
	</ns2:project>
288
</ns2:projects>
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.7.8/src/main/java/eu/dnetlib/data/collector/plugins/excel/Read.java
1
package eu.dnetlib.data.collector.plugins.excel;
2

  
3
/**
4
 * Created by miriam on 10/05/2017.
5
 */
6
import java.io.File;
7
import java.io.FileInputStream;
8
import java.io.IOException;
9
import java.net.URL;
10
import java.util.ArrayList;
11
import java.util.HashMap;
12
import java.util.Iterator;
13

  
14
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
15
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
16
import org.apache.commons.lang3.StringUtils;
17
import org.apache.commons.logging.Log;
18
import org.apache.commons.logging.LogFactory;
19
import org.apache.poi.ss.usermodel.Cell;
20
import org.apache.poi.ss.usermodel.DataFormatter;
21
import org.apache.poi.ss.usermodel.Row;
22
import org.apache.poi.ss.usermodel.Sheet;
23
import org.apache.poi.ss.usermodel.Workbook;
24
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
25
import org.json.*;
26

  
27
import org.apache.commons.io.FileUtils;
28

  
29
public class Read {
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff