Project

General

Profile

« Previous | Next » 

Revision 53614

Added schema.org harvesting plugin. Supports sitemapindex files and api listing calls to retrieve endpoints list

View differences:

modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgSitemapIteratorTest.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
6
import org.junit.Assert;
7
import org.junit.Before;
8
import org.junit.Ignore;
9
import org.junit.Test;
10

  
11
import java.net.URL;
12
import java.nio.charset.StandardCharsets;
13
import java.util.HashMap;
14
import java.util.concurrent.TimeUnit;
15

  
16
@Ignore
17
public class SchemaOrgSitemapIteratorTest {
18
	@Before
19
	public void setUp() throws Exception {
20
	}
21

  
22
	@Test
23
	public void test() throws CollectorServiceException {
24
		URL resource = SchemaOrgSitemapIteratorTest.class.getResource("sitemap.xml");
25

  
26
		HashMap<String,String> params = new HashMap<>();
27
		params.put("repositoryAccessType", "sitemapindex");
28
		params.put("consumerBlockPolling", Boolean.toString(true));
29
		params.put("consumerBlockPollingTimeout", "2");
30
		params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
31
		params.put("endpointCharset", StandardCharsets.UTF_8.name());
32
		params.put("updatedDateFormat", "YYYY-MM-DD");
33
		params.put("createdDateFormat", "YYYY-MM-DD");
34
		params.put("publicationDateFormat", "YYYY-MM-DD");
35
		params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
36
		params.put("identifierFallbackType", null);
37
		params.put("identifierFallbackURL", Boolean.toString(true));
38
		params.put("identifierMappingARK", "ark, ARK");
39
		params.put("identifierMappingDOI", "doi, DOI");
40
		params.put("identifierMappingHandle", "Handle, HANDLE");
41
		params.put("identifierMappingPURL", "purl, PURL");
42
		params.put("identifierMappingURN", "urn, URN");
43
		params.put("identifierMappingURL", "url, URL");
44

  
45
		params.put("repositoryAccessType", "sitemapindex");
46
		params.put("sitemap_queueSize", "100");
47
		params.put("sitemap_IndexCharset", StandardCharsets.UTF_8.name());
48
		params.put("sitemap_FileCharset", StandardCharsets.UTF_8.name());
49
		params.put("sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Text.toString());
50
		params.put("sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text.toString());
51

  
52
		InterfaceDescriptor descriptor = new InterfaceDescriptor();
53
		descriptor.setId("schema.org - reactome");
54
		descriptor.setBaseUrl(resource.toString());
55
		descriptor.setParams(params);
56

  
57
		SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
58

  
59
		Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
60

  
61
		int length =0;
62
		int count =0;
63
		int nullcount =0;
64
		for(String item : iterable) {
65
			count += 1;
66
			if(item == null) {
67
				nullcount+=1;
68
				continue;
69
			}
70
			length = item.length();
71
		}
72
		Assert.assertEquals(1, nullcount);
73
		Assert.assertEquals(2, count);
74
		Assert.assertEquals(1626, length);
75

  
76
	}
77
}
modules/dnet-collector-plugins/trunk/src/test/resources/eu/dnetlib/data/collector/plugins/schemaorg/sitemap_file.xml
1
file:target/test-classes/eu/dnetlib/data/collector/plugins/schemaorg/index.html
modules/dnet-collector-plugins/trunk/src/test/resources/eu/dnetlib/data/collector/plugins/schemaorg/sitemap.xml
1
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
2
    <sitemap>
3
        <loc>file:target/test-classes/eu/dnetlib/data/collector/plugins/schemaorg/sitemap_file.xml</loc>
4
    </sitemap>
5
</sitemapindex>
modules/dnet-collector-plugins/trunk/src/test/resources/eu/dnetlib/data/collector/plugins/schemaorg/index.html
1
<html xmlns="//www.w3.org/1999/xhtml" xml:lang="en-gb" lang="en-gb" dir="ltr" >
2
<head>
3
    <script type="application/ld+json">
4
	{
5
		"@context": "http://schema.org",
6
		"@type": "WebSite",
7
		"url": "https://reactome.org/",
8
		"potentialAction": {
9
			"@type": "SearchAction",
10
			"target": "https://reactome.org/content/query?q={term}",
11
			"query-input": "required name=term"
12
		}
13
	}
14
</script>
15
    <script type="application/ld+json">
16
{
17
	"@context": "http://schema.org",
18
	"@type": "Organization",
19
	"url": "https://reactome.org",
20
	"logo": "https://reactome.org/templates/favourite/images/logo/logo.png",
21
	"email": "help@reactome.org"
22
}
23
</script>
24
    <script type="application/ld+json">
25
	{"name":"Binding of the influenza virion to the host cell","description":"Influenza viruses bind via their surface HA (hemagglutinin) to sialic acid in alpha 2,3 or alpha 2,6 linkage with galactose on the host cell surface. Sialic acid in 2,6 linkages is characteristic of human cells while 2,3 linkages are characteristic of avian cells. The specificity of influenza HA for sialic acid in alpha 2,6 or alpha 2,3 linkages is a feature restricting the transfer of influenza viruses between avian species and humans. This species barrier can be overcome, however. Notably, passaged viruses adapt to their host through mutation in the receptor binding site of the viral HA gene.","url":"https://reactome.org/PathwayBrowser/#/R-HSA-168272","sameAs":null,"version":"66","keywords":["Reaction"],"creator":[],"includedInDataCatalog":{"url":"https://reactome.org","name":"Reactome","@type":"DataCatalog"},"distribution":[{"contentUrl":"https://reactome.org/ContentService/exporter/sbml/168272.xml","fileFormat":"SBML","@type":"DataDownload"},{"contentUrl":"https://reactome.org/ReactomeRESTfulAPI/RESTfulWS/sbgnExporter/168272","fileFormat":"SBGN","@type":"DataDownload"},{"contentUrl":"https://reactome.org/ReactomeRESTfulAPI/RESTfulWS/biopaxExporter/Level2/168272","fileFormat":"BIOPAX2","@type":"DataDownload"},{"contentUrl":"https://reactome.org/ReactomeRESTfulAPI/RESTfulWS/biopaxExporter/Level3/168272","fileFormat":"BIOPAX3","@type":"DataDownload"},{"contentUrl":"https://reactome.org/cgi-bin/pdfexporter?DB=gk_current&ID=168272","fileFormat":"PDF","@type":"DataDownload"},{"contentUrl":"https://reactome.org/cgi-bin/rtfexporter?DB=gk_current&ID=168272","fileFormat":"DOCX","@type":"DataDownload"},{"contentUrl":"https://reactome.org/cgi-bin/protegeexporter?DB=gk_current&ID=168272","fileFormat":"OWL","@type":"DataDownload"}],"citation":["http://www.ncbi.nlm.nih.gov/pubmed/0"],"license":"https://creativecommons.org/licenses/by/4.0/","@context":"http://schema.org/","@type":"DataSet"}
26
</script>
27
</head>
28
<body>
29
this is the body of the page
30
</body>
31
</html>
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgIterableOLD.java
1
//package eu.dnetlib.data.collector.plugins.schemaorg;
2
//
3
//import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
4
//import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator;
5
//import org.apache.commons.logging.Log;
6
//import org.apache.commons.logging.LogFactory;
7
//
8
//import java.net.URL;
9
//import java.util.Iterator;
10
//import java.util.concurrent.ArrayBlockingQueue;
11
//import java.util.concurrent.ExecutorService;
12
//import java.util.concurrent.Executors;
13
//
14
//public class SchemaOrgIterableOLD implements Iterable<String> {
15
//	private static final Log log = LogFactory.getLog(SchemaOrgIterable.class);
16
//
17
//	public static class Options {
18
//		private SchemaOrgIterator.Options schemaOrgIteratorOptions;
19
//		private SitemapIndexIterator.Options sitemapIndexIteratorOptions;
20
//		private SitemapFileIterator.Options sitemapFileIteratorOptions;
21
//		private EndpointAccessIterator.Options endpointAccessIteratorOptions;
22
//		private DatasetMappingIterator.Options datasetMappingIteratorOptions;
23
//
24
//		private int queueSize;
25
//
26
//		public DatasetMappingIterator.Options getDatasetMappingIteratorOptions() {
27
//			return datasetMappingIteratorOptions;
28
//		}
29
//
30
//		public void setDatasetMappingIteratorOptions(DatasetMappingIterator.Options datasetMappingIteratorOptions) {
31
//			this.datasetMappingIteratorOptions = datasetMappingIteratorOptions;
32
//		}
33
//
34
//		public EndpointAccessIterator.Options getEndpointAccessIteratorOptions() {
35
//			return endpointAccessIteratorOptions;
36
//		}
37
//
38
//		public void setEndpointAccessIteratorOptions(EndpointAccessIterator.Options endpointAccessIteratorOptions) {
39
//			this.endpointAccessIteratorOptions = endpointAccessIteratorOptions;
40
//		}
41
//
42
//		public SitemapFileIterator.Options getSitemapFileIteratorOptions() {
43
//			return sitemapFileIteratorOptions;
44
//		}
45
//
46
//		public void setSitemapFileIteratorOptions(SitemapFileIterator.Options sitemapFileIteratorOptions) {
47
//			this.sitemapFileIteratorOptions = sitemapFileIteratorOptions;
48
//		}
49
//
50
//		public SitemapIndexIterator.Options getSitemapIndexIteratorOptions() {
51
//			return sitemapIndexIteratorOptions;
52
//		}
53
//
54
//		public void setSitemapIndexIteratorOptions(SitemapIndexIterator.Options sitemapIndexIteratorOptions) {
55
//			this.sitemapIndexIteratorOptions = sitemapIndexIteratorOptions;
56
//		}
57
//
58
//		public SchemaOrgIterator.Options getSchemaOrgIteratorOptions() {
59
//			return schemaOrgIteratorOptions;
60
//		}
61
//
62
//		public void setSchemaOrgIteratorOptions(SchemaOrgIterator.Options schemaOrgIteratorOptions) {
63
//			this.schemaOrgIteratorOptions = schemaOrgIteratorOptions;
64
//		}
65
//
66
//		public int getQueueSize() {
67
//			return queueSize;
68
//		}
69
//
70
//		public void setQueueSize(int queueSize) {
71
//			this.queueSize = queueSize;
72
//		}
73
//	}
74
//
75
//	private Options options;
76
//	private ArrayBlockingQueue<String> queue;
77
//
78
//	public SchemaOrgIterable(Options options) {
79
//		this.options = options;
80
//		this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize(), true);
81
//	}
82
//
83
//	public void bootstrap() {
84
//		ExecutorService executor = Executors.newSingleThreadExecutor();
85
//		executor.execute(new Harvester());
86
//		executor.shutdown();
87
//	}
88
//
89
//	@Override
90
//	public Iterator<String> iterator() {
91
//		return new SchemaOrgIterator(this.options.getSchemaOrgIteratorOptions(), this.queue);
92
//	}
93
//
94
//	private class Harvester implements Runnable{
95
//
96
//		@Override
97
//		public void run() {
98
//			this.execute();
99
//		}
100
//
101
//		private void execute(){
102
//			try {
103
//				SitemapIndexIterator sitemapIndexIterator = new SitemapIndexIterator(options.getSitemapIndexIteratorOptions());
104
//				sitemapIndexIterator.bootstrap();
105
//
106
//				while (sitemapIndexIterator.hasNext()) {
107
//					String sitemapFile = sitemapIndexIterator.next();
108
//					if(sitemapFile == null) continue;
109
//
110
//					SitemapFileIterator.Options sitemapFileIteratorOptions = (SitemapFileIterator.Options)options.getSitemapFileIteratorOptions().clone();
111
//					sitemapFileIteratorOptions.setFileUrl(new URL(sitemapFile));
112
//					SitemapFileIterator sitemapFileIterator = new SitemapFileIterator(sitemapFileIteratorOptions);
113
//					sitemapFileIterator.bootstrap();
114
//
115
//					EndpointAccessIterator endpointAccessIterator = new EndpointAccessIterator(options.getEndpointAccessIteratorOptions(), sitemapFileIterator);
116
//					DatasetMappingIterator datasetMappingIterator = new DatasetMappingIterator(options.getDatasetMappingIteratorOptions(), endpointAccessIterator);
117
//
118
//					while (datasetMappingIterator.hasNext()) {
119
//						String xml = datasetMappingIterator.next();
120
//						if(xml == null) continue;
121
//
122
//						queue.put(xml);
123
//					}
124
//				}
125
//			}catch(Exception ex){
126
//				log.error("problem execution harvesting", ex);
127
//			}
128
//			finally {
129
//				try {
130
//					queue.put(Conventions.TerminateHint);
131
//				} catch (Exception ex) {
132
//					log.fatal("could not add termination hint. the process will not terminate gracefully", ex);
133
//				}
134
//			}
135
//		}
136
//	}
137
//}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import org.apache.commons.logging.Log;
4
import org.apache.commons.logging.LogFactory;
5

  
6
import java.util.Iterator;
7
import java.util.concurrent.ArrayBlockingQueue;
8

  
9
public class SchemaOrgIterable implements Iterable<String> {
10
	private static final Log log = LogFactory.getLog(SchemaOrgIterable.class);
11

  
12
	public static class Options {
13
		private EndpointAccessIterator.Options endpointAccessOptions;
14
		private DatasetMappingIterator.Options datasetMappingOptions;
15

  
16
		public EndpointAccessIterator.Options getEndpointAccessOptions() {
17
			return endpointAccessOptions;
18
		}
19

  
20
		public void setEndpointAccessOptions(EndpointAccessIterator.Options endpointAccessOptions) {
21
			this.endpointAccessOptions = endpointAccessOptions;
22
		}
23

  
24
		public DatasetMappingIterator.Options getDatasetMappingOptions() {
25
			return datasetMappingOptions;
26
		}
27

  
28
		public void setDatasetMappingOptions(DatasetMappingIterator.Options datasetMappingOptions) {
29
			this.datasetMappingOptions = datasetMappingOptions;
30
		}
31
	}
32

  
33
	private Options options;
34
	private RepositoryIterable repository;
35

  
36
	public SchemaOrgIterable(Options options, RepositoryIterable repository){
37
		this.options = options;
38
		this.repository = repository;
39
	}
40

  
41
	@Override
42
	public Iterator<String> iterator() {
43
		Iterator<String> repositoryIterator = this.repository.iterator();
44
		EndpointAccessIterator endpointAccessIterator = new EndpointAccessIterator(options.getEndpointAccessOptions(), repositoryIterator);
45
		DatasetMappingIterator datasetMappingIterator = new DatasetMappingIterator(options.getDatasetMappingOptions(), endpointAccessIterator);
46

  
47
		return datasetMappingIterator;
48
	}
49
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/Utils.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import org.json.JSONArray;
4
import org.json.JSONObject;
5
import org.w3c.dom.Document;
6
import org.w3c.dom.NodeList;
7
import org.xml.sax.InputSource;
8

  
9
import javax.xml.parsers.DocumentBuilder;
10
import javax.xml.parsers.DocumentBuilderFactory;
11
import javax.xml.xpath.XPath;
12
import javax.xml.xpath.XPathConstants;
13
import javax.xml.xpath.XPathExpression;
14
import javax.xml.xpath.XPathFactory;
15
import java.io.File;
16
import java.io.FileInputStream;
17
import java.io.FileOutputStream;
18
import java.io.StringReader;
19
import java.nio.charset.Charset;
20
import java.nio.charset.UnsupportedCharsetException;
21
import java.util.ArrayList;
22
import java.util.EnumSet;
23
import java.util.HashMap;
24
import java.util.List;
25
import java.util.zip.GZIPInputStream;
26

  
27
public class Utils {
28

  
29
	public static List<String> collectAsStrings(String xml, String xpath) throws Exception{
30
		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
31
		DocumentBuilder builder = factory.newDocumentBuilder();
32
		Document doc = builder.parse(new InputSource(new StringReader(xml)));
33
		return Utils.collectAsStrings(doc, xpath);
34
	}
35

  
36
	public static List<String> collectAsStrings(File file, String xpath) throws Exception{
37
		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
38
		DocumentBuilder builder = factory.newDocumentBuilder();
39
		Document doc = builder.parse(file);
40
		return Utils.collectAsStrings(doc, xpath);
41
	}
42

  
43
	public static List<String> collectAsStrings(Document doc, String xpath) throws Exception{
44
		XPathFactory xPathfactory = XPathFactory.newInstance();
45
		XPath path = xPathfactory.newXPath();
46
		XPathExpression expr = path.compile(xpath);
47
		NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
48

  
49
		List<String> values = new ArrayList<>();
50

  
51
		for (int i = 0; i < nodes.getLength(); i++)
52
			values.add(nodes.item(i).getNodeValue());
53

  
54
		return values;
55
	}
56

  
57
	public static void decompressGZipTo(File input, File output) throws Exception {
58
		try (GZIPInputStream in = new GZIPInputStream(new FileInputStream(input))){
59
			try (FileOutputStream out = new FileOutputStream(output)){
60
				byte[] buffer = new byte[1024];
61
				int len;
62
				while((len = in.read(buffer)) != -1){
63
					out.write(buffer, 0, len);
64
				}
65
			}
66
		}
67
	}
68

  
69
	public static String getAsString(HashMap<String,String> map, String key, String defaultValue)
70
	{
71
		String value = map.get(key);
72
		if(value == null) return defaultValue;
73
		return value;
74
	}
75

  
76
	public static List<String> getAsStringCsv(HashMap<String,String> map, String key, List<String> defaultValue)
77
	{
78
		String value = map.get(key);
79
		if(value == null) return defaultValue;
80
		String[] splits = value.split(",");
81
		List<String> curated = new ArrayList<>();
82
		for(String item : splits){
83
			if(item == null || item.trim().length() == 0) continue;
84
			curated.add(item.trim());
85
		}
86
		return curated;
87
	}
88

  
89
	public static int getAsInt(HashMap<String,String> map, String key, int defaultValue)
90
	{
91
		String value = map.get(key);
92
		if(value == null) return defaultValue;
93
		try {
94
			return Integer.parseInt(value);
95
		} catch (NumberFormatException e) {
96
			return defaultValue;
97
		}
98
	}
99

  
100
	public static long getAsLong(HashMap<String,String> map, String key, long defaultValue)
101
	{
102
		String value = map.get(key);
103
		if(value == null) return defaultValue;
104
		try {
105
			return Long.parseLong(value);
106
		} catch (NumberFormatException e) {
107
			return defaultValue;
108
		}
109
	}
110

  
111
	public static <E extends Enum<E>> E getAsEnum(HashMap<String,String> map, String key, E defaultValue, Class<E> clazz) {
112
		//EnumSet<E> values = EnumSet.allOf(defaultValue.getClass());
113
		EnumSet<E> values = EnumSet.allOf(clazz);
114
		String value = map.get(key);
115
		if (value == null) return defaultValue;
116
		for(E val : values){
117
			if(!val.name().equalsIgnoreCase(value)) continue;
118
			return val;
119
		}
120
		return defaultValue;
121
	}
122

  
123
	public static Boolean getAsBoolean(HashMap<String,String> map, String key, Boolean defaultValue) {
124
		String value = map.get(key);
125
		if (value == null) return defaultValue;
126
		return Boolean.parseBoolean(value);
127
	}
128

  
129
	public static Charset getAsCharset(HashMap<String,String> map, String key, Charset defaultValue)
130
	{
131
		String value = map.get(key);
132
		if(value == null) return defaultValue;
133
		try {
134
			return Charset.forName(value);
135
		} catch (UnsupportedCharsetException e) {
136
			return defaultValue;
137
		}
138
	}
139

  
140
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgIteratorOLD.java
1
//package eu.dnetlib.data.collector.plugins.schemaorg;
2
//
3
//import org.apache.commons.logging.Log;
4
//import org.apache.commons.logging.LogFactory;
5
//
6
//import java.util.Iterator;
7
//import java.util.NoSuchElementException;
8
//import java.util.concurrent.ArrayBlockingQueue;
9
//import java.util.concurrent.TimeUnit;
10
//
11
//public class SchemaOrgIteratorOLD implements Iterator<String> {
12
//	private static final Log log = LogFactory.getLog(SchemaOrgIterator.class);
13
//
14
//	public static class Options {
15
//		private Boolean blockPolling;
16
//		private long pollTimeout;
17
//		private TimeUnit pollTimeoutUnit;
18
//
19
//		public Boolean getBlockPolling() {
20
//			return blockPolling;
21
//		}
22
//
23
//		public void setBlockPolling(Boolean blockPolling) {
24
//			this.blockPolling = blockPolling;
25
//		}
26
//
27
//		public long getPollTimeout() {
28
//			return pollTimeout;
29
//		}
30
//
31
//		public void setPollTimeout(long pollTimeout) {
32
//			this.pollTimeout = pollTimeout;
33
//		}
34
//
35
//		public TimeUnit getPollTimeoutUnit() {
36
//			return pollTimeoutUnit;
37
//		}
38
//
39
//		public void setPollTimeoutUnit(TimeUnit pollTimeoutUnit) {
40
//			this.pollTimeoutUnit = pollTimeoutUnit;
41
//		}
42
//	}
43
//
44
//	private ArrayBlockingQueue<String> queue;
45
//	private Options options;
46
//	private boolean hasTerminated;
47
//
48
//	public SchemaOrgIterator(Options options, ArrayBlockingQueue<String> queue) {
49
//		this.options = options;
50
//		this.queue = queue;
51
//		this.hasTerminated = false;
52
//	}
53
//
54
//	public void bootstrap(){
55
//
56
//	}
57
//
58
//	@Override
59
//	public boolean hasNext() {
60
//		if(this.hasTerminated) return false;
61
//		return true;
62
//	}
63
//
64
//	@Override
65
//	public String next() {
66
//		String next = this.poll();
67
//		if (next != null && next.equalsIgnoreCase(Conventions.TerminateHint)) {
68
//			this.hasTerminated = true;
69
//			next = null;
70
//		}
71
//		return next;
72
//	}
73
//
74
//	private String poll(){
75
//		if(this.options.getBlockPolling()) {
76
//			try {
77
//				return this.queue.poll(this.options.getPollTimeout(), this.options.getPollTimeoutUnit());
78
//			} catch (InterruptedException ex) {
79
//				log.warn(String.format("could not poll elements from queue for more than %s %s. throwing", this.options.getPollTimeout(), this.options.getPollTimeoutUnit()));
80
//				throw new NoSuchElementException(ex.getMessage());
81
//			}
82
//		}
83
//		else {
84
//			return this.queue.poll();
85
//		}
86
//	}
87
//}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetDocument.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import org.w3c.dom.Attr;
4
import org.w3c.dom.Document;
5
import org.w3c.dom.Element;
6

  
7
import javax.xml.parsers.DocumentBuilder;
8
import javax.xml.parsers.DocumentBuilderFactory;
9
import javax.xml.parsers.ParserConfigurationException;
10
import javax.xml.transform.Transformer;
11
import javax.xml.transform.TransformerFactory;
12
import javax.xml.transform.dom.DOMSource;
13
import javax.xml.transform.stream.StreamResult;
14
import java.io.StringWriter;
15
import java.time.LocalDate;
16
import java.time.format.DateTimeFormatter;
17
import java.util.Calendar;
18
import java.util.Date;
19
import java.util.List;
20

  
21
public class DatasetDocument {
22
	private List<Identifier> identifiers;
23
	private List<Creator> creators;
24
	private List<String> titles;
25
	private List<String> alternativeTitles;
26
	private List<String> publishers;
27
	private List<LocalDate> publicationDates;
28
	private List<String> subjects;
29
	private List<Contributor> contributors;
30
	private List<LocalDate> createdDates;
31
	private List<LocalDate> updatedDates;
32
	private List<String> languages;
33
	private List<ResourceType> resourceTypes;
34
	private List<AlternateIdentifier> alternateIdentifier;
35
	private List<Citation> citations;
36
	private List<String> sizes;
37
	private List<String> format;
38
	private List<String> version;
39
	private List<License> licenses;
40
	private List<String> descriptions;
41
	private List<String> disambiguatingDescriptions;
42
	private List<SpatialCoverage> geoLocations;
43

  
44
	public List<Identifier> getIdentifiers() {
45
		return identifiers;
46
	}
47

  
48
	public void setIdentifiers(List<Identifier> identifiers) {
49
		this.identifiers = identifiers;
50
	}
51

  
52
	public List<Creator> getCreators() {
53
		return creators;
54
	}
55

  
56
	public void setCreators(List<Creator> creators) {
57
		this.creators = creators;
58
	}
59

  
60
	public List<String> getTitles() {
61
		return titles;
62
	}
63

  
64
	public void setTitles(List<String> titles) {
65
		this.titles = titles;
66
	}
67

  
68
	public List<String> getAlternativeTitles() {
69
		return alternativeTitles;
70
	}
71

  
72
	public void setAlternativeTitles(List<String> alternativeTitles) {
73
		this.alternativeTitles = alternativeTitles;
74
	}
75

  
76
	public List<String> getPublishers() {
77
		return publishers;
78
	}
79

  
80
	public void setPublishers(List<String> publishers) {
81
		this.publishers = publishers;
82
	}
83

  
84
	public List<LocalDate> getPublicationDates() {
85
		return publicationDates;
86
	}
87

  
88
	public void setPublicationDates(List<LocalDate> publicationDates) {
89
		this.publicationDates = publicationDates;
90
	}
91

  
92
	public List<String> getSubjects() {
93
		return subjects;
94
	}
95

  
96
	public void setSubjects(List<String> subjects) {
97
		this.subjects = subjects;
98
	}
99

  
100
	public List<Contributor> getContributors() {
101
		return contributors;
102
	}
103

  
104
	public void setContributors(List<Contributor> contributors) {
105
		this.contributors = contributors;
106
	}
107

  
108
	public List<LocalDate> getCreatedDates() {
109
		return createdDates;
110
	}
111

  
112
	public void setCreatedDates(List<LocalDate> createdDates) {
113
		this.createdDates = createdDates;
114
	}
115

  
116
	public List<LocalDate> getUpdatedDates() {
117
		return updatedDates;
118
	}
119

  
120
	public void setUpdatedDates(List<LocalDate> updatedDates) {
121
		this.updatedDates = updatedDates;
122
	}
123

  
124
	public List<String> getLanguages() {
125
		return languages;
126
	}
127

  
128
	public void setLanguages(List<String> languages) {
129
		this.languages = languages;
130
	}
131

  
132
	public List<ResourceType> getResourceTypes() {
133
		return resourceTypes;
134
	}
135

  
136
	public void setResourceTypes(List<ResourceType> resourceTypes) {
137
		this.resourceTypes = resourceTypes;
138
	}
139

  
140
	public List<AlternateIdentifier> getAlternateIdentifier() {
141
		return alternateIdentifier;
142
	}
143

  
144
	public void setAlternateIdentifier(List<AlternateIdentifier> alternateIdentifier) {
145
		this.alternateIdentifier = alternateIdentifier;
146
	}
147

  
148
	public List<Citation> getCitations() {
149
		return citations;
150
	}
151

  
152
	public void setCitations(List<Citation> citations) {
153
		this.citations = citations;
154
	}
155

  
156
	public List<String> getSizes() {
157
		return sizes;
158
	}
159

  
160
	public void setSizes(List<String> sizes) {
161
		this.sizes = sizes;
162
	}
163

  
164
	public List<String> getFormat() {
165
		return format;
166
	}
167

  
168
	public void setFormat(List<String> format) {
169
		this.format = format;
170
	}
171

  
172
	public List<String> getVersion() {
173
		return version;
174
	}
175

  
176
	public void setVersion(List<String> version) {
177
		this.version = version;
178
	}
179

  
180
	public List<License> getLicenses() {
181
		return licenses;
182
	}
183

  
184
	public void setLicenses(List<License> licenses) {
185
		this.licenses = licenses;
186
	}
187

  
188
	public List<String> getDescriptions() {
189
		return descriptions;
190
	}
191

  
192
	public void setDescriptions(List<String> descriptions) {
193
		this.descriptions = descriptions;
194
	}
195

  
196
	public List<String> getDisambiguatingDescriptions() {
197
		return disambiguatingDescriptions;
198
	}
199

  
200
	public void setDisambiguatingDescriptions(List<String> disambiguatingDescriptions) {
201
		this.disambiguatingDescriptions = disambiguatingDescriptions;
202
	}
203

  
204
	public List<SpatialCoverage> getGeoLocations() {
205
		return geoLocations;
206
	}
207

  
208
	public void setGeoLocations(List<SpatialCoverage> geoLocations) {
209
		this.geoLocations = geoLocations;
210
	}
211

  
212
	public String toXml() throws Exception {
213
		DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
214
		DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
215
		Document doc = docBuilder.newDocument();
216

  
217
		Element root = doc.createElement("dataset");
218
		doc.appendChild(root);
219

  
220
		if(this.identifiers!=null){
221
			for(Identifier item : this.identifiers){
222
				item.toXml(root);
223
			}
224
		}
225
		if(this.creators!=null){
226
			Element creators = doc.createElement("creators");
227
			root.appendChild(creators);
228
			for(Creator item : this.creators){
229
				item.toXml(creators);
230
			}
231
		}
232
		if(this.titles!=null || this.alternativeTitles!=null){
233
			Element titles = doc.createElement("titles");
234
			root.appendChild(titles);
235
			if(this.titles!=null) {
236
				for (String item : this.titles) {
237
					Element title = doc.createElement("title");
238
					titles.appendChild(title);
239
					title.appendChild(doc.createTextNode(item));
240
				}
241
			}
242
			if(this.alternativeTitles!=null) {
243
				for (String item : this.alternativeTitles) {
244
					Element title = doc.createElement("title");
245
					titles.appendChild(title);
246
					title.setAttribute("titleType", "AlternativeTitle");
247
					title.appendChild(doc.createTextNode(item));
248
				}
249
			}
250
		}
251
		if(this.publishers!=null){
252
			for(String item : this.publishers){
253
				Element publisher = doc.createElement("publisher");
254
				root.appendChild(publisher);
255
				publisher.appendChild(doc.createTextNode(item));
256
			}
257
		}
258
		if(this.publicationDates!=null){
259
			for(LocalDate item : this.publicationDates){
260
				Element publicationYear = doc.createElement("publicationYear");
261
				root.appendChild(publicationYear);
262
				publicationYear.appendChild(doc.createTextNode(Integer.toString(item.getYear())));
263
			}
264
		}
265
		if(this.subjects!=null){
266
			Element subjects = doc.createElement("subjects");
267
			root.appendChild(subjects);
268
			for(String item : this.subjects){
269
				Element subject = doc.createElement("subject");
270
				subjects.appendChild(subject);
271
				subject.appendChild(doc.createTextNode(item));
272
			}
273
		}
274
		if(this.contributors!=null){
275
			for(Contributor item : this.contributors){
276
				item.toXml(root);
277
			}
278
		}
279
		if(this.createdDates!=null || this.updatedDates!=null){
280
			Element dates = doc.createElement("dates");
281
			root.appendChild(dates);
282

  
283
			DateTimeFormatter formatter = DateTimeFormatter.ofPattern("YYYY-MM-DD");
284

  
285
			if(createdDates!=null) {
286
				for (LocalDate item : this.createdDates) {
287
					Element date = doc.createElement("date");
288
					root.appendChild(date);
289
					date.setAttribute("dateType", "Created");
290
					date.appendChild(doc.createTextNode(item.format(formatter)));
291
				}
292
			}
293
			if(updatedDates!=null) {
294
				for (LocalDate item : this.updatedDates) {
295
					Element date = doc.createElement("date");
296
					root.appendChild(date);
297
					date.setAttribute("dateType", "Updated");
298
					date.appendChild(doc.createTextNode(item.format(formatter)));
299
				}
300
			}
301
		}
302
		if(this.languages!=null){
303
			for(String item : this.languages){
304
				Element language = doc.createElement("language");
305
				root.appendChild(language);
306
				language.appendChild(doc.createTextNode(item));
307
			}
308
		}
309
		if(this.resourceTypes!=null){
310
			for(ResourceType item : this.resourceTypes){
311
				item.toXml(root);
312
			}
313
		}
314
		if(this.alternateIdentifier!=null){
315
			Element alternateIdentifiers = doc.createElement("alternateIdentifiers");
316
			root.appendChild(alternateIdentifiers);
317
			for(AlternateIdentifier item : this.alternateIdentifier){
318
				item.toXml(alternateIdentifiers);
319
			}
320
		}
321
		if(this.citations!=null){
322
			for(Citation item : this.citations){
323
				item.toXml(root);
324
			}
325
		}
326
		if(this.sizes!=null){
327
			Element sizes = doc.createElement("sizes");
328
			root.appendChild(sizes);
329
			for(String item : this.sizes){
330
				Element size = doc.createElement("size");
331
				sizes.appendChild(size);
332
				size.appendChild(doc.createTextNode(item));
333
			}
334
		}
335
		if(this.format!=null){
336
			Element formats = doc.createElement("formats");
337
			root.appendChild(formats);
338
			for(String item : this.format){
339
				Element format = doc.createElement("format");
340
				formats.appendChild(format);
341
				format.appendChild(doc.createTextNode(item));
342
			}
343
		}
344
		if(this.version!=null){
345
			for(String item : this.version){
346
				Element version = doc.createElement("version");
347
				root.appendChild(version);
348
				version.appendChild(doc.createTextNode(item));
349
			}
350
		}
351
		if(this.licenses!=null){
352
			Element rightsList = doc.createElement("rightsList");
353
			root.appendChild(rightsList);
354
			for(License item : this.licenses){
355
				item.toXml(rightsList);
356
			}
357
		}
358
		if(this.descriptions!=null || this.disambiguatingDescriptions!=null){
359
			Element descriptions = doc.createElement("descriptions");
360
			root.appendChild(descriptions);
361
			if(this.descriptions!=null) {
362
				for (String item : this.descriptions) {
363
					Element description = doc.createElement("description");
364
					descriptions.appendChild(description);
365
					description.setAttribute("descriptionType", "Abstract");
366
					description.appendChild(doc.createTextNode(item));
367
				}
368
			}
369
			if(this.disambiguatingDescriptions!=null) {
370
				for (String item : this.disambiguatingDescriptions) {
371
					Element description = doc.createElement("description");
372
					descriptions.appendChild(description);
373
					description.setAttribute("descriptionType", "Other");
374
					description.appendChild(doc.createTextNode(item));
375
				}
376
			}
377
		}
378
		if(this.geoLocations!=null){
379
			Element geoLocations = doc.createElement("geoLocations");
380
			root.appendChild(geoLocations);
381
			for(SpatialCoverage item : this.geoLocations){
382
				item.toXml(geoLocations);
383
			}
384
		}
385

  
386
		TransformerFactory tf = TransformerFactory.newInstance();
387
		Transformer transformer = tf.newTransformer();
388
		StringWriter writer = new StringWriter();
389
		transformer.transform(new DOMSource(doc), new StreamResult(writer));
390
		String xml = writer.getBuffer().toString();
391
		return xml;
392
	}
393

  
394
	public static class SpatialCoverage{
395
		public static class Point{
396
			public String latitude;
397
			public String longitude;
398

  
399
			public Point() {}
400

  
401
			public Point(String latitude, String longitude){
402
				this.latitude = latitude;
403
				this.longitude = longitude;
404
			}
405
		}
406
		public String name;
407
		public List<Point> points;
408
		public List<String> boxes;
409

  
410
		public SpatialCoverage() {}
411

  
412
		public SpatialCoverage(String name, List<Point> points, List<String> boxes ) {
413
			this.name = name;
414
			this.points = points;
415
			this.boxes = boxes;
416
		}
417

  
418
		public void toXml(Element parent){
419
			Element node = parent.getOwnerDocument().createElement("geoLocation");
420
			parent.appendChild(node);
421

  
422
			if(this.points!=null) {
423
				for(Point point : this.points) {
424
					if(point.latitude == null || point.longitude == null) continue;
425
					Element geoLocationPoint = parent.getOwnerDocument().createElement("geoLocationPoint");
426
					geoLocationPoint.appendChild(parent.getOwnerDocument().createTextNode(String.format("%s %s", point.latitude, point.longitude)));
427
					node.appendChild(geoLocationPoint);
428
				}
429
			}
430
			if(this.boxes!=null) {
431
				for(String box : this.boxes) {
432
					if(box == null) continue;
433
					Element geoLocationBox = parent.getOwnerDocument().createElement("geoLocationBox");
434
					geoLocationBox.appendChild(parent.getOwnerDocument().createTextNode(box));
435
					node.appendChild(geoLocationBox);
436
				}
437
			}
438
			if(this.name!=null) {
439
				Element geoLocationPlace = parent.getOwnerDocument().createElement("geoLocationPlace");
440
				geoLocationPlace.appendChild(parent.getOwnerDocument().createTextNode(this.name));
441
				node.appendChild(geoLocationPlace);
442
			}
443
		}
444
	}
445

  
446
	public static class License{
447
		public String name;
448
		public String url;
449

  
450
		public License() {}
451

  
452
		public License(String name, String url) {
453
			this.name = name;
454
			this.url = url;
455
		}
456

  
457
		public void toXml(Element parent){
458
			Element node = parent.getOwnerDocument().createElement("rights");
459
			parent.appendChild(node);
460

  
461
			if(this.url!=null) {
462
				node.setAttribute("rightsURI", this.url);
463
			}
464
			if(this.name!=null) {
465
				node.appendChild(parent.getOwnerDocument().createTextNode(this.name));
466
			}
467
		}
468
	}
469

  
470
	public static class Citation{
471
		public enum CitationIdentifierType{
472
			ARK, arXiv, bibcode, DOI, EAN13, EISSN, Handle, ISBN, ISSN, ISTC, LISSN, LSID, PMID,
473
			PURL, UPC, URL, URN
474
		}
475

  
476
		public CitationIdentifierType type;
477
		public String value;
478

  
479
		public Citation() {}
480

  
481
		public Citation(String value, CitationIdentifierType type) {
482
			this.value = value;
483
			this.type = type;
484
		}
485

  
486
		public void toXml(Element parent){
487
			Element node = parent.getOwnerDocument().createElement("relatedIdentifier");
488
			parent.appendChild(node);
489

  
490
			node.setAttribute("relatedIdentifierType", this.type.toString());
491
			node.setAttribute("relationType", "Cites");
492
			node.appendChild(parent.getOwnerDocument().createTextNode(this.value));
493
		}
494
	}
495

  
496
	public static class Contributor{
497
		public enum ContributorType{
498
			ContactPerson, DataCollector, DataCurator, DataManager, Distributor, Editor, Funder, HostingInstitution,
499
			Producer, ProjectLeader, ProjectManager, ProjectMember, RegistrationAgency, RegistrationAuthority,
500
			RelatedPerson, Researcher, ResearchGroup, RightsHolder, Sponsor, Supervisor, WorkPackageLeader, Other
501
		}
502

  
503
		public String name;
504
		public List<String> affiliations;
505
		public ContributorType type;
506

  
507
		public Contributor() {
508
		}
509

  
510
		public Contributor(String name) {
511
			this.name = name;
512
		}
513

  
514
		public Contributor(String name, List<String> affiliations) {
515
			this.name = name;
516
			this.affiliations = affiliations;
517
		}
518

  
519
		public Contributor(String name, List<String> affiliations, ContributorType type) {
520
			this.name = name;
521
			this.affiliations = affiliations;
522
			this.type = type;
523
		}
524

  
525
		public void toXml(Element parent){
526
			Element node = parent.getOwnerDocument().createElement("contributor");
527
			parent.appendChild(node);
528

  
529
			node.setAttribute("contributorType", this.type.toString());
530

  
531
			if(this.name!=null) {
532
				Element contributorName = parent.getOwnerDocument().createElement("contributorName");
533
				node.appendChild(contributorName);
534
				contributorName.appendChild(parent.getOwnerDocument().createTextNode(this.name));
535
			}
536
			if(this.affiliations!=null) {
537
				for(String item : this.affiliations) {
538
					Element affiliation = parent.getOwnerDocument().createElement("affiliation");
539
					node.appendChild(affiliation);
540
					affiliation.appendChild(parent.getOwnerDocument().createTextNode(item));
541
				}
542
			}
543
		}
544
	}
545

  
546
	public static class AlternateIdentifier{
547
		public String identifier;
548
		public String type;
549

  
550
		public AlternateIdentifier() {}
551

  
552
		public AlternateIdentifier(String identifier, String type) {
553
			this.identifier = identifier;
554
			this.type = type;
555
		}
556

  
557
		public void toXml(Element parent){
558
			Element node = parent.getOwnerDocument().createElement("alternateIdentifier");
559
			parent.appendChild(node);
560

  
561
			if(this.type!=null) {
562
				node.setAttribute("alternateIdentifierType", this.type);
563
			}
564
			if(this.identifier!=null) {
565
				node.appendChild(parent.getOwnerDocument().createTextNode(this.identifier));
566
			}
567
		}
568
	}
569

  
570
	public static class ResourceType{
571
		public enum ResourceTypeGeneralType {
572
			Audiovisual, Collection, Dataset, Event, Image, InteractiveResource, Model, PhysicalObject, Service,
573
			Software, Sound, Text, Workflow, Other
574
		}
575

  
576
		public ResourceTypeGeneralType type;
577

  
578
		public ResourceType() {}
579

  
580
		public ResourceType(ResourceTypeGeneralType type) {
581
			this.type = type;
582
		}
583

  
584
		public void toXml(Element parent){
585
			Element node = parent.getOwnerDocument().createElement("resourceType");
586
			parent.appendChild(node);
587

  
588
			if(this.type!=null) {
589
				node.setAttribute("resourceTypeGeneral", this.type.toString());
590
			}
591
		}
592
	}
593

  
594
	public static class Creator {
595
		public String name;
596
		public List<String> affiliations;
597

  
598
		public Creator() {
599
		}
600

  
601
		public Creator(String name) {
602
			this.name = name;
603
		}
604

  
605
		public Creator(String name, List<String> affiliations) {
606
			this.name = name;
607
			this.affiliations = affiliations;
608
		}
609

  
610
		public void toXml(Element parent){
611
			Element node = parent.getOwnerDocument().createElement("creator");
612
			parent.appendChild(node);
613

  
614
			if(this.name!=null) {
615
				Element creatorName = parent.getOwnerDocument().createElement("creatorName");
616
				node.appendChild(creatorName);
617
				creatorName.appendChild(parent.getOwnerDocument().createTextNode(this.name));
618
			}
619
			if(this.affiliations!=null) {
620
				for(String item : this.affiliations) {
621
					Element affiliation = parent.getOwnerDocument().createElement("affiliation");
622
					node.appendChild(affiliation);
623
					affiliation.appendChild(parent.getOwnerDocument().createTextNode(item));
624
				}
625
			}
626
		}
627
	}
628

  
629
	public static class Identifier {
630
		public enum IdentifierType {
631
			ARK, DOI, Handle, PURL, URN, URL
632
		}
633

  
634
		public String value;
635
		public IdentifierType type;
636

  
637
		public Identifier() {
638
		}
639

  
640
		public Identifier(IdentifierType type, String value) {
641
			this.type = type;
642
			this.value = value;
643
		}
644

  
645
		public void toXml(Element parent){
646
			Element node = parent.getOwnerDocument().createElement("identifier");
647
			parent.appendChild(node);
648

  
649
			node.setAttribute("identifierType", this.type.toString());
650
			if(this.value!=null) {
651
				node.appendChild(parent.getOwnerDocument().createTextNode(this.value));
652
			}
653
		}
654
	}
655
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/RepositoryIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import java.util.Iterator;
4

  
5
public interface RepositoryIterable extends Iterable<String> {
6
	public static String TerminationHint = "df667391-676d-4c0f-9c40-426b1001607a";
7
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgPlugin.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle.KaggleRepositoryIterable;
5
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
6
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator;
7
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexRepositoryIterable;
8
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
9
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
10
import org.apache.commons.logging.Log;
11
import org.apache.commons.logging.LogFactory;
12

  
13
import java.net.MalformedURLException;
14
import java.net.URL;
15
import java.nio.charset.StandardCharsets;
16
import java.util.concurrent.TimeUnit;
17

  
18
public class SchemaOrgPlugin extends AbstractCollectorPlugin {
19

  
20
    private static final Log log = LogFactory.getLog(SchemaOrgPlugin.class);
21

  
22
    public String hello(){
23
        return "hello";
24
    }
25

  
26
    @Override
27
    public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
28
        try {
29
			RepositoryIterable repository = null;
30
        	String repositoryAccessType = Utils.getAsString(interfaceDescriptor.getParams(), "repositoryAccessType", null);
31
        	switch(repositoryAccessType) {
32
				case "sitemapindex": {
33
					SitemapIndexRepositoryIterable.Options repositoryOptions = this.compileSitemapIndexRepositoryOptions(interfaceDescriptor);
34
					SitemapIndexRepositoryIterable repositoryIterable = new SitemapIndexRepositoryIterable(repositoryOptions);
35
					repositoryIterable.bootstrap();
36
					repository = repositoryIterable;
37
					break;
38
				}
39
				case "httpapi-kaggle": {
40
					KaggleRepositoryIterable.Options repositoryOptions = this.compileKaggleRepositoryOptions(interfaceDescriptor);
41
					KaggleRepositoryIterable repositoryIterable = new KaggleRepositoryIterable(repositoryOptions);
42
					repositoryIterable.bootstrap();
43
					repository = repositoryIterable;
44
					break;
45
				}
46
				default:
47
					throw new CollectorServiceException(String.format("unrecognized repository access type ", repositoryAccessType));
48
			}
49
			SchemaOrgIterable.Options schemaOrgOptions = this.compileSchemaOrgOptions(interfaceDescriptor);
50
            SchemaOrgIterable iterable = new SchemaOrgIterable(schemaOrgOptions, repository);
51
            return iterable;
52
        } catch (Exception e) {
53
            throw new CollectorServiceException("Could not create iterator", e);
54
        }
55
    }
56

  
57
	private KaggleRepositoryIterable.Options compileKaggleRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
58
		KaggleRepositoryIterable.Options kaggleRepositoryOptions = new KaggleRepositoryIterable.Options();
59
		kaggleRepositoryOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "httpapi-kaggle_queueSize", 100));
60
		kaggleRepositoryOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "httpapi-kaggle_APICharset", StandardCharsets.UTF_8));
61
		kaggleRepositoryOptions.setQueryUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryUrl", null));
62
		kaggleRepositoryOptions.setQueryPagePlaceholder(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryPagePlaceholder", "{PAGE}"));
63
		kaggleRepositoryOptions.setResponsePropertyTotalDataset(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems"));
64
		kaggleRepositoryOptions.setResponsePropertyDatasetList(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetList", "datasetListItems"));
65
		kaggleRepositoryOptions.setResponsePropertyDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl"));
66
		kaggleRepositoryOptions.setResponseBaseDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responseBaseDatasetUrl", interfaceDescriptor.getBaseUrl()));
67
		kaggleRepositoryOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
68
		return kaggleRepositoryOptions;
69

  
70
	}
71

  
72
    private SitemapIndexIterator.Options compileSitemapIndexOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
73
		SitemapIndexIterator.Options sitemapIndexIteratorOptions = new SitemapIndexIterator.Options();
74
		sitemapIndexIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_IndexCharset", StandardCharsets.UTF_8));
75
		sitemapIndexIteratorOptions.setIndexUrl(new URL(interfaceDescriptor.getBaseUrl()));
76
		return sitemapIndexIteratorOptions;
77

  
78
	}
79

  
80
	private SitemapFileIterator.Options compileSitemapFileOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
81
		SitemapFileIterator.Options sitemapFileIteratorOptions = new SitemapFileIterator.Options();
82
		sitemapFileIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_FileCharset", StandardCharsets.UTF_8));
83
		sitemapFileIteratorOptions.setSchemaType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Xml, SitemapFileIterator.Options.SitemapSchemaType.class));
84
		sitemapFileIteratorOptions.setFileType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text, SitemapFileIterator.Options.SitemapFileType.class));
85
		return sitemapFileIteratorOptions;
86
	}
87

  
88
	private RepositoryQueueIterator.Options compileRepositoryQueueOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
89
		RepositoryQueueIterator.Options repositoryQueueIteratorOptions = new RepositoryQueueIterator.Options();
90
		repositoryQueueIteratorOptions.setBlockPolling(Utils.getAsBoolean(interfaceDescriptor.getParams(), "consumerBlockPolling", true));
91
		repositoryQueueIteratorOptions.setPollTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "consumerBlockPollingTimeout", 2));
92
		repositoryQueueIteratorOptions.setPollTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
93
		return repositoryQueueIteratorOptions;
94
	}
95

  
96
	private SitemapIndexRepositoryIterable.Options compileSitemapIndexRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
97
		SitemapIndexRepositoryIterable.Options sitemapIndexRepositoryIterableOptions = new SitemapIndexRepositoryIterable.Options();
98
		sitemapIndexRepositoryIterableOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "sitemap_queueSize", 100));
99
		sitemapIndexRepositoryIterableOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
100
		sitemapIndexRepositoryIterableOptions.setSitemapFileIteratorOptions(this.compileSitemapFileOptions(interfaceDescriptor));
101
		sitemapIndexRepositoryIterableOptions.setSitemapIndexIteratorOptions(this.compileSitemapIndexOptions(interfaceDescriptor));
102
		return sitemapIndexRepositoryIterableOptions;
103
	}
104

  
105
	private EndpointAccessIterator.Options compileEndpointAccessOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
106
		EndpointAccessIterator.Options endpointAccessIteratorOptions = new EndpointAccessIterator.Options();
107
		endpointAccessIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "endpointCharset", StandardCharsets.UTF_8));
108
		return endpointAccessIteratorOptions;
109
	}
110

  
111
	private DatasetMappingIterator.Options compileDatasetMappingOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
112
		DatasetMappingIterator.Options datasetMappingIteratorOptions = new DatasetMappingIterator.Options();
113

  
114
		DatasetMappingIterator.Options.UpdatedDateOptions datasetMappingIteratorUpdatedDateOptions = new DatasetMappingIterator.Options.UpdatedDateOptions();
115
		datasetMappingIteratorUpdatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "updatedDateFormat", "YYYY-MM-DD");
116
		datasetMappingIteratorOptions.setUpdatedDateOptions(datasetMappingIteratorUpdatedDateOptions);
117

  
118
		DatasetMappingIterator.Options.CreatedDateOptions datasetMappingIteratorCreatedDateOptions = new DatasetMappingIterator.Options.CreatedDateOptions();
119
		datasetMappingIteratorCreatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "createdDateFormat", "YYYY-MM-DD");
120
		datasetMappingIteratorOptions.setCreatedDateOptions(datasetMappingIteratorCreatedDateOptions);
121

  
122
		DatasetMappingIterator.Options.PublicationDateOptions datasetMappingIteratorPublicationDateOptions = new DatasetMappingIterator.Options.PublicationDateOptions();
123
		datasetMappingIteratorPublicationDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "publicationDateFormat", "YYYY-MM-DD");
124
		datasetMappingIteratorOptions.setPublicationDateOptions(datasetMappingIteratorPublicationDateOptions);
125

  
126
		DatasetMappingIterator.Options.ContributorOptions datasetMappingIteratorContributorOptions = new DatasetMappingIterator.Options.ContributorOptions();
127
		datasetMappingIteratorContributorOptions.fallbackType =Utils.getAsEnum(interfaceDescriptor.getParams(), "contributorFallbackType",DatasetDocument.Contributor.ContributorType.Other, DatasetDocument.Contributor.ContributorType.class);
128
		datasetMappingIteratorOptions.setContributorOptions(datasetMappingIteratorContributorOptions);
129

  
130
		DatasetMappingIterator.Options.IdentifierOptions datasetMappingIteratorIdentifierOptions = new DatasetMappingIterator.Options.IdentifierOptions();
131
		datasetMappingIteratorIdentifierOptions.fallbackType = Utils.getAsEnum(interfaceDescriptor.getParams(), "identifierFallbackType", null, DatasetDocument.Identifier.IdentifierType.class);
132
		datasetMappingIteratorIdentifierOptions.fallbackURL = Utils.getAsBoolean(interfaceDescriptor.getParams(), "identifierFallbackURL", true);
133
		datasetMappingIteratorIdentifierOptions.mappingARK = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingARK", null);
134
		datasetMappingIteratorIdentifierOptions.mappingDOI = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingDOI", null);
135
		datasetMappingIteratorIdentifierOptions.mappingHandle = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingHandle", null);
136
		datasetMappingIteratorIdentifierOptions.mappingPURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingPURL", null);
137
		datasetMappingIteratorIdentifierOptions.mappingURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURL", null);
138
		datasetMappingIteratorIdentifierOptions.mappingURN = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURN", null);
139
		datasetMappingIteratorOptions.setIdentifierOptions(datasetMappingIteratorIdentifierOptions);
140
		return datasetMappingIteratorOptions;
141
	}
142

  
143
	private SchemaOrgIterable.Options compileSchemaOrgOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
144
		SchemaOrgIterable.Options schemaOrgIterableOptions = new SchemaOrgIterable.Options();
145
		schemaOrgIterableOptions.setDatasetMappingOptions(this.compileDatasetMappingOptions(interfaceDescriptor));
146
		schemaOrgIterableOptions.setEndpointAccessOptions(this.compileEndpointAccessOptions(interfaceDescriptor));
147
		return schemaOrgIterableOptions;
148
	}
149
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/HttpApiRepositoryIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
4

  
5
public interface HttpApiRepositoryIterable extends RepositoryIterable {
6
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/kaggle/KaggleRepositoryIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
4
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator;
5
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.HttpApiRepositoryIterable;
6
import org.apache.commons.io.IOUtils;
7
import org.apache.commons.logging.Log;
8
import org.apache.commons.logging.LogFactory;
9
import org.json.JSONArray;
10
import org.json.JSONObject;
11

  
12
import java.net.URL;
13
import java.nio.charset.Charset;
14
import java.util.Iterator;
15
import java.util.concurrent.ArrayBlockingQueue;
16
import java.util.concurrent.ExecutorService;
17
import java.util.concurrent.Executors;
18

  
19
public class KaggleRepositoryIterable implements HttpApiRepositoryIterable {
20
	private static final Log log = LogFactory.getLog(KaggleRepositoryIterable.class);
21

  
22
	public static class Options {
23
		private String queryUrl;
24
		private String queryPagePlaceholder;
25
		private Charset charset;
26
		private String responsePropertyTotalDataset;
27
		private String responsePropertyDatasetList;
28
		private String responsePropertyDatasetUrl;
29
		private String responseBaseDatasetUrl;
30

  
31
		private RepositoryQueueIterator.Options repositoryQueueIteratorOptions;
32

  
33
		private int queueSize;
34

  
35
		public int getQueueSize() {
36
			return queueSize;
37
		}
38

  
39
		public void setQueueSize(int queueSize) {
40
			this.queueSize = queueSize;
41
		}
42

  
43
		public String getResponseBaseDatasetUrl() {
44
			return responseBaseDatasetUrl;
45
		}
46

  
47
		public void setResponseBaseDatasetUrl(String responseBaseDatasetUrl) {
48
			this.responseBaseDatasetUrl = responseBaseDatasetUrl;
49
		}
50

  
51
		public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() {
52
			return repositoryQueueIteratorOptions;
53
		}
54

  
55
		public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) {
56
			this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions;
57
		}
58

  
59
		public String getResponsePropertyDatasetUrl() {
60
			return responsePropertyDatasetUrl;
61
		}
62

  
63
		public void setResponsePropertyDatasetUrl(String responsePropertyDatasetUrl) {
64
			this.responsePropertyDatasetUrl = responsePropertyDatasetUrl;
65
		}
66

  
67
		public String getResponsePropertyDatasetList() {
68
			return responsePropertyDatasetList;
69
		}
70

  
71
		public void setResponsePropertyDatasetList(String responsePropertyDatasetList) {
72
			this.responsePropertyDatasetList = responsePropertyDatasetList;
73
		}
74

  
75
		public String getResponsePropertyTotalDataset() {
76
			return responsePropertyTotalDataset;
77
		}
78

  
79
		public void setResponsePropertyTotalDataset(String responsePropertyTotalDataset) {
80
			this.responsePropertyTotalDataset = responsePropertyTotalDataset;
81
		}
82

  
83
		public Charset getCharset() {
84
			return charset;
85
		}
86

  
87
		public void setCharset(Charset charset) {
88
			this.charset = charset;
89
		}
90

  
91
		public String getQueryPagePlaceholder() {
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff