Revision 53614
Added by Giorgos Papanikos over 5 years ago
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgSitemapIteratorTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
6 |
import org.junit.Assert; |
|
7 |
import org.junit.Before; |
|
8 |
import org.junit.Ignore; |
|
9 |
import org.junit.Test; |
|
10 |
|
|
11 |
import java.net.URL; |
|
12 |
import java.nio.charset.StandardCharsets; |
|
13 |
import java.util.HashMap; |
|
14 |
import java.util.concurrent.TimeUnit; |
|
15 |
|
|
16 |
@Ignore |
|
17 |
public class SchemaOrgSitemapIteratorTest { |
|
18 |
@Before |
|
19 |
public void setUp() throws Exception { |
|
20 |
} |
|
21 |
|
|
22 |
@Test |
|
23 |
public void test() throws CollectorServiceException { |
|
24 |
URL resource = SchemaOrgSitemapIteratorTest.class.getResource("sitemap.xml"); |
|
25 |
|
|
26 |
HashMap<String,String> params = new HashMap<>(); |
|
27 |
params.put("repositoryAccessType", "sitemapindex"); |
|
28 |
params.put("consumerBlockPolling", Boolean.toString(true)); |
|
29 |
params.put("consumerBlockPollingTimeout", "2"); |
|
30 |
params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString()); |
|
31 |
params.put("endpointCharset", StandardCharsets.UTF_8.name()); |
|
32 |
params.put("updatedDateFormat", "YYYY-MM-DD"); |
|
33 |
params.put("createdDateFormat", "YYYY-MM-DD"); |
|
34 |
params.put("publicationDateFormat", "YYYY-MM-DD"); |
|
35 |
params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString()); |
|
36 |
params.put("identifierFallbackType", null); |
|
37 |
params.put("identifierFallbackURL", Boolean.toString(true)); |
|
38 |
params.put("identifierMappingARK", "ark, ARK"); |
|
39 |
params.put("identifierMappingDOI", "doi, DOI"); |
|
40 |
params.put("identifierMappingHandle", "Handle, HANDLE"); |
|
41 |
params.put("identifierMappingPURL", "purl, PURL"); |
|
42 |
params.put("identifierMappingURN", "urn, URN"); |
|
43 |
params.put("identifierMappingURL", "url, URL"); |
|
44 |
|
|
45 |
params.put("repositoryAccessType", "sitemapindex"); |
|
46 |
params.put("sitemap_queueSize", "100"); |
|
47 |
params.put("sitemap_IndexCharset", StandardCharsets.UTF_8.name()); |
|
48 |
params.put("sitemap_FileCharset", StandardCharsets.UTF_8.name()); |
|
49 |
params.put("sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Text.toString()); |
|
50 |
params.put("sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text.toString()); |
|
51 |
|
|
52 |
InterfaceDescriptor descriptor = new InterfaceDescriptor(); |
|
53 |
descriptor.setId("schema.org - reactome"); |
|
54 |
descriptor.setBaseUrl(resource.toString()); |
|
55 |
descriptor.setParams(params); |
|
56 |
|
|
57 |
SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin(); |
|
58 |
|
|
59 |
Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null); |
|
60 |
|
|
61 |
int length =0; |
|
62 |
int count =0; |
|
63 |
int nullcount =0; |
|
64 |
for(String item : iterable) { |
|
65 |
count += 1; |
|
66 |
if(item == null) { |
|
67 |
nullcount+=1; |
|
68 |
continue; |
|
69 |
} |
|
70 |
length = item.length(); |
|
71 |
} |
|
72 |
Assert.assertEquals(1, nullcount); |
|
73 |
Assert.assertEquals(2, count); |
|
74 |
Assert.assertEquals(1626, length); |
|
75 |
|
|
76 |
} |
|
77 |
} |
modules/dnet-collector-plugins/trunk/src/test/resources/eu/dnetlib/data/collector/plugins/schemaorg/sitemap_file.xml | ||
---|---|---|
1 |
file:target/test-classes/eu/dnetlib/data/collector/plugins/schemaorg/index.html |
modules/dnet-collector-plugins/trunk/src/test/resources/eu/dnetlib/data/collector/plugins/schemaorg/sitemap.xml | ||
---|---|---|
1 |
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> |
|
2 |
<sitemap> |
|
3 |
<loc>file:target/test-classes/eu/dnetlib/data/collector/plugins/schemaorg/sitemap_file.xml</loc> |
|
4 |
</sitemap> |
|
5 |
</sitemapindex> |
modules/dnet-collector-plugins/trunk/src/test/resources/eu/dnetlib/data/collector/plugins/schemaorg/index.html | ||
---|---|---|
1 |
<html xmlns="//www.w3.org/1999/xhtml" xml:lang="en-gb" lang="en-gb" dir="ltr" > |
|
2 |
<head> |
|
3 |
<script type="application/ld+json"> |
|
4 |
{ |
|
5 |
"@context": "http://schema.org", |
|
6 |
"@type": "WebSite", |
|
7 |
"url": "https://reactome.org/", |
|
8 |
"potentialAction": { |
|
9 |
"@type": "SearchAction", |
|
10 |
"target": "https://reactome.org/content/query?q={term}", |
|
11 |
"query-input": "required name=term" |
|
12 |
} |
|
13 |
} |
|
14 |
</script> |
|
15 |
<script type="application/ld+json"> |
|
16 |
{ |
|
17 |
"@context": "http://schema.org", |
|
18 |
"@type": "Organization", |
|
19 |
"url": "https://reactome.org", |
|
20 |
"logo": "https://reactome.org/templates/favourite/images/logo/logo.png", |
|
21 |
"email": "help@reactome.org" |
|
22 |
} |
|
23 |
</script> |
|
24 |
<script type="application/ld+json"> |
|
25 |
{"name":"Binding of the influenza virion to the host cell","description":"Influenza viruses bind via their surface HA (hemagglutinin) to sialic acid in alpha 2,3 or alpha 2,6 linkage with galactose on the host cell surface. Sialic acid in 2,6 linkages is characteristic of human cells while 2,3 linkages are characteristic of avian cells. The specificity of influenza HA for sialic acid in alpha 2,6 or alpha 2,3 linkages is a feature restricting the transfer of influenza viruses between avian species and humans. This species barrier can be overcome, however. Notably, passaged viruses adapt to their host through mutation in the receptor binding site of the viral HA gene.","url":"https://reactome.org/PathwayBrowser/#/R-HSA-168272","sameAs":null,"version":"66","keywords":["Reaction"],"creator":[],"includedInDataCatalog":{"url":"https://reactome.org","name":"Reactome","@type":"DataCatalog"},"distribution":[{"contentUrl":"https://reactome.org/ContentService/exporter/sbml/168272.xml","fileFormat":"SBML","@type":"DataDownload"},{"contentUrl":"https://reactome.org/ReactomeRESTfulAPI/RESTfulWS/sbgnExporter/168272","fileFormat":"SBGN","@type":"DataDownload"},{"contentUrl":"https://reactome.org/ReactomeRESTfulAPI/RESTfulWS/biopaxExporter/Level2/168272","fileFormat":"BIOPAX2","@type":"DataDownload"},{"contentUrl":"https://reactome.org/ReactomeRESTfulAPI/RESTfulWS/biopaxExporter/Level3/168272","fileFormat":"BIOPAX3","@type":"DataDownload"},{"contentUrl":"https://reactome.org/cgi-bin/pdfexporter?DB=gk_current&ID=168272","fileFormat":"PDF","@type":"DataDownload"},{"contentUrl":"https://reactome.org/cgi-bin/rtfexporter?DB=gk_current&ID=168272","fileFormat":"DOCX","@type":"DataDownload"},{"contentUrl":"https://reactome.org/cgi-bin/protegeexporter?DB=gk_current&ID=168272","fileFormat":"OWL","@type":"DataDownload"}],"citation":["http://www.ncbi.nlm.nih.gov/pubmed/0"],"license":"https://creativecommons.org/licenses/by/4.0/","@context":"http://schema.org/","@type":"DataSet"} |
|
26 |
</script> |
|
27 |
</head> |
|
28 |
<body> |
|
29 |
this is the body of the page |
|
30 |
</body> |
|
31 |
</html> |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgIterableOLD.java | ||
---|---|---|
1 |
//package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
// |
|
3 |
//import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator; |
|
4 |
//import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator; |
|
5 |
//import org.apache.commons.logging.Log; |
|
6 |
//import org.apache.commons.logging.LogFactory; |
|
7 |
// |
|
8 |
//import java.net.URL; |
|
9 |
//import java.util.Iterator; |
|
10 |
//import java.util.concurrent.ArrayBlockingQueue; |
|
11 |
//import java.util.concurrent.ExecutorService; |
|
12 |
//import java.util.concurrent.Executors; |
|
13 |
// |
|
14 |
//public class SchemaOrgIterableOLD implements Iterable<String> { |
|
15 |
// private static final Log log = LogFactory.getLog(SchemaOrgIterable.class); |
|
16 |
// |
|
17 |
// public static class Options { |
|
18 |
// private SchemaOrgIterator.Options schemaOrgIteratorOptions; |
|
19 |
// private SitemapIndexIterator.Options sitemapIndexIteratorOptions; |
|
20 |
// private SitemapFileIterator.Options sitemapFileIteratorOptions; |
|
21 |
// private EndpointAccessIterator.Options endpointAccessIteratorOptions; |
|
22 |
// private DatasetMappingIterator.Options datasetMappingIteratorOptions; |
|
23 |
// |
|
24 |
// private int queueSize; |
|
25 |
// |
|
26 |
// public DatasetMappingIterator.Options getDatasetMappingIteratorOptions() { |
|
27 |
// return datasetMappingIteratorOptions; |
|
28 |
// } |
|
29 |
// |
|
30 |
// public void setDatasetMappingIteratorOptions(DatasetMappingIterator.Options datasetMappingIteratorOptions) { |
|
31 |
// this.datasetMappingIteratorOptions = datasetMappingIteratorOptions; |
|
32 |
// } |
|
33 |
// |
|
34 |
// public EndpointAccessIterator.Options getEndpointAccessIteratorOptions() { |
|
35 |
// return endpointAccessIteratorOptions; |
|
36 |
// } |
|
37 |
// |
|
38 |
// public void setEndpointAccessIteratorOptions(EndpointAccessIterator.Options endpointAccessIteratorOptions) { |
|
39 |
// this.endpointAccessIteratorOptions = endpointAccessIteratorOptions; |
|
40 |
// } |
|
41 |
// |
|
42 |
// public SitemapFileIterator.Options getSitemapFileIteratorOptions() { |
|
43 |
// return sitemapFileIteratorOptions; |
|
44 |
// } |
|
45 |
// |
|
46 |
// public void setSitemapFileIteratorOptions(SitemapFileIterator.Options sitemapFileIteratorOptions) { |
|
47 |
// this.sitemapFileIteratorOptions = sitemapFileIteratorOptions; |
|
48 |
// } |
|
49 |
// |
|
50 |
// public SitemapIndexIterator.Options getSitemapIndexIteratorOptions() { |
|
51 |
// return sitemapIndexIteratorOptions; |
|
52 |
// } |
|
53 |
// |
|
54 |
// public void setSitemapIndexIteratorOptions(SitemapIndexIterator.Options sitemapIndexIteratorOptions) { |
|
55 |
// this.sitemapIndexIteratorOptions = sitemapIndexIteratorOptions; |
|
56 |
// } |
|
57 |
// |
|
58 |
// public SchemaOrgIterator.Options getSchemaOrgIteratorOptions() { |
|
59 |
// return schemaOrgIteratorOptions; |
|
60 |
// } |
|
61 |
// |
|
62 |
// public void setSchemaOrgIteratorOptions(SchemaOrgIterator.Options schemaOrgIteratorOptions) { |
|
63 |
// this.schemaOrgIteratorOptions = schemaOrgIteratorOptions; |
|
64 |
// } |
|
65 |
// |
|
66 |
// public int getQueueSize() { |
|
67 |
// return queueSize; |
|
68 |
// } |
|
69 |
// |
|
70 |
// public void setQueueSize(int queueSize) { |
|
71 |
// this.queueSize = queueSize; |
|
72 |
// } |
|
73 |
// } |
|
74 |
// |
|
75 |
// private Options options; |
|
76 |
// private ArrayBlockingQueue<String> queue; |
|
77 |
// |
|
78 |
// public SchemaOrgIterable(Options options) { |
|
79 |
// this.options = options; |
|
80 |
// this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize(), true); |
|
81 |
// } |
|
82 |
// |
|
83 |
// public void bootstrap() { |
|
84 |
// ExecutorService executor = Executors.newSingleThreadExecutor(); |
|
85 |
// executor.execute(new Harvester()); |
|
86 |
// executor.shutdown(); |
|
87 |
// } |
|
88 |
// |
|
89 |
// @Override |
|
90 |
// public Iterator<String> iterator() { |
|
91 |
// return new SchemaOrgIterator(this.options.getSchemaOrgIteratorOptions(), this.queue); |
|
92 |
// } |
|
93 |
// |
|
94 |
// private class Harvester implements Runnable{ |
|
95 |
// |
|
96 |
// @Override |
|
97 |
// public void run() { |
|
98 |
// this.execute(); |
|
99 |
// } |
|
100 |
// |
|
101 |
// private void execute(){ |
|
102 |
// try { |
|
103 |
// SitemapIndexIterator sitemapIndexIterator = new SitemapIndexIterator(options.getSitemapIndexIteratorOptions()); |
|
104 |
// sitemapIndexIterator.bootstrap(); |
|
105 |
// |
|
106 |
// while (sitemapIndexIterator.hasNext()) { |
|
107 |
// String sitemapFile = sitemapIndexIterator.next(); |
|
108 |
// if(sitemapFile == null) continue; |
|
109 |
// |
|
110 |
// SitemapFileIterator.Options sitemapFileIteratorOptions = (SitemapFileIterator.Options)options.getSitemapFileIteratorOptions().clone(); |
|
111 |
// sitemapFileIteratorOptions.setFileUrl(new URL(sitemapFile)); |
|
112 |
// SitemapFileIterator sitemapFileIterator = new SitemapFileIterator(sitemapFileIteratorOptions); |
|
113 |
// sitemapFileIterator.bootstrap(); |
|
114 |
// |
|
115 |
// EndpointAccessIterator endpointAccessIterator = new EndpointAccessIterator(options.getEndpointAccessIteratorOptions(), sitemapFileIterator); |
|
116 |
// DatasetMappingIterator datasetMappingIterator = new DatasetMappingIterator(options.getDatasetMappingIteratorOptions(), endpointAccessIterator); |
|
117 |
// |
|
118 |
// while (datasetMappingIterator.hasNext()) { |
|
119 |
// String xml = datasetMappingIterator.next(); |
|
120 |
// if(xml == null) continue; |
|
121 |
// |
|
122 |
// queue.put(xml); |
|
123 |
// } |
|
124 |
// } |
|
125 |
// }catch(Exception ex){ |
|
126 |
// log.error("problem execution harvesting", ex); |
|
127 |
// } |
|
128 |
// finally { |
|
129 |
// try { |
|
130 |
// queue.put(Conventions.TerminateHint); |
|
131 |
// } catch (Exception ex) { |
|
132 |
// log.fatal("could not add termination hint. the process will not terminate gracefully", ex); |
|
133 |
// } |
|
134 |
// } |
|
135 |
// } |
|
136 |
// } |
|
137 |
//} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import org.apache.commons.logging.Log; |
|
4 |
import org.apache.commons.logging.LogFactory; |
|
5 |
|
|
6 |
import java.util.Iterator; |
|
7 |
import java.util.concurrent.ArrayBlockingQueue; |
|
8 |
|
|
9 |
public class SchemaOrgIterable implements Iterable<String> { |
|
10 |
private static final Log log = LogFactory.getLog(SchemaOrgIterable.class); |
|
11 |
|
|
12 |
public static class Options { |
|
13 |
private EndpointAccessIterator.Options endpointAccessOptions; |
|
14 |
private DatasetMappingIterator.Options datasetMappingOptions; |
|
15 |
|
|
16 |
public EndpointAccessIterator.Options getEndpointAccessOptions() { |
|
17 |
return endpointAccessOptions; |
|
18 |
} |
|
19 |
|
|
20 |
public void setEndpointAccessOptions(EndpointAccessIterator.Options endpointAccessOptions) { |
|
21 |
this.endpointAccessOptions = endpointAccessOptions; |
|
22 |
} |
|
23 |
|
|
24 |
public DatasetMappingIterator.Options getDatasetMappingOptions() { |
|
25 |
return datasetMappingOptions; |
|
26 |
} |
|
27 |
|
|
28 |
public void setDatasetMappingOptions(DatasetMappingIterator.Options datasetMappingOptions) { |
|
29 |
this.datasetMappingOptions = datasetMappingOptions; |
|
30 |
} |
|
31 |
} |
|
32 |
|
|
33 |
private Options options; |
|
34 |
private RepositoryIterable repository; |
|
35 |
|
|
36 |
public SchemaOrgIterable(Options options, RepositoryIterable repository){ |
|
37 |
this.options = options; |
|
38 |
this.repository = repository; |
|
39 |
} |
|
40 |
|
|
41 |
@Override |
|
42 |
public Iterator<String> iterator() { |
|
43 |
Iterator<String> repositoryIterator = this.repository.iterator(); |
|
44 |
EndpointAccessIterator endpointAccessIterator = new EndpointAccessIterator(options.getEndpointAccessOptions(), repositoryIterator); |
|
45 |
DatasetMappingIterator datasetMappingIterator = new DatasetMappingIterator(options.getDatasetMappingOptions(), endpointAccessIterator); |
|
46 |
|
|
47 |
return datasetMappingIterator; |
|
48 |
} |
|
49 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/Utils.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import org.json.JSONArray; |
|
4 |
import org.json.JSONObject; |
|
5 |
import org.w3c.dom.Document; |
|
6 |
import org.w3c.dom.NodeList; |
|
7 |
import org.xml.sax.InputSource; |
|
8 |
|
|
9 |
import javax.xml.parsers.DocumentBuilder; |
|
10 |
import javax.xml.parsers.DocumentBuilderFactory; |
|
11 |
import javax.xml.xpath.XPath; |
|
12 |
import javax.xml.xpath.XPathConstants; |
|
13 |
import javax.xml.xpath.XPathExpression; |
|
14 |
import javax.xml.xpath.XPathFactory; |
|
15 |
import java.io.File; |
|
16 |
import java.io.FileInputStream; |
|
17 |
import java.io.FileOutputStream; |
|
18 |
import java.io.StringReader; |
|
19 |
import java.nio.charset.Charset; |
|
20 |
import java.nio.charset.UnsupportedCharsetException; |
|
21 |
import java.util.ArrayList; |
|
22 |
import java.util.EnumSet; |
|
23 |
import java.util.HashMap; |
|
24 |
import java.util.List; |
|
25 |
import java.util.zip.GZIPInputStream; |
|
26 |
|
|
27 |
public class Utils { |
|
28 |
|
|
29 |
public static List<String> collectAsStrings(String xml, String xpath) throws Exception{ |
|
30 |
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
|
31 |
DocumentBuilder builder = factory.newDocumentBuilder(); |
|
32 |
Document doc = builder.parse(new InputSource(new StringReader(xml))); |
|
33 |
return Utils.collectAsStrings(doc, xpath); |
|
34 |
} |
|
35 |
|
|
36 |
public static List<String> collectAsStrings(File file, String xpath) throws Exception{ |
|
37 |
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
|
38 |
DocumentBuilder builder = factory.newDocumentBuilder(); |
|
39 |
Document doc = builder.parse(file); |
|
40 |
return Utils.collectAsStrings(doc, xpath); |
|
41 |
} |
|
42 |
|
|
43 |
public static List<String> collectAsStrings(Document doc, String xpath) throws Exception{ |
|
44 |
XPathFactory xPathfactory = XPathFactory.newInstance(); |
|
45 |
XPath path = xPathfactory.newXPath(); |
|
46 |
XPathExpression expr = path.compile(xpath); |
|
47 |
NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET); |
|
48 |
|
|
49 |
List<String> values = new ArrayList<>(); |
|
50 |
|
|
51 |
for (int i = 0; i < nodes.getLength(); i++) |
|
52 |
values.add(nodes.item(i).getNodeValue()); |
|
53 |
|
|
54 |
return values; |
|
55 |
} |
|
56 |
|
|
57 |
public static void decompressGZipTo(File input, File output) throws Exception { |
|
58 |
try (GZIPInputStream in = new GZIPInputStream(new FileInputStream(input))){ |
|
59 |
try (FileOutputStream out = new FileOutputStream(output)){ |
|
60 |
byte[] buffer = new byte[1024]; |
|
61 |
int len; |
|
62 |
while((len = in.read(buffer)) != -1){ |
|
63 |
out.write(buffer, 0, len); |
|
64 |
} |
|
65 |
} |
|
66 |
} |
|
67 |
} |
|
68 |
|
|
69 |
public static String getAsString(HashMap<String,String> map, String key, String defaultValue) |
|
70 |
{ |
|
71 |
String value = map.get(key); |
|
72 |
if(value == null) return defaultValue; |
|
73 |
return value; |
|
74 |
} |
|
75 |
|
|
76 |
public static List<String> getAsStringCsv(HashMap<String,String> map, String key, List<String> defaultValue) |
|
77 |
{ |
|
78 |
String value = map.get(key); |
|
79 |
if(value == null) return defaultValue; |
|
80 |
String[] splits = value.split(","); |
|
81 |
List<String> curated = new ArrayList<>(); |
|
82 |
for(String item : splits){ |
|
83 |
if(item == null || item.trim().length() == 0) continue; |
|
84 |
curated.add(item.trim()); |
|
85 |
} |
|
86 |
return curated; |
|
87 |
} |
|
88 |
|
|
89 |
public static int getAsInt(HashMap<String,String> map, String key, int defaultValue) |
|
90 |
{ |
|
91 |
String value = map.get(key); |
|
92 |
if(value == null) return defaultValue; |
|
93 |
try { |
|
94 |
return Integer.parseInt(value); |
|
95 |
} catch (NumberFormatException e) { |
|
96 |
return defaultValue; |
|
97 |
} |
|
98 |
} |
|
99 |
|
|
100 |
public static long getAsLong(HashMap<String,String> map, String key, long defaultValue) |
|
101 |
{ |
|
102 |
String value = map.get(key); |
|
103 |
if(value == null) return defaultValue; |
|
104 |
try { |
|
105 |
return Long.parseLong(value); |
|
106 |
} catch (NumberFormatException e) { |
|
107 |
return defaultValue; |
|
108 |
} |
|
109 |
} |
|
110 |
|
|
111 |
public static <E extends Enum<E>> E getAsEnum(HashMap<String,String> map, String key, E defaultValue, Class<E> clazz) { |
|
112 |
//EnumSet<E> values = EnumSet.allOf(defaultValue.getClass()); |
|
113 |
EnumSet<E> values = EnumSet.allOf(clazz); |
|
114 |
String value = map.get(key); |
|
115 |
if (value == null) return defaultValue; |
|
116 |
for(E val : values){ |
|
117 |
if(!val.name().equalsIgnoreCase(value)) continue; |
|
118 |
return val; |
|
119 |
} |
|
120 |
return defaultValue; |
|
121 |
} |
|
122 |
|
|
123 |
public static Boolean getAsBoolean(HashMap<String,String> map, String key, Boolean defaultValue) { |
|
124 |
String value = map.get(key); |
|
125 |
if (value == null) return defaultValue; |
|
126 |
return Boolean.parseBoolean(value); |
|
127 |
} |
|
128 |
|
|
129 |
public static Charset getAsCharset(HashMap<String,String> map, String key, Charset defaultValue) |
|
130 |
{ |
|
131 |
String value = map.get(key); |
|
132 |
if(value == null) return defaultValue; |
|
133 |
try { |
|
134 |
return Charset.forName(value); |
|
135 |
} catch (UnsupportedCharsetException e) { |
|
136 |
return defaultValue; |
|
137 |
} |
|
138 |
} |
|
139 |
|
|
140 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgIteratorOLD.java | ||
---|---|---|
1 |
//package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
// |
|
3 |
//import org.apache.commons.logging.Log; |
|
4 |
//import org.apache.commons.logging.LogFactory; |
|
5 |
// |
|
6 |
//import java.util.Iterator; |
|
7 |
//import java.util.NoSuchElementException; |
|
8 |
//import java.util.concurrent.ArrayBlockingQueue; |
|
9 |
//import java.util.concurrent.TimeUnit; |
|
10 |
// |
|
11 |
//public class SchemaOrgIteratorOLD implements Iterator<String> { |
|
12 |
// private static final Log log = LogFactory.getLog(SchemaOrgIterator.class); |
|
13 |
// |
|
14 |
// public static class Options { |
|
15 |
// private Boolean blockPolling; |
|
16 |
// private long pollTimeout; |
|
17 |
// private TimeUnit pollTimeoutUnit; |
|
18 |
// |
|
19 |
// public Boolean getBlockPolling() { |
|
20 |
// return blockPolling; |
|
21 |
// } |
|
22 |
// |
|
23 |
// public void setBlockPolling(Boolean blockPolling) { |
|
24 |
// this.blockPolling = blockPolling; |
|
25 |
// } |
|
26 |
// |
|
27 |
// public long getPollTimeout() { |
|
28 |
// return pollTimeout; |
|
29 |
// } |
|
30 |
// |
|
31 |
// public void setPollTimeout(long pollTimeout) { |
|
32 |
// this.pollTimeout = pollTimeout; |
|
33 |
// } |
|
34 |
// |
|
35 |
// public TimeUnit getPollTimeoutUnit() { |
|
36 |
// return pollTimeoutUnit; |
|
37 |
// } |
|
38 |
// |
|
39 |
// public void setPollTimeoutUnit(TimeUnit pollTimeoutUnit) { |
|
40 |
// this.pollTimeoutUnit = pollTimeoutUnit; |
|
41 |
// } |
|
42 |
// } |
|
43 |
// |
|
44 |
// private ArrayBlockingQueue<String> queue; |
|
45 |
// private Options options; |
|
46 |
// private boolean hasTerminated; |
|
47 |
// |
|
48 |
// public SchemaOrgIterator(Options options, ArrayBlockingQueue<String> queue) { |
|
49 |
// this.options = options; |
|
50 |
// this.queue = queue; |
|
51 |
// this.hasTerminated = false; |
|
52 |
// } |
|
53 |
// |
|
54 |
// public void bootstrap(){ |
|
55 |
// |
|
56 |
// } |
|
57 |
// |
|
58 |
// @Override |
|
59 |
// public boolean hasNext() { |
|
60 |
// if(this.hasTerminated) return false; |
|
61 |
// return true; |
|
62 |
// } |
|
63 |
// |
|
64 |
// @Override |
|
65 |
// public String next() { |
|
66 |
// String next = this.poll(); |
|
67 |
// if (next != null && next.equalsIgnoreCase(Conventions.TerminateHint)) { |
|
68 |
// this.hasTerminated = true; |
|
69 |
// next = null; |
|
70 |
// } |
|
71 |
// return next; |
|
72 |
// } |
|
73 |
// |
|
74 |
// private String poll(){ |
|
75 |
// if(this.options.getBlockPolling()) { |
|
76 |
// try { |
|
77 |
// return this.queue.poll(this.options.getPollTimeout(), this.options.getPollTimeoutUnit()); |
|
78 |
// } catch (InterruptedException ex) { |
|
79 |
// log.warn(String.format("could not poll elements from queue for more than %s %s. throwing", this.options.getPollTimeout(), this.options.getPollTimeoutUnit())); |
|
80 |
// throw new NoSuchElementException(ex.getMessage()); |
|
81 |
// } |
|
82 |
// } |
|
83 |
// else { |
|
84 |
// return this.queue.poll(); |
|
85 |
// } |
|
86 |
// } |
|
87 |
//} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetDocument.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import org.w3c.dom.Attr; |
|
4 |
import org.w3c.dom.Document; |
|
5 |
import org.w3c.dom.Element; |
|
6 |
|
|
7 |
import javax.xml.parsers.DocumentBuilder; |
|
8 |
import javax.xml.parsers.DocumentBuilderFactory; |
|
9 |
import javax.xml.parsers.ParserConfigurationException; |
|
10 |
import javax.xml.transform.Transformer; |
|
11 |
import javax.xml.transform.TransformerFactory; |
|
12 |
import javax.xml.transform.dom.DOMSource; |
|
13 |
import javax.xml.transform.stream.StreamResult; |
|
14 |
import java.io.StringWriter; |
|
15 |
import java.time.LocalDate; |
|
16 |
import java.time.format.DateTimeFormatter; |
|
17 |
import java.util.Calendar; |
|
18 |
import java.util.Date; |
|
19 |
import java.util.List; |
|
20 |
|
|
21 |
public class DatasetDocument { |
|
22 |
private List<Identifier> identifiers; |
|
23 |
private List<Creator> creators; |
|
24 |
private List<String> titles; |
|
25 |
private List<String> alternativeTitles; |
|
26 |
private List<String> publishers; |
|
27 |
private List<LocalDate> publicationDates; |
|
28 |
private List<String> subjects; |
|
29 |
private List<Contributor> contributors; |
|
30 |
private List<LocalDate> createdDates; |
|
31 |
private List<LocalDate> updatedDates; |
|
32 |
private List<String> languages; |
|
33 |
private List<ResourceType> resourceTypes; |
|
34 |
private List<AlternateIdentifier> alternateIdentifier; |
|
35 |
private List<Citation> citations; |
|
36 |
private List<String> sizes; |
|
37 |
private List<String> format; |
|
38 |
private List<String> version; |
|
39 |
private List<License> licenses; |
|
40 |
private List<String> descriptions; |
|
41 |
private List<String> disambiguatingDescriptions; |
|
42 |
private List<SpatialCoverage> geoLocations; |
|
43 |
|
|
44 |
public List<Identifier> getIdentifiers() { |
|
45 |
return identifiers; |
|
46 |
} |
|
47 |
|
|
48 |
public void setIdentifiers(List<Identifier> identifiers) { |
|
49 |
this.identifiers = identifiers; |
|
50 |
} |
|
51 |
|
|
52 |
public List<Creator> getCreators() { |
|
53 |
return creators; |
|
54 |
} |
|
55 |
|
|
56 |
public void setCreators(List<Creator> creators) { |
|
57 |
this.creators = creators; |
|
58 |
} |
|
59 |
|
|
60 |
public List<String> getTitles() { |
|
61 |
return titles; |
|
62 |
} |
|
63 |
|
|
64 |
public void setTitles(List<String> titles) { |
|
65 |
this.titles = titles; |
|
66 |
} |
|
67 |
|
|
68 |
public List<String> getAlternativeTitles() { |
|
69 |
return alternativeTitles; |
|
70 |
} |
|
71 |
|
|
72 |
public void setAlternativeTitles(List<String> alternativeTitles) { |
|
73 |
this.alternativeTitles = alternativeTitles; |
|
74 |
} |
|
75 |
|
|
76 |
public List<String> getPublishers() { |
|
77 |
return publishers; |
|
78 |
} |
|
79 |
|
|
80 |
public void setPublishers(List<String> publishers) { |
|
81 |
this.publishers = publishers; |
|
82 |
} |
|
83 |
|
|
84 |
public List<LocalDate> getPublicationDates() { |
|
85 |
return publicationDates; |
|
86 |
} |
|
87 |
|
|
88 |
public void setPublicationDates(List<LocalDate> publicationDates) { |
|
89 |
this.publicationDates = publicationDates; |
|
90 |
} |
|
91 |
|
|
92 |
public List<String> getSubjects() { |
|
93 |
return subjects; |
|
94 |
} |
|
95 |
|
|
96 |
public void setSubjects(List<String> subjects) { |
|
97 |
this.subjects = subjects; |
|
98 |
} |
|
99 |
|
|
100 |
public List<Contributor> getContributors() { |
|
101 |
return contributors; |
|
102 |
} |
|
103 |
|
|
104 |
public void setContributors(List<Contributor> contributors) { |
|
105 |
this.contributors = contributors; |
|
106 |
} |
|
107 |
|
|
108 |
public List<LocalDate> getCreatedDates() { |
|
109 |
return createdDates; |
|
110 |
} |
|
111 |
|
|
112 |
public void setCreatedDates(List<LocalDate> createdDates) { |
|
113 |
this.createdDates = createdDates; |
|
114 |
} |
|
115 |
|
|
116 |
public List<LocalDate> getUpdatedDates() { |
|
117 |
return updatedDates; |
|
118 |
} |
|
119 |
|
|
120 |
public void setUpdatedDates(List<LocalDate> updatedDates) { |
|
121 |
this.updatedDates = updatedDates; |
|
122 |
} |
|
123 |
|
|
124 |
public List<String> getLanguages() { |
|
125 |
return languages; |
|
126 |
} |
|
127 |
|
|
128 |
public void setLanguages(List<String> languages) { |
|
129 |
this.languages = languages; |
|
130 |
} |
|
131 |
|
|
132 |
public List<ResourceType> getResourceTypes() { |
|
133 |
return resourceTypes; |
|
134 |
} |
|
135 |
|
|
136 |
public void setResourceTypes(List<ResourceType> resourceTypes) { |
|
137 |
this.resourceTypes = resourceTypes; |
|
138 |
} |
|
139 |
|
|
140 |
public List<AlternateIdentifier> getAlternateIdentifier() { |
|
141 |
return alternateIdentifier; |
|
142 |
} |
|
143 |
|
|
144 |
public void setAlternateIdentifier(List<AlternateIdentifier> alternateIdentifier) { |
|
145 |
this.alternateIdentifier = alternateIdentifier; |
|
146 |
} |
|
147 |
|
|
148 |
public List<Citation> getCitations() { |
|
149 |
return citations; |
|
150 |
} |
|
151 |
|
|
152 |
public void setCitations(List<Citation> citations) { |
|
153 |
this.citations = citations; |
|
154 |
} |
|
155 |
|
|
156 |
public List<String> getSizes() { |
|
157 |
return sizes; |
|
158 |
} |
|
159 |
|
|
160 |
public void setSizes(List<String> sizes) { |
|
161 |
this.sizes = sizes; |
|
162 |
} |
|
163 |
|
|
164 |
public List<String> getFormat() { |
|
165 |
return format; |
|
166 |
} |
|
167 |
|
|
168 |
public void setFormat(List<String> format) { |
|
169 |
this.format = format; |
|
170 |
} |
|
171 |
|
|
172 |
public List<String> getVersion() { |
|
173 |
return version; |
|
174 |
} |
|
175 |
|
|
176 |
public void setVersion(List<String> version) { |
|
177 |
this.version = version; |
|
178 |
} |
|
179 |
|
|
180 |
public List<License> getLicenses() { |
|
181 |
return licenses; |
|
182 |
} |
|
183 |
|
|
184 |
public void setLicenses(List<License> licenses) { |
|
185 |
this.licenses = licenses; |
|
186 |
} |
|
187 |
|
|
188 |
public List<String> getDescriptions() { |
|
189 |
return descriptions; |
|
190 |
} |
|
191 |
|
|
192 |
public void setDescriptions(List<String> descriptions) { |
|
193 |
this.descriptions = descriptions; |
|
194 |
} |
|
195 |
|
|
196 |
public List<String> getDisambiguatingDescriptions() { |
|
197 |
return disambiguatingDescriptions; |
|
198 |
} |
|
199 |
|
|
200 |
public void setDisambiguatingDescriptions(List<String> disambiguatingDescriptions) { |
|
201 |
this.disambiguatingDescriptions = disambiguatingDescriptions; |
|
202 |
} |
|
203 |
|
|
204 |
public List<SpatialCoverage> getGeoLocations() { |
|
205 |
return geoLocations; |
|
206 |
} |
|
207 |
|
|
208 |
public void setGeoLocations(List<SpatialCoverage> geoLocations) { |
|
209 |
this.geoLocations = geoLocations; |
|
210 |
} |
|
211 |
|
|
212 |
public String toXml() throws Exception { |
|
213 |
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); |
|
214 |
DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); |
|
215 |
Document doc = docBuilder.newDocument(); |
|
216 |
|
|
217 |
Element root = doc.createElement("dataset"); |
|
218 |
doc.appendChild(root); |
|
219 |
|
|
220 |
if(this.identifiers!=null){ |
|
221 |
for(Identifier item : this.identifiers){ |
|
222 |
item.toXml(root); |
|
223 |
} |
|
224 |
} |
|
225 |
if(this.creators!=null){ |
|
226 |
Element creators = doc.createElement("creators"); |
|
227 |
root.appendChild(creators); |
|
228 |
for(Creator item : this.creators){ |
|
229 |
item.toXml(creators); |
|
230 |
} |
|
231 |
} |
|
232 |
if(this.titles!=null || this.alternativeTitles!=null){ |
|
233 |
Element titles = doc.createElement("titles"); |
|
234 |
root.appendChild(titles); |
|
235 |
if(this.titles!=null) { |
|
236 |
for (String item : this.titles) { |
|
237 |
Element title = doc.createElement("title"); |
|
238 |
titles.appendChild(title); |
|
239 |
title.appendChild(doc.createTextNode(item)); |
|
240 |
} |
|
241 |
} |
|
242 |
if(this.alternativeTitles!=null) { |
|
243 |
for (String item : this.alternativeTitles) { |
|
244 |
Element title = doc.createElement("title"); |
|
245 |
titles.appendChild(title); |
|
246 |
title.setAttribute("titleType", "AlternativeTitle"); |
|
247 |
title.appendChild(doc.createTextNode(item)); |
|
248 |
} |
|
249 |
} |
|
250 |
} |
|
251 |
if(this.publishers!=null){ |
|
252 |
for(String item : this.publishers){ |
|
253 |
Element publisher = doc.createElement("publisher"); |
|
254 |
root.appendChild(publisher); |
|
255 |
publisher.appendChild(doc.createTextNode(item)); |
|
256 |
} |
|
257 |
} |
|
258 |
if(this.publicationDates!=null){ |
|
259 |
for(LocalDate item : this.publicationDates){ |
|
260 |
Element publicationYear = doc.createElement("publicationYear"); |
|
261 |
root.appendChild(publicationYear); |
|
262 |
publicationYear.appendChild(doc.createTextNode(Integer.toString(item.getYear()))); |
|
263 |
} |
|
264 |
} |
|
265 |
if(this.subjects!=null){ |
|
266 |
Element subjects = doc.createElement("subjects"); |
|
267 |
root.appendChild(subjects); |
|
268 |
for(String item : this.subjects){ |
|
269 |
Element subject = doc.createElement("subject"); |
|
270 |
subjects.appendChild(subject); |
|
271 |
subject.appendChild(doc.createTextNode(item)); |
|
272 |
} |
|
273 |
} |
|
274 |
if(this.contributors!=null){ |
|
275 |
for(Contributor item : this.contributors){ |
|
276 |
item.toXml(root); |
|
277 |
} |
|
278 |
} |
|
279 |
if(this.createdDates!=null || this.updatedDates!=null){ |
|
280 |
Element dates = doc.createElement("dates"); |
|
281 |
root.appendChild(dates); |
|
282 |
|
|
283 |
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("YYYY-MM-DD"); |
|
284 |
|
|
285 |
if(createdDates!=null) { |
|
286 |
for (LocalDate item : this.createdDates) { |
|
287 |
Element date = doc.createElement("date"); |
|
288 |
root.appendChild(date); |
|
289 |
date.setAttribute("dateType", "Created"); |
|
290 |
date.appendChild(doc.createTextNode(item.format(formatter))); |
|
291 |
} |
|
292 |
} |
|
293 |
if(updatedDates!=null) { |
|
294 |
for (LocalDate item : this.updatedDates) { |
|
295 |
Element date = doc.createElement("date"); |
|
296 |
root.appendChild(date); |
|
297 |
date.setAttribute("dateType", "Updated"); |
|
298 |
date.appendChild(doc.createTextNode(item.format(formatter))); |
|
299 |
} |
|
300 |
} |
|
301 |
} |
|
302 |
if(this.languages!=null){ |
|
303 |
for(String item : this.languages){ |
|
304 |
Element language = doc.createElement("language"); |
|
305 |
root.appendChild(language); |
|
306 |
language.appendChild(doc.createTextNode(item)); |
|
307 |
} |
|
308 |
} |
|
309 |
if(this.resourceTypes!=null){ |
|
310 |
for(ResourceType item : this.resourceTypes){ |
|
311 |
item.toXml(root); |
|
312 |
} |
|
313 |
} |
|
314 |
if(this.alternateIdentifier!=null){ |
|
315 |
Element alternateIdentifiers = doc.createElement("alternateIdentifiers"); |
|
316 |
root.appendChild(alternateIdentifiers); |
|
317 |
for(AlternateIdentifier item : this.alternateIdentifier){ |
|
318 |
item.toXml(alternateIdentifiers); |
|
319 |
} |
|
320 |
} |
|
321 |
if(this.citations!=null){ |
|
322 |
for(Citation item : this.citations){ |
|
323 |
item.toXml(root); |
|
324 |
} |
|
325 |
} |
|
326 |
if(this.sizes!=null){ |
|
327 |
Element sizes = doc.createElement("sizes"); |
|
328 |
root.appendChild(sizes); |
|
329 |
for(String item : this.sizes){ |
|
330 |
Element size = doc.createElement("size"); |
|
331 |
sizes.appendChild(size); |
|
332 |
size.appendChild(doc.createTextNode(item)); |
|
333 |
} |
|
334 |
} |
|
335 |
if(this.format!=null){ |
|
336 |
Element formats = doc.createElement("formats"); |
|
337 |
root.appendChild(formats); |
|
338 |
for(String item : this.format){ |
|
339 |
Element format = doc.createElement("format"); |
|
340 |
formats.appendChild(format); |
|
341 |
format.appendChild(doc.createTextNode(item)); |
|
342 |
} |
|
343 |
} |
|
344 |
if(this.version!=null){ |
|
345 |
for(String item : this.version){ |
|
346 |
Element version = doc.createElement("version"); |
|
347 |
root.appendChild(version); |
|
348 |
version.appendChild(doc.createTextNode(item)); |
|
349 |
} |
|
350 |
} |
|
351 |
if(this.licenses!=null){ |
|
352 |
Element rightsList = doc.createElement("rightsList"); |
|
353 |
root.appendChild(rightsList); |
|
354 |
for(License item : this.licenses){ |
|
355 |
item.toXml(rightsList); |
|
356 |
} |
|
357 |
} |
|
358 |
if(this.descriptions!=null || this.disambiguatingDescriptions!=null){ |
|
359 |
Element descriptions = doc.createElement("descriptions"); |
|
360 |
root.appendChild(descriptions); |
|
361 |
if(this.descriptions!=null) { |
|
362 |
for (String item : this.descriptions) { |
|
363 |
Element description = doc.createElement("description"); |
|
364 |
descriptions.appendChild(description); |
|
365 |
description.setAttribute("descriptionType", "Abstract"); |
|
366 |
description.appendChild(doc.createTextNode(item)); |
|
367 |
} |
|
368 |
} |
|
369 |
if(this.disambiguatingDescriptions!=null) { |
|
370 |
for (String item : this.disambiguatingDescriptions) { |
|
371 |
Element description = doc.createElement("description"); |
|
372 |
descriptions.appendChild(description); |
|
373 |
description.setAttribute("descriptionType", "Other"); |
|
374 |
description.appendChild(doc.createTextNode(item)); |
|
375 |
} |
|
376 |
} |
|
377 |
} |
|
378 |
if(this.geoLocations!=null){ |
|
379 |
Element geoLocations = doc.createElement("geoLocations"); |
|
380 |
root.appendChild(geoLocations); |
|
381 |
for(SpatialCoverage item : this.geoLocations){ |
|
382 |
item.toXml(geoLocations); |
|
383 |
} |
|
384 |
} |
|
385 |
|
|
386 |
TransformerFactory tf = TransformerFactory.newInstance(); |
|
387 |
Transformer transformer = tf.newTransformer(); |
|
388 |
StringWriter writer = new StringWriter(); |
|
389 |
transformer.transform(new DOMSource(doc), new StreamResult(writer)); |
|
390 |
String xml = writer.getBuffer().toString(); |
|
391 |
return xml; |
|
392 |
} |
|
393 |
|
|
394 |
public static class SpatialCoverage{ |
|
395 |
public static class Point{ |
|
396 |
public String latitude; |
|
397 |
public String longitude; |
|
398 |
|
|
399 |
public Point() {} |
|
400 |
|
|
401 |
public Point(String latitude, String longitude){ |
|
402 |
this.latitude = latitude; |
|
403 |
this.longitude = longitude; |
|
404 |
} |
|
405 |
} |
|
406 |
public String name; |
|
407 |
public List<Point> points; |
|
408 |
public List<String> boxes; |
|
409 |
|
|
410 |
public SpatialCoverage() {} |
|
411 |
|
|
412 |
public SpatialCoverage(String name, List<Point> points, List<String> boxes ) { |
|
413 |
this.name = name; |
|
414 |
this.points = points; |
|
415 |
this.boxes = boxes; |
|
416 |
} |
|
417 |
|
|
418 |
public void toXml(Element parent){ |
|
419 |
Element node = parent.getOwnerDocument().createElement("geoLocation"); |
|
420 |
parent.appendChild(node); |
|
421 |
|
|
422 |
if(this.points!=null) { |
|
423 |
for(Point point : this.points) { |
|
424 |
if(point.latitude == null || point.longitude == null) continue; |
|
425 |
Element geoLocationPoint = parent.getOwnerDocument().createElement("geoLocationPoint"); |
|
426 |
geoLocationPoint.appendChild(parent.getOwnerDocument().createTextNode(String.format("%s %s", point.latitude, point.longitude))); |
|
427 |
node.appendChild(geoLocationPoint); |
|
428 |
} |
|
429 |
} |
|
430 |
if(this.boxes!=null) { |
|
431 |
for(String box : this.boxes) { |
|
432 |
if(box == null) continue; |
|
433 |
Element geoLocationBox = parent.getOwnerDocument().createElement("geoLocationBox"); |
|
434 |
geoLocationBox.appendChild(parent.getOwnerDocument().createTextNode(box)); |
|
435 |
node.appendChild(geoLocationBox); |
|
436 |
} |
|
437 |
} |
|
438 |
if(this.name!=null) { |
|
439 |
Element geoLocationPlace = parent.getOwnerDocument().createElement("geoLocationPlace"); |
|
440 |
geoLocationPlace.appendChild(parent.getOwnerDocument().createTextNode(this.name)); |
|
441 |
node.appendChild(geoLocationPlace); |
|
442 |
} |
|
443 |
} |
|
444 |
} |
|
445 |
|
|
446 |
public static class License{ |
|
447 |
public String name; |
|
448 |
public String url; |
|
449 |
|
|
450 |
public License() {} |
|
451 |
|
|
452 |
public License(String name, String url) { |
|
453 |
this.name = name; |
|
454 |
this.url = url; |
|
455 |
} |
|
456 |
|
|
457 |
public void toXml(Element parent){ |
|
458 |
Element node = parent.getOwnerDocument().createElement("rights"); |
|
459 |
parent.appendChild(node); |
|
460 |
|
|
461 |
if(this.url!=null) { |
|
462 |
node.setAttribute("rightsURI", this.url); |
|
463 |
} |
|
464 |
if(this.name!=null) { |
|
465 |
node.appendChild(parent.getOwnerDocument().createTextNode(this.name)); |
|
466 |
} |
|
467 |
} |
|
468 |
} |
|
469 |
|
|
470 |
public static class Citation{ |
|
471 |
public enum CitationIdentifierType{ |
|
472 |
ARK, arXiv, bibcode, DOI, EAN13, EISSN, Handle, ISBN, ISSN, ISTC, LISSN, LSID, PMID, |
|
473 |
PURL, UPC, URL, URN |
|
474 |
} |
|
475 |
|
|
476 |
public CitationIdentifierType type; |
|
477 |
public String value; |
|
478 |
|
|
479 |
public Citation() {} |
|
480 |
|
|
481 |
public Citation(String value, CitationIdentifierType type) { |
|
482 |
this.value = value; |
|
483 |
this.type = type; |
|
484 |
} |
|
485 |
|
|
486 |
public void toXml(Element parent){ |
|
487 |
Element node = parent.getOwnerDocument().createElement("relatedIdentifier"); |
|
488 |
parent.appendChild(node); |
|
489 |
|
|
490 |
node.setAttribute("relatedIdentifierType", this.type.toString()); |
|
491 |
node.setAttribute("relationType", "Cites"); |
|
492 |
node.appendChild(parent.getOwnerDocument().createTextNode(this.value)); |
|
493 |
} |
|
494 |
} |
|
495 |
|
|
496 |
public static class Contributor{ |
|
497 |
public enum ContributorType{ |
|
498 |
ContactPerson, DataCollector, DataCurator, DataManager, Distributor, Editor, Funder, HostingInstitution, |
|
499 |
Producer, ProjectLeader, ProjectManager, ProjectMember, RegistrationAgency, RegistrationAuthority, |
|
500 |
RelatedPerson, Researcher, ResearchGroup, RightsHolder, Sponsor, Supervisor, WorkPackageLeader, Other |
|
501 |
} |
|
502 |
|
|
503 |
public String name; |
|
504 |
public List<String> affiliations; |
|
505 |
public ContributorType type; |
|
506 |
|
|
507 |
public Contributor() { |
|
508 |
} |
|
509 |
|
|
510 |
public Contributor(String name) { |
|
511 |
this.name = name; |
|
512 |
} |
|
513 |
|
|
514 |
public Contributor(String name, List<String> affiliations) { |
|
515 |
this.name = name; |
|
516 |
this.affiliations = affiliations; |
|
517 |
} |
|
518 |
|
|
519 |
public Contributor(String name, List<String> affiliations, ContributorType type) { |
|
520 |
this.name = name; |
|
521 |
this.affiliations = affiliations; |
|
522 |
this.type = type; |
|
523 |
} |
|
524 |
|
|
525 |
public void toXml(Element parent){ |
|
526 |
Element node = parent.getOwnerDocument().createElement("contributor"); |
|
527 |
parent.appendChild(node); |
|
528 |
|
|
529 |
node.setAttribute("contributorType", this.type.toString()); |
|
530 |
|
|
531 |
if(this.name!=null) { |
|
532 |
Element contributorName = parent.getOwnerDocument().createElement("contributorName"); |
|
533 |
node.appendChild(contributorName); |
|
534 |
contributorName.appendChild(parent.getOwnerDocument().createTextNode(this.name)); |
|
535 |
} |
|
536 |
if(this.affiliations!=null) { |
|
537 |
for(String item : this.affiliations) { |
|
538 |
Element affiliation = parent.getOwnerDocument().createElement("affiliation"); |
|
539 |
node.appendChild(affiliation); |
|
540 |
affiliation.appendChild(parent.getOwnerDocument().createTextNode(item)); |
|
541 |
} |
|
542 |
} |
|
543 |
} |
|
544 |
} |
|
545 |
|
|
546 |
public static class AlternateIdentifier{ |
|
547 |
public String identifier; |
|
548 |
public String type; |
|
549 |
|
|
550 |
public AlternateIdentifier() {} |
|
551 |
|
|
552 |
public AlternateIdentifier(String identifier, String type) { |
|
553 |
this.identifier = identifier; |
|
554 |
this.type = type; |
|
555 |
} |
|
556 |
|
|
557 |
public void toXml(Element parent){ |
|
558 |
Element node = parent.getOwnerDocument().createElement("alternateIdentifier"); |
|
559 |
parent.appendChild(node); |
|
560 |
|
|
561 |
if(this.type!=null) { |
|
562 |
node.setAttribute("alternateIdentifierType", this.type); |
|
563 |
} |
|
564 |
if(this.identifier!=null) { |
|
565 |
node.appendChild(parent.getOwnerDocument().createTextNode(this.identifier)); |
|
566 |
} |
|
567 |
} |
|
568 |
} |
|
569 |
|
|
570 |
public static class ResourceType{ |
|
571 |
public enum ResourceTypeGeneralType { |
|
572 |
Audiovisual, Collection, Dataset, Event, Image, InteractiveResource, Model, PhysicalObject, Service, |
|
573 |
Software, Sound, Text, Workflow, Other |
|
574 |
} |
|
575 |
|
|
576 |
public ResourceTypeGeneralType type; |
|
577 |
|
|
578 |
public ResourceType() {} |
|
579 |
|
|
580 |
public ResourceType(ResourceTypeGeneralType type) { |
|
581 |
this.type = type; |
|
582 |
} |
|
583 |
|
|
584 |
public void toXml(Element parent){ |
|
585 |
Element node = parent.getOwnerDocument().createElement("resourceType"); |
|
586 |
parent.appendChild(node); |
|
587 |
|
|
588 |
if(this.type!=null) { |
|
589 |
node.setAttribute("resourceTypeGeneral", this.type.toString()); |
|
590 |
} |
|
591 |
} |
|
592 |
} |
|
593 |
|
|
594 |
public static class Creator { |
|
595 |
public String name; |
|
596 |
public List<String> affiliations; |
|
597 |
|
|
598 |
public Creator() { |
|
599 |
} |
|
600 |
|
|
601 |
public Creator(String name) { |
|
602 |
this.name = name; |
|
603 |
} |
|
604 |
|
|
605 |
public Creator(String name, List<String> affiliations) { |
|
606 |
this.name = name; |
|
607 |
this.affiliations = affiliations; |
|
608 |
} |
|
609 |
|
|
610 |
public void toXml(Element parent){ |
|
611 |
Element node = parent.getOwnerDocument().createElement("creator"); |
|
612 |
parent.appendChild(node); |
|
613 |
|
|
614 |
if(this.name!=null) { |
|
615 |
Element creatorName = parent.getOwnerDocument().createElement("creatorName"); |
|
616 |
node.appendChild(creatorName); |
|
617 |
creatorName.appendChild(parent.getOwnerDocument().createTextNode(this.name)); |
|
618 |
} |
|
619 |
if(this.affiliations!=null) { |
|
620 |
for(String item : this.affiliations) { |
|
621 |
Element affiliation = parent.getOwnerDocument().createElement("affiliation"); |
|
622 |
node.appendChild(affiliation); |
|
623 |
affiliation.appendChild(parent.getOwnerDocument().createTextNode(item)); |
|
624 |
} |
|
625 |
} |
|
626 |
} |
|
627 |
} |
|
628 |
|
|
629 |
public static class Identifier { |
|
630 |
public enum IdentifierType { |
|
631 |
ARK, DOI, Handle, PURL, URN, URL |
|
632 |
} |
|
633 |
|
|
634 |
public String value; |
|
635 |
public IdentifierType type; |
|
636 |
|
|
637 |
public Identifier() { |
|
638 |
} |
|
639 |
|
|
640 |
public Identifier(IdentifierType type, String value) { |
|
641 |
this.type = type; |
|
642 |
this.value = value; |
|
643 |
} |
|
644 |
|
|
645 |
public void toXml(Element parent){ |
|
646 |
Element node = parent.getOwnerDocument().createElement("identifier"); |
|
647 |
parent.appendChild(node); |
|
648 |
|
|
649 |
node.setAttribute("identifierType", this.type.toString()); |
|
650 |
if(this.value!=null) { |
|
651 |
node.appendChild(parent.getOwnerDocument().createTextNode(this.value)); |
|
652 |
} |
|
653 |
} |
|
654 |
} |
|
655 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/RepositoryIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
|
|
5 |
public interface RepositoryIterable extends Iterable<String> { |
|
6 |
public static String TerminationHint = "df667391-676d-4c0f-9c40-426b1001607a"; |
|
7 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
4 |
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle.KaggleRepositoryIterable; |
|
5 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator; |
|
6 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator; |
|
7 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexRepositoryIterable; |
|
8 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
9 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
10 |
import org.apache.commons.logging.Log; |
|
11 |
import org.apache.commons.logging.LogFactory; |
|
12 |
|
|
13 |
import java.net.MalformedURLException; |
|
14 |
import java.net.URL; |
|
15 |
import java.nio.charset.StandardCharsets; |
|
16 |
import java.util.concurrent.TimeUnit; |
|
17 |
|
|
18 |
public class SchemaOrgPlugin extends AbstractCollectorPlugin { |
|
19 |
|
|
20 |
private static final Log log = LogFactory.getLog(SchemaOrgPlugin.class); |
|
21 |
|
|
22 |
public String hello(){ |
|
23 |
return "hello"; |
|
24 |
} |
|
25 |
|
|
26 |
@Override |
|
27 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException { |
|
28 |
try { |
|
29 |
RepositoryIterable repository = null; |
|
30 |
String repositoryAccessType = Utils.getAsString(interfaceDescriptor.getParams(), "repositoryAccessType", null); |
|
31 |
switch(repositoryAccessType) { |
|
32 |
case "sitemapindex": { |
|
33 |
SitemapIndexRepositoryIterable.Options repositoryOptions = this.compileSitemapIndexRepositoryOptions(interfaceDescriptor); |
|
34 |
SitemapIndexRepositoryIterable repositoryIterable = new SitemapIndexRepositoryIterable(repositoryOptions); |
|
35 |
repositoryIterable.bootstrap(); |
|
36 |
repository = repositoryIterable; |
|
37 |
break; |
|
38 |
} |
|
39 |
case "httpapi-kaggle": { |
|
40 |
KaggleRepositoryIterable.Options repositoryOptions = this.compileKaggleRepositoryOptions(interfaceDescriptor); |
|
41 |
KaggleRepositoryIterable repositoryIterable = new KaggleRepositoryIterable(repositoryOptions); |
|
42 |
repositoryIterable.bootstrap(); |
|
43 |
repository = repositoryIterable; |
|
44 |
break; |
|
45 |
} |
|
46 |
default: |
|
47 |
throw new CollectorServiceException(String.format("unrecognized repository access type ", repositoryAccessType)); |
|
48 |
} |
|
49 |
SchemaOrgIterable.Options schemaOrgOptions = this.compileSchemaOrgOptions(interfaceDescriptor); |
|
50 |
SchemaOrgIterable iterable = new SchemaOrgIterable(schemaOrgOptions, repository); |
|
51 |
return iterable; |
|
52 |
} catch (Exception e) { |
|
53 |
throw new CollectorServiceException("Could not create iterator", e); |
|
54 |
} |
|
55 |
} |
|
56 |
|
|
57 |
private KaggleRepositoryIterable.Options compileKaggleRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
58 |
KaggleRepositoryIterable.Options kaggleRepositoryOptions = new KaggleRepositoryIterable.Options(); |
|
59 |
kaggleRepositoryOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "httpapi-kaggle_queueSize", 100)); |
|
60 |
kaggleRepositoryOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "httpapi-kaggle_APICharset", StandardCharsets.UTF_8)); |
|
61 |
kaggleRepositoryOptions.setQueryUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryUrl", null)); |
|
62 |
kaggleRepositoryOptions.setQueryPagePlaceholder(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryPagePlaceholder", "{PAGE}")); |
|
63 |
kaggleRepositoryOptions.setResponsePropertyTotalDataset(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems")); |
|
64 |
kaggleRepositoryOptions.setResponsePropertyDatasetList(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetList", "datasetListItems")); |
|
65 |
kaggleRepositoryOptions.setResponsePropertyDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl")); |
|
66 |
kaggleRepositoryOptions.setResponseBaseDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responseBaseDatasetUrl", interfaceDescriptor.getBaseUrl())); |
|
67 |
kaggleRepositoryOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor)); |
|
68 |
return kaggleRepositoryOptions; |
|
69 |
|
|
70 |
} |
|
71 |
|
|
72 |
private SitemapIndexIterator.Options compileSitemapIndexOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
73 |
SitemapIndexIterator.Options sitemapIndexIteratorOptions = new SitemapIndexIterator.Options(); |
|
74 |
sitemapIndexIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_IndexCharset", StandardCharsets.UTF_8)); |
|
75 |
sitemapIndexIteratorOptions.setIndexUrl(new URL(interfaceDescriptor.getBaseUrl())); |
|
76 |
return sitemapIndexIteratorOptions; |
|
77 |
|
|
78 |
} |
|
79 |
|
|
80 |
private SitemapFileIterator.Options compileSitemapFileOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
81 |
SitemapFileIterator.Options sitemapFileIteratorOptions = new SitemapFileIterator.Options(); |
|
82 |
sitemapFileIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_FileCharset", StandardCharsets.UTF_8)); |
|
83 |
sitemapFileIteratorOptions.setSchemaType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Xml, SitemapFileIterator.Options.SitemapSchemaType.class)); |
|
84 |
sitemapFileIteratorOptions.setFileType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text, SitemapFileIterator.Options.SitemapFileType.class)); |
|
85 |
return sitemapFileIteratorOptions; |
|
86 |
} |
|
87 |
|
|
88 |
private RepositoryQueueIterator.Options compileRepositoryQueueOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
89 |
RepositoryQueueIterator.Options repositoryQueueIteratorOptions = new RepositoryQueueIterator.Options(); |
|
90 |
repositoryQueueIteratorOptions.setBlockPolling(Utils.getAsBoolean(interfaceDescriptor.getParams(), "consumerBlockPolling", true)); |
|
91 |
repositoryQueueIteratorOptions.setPollTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "consumerBlockPollingTimeout", 2)); |
|
92 |
repositoryQueueIteratorOptions.setPollTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class)); |
|
93 |
return repositoryQueueIteratorOptions; |
|
94 |
} |
|
95 |
|
|
96 |
private SitemapIndexRepositoryIterable.Options compileSitemapIndexRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
97 |
SitemapIndexRepositoryIterable.Options sitemapIndexRepositoryIterableOptions = new SitemapIndexRepositoryIterable.Options(); |
|
98 |
sitemapIndexRepositoryIterableOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "sitemap_queueSize", 100)); |
|
99 |
sitemapIndexRepositoryIterableOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor)); |
|
100 |
sitemapIndexRepositoryIterableOptions.setSitemapFileIteratorOptions(this.compileSitemapFileOptions(interfaceDescriptor)); |
|
101 |
sitemapIndexRepositoryIterableOptions.setSitemapIndexIteratorOptions(this.compileSitemapIndexOptions(interfaceDescriptor)); |
|
102 |
return sitemapIndexRepositoryIterableOptions; |
|
103 |
} |
|
104 |
|
|
105 |
private EndpointAccessIterator.Options compileEndpointAccessOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
106 |
EndpointAccessIterator.Options endpointAccessIteratorOptions = new EndpointAccessIterator.Options(); |
|
107 |
endpointAccessIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "endpointCharset", StandardCharsets.UTF_8)); |
|
108 |
return endpointAccessIteratorOptions; |
|
109 |
} |
|
110 |
|
|
111 |
private DatasetMappingIterator.Options compileDatasetMappingOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
112 |
DatasetMappingIterator.Options datasetMappingIteratorOptions = new DatasetMappingIterator.Options(); |
|
113 |
|
|
114 |
DatasetMappingIterator.Options.UpdatedDateOptions datasetMappingIteratorUpdatedDateOptions = new DatasetMappingIterator.Options.UpdatedDateOptions(); |
|
115 |
datasetMappingIteratorUpdatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "updatedDateFormat", "YYYY-MM-DD"); |
|
116 |
datasetMappingIteratorOptions.setUpdatedDateOptions(datasetMappingIteratorUpdatedDateOptions); |
|
117 |
|
|
118 |
DatasetMappingIterator.Options.CreatedDateOptions datasetMappingIteratorCreatedDateOptions = new DatasetMappingIterator.Options.CreatedDateOptions(); |
|
119 |
datasetMappingIteratorCreatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "createdDateFormat", "YYYY-MM-DD"); |
|
120 |
datasetMappingIteratorOptions.setCreatedDateOptions(datasetMappingIteratorCreatedDateOptions); |
|
121 |
|
|
122 |
DatasetMappingIterator.Options.PublicationDateOptions datasetMappingIteratorPublicationDateOptions = new DatasetMappingIterator.Options.PublicationDateOptions(); |
|
123 |
datasetMappingIteratorPublicationDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "publicationDateFormat", "YYYY-MM-DD"); |
|
124 |
datasetMappingIteratorOptions.setPublicationDateOptions(datasetMappingIteratorPublicationDateOptions); |
|
125 |
|
|
126 |
DatasetMappingIterator.Options.ContributorOptions datasetMappingIteratorContributorOptions = new DatasetMappingIterator.Options.ContributorOptions(); |
|
127 |
datasetMappingIteratorContributorOptions.fallbackType =Utils.getAsEnum(interfaceDescriptor.getParams(), "contributorFallbackType",DatasetDocument.Contributor.ContributorType.Other, DatasetDocument.Contributor.ContributorType.class); |
|
128 |
datasetMappingIteratorOptions.setContributorOptions(datasetMappingIteratorContributorOptions); |
|
129 |
|
|
130 |
DatasetMappingIterator.Options.IdentifierOptions datasetMappingIteratorIdentifierOptions = new DatasetMappingIterator.Options.IdentifierOptions(); |
|
131 |
datasetMappingIteratorIdentifierOptions.fallbackType = Utils.getAsEnum(interfaceDescriptor.getParams(), "identifierFallbackType", null, DatasetDocument.Identifier.IdentifierType.class); |
|
132 |
datasetMappingIteratorIdentifierOptions.fallbackURL = Utils.getAsBoolean(interfaceDescriptor.getParams(), "identifierFallbackURL", true); |
|
133 |
datasetMappingIteratorIdentifierOptions.mappingARK = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingARK", null); |
|
134 |
datasetMappingIteratorIdentifierOptions.mappingDOI = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingDOI", null); |
|
135 |
datasetMappingIteratorIdentifierOptions.mappingHandle = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingHandle", null); |
|
136 |
datasetMappingIteratorIdentifierOptions.mappingPURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingPURL", null); |
|
137 |
datasetMappingIteratorIdentifierOptions.mappingURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURL", null); |
|
138 |
datasetMappingIteratorIdentifierOptions.mappingURN = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURN", null); |
|
139 |
datasetMappingIteratorOptions.setIdentifierOptions(datasetMappingIteratorIdentifierOptions); |
|
140 |
return datasetMappingIteratorOptions; |
|
141 |
} |
|
142 |
|
|
143 |
private SchemaOrgIterable.Options compileSchemaOrgOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
144 |
SchemaOrgIterable.Options schemaOrgIterableOptions = new SchemaOrgIterable.Options(); |
|
145 |
schemaOrgIterableOptions.setDatasetMappingOptions(this.compileDatasetMappingOptions(interfaceDescriptor)); |
|
146 |
schemaOrgIterableOptions.setEndpointAccessOptions(this.compileEndpointAccessOptions(interfaceDescriptor)); |
|
147 |
return schemaOrgIterableOptions; |
|
148 |
} |
|
149 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/HttpApiRepositoryIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable; |
|
4 |
|
|
5 |
public interface HttpApiRepositoryIterable extends RepositoryIterable { |
|
6 |
} |
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/kaggle/KaggleRepositoryIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable; |
|
4 |
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator; |
|
5 |
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.HttpApiRepositoryIterable; |
|
6 |
import org.apache.commons.io.IOUtils; |
|
7 |
import org.apache.commons.logging.Log; |
|
8 |
import org.apache.commons.logging.LogFactory; |
|
9 |
import org.json.JSONArray; |
|
10 |
import org.json.JSONObject; |
|
11 |
|
|
12 |
import java.net.URL; |
|
13 |
import java.nio.charset.Charset; |
|
14 |
import java.util.Iterator; |
|
15 |
import java.util.concurrent.ArrayBlockingQueue; |
|
16 |
import java.util.concurrent.ExecutorService; |
|
17 |
import java.util.concurrent.Executors; |
|
18 |
|
|
19 |
public class KaggleRepositoryIterable implements HttpApiRepositoryIterable { |
|
20 |
private static final Log log = LogFactory.getLog(KaggleRepositoryIterable.class); |
|
21 |
|
|
22 |
public static class Options { |
|
23 |
private String queryUrl; |
|
24 |
private String queryPagePlaceholder; |
|
25 |
private Charset charset; |
|
26 |
private String responsePropertyTotalDataset; |
|
27 |
private String responsePropertyDatasetList; |
|
28 |
private String responsePropertyDatasetUrl; |
|
29 |
private String responseBaseDatasetUrl; |
|
30 |
|
|
31 |
private RepositoryQueueIterator.Options repositoryQueueIteratorOptions; |
|
32 |
|
|
33 |
private int queueSize; |
|
34 |
|
|
35 |
public int getQueueSize() { |
|
36 |
return queueSize; |
|
37 |
} |
|
38 |
|
|
39 |
public void setQueueSize(int queueSize) { |
|
40 |
this.queueSize = queueSize; |
|
41 |
} |
|
42 |
|
|
43 |
public String getResponseBaseDatasetUrl() { |
|
44 |
return responseBaseDatasetUrl; |
|
45 |
} |
|
46 |
|
|
47 |
public void setResponseBaseDatasetUrl(String responseBaseDatasetUrl) { |
|
48 |
this.responseBaseDatasetUrl = responseBaseDatasetUrl; |
|
49 |
} |
|
50 |
|
|
51 |
public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() { |
|
52 |
return repositoryQueueIteratorOptions; |
|
53 |
} |
|
54 |
|
|
55 |
public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) { |
|
56 |
this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions; |
|
57 |
} |
|
58 |
|
|
59 |
public String getResponsePropertyDatasetUrl() { |
|
60 |
return responsePropertyDatasetUrl; |
|
61 |
} |
|
62 |
|
|
63 |
public void setResponsePropertyDatasetUrl(String responsePropertyDatasetUrl) { |
|
64 |
this.responsePropertyDatasetUrl = responsePropertyDatasetUrl; |
|
65 |
} |
|
66 |
|
|
67 |
public String getResponsePropertyDatasetList() { |
|
68 |
return responsePropertyDatasetList; |
|
69 |
} |
|
70 |
|
|
71 |
public void setResponsePropertyDatasetList(String responsePropertyDatasetList) { |
|
72 |
this.responsePropertyDatasetList = responsePropertyDatasetList; |
|
73 |
} |
|
74 |
|
|
75 |
public String getResponsePropertyTotalDataset() { |
|
76 |
return responsePropertyTotalDataset; |
|
77 |
} |
|
78 |
|
|
79 |
public void setResponsePropertyTotalDataset(String responsePropertyTotalDataset) { |
|
80 |
this.responsePropertyTotalDataset = responsePropertyTotalDataset; |
|
81 |
} |
|
82 |
|
|
83 |
public Charset getCharset() { |
|
84 |
return charset; |
|
85 |
} |
|
86 |
|
|
87 |
public void setCharset(Charset charset) { |
|
88 |
this.charset = charset; |
|
89 |
} |
|
90 |
|
|
91 |
public String getQueryPagePlaceholder() { |
Also available in: Unified diff
Added schema.org harvesting plugin. Supports sitemapindex files and api listing calls to retrieve endpoints list