Revision 62189
Added by Michele Artini over 2 years ago
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainKaggle.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
4 |
import org.apache.commons.io.FileUtils; |
|
5 |
import org.apache.commons.logging.Log; |
|
6 |
import org.apache.commons.logging.LogFactory; |
|
7 |
import org.apache.log4j.ConsoleAppender; |
|
8 |
import org.apache.log4j.Level; |
|
9 |
import org.apache.log4j.Logger; |
|
10 |
import org.apache.log4j.PatternLayout; |
|
11 |
|
|
12 |
import java.io.File; |
|
13 |
import java.nio.charset.StandardCharsets; |
|
14 |
import java.util.HashMap; |
|
15 |
import java.util.concurrent.TimeUnit; |
|
16 |
|
|
17 |
public class SchemaOrgMainKaggle { |
|
18 |
|
|
19 |
private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class); |
|
20 |
|
|
21 |
public static void main(String[] args) throws Exception { |
|
22 |
|
|
23 |
ConsoleAppender console = new ConsoleAppender(); |
|
24 |
console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n")); |
|
25 |
console.setThreshold(Level.DEBUG); |
|
26 |
console.activateOptions(); |
|
27 |
Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console); |
|
28 |
|
|
29 |
HashMap<String,String> params = new HashMap<>(); |
|
30 |
params.put("consumerBlockPolling", Boolean.toString(true)); |
|
31 |
params.put("consumerBlockPollingTimeout", "2"); |
|
32 |
params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString()); |
|
33 |
params.put("endpointCharset", StandardCharsets.UTF_8.name()); |
|
34 |
params.put("updatedDateFormat", "YYYY-MM-DD"); |
|
35 |
params.put("createdDateFormat", "YYYY-MM-DD"); |
|
36 |
params.put("publicationDateFormat", "YYYY-MM-DD"); |
|
37 |
params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString()); |
|
38 |
params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString()); |
|
39 |
params.put("identifierFallbackURL", Boolean.toString(true)); |
|
40 |
params.put("identifierMappingARK", "ark, ARK"); |
|
41 |
params.put("identifierMappingDOI", "doi, DOI"); |
|
42 |
params.put("identifierMappingHandle", "Handle, HANDLE"); |
|
43 |
params.put("identifierMappingPURL", "purl, PURL"); |
|
44 |
params.put("identifierMappingURN", "urn, URN"); |
|
45 |
params.put("identifierMappingURL", "url, URL"); |
|
46 |
|
|
47 |
params.put("repositoryAccessType", "httpapi-kaggle"); |
|
48 |
|
|
49 |
params.put("httpapi-kaggle_queueSize", "100"); |
|
50 |
params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name()); |
|
51 |
params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll"); |
|
52 |
params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}"); |
|
53 |
params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems"); |
|
54 |
params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems"); |
|
55 |
params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl"); |
|
56 |
params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com"); |
|
57 |
params.put("httpapi-kaggle_producerBlockPollingTimeout", "2"); |
|
58 |
params.put("httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString()); |
|
59 |
|
|
60 |
InterfaceDescriptor descriptor = new InterfaceDescriptor(); |
|
61 |
descriptor.setId("schema.org - kaggle"); |
|
62 |
descriptor.setBaseUrl("https://www.kaggle.com"); |
|
63 |
|
|
64 |
descriptor.setParams(params); |
|
65 |
|
|
66 |
SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin(); |
|
67 |
|
|
68 |
Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null); |
|
69 |
|
|
70 |
String outDir = params.get("repositoryAccessType"); |
|
71 |
|
|
72 |
log.info("saving content in " + outDir); |
|
73 |
|
|
74 |
File directory = new File(outDir); |
|
75 |
if (directory.exists()) { |
|
76 |
log.info(directory.getAbsolutePath() + " exists, cleaning up"); |
|
77 |
FileUtils.deleteDirectory(directory); |
|
78 |
} |
|
79 |
FileUtils.forceMkdir(directory); |
|
80 |
Utils.writeFiles(iterable, outDir); |
|
81 |
|
|
82 |
} |
|
83 |
|
|
84 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
4 |
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle.KaggleRepositoryIterable; |
|
5 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator; |
|
6 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator; |
|
7 |
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexRepositoryIterable; |
|
8 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
9 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
10 |
import org.apache.commons.logging.Log; |
|
11 |
import org.apache.commons.logging.LogFactory; |
|
12 |
|
|
13 |
import java.net.MalformedURLException; |
|
14 |
import java.net.URL; |
|
15 |
import java.nio.charset.StandardCharsets; |
|
16 |
import java.util.concurrent.TimeUnit; |
|
17 |
|
|
18 |
public class SchemaOrgPlugin extends AbstractCollectorPlugin { |
|
19 |
|
|
20 |
private static final Log log = LogFactory.getLog(SchemaOrgPlugin.class); |
|
21 |
|
|
22 |
public String hello(){ |
|
23 |
return "hello"; |
|
24 |
} |
|
25 |
|
|
26 |
@Override |
|
27 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException { |
|
28 |
try { |
|
29 |
RepositoryIterable repository = null; |
|
30 |
String repositoryAccessType = Utils.getAsString(interfaceDescriptor.getParams(), "repositoryAccessType", null); |
|
31 |
switch(repositoryAccessType) { |
|
32 |
case "sitemapindex": { |
|
33 |
SitemapIndexRepositoryIterable.Options repositoryOptions = this.compileSitemapIndexRepositoryOptions(interfaceDescriptor); |
|
34 |
SitemapIndexRepositoryIterable repositoryIterable = new SitemapIndexRepositoryIterable(repositoryOptions); |
|
35 |
repositoryIterable.bootstrap(); |
|
36 |
repository = repositoryIterable; |
|
37 |
break; |
|
38 |
} |
|
39 |
case "httpapi-kaggle": { |
|
40 |
KaggleRepositoryIterable.Options repositoryOptions = this.compileKaggleRepositoryOptions(interfaceDescriptor); |
|
41 |
KaggleRepositoryIterable repositoryIterable = new KaggleRepositoryIterable(repositoryOptions); |
|
42 |
repositoryIterable.bootstrap(); |
|
43 |
repository = repositoryIterable; |
|
44 |
break; |
|
45 |
} |
|
46 |
default: |
|
47 |
throw new CollectorServiceException(String.format("unrecognized repository access type ", repositoryAccessType)); |
|
48 |
} |
|
49 |
SchemaOrgIterable.Options schemaOrgOptions = this.compileSchemaOrgOptions(interfaceDescriptor); |
|
50 |
SchemaOrgIterable iterable = new SchemaOrgIterable(schemaOrgOptions, repository); |
|
51 |
return iterable; |
|
52 |
} catch (Exception e) { |
|
53 |
throw new CollectorServiceException("Could not create iterator", e); |
|
54 |
} |
|
55 |
} |
|
56 |
|
|
57 |
private KaggleRepositoryIterable.Options compileKaggleRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
58 |
KaggleRepositoryIterable.Options kaggleRepositoryOptions = new KaggleRepositoryIterable.Options(); |
|
59 |
kaggleRepositoryOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "httpapi-kaggle_queueSize", 100)); |
|
60 |
kaggleRepositoryOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeout", 20)); |
|
61 |
kaggleRepositoryOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class)); |
|
62 |
kaggleRepositoryOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "httpapi-kaggle_APICharset", StandardCharsets.UTF_8)); |
|
63 |
kaggleRepositoryOptions.setQueryUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryUrl", null)); |
|
64 |
kaggleRepositoryOptions.setQueryPagePlaceholder(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryPagePlaceholder", "{PAGE}")); |
|
65 |
kaggleRepositoryOptions.setResponsePropertyTotalDataset(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems")); |
|
66 |
kaggleRepositoryOptions.setResponsePropertyDatasetList(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetList", "datasetListItems")); |
|
67 |
kaggleRepositoryOptions.setResponsePropertyDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl")); |
|
68 |
kaggleRepositoryOptions.setResponseBaseDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responseBaseDatasetUrl", interfaceDescriptor.getBaseUrl())); |
|
69 |
kaggleRepositoryOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor)); |
|
70 |
return kaggleRepositoryOptions; |
|
71 |
|
|
72 |
} |
|
73 |
|
|
74 |
private SitemapIndexIterator.Options compileSitemapIndexOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
75 |
SitemapIndexIterator.Options sitemapIndexIteratorOptions = new SitemapIndexIterator.Options(); |
|
76 |
sitemapIndexIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_IndexCharset", StandardCharsets.UTF_8)); |
|
77 |
sitemapIndexIteratorOptions.setIndexUrl(new URL(interfaceDescriptor.getBaseUrl())); |
|
78 |
return sitemapIndexIteratorOptions; |
|
79 |
|
|
80 |
} |
|
81 |
|
|
82 |
private SitemapFileIterator.Options compileSitemapFileOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
83 |
SitemapFileIterator.Options sitemapFileIteratorOptions = new SitemapFileIterator.Options(); |
|
84 |
sitemapFileIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_FileCharset", StandardCharsets.UTF_8)); |
|
85 |
sitemapFileIteratorOptions.setSchemaType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Xml, SitemapFileIterator.Options.SitemapSchemaType.class)); |
|
86 |
sitemapFileIteratorOptions.setFileType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text, SitemapFileIterator.Options.SitemapFileType.class)); |
|
87 |
return sitemapFileIteratorOptions; |
|
88 |
} |
|
89 |
|
|
90 |
private RepositoryQueueIterator.Options compileRepositoryQueueOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
91 |
RepositoryQueueIterator.Options repositoryQueueIteratorOptions = new RepositoryQueueIterator.Options(); |
|
92 |
repositoryQueueIteratorOptions.setBlockPolling(Utils.getAsBoolean(interfaceDescriptor.getParams(), "consumerBlockPolling", true)); |
|
93 |
repositoryQueueIteratorOptions.setPollTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "consumerBlockPollingTimeout", 2)); |
|
94 |
repositoryQueueIteratorOptions.setPollTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class)); |
|
95 |
return repositoryQueueIteratorOptions; |
|
96 |
} |
|
97 |
|
|
98 |
private SitemapIndexRepositoryIterable.Options compileSitemapIndexRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
99 |
SitemapIndexRepositoryIterable.Options sitemapIndexRepositoryIterableOptions = new SitemapIndexRepositoryIterable.Options(); |
|
100 |
sitemapIndexRepositoryIterableOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "sitemap_queueSize", 100)); |
|
101 |
sitemapIndexRepositoryIterableOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeout", 20)); |
|
102 |
sitemapIndexRepositoryIterableOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class)); |
|
103 |
sitemapIndexRepositoryIterableOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor)); |
|
104 |
sitemapIndexRepositoryIterableOptions.setSitemapFileIteratorOptions(this.compileSitemapFileOptions(interfaceDescriptor)); |
|
105 |
sitemapIndexRepositoryIterableOptions.setSitemapIndexIteratorOptions(this.compileSitemapIndexOptions(interfaceDescriptor)); |
|
106 |
return sitemapIndexRepositoryIterableOptions; |
|
107 |
} |
|
108 |
|
|
109 |
private EndpointAccessIterator.Options compileEndpointAccessOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
110 |
EndpointAccessIterator.Options endpointAccessIteratorOptions = new EndpointAccessIterator.Options(); |
|
111 |
endpointAccessIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "endpointCharset", StandardCharsets.UTF_8)); |
|
112 |
return endpointAccessIteratorOptions; |
|
113 |
} |
|
114 |
|
|
115 |
private DatasetMappingIterator.Options compileDatasetMappingOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
116 |
DatasetMappingIterator.Options datasetMappingIteratorOptions = new DatasetMappingIterator.Options(); |
|
117 |
|
|
118 |
DatasetMappingIterator.Options.UpdatedDateOptions datasetMappingIteratorUpdatedDateOptions = new DatasetMappingIterator.Options.UpdatedDateOptions(); |
|
119 |
datasetMappingIteratorUpdatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "updatedDateFormat", "YYYY-MM-DD"); |
|
120 |
datasetMappingIteratorOptions.setUpdatedDateOptions(datasetMappingIteratorUpdatedDateOptions); |
|
121 |
|
|
122 |
DatasetMappingIterator.Options.CreatedDateOptions datasetMappingIteratorCreatedDateOptions = new DatasetMappingIterator.Options.CreatedDateOptions(); |
|
123 |
datasetMappingIteratorCreatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "createdDateFormat", "YYYY-MM-DD"); |
|
124 |
datasetMappingIteratorOptions.setCreatedDateOptions(datasetMappingIteratorCreatedDateOptions); |
|
125 |
|
|
126 |
DatasetMappingIterator.Options.PublicationDateOptions datasetMappingIteratorPublicationDateOptions = new DatasetMappingIterator.Options.PublicationDateOptions(); |
|
127 |
datasetMappingIteratorPublicationDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "publicationDateFormat", "YYYY-MM-DD"); |
|
128 |
datasetMappingIteratorOptions.setPublicationDateOptions(datasetMappingIteratorPublicationDateOptions); |
|
129 |
|
|
130 |
DatasetMappingIterator.Options.ContributorOptions datasetMappingIteratorContributorOptions = new DatasetMappingIterator.Options.ContributorOptions(); |
|
131 |
datasetMappingIteratorContributorOptions.fallbackType =Utils.getAsEnum(interfaceDescriptor.getParams(), "contributorFallbackType",DatasetDocument.Contributor.ContributorType.Other, DatasetDocument.Contributor.ContributorType.class); |
|
132 |
datasetMappingIteratorOptions.setContributorOptions(datasetMappingIteratorContributorOptions); |
|
133 |
|
|
134 |
DatasetMappingIterator.Options.IdentifierOptions datasetMappingIteratorIdentifierOptions = new DatasetMappingIterator.Options.IdentifierOptions(); |
|
135 |
datasetMappingIteratorIdentifierOptions.fallbackType = Utils.getAsEnum(interfaceDescriptor.getParams(), "identifierFallbackType", null, DatasetDocument.Identifier.IdentifierType.class); |
|
136 |
datasetMappingIteratorIdentifierOptions.fallbackURL = Utils.getAsBoolean(interfaceDescriptor.getParams(), "identifierFallbackURL", true); |
|
137 |
datasetMappingIteratorIdentifierOptions.mappingARK = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingARK", null); |
|
138 |
datasetMappingIteratorIdentifierOptions.mappingDOI = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingDOI", null); |
|
139 |
datasetMappingIteratorIdentifierOptions.mappingHandle = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingHandle", null); |
|
140 |
datasetMappingIteratorIdentifierOptions.mappingPURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingPURL", null); |
|
141 |
datasetMappingIteratorIdentifierOptions.mappingURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURL", null); |
|
142 |
datasetMappingIteratorIdentifierOptions.mappingURN = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURN", null); |
|
143 |
datasetMappingIteratorOptions.setIdentifierOptions(datasetMappingIteratorIdentifierOptions); |
|
144 |
return datasetMappingIteratorOptions; |
|
145 |
} |
|
146 |
|
|
147 |
private SchemaOrgIterable.Options compileSchemaOrgOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException { |
|
148 |
SchemaOrgIterable.Options schemaOrgIterableOptions = new SchemaOrgIterable.Options(); |
|
149 |
schemaOrgIterableOptions.setDatasetMappingOptions(this.compileDatasetMappingOptions(interfaceDescriptor)); |
|
150 |
schemaOrgIterableOptions.setEndpointAccessOptions(this.compileEndpointAccessOptions(interfaceDescriptor)); |
|
151 |
return schemaOrgIterableOptions; |
|
152 |
} |
|
153 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/pom.xml | ||
---|---|---|
1 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|
2 |
<modelVersion>4.0.0</modelVersion> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet45-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
</parent> |
|
8 |
<groupId>eu.dnetlib</groupId> |
|
9 |
<artifactId>dnet-collector-plugins</artifactId> |
|
10 |
<version>1.6.0</version> |
|
11 |
<scm> |
|
12 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0</developerConnection> |
|
13 |
</scm> |
|
14 |
|
|
15 |
<build> |
|
16 |
<plugins> |
|
17 |
<plugin> |
|
18 |
<artifactId>maven-assembly-plugin</artifactId> |
|
19 |
<configuration> |
|
20 |
<archive> |
|
21 |
<manifest> |
|
22 |
<mainClass>eu.dnetlib.data.collector.plugins.schemaorg.SchemaOrgMainReactome</mainClass> |
|
23 |
</manifest> |
|
24 |
</archive> |
|
25 |
<descriptorRefs> |
|
26 |
<descriptorRef>jar-with-dependencies</descriptorRef> |
|
27 |
</descriptorRefs> |
|
28 |
</configuration> |
|
29 |
</plugin> |
|
30 |
</plugins> |
|
31 |
</build> |
|
32 |
|
|
33 |
<dependencies> |
|
34 |
<dependency> |
|
35 |
<groupId>eu.dnetlib</groupId> |
|
36 |
<artifactId>dnet-modular-collector-service-rmi</artifactId> |
|
37 |
<version>[1.3.0,2.0.0)</version> |
|
38 |
</dependency> |
|
39 |
<dependency> |
|
40 |
<groupId>eu.dnetlib</groupId> |
|
41 |
<artifactId>dnet-modular-collector-service</artifactId> |
|
42 |
<version>[3.3.26,4.0.0)</version> |
|
43 |
</dependency> |
|
44 |
<dependency> |
|
45 |
<groupId>com.google.code.gson</groupId> |
|
46 |
<artifactId>gson</artifactId> |
|
47 |
<version>${google.gson.version}</version> |
|
48 |
</dependency> |
|
49 |
<dependency> |
|
50 |
<groupId>commons-io</groupId> |
|
51 |
<artifactId>commons-io</artifactId> |
|
52 |
<version>${commons.io.version}</version> |
|
53 |
</dependency> |
|
54 |
<dependency> |
|
55 |
<groupId>junit</groupId> |
|
56 |
<artifactId>junit</artifactId> |
|
57 |
<version>${junit.version}</version> |
|
58 |
<scope>test</scope> |
|
59 |
</dependency> |
|
60 |
<dependency> |
|
61 |
<groupId>org.apache.httpcomponents</groupId> |
|
62 |
<artifactId>httpclient</artifactId> |
|
63 |
<version>4.5</version> |
|
64 |
</dependency> |
|
65 |
<dependency> |
|
66 |
<groupId>eu.dnetlib</groupId> |
|
67 |
<artifactId>cnr-resultset-service</artifactId> |
|
68 |
<version>[2.0.0, 3.0.0)</version> |
|
69 |
<scope>provided</scope> |
|
70 |
</dependency> |
|
71 |
<dependency> |
|
72 |
<groupId>com.ximpleware</groupId> |
|
73 |
<artifactId>vtd-xml</artifactId> |
|
74 |
<version>[2.12, 3.0.0)</version> |
|
75 |
</dependency> |
|
76 |
<dependency> |
|
77 |
<groupId>joda-time</groupId> |
|
78 |
<artifactId>joda-time</artifactId> |
|
79 |
<version>2.9.2</version> |
|
80 |
</dependency> |
|
81 |
|
|
82 |
<dependency> |
|
83 |
<groupId>org.json</groupId> |
|
84 |
<artifactId>json</artifactId> |
|
85 |
<version>20180813</version> |
|
86 |
<type>jar</type> |
|
87 |
</dependency> |
|
88 |
<dependency> |
|
89 |
<groupId>org.apache.commons</groupId> |
|
90 |
<artifactId>commons-lang3</artifactId> |
|
91 |
<version>3.5</version> |
|
92 |
</dependency> |
|
93 |
|
|
94 |
<dependency> |
|
95 |
<groupId>org.apache.poi</groupId> |
|
96 |
<artifactId>poi</artifactId> |
|
97 |
<version>3.16</version> |
|
98 |
</dependency> |
|
99 |
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> |
|
100 |
<dependency> |
|
101 |
<groupId>org.apache.poi</groupId> |
|
102 |
<artifactId>poi-ooxml</artifactId> |
|
103 |
<version>3.16</version> |
|
104 |
</dependency> |
|
105 |
<dependency> |
|
106 |
<groupId>org.jsoup</groupId> |
|
107 |
<artifactId>jsoup</artifactId> |
|
108 |
<version>1.11.2</version> |
|
109 |
</dependency> |
|
110 |
<dependency> |
|
111 |
<groupId>commons-lang</groupId> |
|
112 |
<artifactId>commons-lang</artifactId> |
|
113 |
<version>2.6</version> |
|
114 |
<scope>compile</scope> |
|
115 |
</dependency> |
|
116 |
<dependency> |
|
117 |
<groupId>org.mockito</groupId> |
|
118 |
<artifactId>mockito-core</artifactId> |
|
119 |
<version>3.3.3</version> |
|
120 |
<scope>test</scope> |
|
121 |
</dependency> |
|
122 |
</dependencies> |
|
123 |
</project> |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/fairsharing/FairSharingPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.fairsharing; |
|
2 |
|
|
3 |
import java.io.UnsupportedEncodingException; |
|
4 |
|
|
5 |
import org.apache.commons.io.IOUtils; |
|
6 |
import org.apache.commons.lang3.StringUtils; |
|
7 |
import org.apache.commons.logging.Log; |
|
8 |
import org.apache.commons.logging.LogFactory; |
|
9 |
import org.apache.http.HttpEntity; |
|
10 |
import org.apache.http.client.methods.CloseableHttpResponse; |
|
11 |
import org.apache.http.client.methods.HttpPost; |
|
12 |
import org.apache.http.entity.StringEntity; |
|
13 |
import org.apache.http.impl.client.CloseableHttpClient; |
|
14 |
import org.apache.http.impl.client.HttpClients; |
|
15 |
import org.json.JSONObject; |
|
16 |
|
|
17 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
18 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
19 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
20 |
|
|
21 |
public class FairSharingPlugin extends AbstractCollectorPlugin { |
|
22 |
|
|
23 |
private static final int PAGE_SIZE = 100; |
|
24 |
|
|
25 |
private static final Log log = LogFactory.getLog(FairSharingPlugin.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
26 |
|
|
27 |
// Suggested values: |
|
28 |
// baseUrl = https://api.fairsharing.org |
|
29 |
// XPATH_ID = /record/id |
|
30 |
|
|
31 |
@Override |
|
32 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) |
|
33 |
throws CollectorServiceException { |
|
34 |
|
|
35 |
final String baseUrl = interfaceDescriptor.getBaseUrl(); |
|
36 |
final String login = interfaceDescriptor.getParams().get("login"); |
|
37 |
final String password = interfaceDescriptor.getParams().get("password"); |
|
38 |
|
|
39 |
final String authCode = login(baseUrl, login, password); |
|
40 |
|
|
41 |
log.debug("authCode: " + authCode); |
|
42 |
|
|
43 |
if (StringUtils.isBlank(authCode)) { throw new CollectorServiceException("Authorization failed: authCode is empty"); } |
|
44 |
|
|
45 |
return () -> new FairSharingIterator(baseUrl, authCode, PAGE_SIZE); |
|
46 |
} |
|
47 |
|
|
48 |
private String login(final String baseUrl, final String login, final String password) throws CollectorServiceException { |
|
49 |
final HttpPost req = new HttpPost(baseUrl + "/users/sign_in"); |
|
50 |
req.addHeader("Accept", "application/json"); |
|
51 |
req.addHeader("Content-Type", "application/json"); |
|
52 |
req.setEntity(prepareCredentials(login, password)); |
|
53 |
|
|
54 |
try (final CloseableHttpClient client = HttpClients.createDefault()) { |
|
55 |
try (final CloseableHttpResponse response = client.execute(req)) { |
|
56 |
final String content = IOUtils.toString(response.getEntity().getContent()); |
|
57 |
final JSONObject obj = new JSONObject(content); |
|
58 |
return obj.getString("jwt"); |
|
59 |
} |
|
60 |
} catch (final Exception e) { |
|
61 |
throw new CollectorServiceException("Error perfoming login", e); |
|
62 |
} |
|
63 |
} |
|
64 |
|
|
65 |
public HttpEntity prepareCredentials(final String login, final String password) throws CollectorServiceException { |
|
66 |
|
|
67 |
final JSONObject objUser = new JSONObject(); |
|
68 |
objUser.put("login", login); |
|
69 |
objUser.put("password", password); |
|
70 |
|
|
71 |
final JSONObject objCredentials = new JSONObject(); |
|
72 |
objCredentials.put("user", objUser); |
|
73 |
|
|
74 |
try { |
|
75 |
return new StringEntity(objCredentials.toString()); |
|
76 |
} catch (final UnsupportedEncodingException e) { |
|
77 |
throw new CollectorServiceException("Error preparing http entity for login"); |
|
78 |
} |
|
79 |
} |
|
80 |
|
|
81 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/fairsharing/FairSharingIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.fairsharing; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
import java.util.Queue; |
|
5 |
import java.util.concurrent.PriorityBlockingQueue; |
|
6 |
|
|
7 |
import org.apache.commons.io.IOUtils; |
|
8 |
import org.apache.commons.logging.Log; |
|
9 |
import org.apache.commons.logging.LogFactory; |
|
10 |
import org.apache.http.client.methods.CloseableHttpResponse; |
|
11 |
import org.apache.http.client.methods.HttpGet; |
|
12 |
import org.apache.http.impl.client.CloseableHttpClient; |
|
13 |
import org.apache.http.impl.client.HttpClients; |
|
14 |
import org.json.JSONObject; |
|
15 |
import org.json.XML; |
|
16 |
|
|
17 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
18 |
|
|
19 |
public class FairSharingIterator implements Iterator<String> { |
|
20 |
|
|
21 |
private static final Log log = LogFactory.getLog(FairSharingIterator.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
22 |
|
|
23 |
private final Queue<String> queue = new PriorityBlockingQueue<>(); |
|
24 |
|
|
25 |
private final String baseUrl; |
|
26 |
private final String authCode; |
|
27 |
private final int pageSize; |
|
28 |
|
|
29 |
private String nextUrl; |
|
30 |
private boolean started; |
|
31 |
|
|
32 |
public FairSharingIterator(final String baseUrl, final String authCode, final int pageSize) { |
|
33 |
this.baseUrl = baseUrl; |
|
34 |
this.authCode = authCode; |
|
35 |
this.pageSize = pageSize; |
|
36 |
this.started = false; |
|
37 |
} |
|
38 |
|
|
39 |
private void verifyStarted() { |
|
40 |
if (!this.started) { |
|
41 |
this.started = true; |
|
42 |
try { |
|
43 |
final String url = baseUrl + "/fairsharing_records/?page%5Bnumber%5D=1&page%5Bsize%5D=" + pageSize; |
|
44 |
this.nextUrl = downloadPage(url); |
|
45 |
} catch (final CollectorServiceException e) { |
|
46 |
throw new RuntimeException(e); |
|
47 |
} |
|
48 |
} |
|
49 |
} |
|
50 |
|
|
51 |
@Override |
|
52 |
public boolean hasNext() { |
|
53 |
synchronized (queue) { |
|
54 |
verifyStarted(); |
|
55 |
return !queue.isEmpty(); |
|
56 |
} |
|
57 |
} |
|
58 |
|
|
59 |
@Override |
|
60 |
public String next() { |
|
61 |
synchronized (queue) { |
|
62 |
verifyStarted(); |
|
63 |
final String res = queue.poll(); |
|
64 |
while (queue.isEmpty() && nextUrl != null && !nextUrl.isEmpty()) { |
|
65 |
try { |
|
66 |
nextUrl = downloadPage(nextUrl); |
|
67 |
} catch (final CollectorServiceException e) { |
|
68 |
throw new RuntimeException(e); |
|
69 |
} |
|
70 |
} |
|
71 |
return res; |
|
72 |
} |
|
73 |
} |
|
74 |
|
|
75 |
@Override |
|
76 |
public void remove() {} |
|
77 |
|
|
78 |
private String downloadPage(final String url) throws CollectorServiceException { |
|
79 |
log.debug("Fetching url: " + url); |
|
80 |
|
|
81 |
final HttpGet req = new HttpGet(url); |
|
82 |
req.addHeader("Accept", "application/json"); |
|
83 |
req.addHeader("Content-Type", "application/json"); |
|
84 |
req.addHeader("Authorization", "Bearer " + authCode); |
|
85 |
|
|
86 |
try (final CloseableHttpClient client = HttpClients.createDefault()) { |
|
87 |
try (final CloseableHttpResponse response = client.execute(req)) { |
|
88 |
final String content = IOUtils.toString(response.getEntity().getContent()); |
|
89 |
final JSONObject obj = new JSONObject(content); |
|
90 |
|
|
91 |
obj.getJSONArray("data") |
|
92 |
.forEach(x -> queue.add(XML.toString(x, "record"))); |
|
93 |
|
|
94 |
final JSONObject links = obj.getJSONObject("links"); |
|
95 |
|
|
96 |
return links.isNull("next") ? null : links.getString("next"); |
|
97 |
} |
|
98 |
} catch (final Exception e) { |
|
99 |
throw new CollectorServiceException("Error perfoming call fro login", e); |
|
100 |
} |
|
101 |
} |
|
102 |
|
|
103 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/eosc/EoscServicesPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.eosc; |
|
2 |
|
|
3 |
import org.apache.commons.lang3.math.NumberUtils; |
|
4 |
import org.apache.http.client.methods.CloseableHttpResponse; |
|
5 |
import org.apache.http.client.methods.HttpGet; |
|
6 |
import org.apache.http.impl.client.CloseableHttpClient; |
|
7 |
import org.apache.http.impl.client.HttpClients; |
|
8 |
import org.dom4j.Document; |
|
9 |
import org.dom4j.DocumentHelper; |
|
10 |
import org.dom4j.Element; |
|
11 |
import org.dom4j.Node; |
|
12 |
import org.dom4j.io.SAXReader; |
|
13 |
|
|
14 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
15 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
16 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
17 |
|
|
18 |
public class EoscServicesPlugin extends AbstractCollectorPlugin { |
|
19 |
|
|
20 |
// Suggested values: |
|
21 |
// baseUrl = https://api.eosc-portal.eu |
|
22 |
// maxProviders = 10000 |
|
23 |
// XPATH_ID = /record/organization/id |
|
24 |
|
|
25 |
@SuppressWarnings("unchecked") |
|
26 |
@Override |
|
27 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) |
|
28 |
throws CollectorServiceException { |
|
29 |
|
|
30 |
final String baseUrl = interfaceDescriptor.getBaseUrl(); |
|
31 |
final long maxProviders = NumberUtils.toLong(interfaceDescriptor.getParams().get("maxProviders"), 10000); |
|
32 |
|
|
33 |
return () -> httpCall(baseUrl + "/provider/all?quantity=" + maxProviders) |
|
34 |
.selectNodes("/Paging/results/results") |
|
35 |
.stream() |
|
36 |
.map(o -> processProvider(baseUrl, (Node) o)) |
|
37 |
.iterator(); |
|
38 |
} |
|
39 |
|
|
40 |
private String processProvider(final String baseUrl, final Node nodeProv) { |
|
41 |
final String orgId = nodeProv.valueOf("./id"); |
|
42 |
|
|
43 |
final Document newDoc = DocumentHelper.createDocument(); |
|
44 |
final Element newRoot = DocumentHelper.createElement("record"); |
|
45 |
|
|
46 |
newDoc.setRootElement(newRoot); |
|
47 |
|
|
48 |
nodeProv.setName("organization"); |
|
49 |
newRoot.add(nodeProv.detach()); |
|
50 |
|
|
51 |
final Document docSrvs = httpCall(baseUrl + "/provider/services/" + orgId); |
|
52 |
for (final Object o : docSrvs.selectNodes("/List/item")) { |
|
53 |
final Node nodeSrv = (Node) o; |
|
54 |
nodeSrv.setName("service"); |
|
55 |
newRoot.add(nodeSrv.detach()); |
|
56 |
} |
|
57 |
|
|
58 |
return newDoc.asXML(); |
|
59 |
} |
|
60 |
|
|
61 |
private Document httpCall(final String url) { |
|
62 |
final SAXReader reader = new SAXReader(); |
|
63 |
|
|
64 |
final HttpGet req = new HttpGet(url); |
|
65 |
req.addHeader("Accept", "application/xml"); |
|
66 |
|
|
67 |
try (final CloseableHttpClient client = HttpClients.createDefault()) { |
|
68 |
try (final CloseableHttpResponse response = client.execute(req)) { |
|
69 |
return reader.read(response.getEntity().getContent()); |
|
70 |
} |
|
71 |
} catch (final Exception e) { |
|
72 |
throw new RuntimeException(e); |
|
73 |
} |
|
74 |
} |
|
75 |
|
|
76 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/Connector.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.httpfilename; |
|
2 |
|
|
3 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
|
4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
5 |
|
|
6 |
|
|
7 |
/** |
|
8 |
* Created by miriam on 07/05/2018. |
|
9 |
*/ |
|
10 |
public class Connector extends HttpConnector implements ConnectorInterface { |
|
11 |
private String response; |
|
12 |
|
|
13 |
@Override |
|
14 |
public void get(final String requestUrl) throws CollectorServiceException { |
|
15 |
response = getInputSource(requestUrl); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
public String getResponse() { |
|
20 |
return response; |
|
21 |
} |
|
22 |
|
|
23 |
@Override |
|
24 |
public boolean isStatusOk() { |
|
25 |
return (response != null); |
|
26 |
} |
|
27 |
|
|
28 |
@Override |
|
29 |
public boolean responseTypeContains(String string) { |
|
30 |
String responseType = getResponseType(); |
|
31 |
if (responseType != null) |
|
32 |
return responseType.contains(string); |
|
33 |
return false; |
|
34 |
} |
|
35 |
|
|
36 |
|
|
37 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/rest/RestIterator.java | ||
---|---|---|
1 |
/** |
|
2 |
* log.debug(...) equal to log.trace(...) in the application-logs |
|
3 |
* <p> |
|
4 |
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue |
|
5 |
*/ |
|
6 |
package eu.dnetlib.data.collector.plugins.rest; |
|
7 |
|
|
8 |
import java.io.InputStream; |
|
9 |
import java.io.StringWriter; |
|
10 |
import java.io.UnsupportedEncodingException; |
|
11 |
import java.net.URL; |
|
12 |
import java.net.URLEncoder; |
|
13 |
import java.nio.charset.StandardCharsets; |
|
14 |
import java.net.HttpURLConnection; |
|
15 |
import java.util.Iterator; |
|
16 |
import java.util.Map; |
|
17 |
import java.util.Queue; |
|
18 |
import java.util.concurrent.PriorityBlockingQueue; |
|
19 |
import java.util.regex.Pattern; |
|
20 |
import java.util.regex.Matcher; |
|
21 |
import javax.xml.transform.OutputKeys; |
|
22 |
import javax.xml.transform.Transformer; |
|
23 |
import javax.xml.transform.TransformerConfigurationException; |
|
24 |
import javax.xml.transform.TransformerFactory; |
|
25 |
import javax.xml.transform.dom.DOMSource; |
|
26 |
import javax.xml.transform.stream.StreamResult; |
|
27 |
import javax.xml.xpath.*; |
|
28 |
|
|
29 |
import com.google.common.collect.Maps; |
|
30 |
import eu.dnetlib.data.collector.plugins.utils.JsonUtils; |
|
31 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
32 |
import org.apache.commons.io.IOUtils; |
|
33 |
import org.apache.commons.lang3.StringUtils; |
|
34 |
import org.apache.commons.logging.Log; |
|
35 |
import org.apache.commons.logging.LogFactory; |
|
36 |
import org.apache.http.client.methods.CloseableHttpResponse; |
|
37 |
import org.apache.http.client.methods.HttpGet; |
|
38 |
import org.apache.http.impl.client.HttpClients; |
|
39 |
import org.w3c.dom.Node; |
|
40 |
import org.w3c.dom.NodeList; |
|
41 |
import org.xml.sax.InputSource; |
|
42 |
|
|
43 |
/** |
|
44 |
* @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak, Alessia Bardi, Miriam Baglioni |
|
45 |
* @date 2020-04-09 |
|
46 |
*/ |
|
47 |
public class RestIterator implements Iterator<String> { |
|
48 |
private final String AUTHBASIC = "basic"; |
|
49 |
|
|
50 |
// TODO: clean up the comments of replaced source code |
|
51 |
private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
52 |
private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; |
|
53 |
private static final String EMPTY_XML = XML_HEADER + JsonUtils.wrapName + "></" + JsonUtils.wrapName + ">"; |
|
54 |
private JsonUtils jsonUtils; |
|
55 |
|
|
56 |
private String baseUrl; |
|
57 |
private String resumptionType; |
|
58 |
private String resumptionParam; |
|
59 |
private String resultFormatValue; |
|
60 |
private String queryParams = ""; |
|
61 |
private int resultSizeValue; |
|
62 |
private int resumptionInt = 0; // integer resumption token (first record to harvest) |
|
63 |
private int resultTotal = -1; |
|
64 |
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest or token scanned from results) |
|
65 |
private InputStream resultStream; |
|
66 |
private Transformer transformer; |
|
67 |
private XPath xpath; |
|
68 |
private String query; |
|
69 |
private XPathExpression xprResultTotalPath; |
|
70 |
private XPathExpression xprResumptionPath; |
|
71 |
private XPathExpression xprEntity; |
|
72 |
private String queryFormat; |
|
73 |
private String querySize; |
|
74 |
private String authMethod; |
|
75 |
private String authToken; |
|
76 |
private Queue<String> recordQueue = new PriorityBlockingQueue<String>(); |
|
77 |
private int discoverResultSize = 0; |
|
78 |
private int pagination = 1; |
|
79 |
/* |
|
80 |
While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in json. |
|
81 |
useful for cases when the target API expects a resultFormatValue != json, but the results are returned in json. |
|
82 |
An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format. |
|
83 |
*/ |
|
84 |
private String resultOutputFormat; |
|
85 |
/* |
|
86 |
Can be used to set additional request headers, like for content negotiation |
|
87 |
*/ |
|
88 |
private Map<String, String> requestHeaders; |
|
89 |
|
|
90 |
|
|
91 |
public RestIterator( |
|
92 |
final String baseUrl, |
|
93 |
final String resumptionType, |
|
94 |
final String resumptionParam, |
|
95 |
final String resumptionXpath, |
|
96 |
final String resultTotalXpath, |
|
97 |
final String resultFormatParam, |
|
98 |
final String resultFormatValue, |
|
99 |
final String resultSizeParam, |
|
100 |
final String resultSizeValueStr, |
|
101 |
final String queryParams, |
|
102 |
final String entityXpath, |
|
103 |
final String authMethod, |
|
104 |
final String authToken, |
|
105 |
final String resultOutputFormat, |
|
106 |
final Map<String, String> requestHeaders |
|
107 |
) { |
|
108 |
this.jsonUtils = new JsonUtils(); |
|
109 |
this.baseUrl = baseUrl; |
|
110 |
this.resumptionType = resumptionType; |
|
111 |
this.resumptionParam = resumptionParam; |
|
112 |
this.resultFormatValue = resultFormatValue; |
|
113 |
this.queryParams = queryParams; |
|
114 |
this.resultSizeValue = Integer.valueOf(resultSizeValueStr); |
|
115 |
this.authMethod = authMethod; |
|
116 |
this.authToken = authToken; |
|
117 |
this.resultOutputFormat = resultOutputFormat; |
|
118 |
this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap(); |
|
119 |
|
|
120 |
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : ""; |
|
121 |
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : ""; |
|
122 |
|
|
123 |
try { |
|
124 |
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath); |
|
125 |
} catch (Exception e) { |
|
126 |
throw new IllegalStateException("xml transformation init failed: " + e.getMessage()); |
|
127 |
} |
|
128 |
initQueue(); |
|
129 |
} |
|
130 |
|
|
131 |
|
|
132 |
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) |
|
133 |
throws TransformerConfigurationException, XPathExpressionException { |
|
134 |
transformer = TransformerFactory.newInstance().newTransformer(); |
|
135 |
transformer.setOutputProperty(OutputKeys.INDENT, "yes"); |
|
136 |
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); |
|
137 |
xpath = XPathFactory.newInstance().newXPath(); |
|
138 |
xprResultTotalPath = xpath.compile(resultTotalXpath); |
|
139 |
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath); |
|
140 |
xprEntity = xpath.compile(entityXpath); |
|
141 |
} |
|
142 |
|
|
143 |
private void initQueue() { |
|
144 |
if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) { |
|
145 |
query = baseUrl; |
|
146 |
} else { |
|
147 |
query = baseUrl + "?" + queryParams + querySize + queryFormat; |
|
148 |
} |
|
149 |
|
|
150 |
log.info("RestIterator.initQueue():: REST calls starting with " + query); |
|
151 |
} |
|
152 |
|
|
153 |
private void disconnect() { |
|
154 |
// TODO close inputstream |
|
155 |
} |
|
156 |
|
|
157 |
/* (non-Javadoc) |
|
158 |
* @see java.util.Iterator#hasNext() |
|
159 |
*/ |
|
160 |
@Override |
|
161 |
public boolean hasNext() { |
|
162 |
if (recordQueue.isEmpty() && query.isEmpty()) { |
|
163 |
disconnect(); |
|
164 |
return false; |
|
165 |
} else { |
|
166 |
return true; |
|
167 |
} |
|
168 |
} |
|
169 |
|
|
170 |
/* (non-Javadoc) |
|
171 |
* @see java.util.Iterator#next() |
|
172 |
*/ |
|
173 |
@Override |
|
174 |
public String next() { |
|
175 |
synchronized (recordQueue) { |
|
176 |
while (recordQueue.isEmpty() && !query.isEmpty()) { |
|
177 |
try { |
|
178 |
log.debug("RestIterator.next():: get Query: " + query); |
|
179 |
query = downloadPage(query); |
|
180 |
log.debug("RestIterator.next():: next queryURL from downloadPage(): " + query); |
|
181 |
} catch (CollectorServiceException e) { |
|
182 |
log.debug("RestIterator.next():: CollectorPlugin.next()-Exception: " + e); |
|
183 |
throw new RuntimeException(e); |
|
184 |
} |
|
185 |
} |
|
186 |
return recordQueue.poll(); |
|
187 |
} |
|
188 |
} |
|
189 |
|
|
190 |
/* |
|
191 |
* download page and return nextQuery |
|
192 |
*/ |
|
193 |
private String downloadPage(String query) throws CollectorServiceException { |
|
194 |
String resultJson; |
|
195 |
String resultXml = XML_HEADER; |
|
196 |
String nextQuery = ""; |
|
197 |
Node resultNode = null; |
|
198 |
NodeList nodeList = null; |
|
199 |
String qUrlArgument = ""; |
|
200 |
int urlOldResumptionSize = 0; |
|
201 |
InputStream theHttpInputStream; |
|
202 |
|
|
203 |
// modifying request URL |
|
204 |
// check if cursor=* is initial set otherwise add it to the queryParam URL |
|
205 |
if (resumptionType.equalsIgnoreCase("deep-cursor")) { |
|
206 |
log.debug("RestIterator.downloadPage():: check resumptionType deep-cursor and check cursor=*?" + query); |
|
207 |
if (!query.contains("&cursor=")) { |
|
208 |
query += "&cursor=*"; |
|
209 |
} |
|
210 |
} |
|
211 |
// find pagination page start number in queryParam and remove before start the first query |
|
212 |
if((resumptionType.toLowerCase().equals("pagination")) && (query.contains("paginationStart="))) { |
|
213 |
|
|
214 |
final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query); |
|
215 |
m.find(); // guaranteed to be true for this regex |
|
216 |
|
|
217 |
String[] pageVal = m.group(0).split("="); |
|
218 |
pagination = Integer.parseInt(pageVal[1]); |
|
219 |
|
|
220 |
// remove page start number from queryParams |
|
221 |
query = query.replaceFirst("&?paginationStart=[0-9]+", ""); |
|
222 |
|
|
223 |
} |
|
224 |
|
|
225 |
|
|
226 |
try { |
|
227 |
|
|
228 |
URL qUrl = new URL(query); |
|
229 |
log.debug("authMethod :" + authMethod); |
|
230 |
if (this.authMethod == "bearer") { |
|
231 |
log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); |
|
232 |
requestHeaders.put("Authorization", "Bearer " + authToken); |
|
233 |
//requestHeaders.put("Content-Type", "application/json"); |
|
234 |
} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) { |
|
235 |
log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml); |
|
236 |
requestHeaders.put("Authorization", "Basic " + authToken); |
|
237 |
//requestHeaders.put("accept", "application/xml"); |
|
238 |
} |
|
239 |
|
|
240 |
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); |
|
241 |
conn.setRequestMethod("GET"); |
|
242 |
this.setRequestHeader(conn); |
|
243 |
resultStream = conn.getInputStream(); |
|
244 |
|
|
245 |
if ("json".equals(resultOutputFormat)) { |
|
246 |
resultJson = IOUtils.toString(resultStream, "UTF-8"); |
|
247 |
resultXml = jsonUtils.convertToXML(resultJson); |
|
248 |
resultStream = IOUtils.toInputStream(resultXml, "UTF-8"); |
|
249 |
} |
|
250 |
|
|
251 |
if (!isEmptyXml(resultXml)) { |
|
252 |
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE); |
|
253 |
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET); |
|
254 |
log.debug("RestIterator.downloadPage():: nodeList.length=" + nodeList.getLength()); |
|
255 |
for (int i = 0; i < nodeList.getLength(); i++) { |
|
256 |
StringWriter sw = new StringWriter(); |
|
257 |
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); |
|
258 |
String toEnqueue = sw.toString(); |
|
259 |
if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) { |
|
260 |
log.warn("RestIterator.downloadPage():: The following record resulted in empty item for the feeding queue: " + resultXml); |
|
261 |
} else { |
|
262 |
recordQueue.add(sw.toString()); |
|
263 |
} |
|
264 |
} |
|
265 |
} else { |
|
266 |
log.warn("resultXml is equal with emptyXml"); |
|
267 |
} |
|
268 |
|
|
269 |
resumptionInt += resultSizeValue; |
|
270 |
|
|
271 |
switch (resumptionType.toLowerCase()) { |
|
272 |
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items |
|
273 |
resumptionStr = xprResumptionPath.evaluate(resultNode); |
|
274 |
break; |
|
275 |
|
|
276 |
case "count": // begin at one step for all records, iterate over items |
|
277 |
resumptionStr = Integer.toString(resumptionInt); |
|
278 |
break; |
|
279 |
|
|
280 |
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808) |
|
281 |
if (resultSizeValue < 2) { |
|
282 |
log.debug("RestIterator.downloadPage().discover:: ode: discover, Param 'resultSizeValue' must greater then 1"); |
|
283 |
throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' must greater then 1"); |
|
284 |
} |
|
285 |
log.debug("RestIterator.downloadPage().discover:: resumptionInt="+Integer.toString(resumptionInt)+"; "); |
|
286 |
qUrlArgument = qUrl.getQuery(); |
|
287 |
|
|
288 |
if( qUrlArgument != null ) { |
|
289 |
String[] arrayQUrlArgument = qUrlArgument.split("&"); |
|
290 |
|
|
291 |
// check if URL arguments given |
|
292 |
if( arrayQUrlArgument != null ) { |
|
293 |
for (String arrayUrlArgStr : arrayQUrlArgument) { |
|
294 |
log.debug("RestIterator.downloadPage/discover:: "+arrayUrlArgStr); |
|
295 |
if (arrayUrlArgStr.startsWith(resumptionParam)) { |
|
296 |
String[] resumptionKeyValue = arrayUrlArgStr.split("="); |
|
297 |
if (isInteger(resumptionKeyValue[1])) { |
|
298 |
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]); |
|
299 |
log.debug("RestIterator.downloadPage():discover:: OldResumptionSize from Url (int): " + urlOldResumptionSize); |
|
300 |
} else { |
|
301 |
log.debug("RestIterator.downloadPage().discover:: OldResumptionSize from Url (str): " + resumptionKeyValue[1]); |
|
302 |
} |
|
303 |
} |
|
304 |
} |
|
305 |
} |
|
306 |
} |
|
307 |
log.debug("RestIterator.downloadPage().discover:: nodeList.length=" + nodeList.getLength()); |
|
308 |
|
|
309 |
if (isEmptyXml(resultXml) || ((nodeList != null) && (nodeList.getLength() < resultSizeValue)) |
|
310 |
) { |
|
311 |
// resumptionStr = ""; |
|
312 |
if (nodeList != null) { |
|
313 |
discoverResultSize += nodeList.getLength(); |
|
314 |
} |
|
315 |
resultTotal = discoverResultSize; |
|
316 |
} else { |
|
317 |
resumptionStr = Integer.toString(resumptionInt); |
|
318 |
resultTotal = resumptionInt + 1; |
|
319 |
if (nodeList != null) { |
|
320 |
discoverResultSize += nodeList.getLength(); |
|
321 |
} |
|
322 |
} |
|
323 |
log.debug("RestIterator.downloadPage().discover:: discoverResultSize=" + discoverResultSize); |
|
324 |
break; |
|
325 |
|
|
326 |
case "pagination": |
|
327 |
case "page": // pagination, iterate over page numbers |
|
328 |
// find start page number |
|
329 |
pagination += 1; |
|
330 |
if (nodeList != null) { |
|
331 |
discoverResultSize += nodeList.getLength(); |
|
332 |
} else { |
|
333 |
resultTotal = discoverResultSize; |
|
334 |
pagination = discoverResultSize; |
|
335 |
} |
|
336 |
resumptionInt = pagination; |
|
337 |
resumptionStr = Integer.toString(resumptionInt); |
|
338 |
|
|
339 |
log.debug("RestIterator.downloadPage().pagination:: resumptionStr=" + resumptionStr + " ; queryParams=" + queryParams + " ; resultTotal: " + resultTotal + " ; discoverResultSize: " + discoverResultSize); |
|
340 |
|
|
341 |
break; |
|
342 |
|
|
343 |
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in solr) |
|
344 |
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: deep-cursor, Param 'resultSizeValue' is less than 2");} |
|
345 |
|
|
346 |
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode)); |
|
347 |
queryParams = queryParams.replace("&cursor=*", ""); |
|
348 |
|
|
349 |
// terminating if length of nodeList is 0 |
|
350 |
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) { |
|
351 |
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue); |
|
352 |
} else { |
|
353 |
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue because the iteration is over real length and the resultSizeValue is added before the switch() |
|
354 |
} |
|
355 |
|
|
356 |
discoverResultSize = nodeList.getLength(); |
|
357 |
|
|
358 |
log.debug("RestIterator.downloadPage().deep-cursor:: resumptionStr=" + resumptionStr + " ; queryParams=" + queryParams + " resumptionLengthIncreased: " + resumptionInt); |
|
359 |
|
|
360 |
break; |
|
361 |
|
|
362 |
default: // otherwise: abort |
|
363 |
// resultTotal = resumptionInt; |
|
364 |
break; |
|
365 |
} |
|
366 |
|
|
367 |
} catch (Exception e) { |
|
368 |
log.error(e); |
|
369 |
throw new IllegalStateException("collection failed: " + e.getMessage()); |
|
370 |
} |
|
371 |
|
|
372 |
try { |
|
373 |
String resultTotalXpathEval = xprResultTotalPath.evaluate(resultNode); |
|
374 |
|
|
375 |
log.debug("downloadPage():: resInt: " +resumptionInt + "; resultTotal: " + resultTotal + " ; resultTotalXpath eval.: " + resultTotalXpathEval ); |
|
376 |
if ((resultTotal == -1) && (!resultTotalXpathEval.isEmpty())) { |
|
377 |
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode)); |
|
378 |
if (resumptionType.toLowerCase().equals("page") && !AUTHBASIC.equalsIgnoreCase(authMethod)) { |
|
379 |
resultTotal += 1; |
|
380 |
} // to correct the upper bound |
|
381 |
log.info("resultTotal was -1 is now: " + resultTotal); |
|
382 |
} |
|
383 |
} catch (Exception e) { |
|
384 |
log.error(e); |
|
385 |
throw new IllegalStateException("downloadPage() resultTotal couldn't parse: " + e.getMessage()); |
|
386 |
} |
|
387 |
log.debug("resultTotal: " + resultTotal + " ; resInt: " + resumptionInt); |
|
388 |
if (resumptionInt <= resultTotal) { |
|
389 |
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat; |
|
390 |
} else { |
|
391 |
nextQuery = ""; |
|
392 |
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the resumptionInt and prevent a NullPointer Exception at mdStore |
|
393 |
} |
|
394 |
log.debug("downloadPage() nextQueryUrl: " + nextQuery); |
|
395 |
return nextQuery; |
|
396 |
|
|
397 |
|
|
398 |
} |
|
399 |
|
|
400 |
private boolean isEmptyXml(String s){ |
|
401 |
return EMPTY_XML.equalsIgnoreCase(s); |
|
402 |
} |
|
403 |
|
|
404 |
|
|
405 |
private boolean isInteger(String s) { |
|
406 |
boolean isValidInteger = false; |
|
407 |
try { |
|
408 |
Integer.parseInt(s); |
|
409 |
|
|
410 |
// s is a valid integer |
|
411 |
|
|
412 |
isValidInteger = true; |
|
413 |
} catch (NumberFormatException ex) { |
|
414 |
// s is not an integer |
|
415 |
} |
|
416 |
|
|
417 |
return isValidInteger; |
|
418 |
} |
|
419 |
|
|
420 |
// Method to encode a string value using `UTF-8` encoding scheme |
|
421 |
private String encodeValue(String value) { |
|
422 |
try { |
|
423 |
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString()); |
|
424 |
} catch (UnsupportedEncodingException ex) { |
|
425 |
throw new RuntimeException(ex.getCause()); |
|
426 |
} |
|
427 |
} |
|
428 |
|
|
429 |
/** |
|
430 |
* setRequestHeader |
|
431 |
* |
|
432 |
* setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value. |
|
433 |
* @param conn |
|
434 |
*/ |
|
435 |
private void setRequestHeader(HttpURLConnection conn) { |
|
436 |
if (requestHeaders != null) { |
|
437 |
for (String key : requestHeaders.keySet()) { |
|
438 |
conn.setRequestProperty(key, requestHeaders.get(key)); |
|
439 |
} |
|
440 |
log.debug("Set Request Header with: " + requestHeaders); |
|
441 |
} |
|
442 |
|
|
443 |
} |
|
444 |
|
|
445 |
public String getResultFormatValue() { |
|
446 |
return resultFormatValue; |
|
447 |
} |
|
448 |
|
|
449 |
public String getResultOutputFormat() { |
|
450 |
return resultOutputFormat; |
|
451 |
} |
|
452 |
|
|
453 |
} |
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetMappingIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.schemaorg; |
|
2 |
|
|
3 |
import org.apache.commons.logging.Log; |
|
4 |
import org.apache.commons.logging.LogFactory; |
|
5 |
import org.json.JSONObject; |
|
6 |
|
|
7 |
import java.net.URL; |
|
8 |
import java.time.LocalDate; |
|
9 |
import java.time.format.DateTimeFormatter; |
|
10 |
import java.util.*; |
|
11 |
|
|
12 |
public class DatasetMappingIterator implements Iterator<String> { |
|
13 |
private static final Log log = LogFactory.getLog(EndpointAccessIterator.class); |
|
14 |
|
|
15 |
public static class Options { |
|
16 |
public static class IdentifierOptions{ |
|
17 |
public List<String> mappingARK; |
|
18 |
public List<String> mappingDOI; |
|
19 |
public List<String> mappingHandle; |
|
20 |
public List<String> mappingPURL; |
|
21 |
public List<String> mappingURN; |
|
22 |
public List<String> mappingURL; |
|
23 |
public DatasetDocument.Identifier.IdentifierType fallbackType; |
|
24 |
public Boolean fallbackURL; |
|
25 |
} |
|
26 |
|
|
27 |
public static class ContributorOptions{ |
|
28 |
public DatasetDocument.Contributor.ContributorType fallbackType; |
|
29 |
} |
|
30 |
|
|
31 |
public static class PublicationDateOptions{ |
|
32 |
public String format; |
|
33 |
} |
|
34 |
|
|
35 |
public static class CreatedDateOptions{ |
|
36 |
public String format; |
|
37 |
} |
|
38 |
|
|
39 |
public static class UpdatedDateOptions{ |
|
40 |
public String format; |
|
41 |
} |
|
42 |
|
|
43 |
private IdentifierOptions identifierOptions; |
|
44 |
private PublicationDateOptions publicationDateOptions; |
|
45 |
private ContributorOptions contributorOptions; |
|
46 |
private CreatedDateOptions createdDateOptions; |
|
47 |
private UpdatedDateOptions updatedDateOptions; |
|
48 |
|
|
49 |
public UpdatedDateOptions getUpdatedDateOptions() { |
|
50 |
return updatedDateOptions; |
|
51 |
} |
|
52 |
|
|
53 |
public void setUpdatedDateOptions(UpdatedDateOptions updatedDateOptions) { |
|
54 |
this.updatedDateOptions = updatedDateOptions; |
|
55 |
} |
|
56 |
|
|
57 |
public CreatedDateOptions getCreatedDateOptions() { |
|
58 |
return createdDateOptions; |
|
59 |
} |
|
60 |
|
|
61 |
public void setCreatedDateOptions(CreatedDateOptions createdDateOptions) { |
|
62 |
this.createdDateOptions = createdDateOptions; |
|
63 |
} |
|
64 |
|
|
65 |
public ContributorOptions getContributorOptions() { |
|
66 |
return contributorOptions; |
|
67 |
} |
|
68 |
|
|
69 |
public void setContributorOptions(ContributorOptions contributorOptions) { |
|
70 |
this.contributorOptions = contributorOptions; |
|
71 |
} |
|
72 |
|
|
73 |
public PublicationDateOptions getPublicationDateOptions() { |
|
74 |
return publicationDateOptions; |
|
75 |
} |
|
76 |
|
|
77 |
public void setPublicationDateOptions(PublicationDateOptions publicationDateOptions) { |
|
78 |
this.publicationDateOptions = publicationDateOptions; |
|
79 |
} |
|
80 |
|
|
81 |
public IdentifierOptions getIdentifierOptions() { |
|
82 |
return identifierOptions; |
|
83 |
} |
|
84 |
|
|
85 |
public void setIdentifierOptions(IdentifierOptions identifierOptions) { |
|
86 |
this.identifierOptions = identifierOptions; |
|
87 |
} |
|
88 |
} |
|
89 |
|
|
90 |
private Options options; |
|
91 |
private EndpointAccessIterator endpointAccessIterator; |
|
92 |
|
|
93 |
public DatasetMappingIterator(Options options, EndpointAccessIterator endpointAccessIterator) { |
|
94 |
this.options = options; |
|
95 |
this.endpointAccessIterator = endpointAccessIterator; |
|
96 |
} |
|
97 |
|
|
98 |
@Override |
|
99 |
public boolean hasNext() { |
|
100 |
return this.endpointAccessIterator.hasNext(); |
|
101 |
} |
|
102 |
|
|
103 |
@Override |
|
104 |
public String next() { |
|
105 |
JSONObject document = this.endpointAccessIterator.next(); |
|
106 |
String xml = null; |
|
107 |
if (document == null) { |
|
108 |
log.debug("no document provided to process. returning empty"); |
|
109 |
xml = DatasetDocument.emptyXml(); |
|
110 |
} |
|
111 |
else { |
|
112 |
log.debug("building document"); |
|
113 |
xml = this.buildDataset(document); |
|
114 |
if (!Utils.validateXml(xml)) { |
|
115 |
log.debug("xml not valid. setting to empty"); |
|
116 |
xml = null; |
|
117 |
} |
|
118 |
if (xml == null) { |
|
119 |
log.debug("could not build xml. returning empty"); |
|
120 |
xml = DatasetDocument.emptyXml(); |
|
121 |
} |
|
122 |
} |
|
123 |
|
|
124 |
//if all else fails |
|
125 |
if(xml == null){ |
|
126 |
log.debug("could not build xml. returning empty"); |
|
127 |
xml = "<dataset/>"; |
|
128 |
} |
|
129 |
|
|
130 |
log.debug("xml document for dataset is: "+xml); |
|
131 |
|
|
132 |
return xml; |
|
133 |
} |
|
134 |
|
|
135 |
private String buildDataset(JSONObject document){ |
|
136 |
String xml = null; |
|
137 |
try{ |
|
138 |
DatasetDocument dataset = new DatasetDocument(); |
|
139 |
|
|
140 |
dataset.setIdentifiers(this.extractIdentifier(document)); |
|
141 |
dataset.setCreators(this.extractCreator(document)); |
|
142 |
dataset.setTitles(this.extractTitles(document)); |
|
143 |
dataset.setAlternativeTitles(this.extractAlternateTitles(document)); |
|
144 |
dataset.setPublishers(this.extractPublisher(document)); |
|
145 |
dataset.setPublicationDates(this.extractPublicationDate(document)); |
|
146 |
dataset.setSubjects(this.extractSubjects(document)); |
|
147 |
dataset.setContributors(this.extractContributors(document)); |
|
148 |
dataset.setCreatedDates(this.extractCreatedDate(document)); |
|
149 |
dataset.setUpdatedDates(this.extractUpdatedDate(document)); |
|
150 |
dataset.setLanguages(this.extractLanguages(document)); |
|
151 |
dataset.setResourceTypes(this.extractResourceTypes(document)); |
|
152 |
dataset.setAlternateIdentifier(this.extractAlternateIdentifiers(document)); |
|
153 |
dataset.setCitations(this.extractCitations(document)); |
|
154 |
dataset.setSizes(this.extractSize(document)); |
|
155 |
dataset.setFormat(this.extractEncodingFormat(document)); |
|
156 |
dataset.setVersion(this.extractVersion(document)); |
|
157 |
dataset.setLicenses(this.extractLicense(document)); |
|
158 |
dataset.setDescriptions(this.extractDescription(document)); |
|
159 |
dataset.setDisambiguatingDescriptions(this.extractDisambiguatingDescription(document)); |
|
160 |
dataset.setGeoLocations(this.extractSpatialCoverage(document)); |
|
161 |
|
|
162 |
log.debug("document contains native identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0)); |
|
163 |
|
|
164 |
if((dataset.getIdentifiers() == null || dataset.getIdentifiers().size() == 0) && |
|
165 |
this.options.getIdentifierOptions().fallbackURL){ |
|
166 |
log.debug("falling back to url identifier"); |
|
167 |
dataset.setIdentifiers(this.extractIdentifierFallbackURL(document)); |
|
168 |
log.debug("document contains overridden identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0)); |
|
169 |
} |
|
170 |
|
|
171 |
xml = dataset.toXml(); |
|
172 |
} |
|
173 |
catch(Exception ex){ |
|
174 |
log.error("problem constructing dataset xml. returning empty", ex); |
|
175 |
xml = null; |
|
176 |
} |
|
177 |
return xml; |
|
178 |
} |
|
179 |
|
|
180 |
private List<DatasetDocument.Identifier> extractIdentifierFallbackURL(JSONObject document){ |
|
181 |
List<String> urls = JSONLDUtils.extractString(document, "url"); |
|
182 |
|
|
183 |
ArrayList<DatasetDocument.Identifier> curated = new ArrayList<>(); |
|
184 |
for(String item : urls){ |
|
185 |
if(item == null || item.trim().length() == 0) continue; |
|
186 |
curated.add(new DatasetDocument.Identifier(DatasetDocument.Identifier.IdentifierType.URL, item.trim())); |
|
187 |
} |
|
188 |
return curated; |
|
189 |
} |
|
190 |
|
|
191 |
private List<DatasetDocument.SpatialCoverage> extractSpatialCoverage(JSONObject document){ |
|
192 |
List<JSONLDUtils.PlaceInfo> spatials = JSONLDUtils.extractPlaces(document, "spatialCoverage"); |
|
193 |
|
|
194 |
ArrayList<DatasetDocument.SpatialCoverage> curated = new ArrayList<>(); |
|
195 |
for(JSONLDUtils.PlaceInfo item : spatials){ |
|
196 |
if((item.name == null || item.name.trim().length() == 0) && |
|
197 |
(item.geoCoordinates == null || item.geoCoordinates.size() == 0) && |
|
198 |
(item.geoShapes == null || item.geoShapes.size() == 0)) continue; |
|
199 |
|
|
200 |
List<DatasetDocument.SpatialCoverage.Point> points = new ArrayList<>(); |
|
201 |
List<String> boxes = new ArrayList<>(); |
|
202 |
if(item.geoCoordinates!=null) { |
|
203 |
for (JSONLDUtils.GeoCoordinatesInfo iter : item.geoCoordinates){ |
|
204 |
points.add(new DatasetDocument.SpatialCoverage.Point(iter.latitude, iter.longitude)); |
|
205 |
} |
|
206 |
} |
|
207 |
if(item.geoShapes!=null) { |
|
208 |
for (JSONLDUtils.GeoShapeInfo iter : item.geoShapes){ |
|
209 |
boxes.add(iter.box); |
|
210 |
} |
|
211 |
} |
|
212 |
curated.add(new DatasetDocument.SpatialCoverage(item.name, points, boxes)); |
|
213 |
} |
|
214 |
return curated; |
|
215 |
} |
|
216 |
|
|
217 |
private List<String> extractDescription(JSONObject document){ |
|
218 |
List<String> descriptions = JSONLDUtils.extractString(document, "description"); |
|
219 |
|
|
220 |
ArrayList<String> curated = new ArrayList<>(); |
|
221 |
for(String item : descriptions){ |
|
222 |
if(item == null || item.trim().length() == 0) continue; |
|
223 |
curated.add(item); |
|
224 |
} |
|
225 |
return curated; |
|
226 |
} |
|
227 |
|
|
228 |
private List<String> extractDisambiguatingDescription(JSONObject document){ |
|
229 |
List<String> descriptions = JSONLDUtils.extractString(document, "disambiguatingDescription"); |
|
230 |
|
|
231 |
ArrayList<String> curated = new ArrayList<>(); |
|
232 |
for(String item : descriptions){ |
|
233 |
if(item == null || item.trim().length() == 0) continue; |
|
234 |
curated.add(item); |
|
235 |
} |
|
236 |
return curated; |
|
237 |
} |
|
238 |
|
|
239 |
private List<DatasetDocument.License> extractLicense(JSONObject document){ |
|
240 |
List<JSONLDUtils.LicenseInfo> licenses = JSONLDUtils.extractLicenses(document, "license"); |
|
241 |
|
|
242 |
ArrayList<DatasetDocument.License> curated = new ArrayList<>(); |
|
243 |
for(JSONLDUtils.LicenseInfo item : licenses){ |
|
244 |
if(item.url == null || item.url.trim().length() == 0) continue; |
|
245 |
curated.add(new DatasetDocument.License(item.name, item.url)); |
|
246 |
} |
|
247 |
return curated; |
|
248 |
} |
|
249 |
|
|
250 |
private List<String> extractVersion(JSONObject document){ |
|
251 |
List<String> versions = JSONLDUtils.extractString(document, "version"); |
|
252 |
|
|
253 |
ArrayList<String> curated = new ArrayList<>(); |
|
254 |
for(String item : versions){ |
|
255 |
if(item == null || item.trim().length() == 0) continue; |
|
256 |
curated.add(item); |
|
257 |
} |
|
258 |
return curated; |
|
259 |
} |
|
260 |
|
|
261 |
private List<String> extractSize(JSONObject document) { |
|
262 |
List<String> sizes = JSONLDUtils.extractSize(document, "distribution"); |
|
263 |
|
|
264 |
HashSet<String> curated = new HashSet<>(); |
|
265 |
for (String item : sizes) { |
|
266 |
if (item == null || item.trim().length() == 0) continue; |
|
267 |
curated.add(item); |
|
268 |
} |
|
269 |
return new ArrayList<>(curated); |
|
270 |
} |
|
271 |
|
|
272 |
private List<String> extractEncodingFormat(JSONObject document){ |
|
273 |
List<String> formats = JSONLDUtils.extractEncodingFormat(document, "distribution"); |
|
274 |
|
|
275 |
HashSet<String> curated = new HashSet<>(); |
|
276 |
for(String item : formats){ |
|
277 |
if(item == null || item.trim().length() == 0) continue; |
|
278 |
curated.add(item); |
|
279 |
} |
|
280 |
return new ArrayList<>(curated); |
|
281 |
} |
|
282 |
|
|
283 |
//TODO: Handle different citation types. Currently only urls |
|
284 |
private List<DatasetDocument.Citation> extractCitations(JSONObject document){ |
|
285 |
List<JSONLDUtils.CitationInfo> citations = JSONLDUtils.extractCitations(document, "citation"); |
|
286 |
|
|
287 |
ArrayList<DatasetDocument.Citation> curated = new ArrayList<>(); |
|
288 |
for(JSONLDUtils.CitationInfo item : citations){ |
|
289 |
if(item.url == null || item.url.trim().length() == 0) continue; |
|
290 |
try{ |
|
291 |
new URL(item.url); |
|
292 |
}catch (Exception ex){ |
|
293 |
continue; |
|
294 |
} |
|
295 |
curated.add(new DatasetDocument.Citation(item.url, DatasetDocument.Citation.CitationIdentifierType.URL)); |
|
296 |
} |
|
297 |
return curated; |
|
298 |
} |
|
299 |
|
|
300 |
private List<DatasetDocument.AlternateIdentifier> extractAlternateIdentifiers(JSONObject document){ |
|
301 |
List<String> issns = JSONLDUtils.extractString(document, "issn"); |
|
302 |
List<String> urls = JSONLDUtils.extractString(document, "url"); |
|
303 |
|
|
304 |
ArrayList<DatasetDocument.AlternateIdentifier> curated = new ArrayList<>(); |
|
305 |
for(String item : issns){ |
|
306 |
if(item == null || item.trim().length() == 0) continue; |
|
307 |
curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "ISSN")); |
|
308 |
} |
|
309 |
for(String item : urls){ |
|
310 |
if(item == null || item.trim().length() == 0) continue; |
|
311 |
curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "URL")); |
|
312 |
} |
|
313 |
return curated; |
|
314 |
} |
|
315 |
|
|
316 |
private List<DatasetDocument.ResourceType> extractResourceTypes(JSONObject document){ |
|
317 |
List<DatasetDocument.ResourceType> resourceTypes = new ArrayList<>(); |
|
318 |
resourceTypes.add(new DatasetDocument.ResourceType(DatasetDocument.ResourceType.ResourceTypeGeneralType.Dataset)); |
|
319 |
return resourceTypes; |
|
320 |
} |
|
321 |
|
|
322 |
private List<String> extractLanguages(JSONObject document){ |
|
323 |
List<String> languages = JSONLDUtils.extractLanguage(document, "inLanguage"); |
|
324 |
|
|
325 |
ArrayList<String> curated = new ArrayList<>(); |
|
326 |
for(String item : languages){ |
|
327 |
if(item == null || item.trim().length() == 0) continue; |
|
328 |
curated.add(item); |
|
329 |
} |
|
330 |
return curated; |
|
331 |
} |
|
332 |
|
|
333 |
private List<LocalDate> extractUpdatedDate(JSONObject document){ |
|
334 |
List<LocalDate> updatedDates = new ArrayList<>(); |
|
335 |
if(this.options.getUpdatedDateOptions() == null || this.options.getUpdatedDateOptions().format == null || this.options.getUpdatedDateOptions().format.length() == 0) return updatedDates; |
|
336 |
|
|
337 |
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format); |
|
338 |
|
|
339 |
List<String> dates = JSONLDUtils.extractString(document, "dateModified"); |
|
340 |
for(String updatedDate : dates){ |
|
341 |
if(updatedDate == null || updatedDate.trim().length() == 0) continue; |
|
342 |
try { |
|
343 |
LocalDate localDate = LocalDate.parse(updatedDate, formatter); |
|
344 |
updatedDates.add(localDate); |
|
345 |
} catch (Exception e) { |
|
346 |
continue; |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-collector-plugins-1.6.0