Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

    
3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle.KaggleRepositoryIterable;
5
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
6
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator;
7
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexRepositoryIterable;
8
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
9
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
10
import org.apache.commons.logging.Log;
11
import org.apache.commons.logging.LogFactory;
12

    
13
import java.net.MalformedURLException;
14
import java.net.URL;
15
import java.nio.charset.StandardCharsets;
16
import java.util.concurrent.TimeUnit;
17

    
18
public class SchemaOrgPlugin extends AbstractCollectorPlugin {
19

    
20
    private static final Log log = LogFactory.getLog(SchemaOrgPlugin.class);
21

    
22
    public String hello(){
23
        return "hello";
24
    }
25

    
26
    @Override
27
    public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
28
        try {
29
			RepositoryIterable repository = null;
30
        	String repositoryAccessType = Utils.getAsString(interfaceDescriptor.getParams(), "repositoryAccessType", null);
31
        	switch(repositoryAccessType) {
32
				case "sitemapindex": {
33
					SitemapIndexRepositoryIterable.Options repositoryOptions = this.compileSitemapIndexRepositoryOptions(interfaceDescriptor);
34
					SitemapIndexRepositoryIterable repositoryIterable = new SitemapIndexRepositoryIterable(repositoryOptions);
35
					repositoryIterable.bootstrap();
36
					repository = repositoryIterable;
37
					break;
38
				}
39
				case "httpapi-kaggle": {
40
					KaggleRepositoryIterable.Options repositoryOptions = this.compileKaggleRepositoryOptions(interfaceDescriptor);
41
					KaggleRepositoryIterable repositoryIterable = new KaggleRepositoryIterable(repositoryOptions);
42
					repositoryIterable.bootstrap();
43
					repository = repositoryIterable;
44
					break;
45
				}
46
				default:
47
					throw new CollectorServiceException(String.format("unrecognized repository access type ", repositoryAccessType));
48
			}
49
			SchemaOrgIterable.Options schemaOrgOptions = this.compileSchemaOrgOptions(interfaceDescriptor);
50
            SchemaOrgIterable iterable = new SchemaOrgIterable(schemaOrgOptions, repository);
51
            return iterable;
52
        } catch (Exception e) {
53
            throw new CollectorServiceException("Could not create iterator", e);
54
        }
55
    }
56

    
57
	private KaggleRepositoryIterable.Options compileKaggleRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
58
		KaggleRepositoryIterable.Options kaggleRepositoryOptions = new KaggleRepositoryIterable.Options();
59
		kaggleRepositoryOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "httpapi-kaggle_queueSize", 100));
60
		kaggleRepositoryOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeout", 20));
61
		kaggleRepositoryOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
62
		kaggleRepositoryOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "httpapi-kaggle_APICharset", StandardCharsets.UTF_8));
63
		kaggleRepositoryOptions.setQueryUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryUrl", null));
64
		kaggleRepositoryOptions.setQueryPagePlaceholder(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryPagePlaceholder", "{PAGE}"));
65
		kaggleRepositoryOptions.setResponsePropertyTotalDataset(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems"));
66
		kaggleRepositoryOptions.setResponsePropertyDatasetList(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetList", "datasetListItems"));
67
		kaggleRepositoryOptions.setResponsePropertyDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl"));
68
		kaggleRepositoryOptions.setResponseBaseDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responseBaseDatasetUrl", interfaceDescriptor.getBaseUrl()));
69
		kaggleRepositoryOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
70
		return kaggleRepositoryOptions;
71

    
72
	}
73

    
74
    private SitemapIndexIterator.Options compileSitemapIndexOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
75
		SitemapIndexIterator.Options sitemapIndexIteratorOptions = new SitemapIndexIterator.Options();
76
		sitemapIndexIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_IndexCharset", StandardCharsets.UTF_8));
77
		sitemapIndexIteratorOptions.setIndexUrl(new URL(interfaceDescriptor.getBaseUrl()));
78
		return sitemapIndexIteratorOptions;
79

    
80
	}
81

    
82
	private SitemapFileIterator.Options compileSitemapFileOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
83
		SitemapFileIterator.Options sitemapFileIteratorOptions = new SitemapFileIterator.Options();
84
		sitemapFileIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_FileCharset", StandardCharsets.UTF_8));
85
		sitemapFileIteratorOptions.setSchemaType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Xml, SitemapFileIterator.Options.SitemapSchemaType.class));
86
		sitemapFileIteratorOptions.setFileType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text, SitemapFileIterator.Options.SitemapFileType.class));
87
		return sitemapFileIteratorOptions;
88
	}
89

    
90
	private RepositoryQueueIterator.Options compileRepositoryQueueOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
91
		RepositoryQueueIterator.Options repositoryQueueIteratorOptions = new RepositoryQueueIterator.Options();
92
		repositoryQueueIteratorOptions.setBlockPolling(Utils.getAsBoolean(interfaceDescriptor.getParams(), "consumerBlockPolling", true));
93
		repositoryQueueIteratorOptions.setPollTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "consumerBlockPollingTimeout", 2));
94
		repositoryQueueIteratorOptions.setPollTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
95
		return repositoryQueueIteratorOptions;
96
	}
97

    
98
	private SitemapIndexRepositoryIterable.Options compileSitemapIndexRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
99
		SitemapIndexRepositoryIterable.Options sitemapIndexRepositoryIterableOptions = new SitemapIndexRepositoryIterable.Options();
100
		sitemapIndexRepositoryIterableOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "sitemap_queueSize", 100));
101
		sitemapIndexRepositoryIterableOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeout", 20));
102
		sitemapIndexRepositoryIterableOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
103
		sitemapIndexRepositoryIterableOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
104
		sitemapIndexRepositoryIterableOptions.setSitemapFileIteratorOptions(this.compileSitemapFileOptions(interfaceDescriptor));
105
		sitemapIndexRepositoryIterableOptions.setSitemapIndexIteratorOptions(this.compileSitemapIndexOptions(interfaceDescriptor));
106
		return sitemapIndexRepositoryIterableOptions;
107
	}
108

    
109
	private EndpointAccessIterator.Options compileEndpointAccessOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
110
		EndpointAccessIterator.Options endpointAccessIteratorOptions = new EndpointAccessIterator.Options();
111
		endpointAccessIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "endpointCharset", StandardCharsets.UTF_8));
112
		return endpointAccessIteratorOptions;
113
	}
114

    
115
	private DatasetMappingIterator.Options compileDatasetMappingOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
116
		DatasetMappingIterator.Options datasetMappingIteratorOptions = new DatasetMappingIterator.Options();
117

    
118
		DatasetMappingIterator.Options.UpdatedDateOptions datasetMappingIteratorUpdatedDateOptions = new DatasetMappingIterator.Options.UpdatedDateOptions();
119
		datasetMappingIteratorUpdatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "updatedDateFormat", "YYYY-MM-DD");
120
		datasetMappingIteratorOptions.setUpdatedDateOptions(datasetMappingIteratorUpdatedDateOptions);
121

    
122
		DatasetMappingIterator.Options.CreatedDateOptions datasetMappingIteratorCreatedDateOptions = new DatasetMappingIterator.Options.CreatedDateOptions();
123
		datasetMappingIteratorCreatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "createdDateFormat", "YYYY-MM-DD");
124
		datasetMappingIteratorOptions.setCreatedDateOptions(datasetMappingIteratorCreatedDateOptions);
125

    
126
		DatasetMappingIterator.Options.PublicationDateOptions datasetMappingIteratorPublicationDateOptions = new DatasetMappingIterator.Options.PublicationDateOptions();
127
		datasetMappingIteratorPublicationDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "publicationDateFormat", "YYYY-MM-DD");
128
		datasetMappingIteratorOptions.setPublicationDateOptions(datasetMappingIteratorPublicationDateOptions);
129

    
130
		DatasetMappingIterator.Options.ContributorOptions datasetMappingIteratorContributorOptions = new DatasetMappingIterator.Options.ContributorOptions();
131
		datasetMappingIteratorContributorOptions.fallbackType =Utils.getAsEnum(interfaceDescriptor.getParams(), "contributorFallbackType",DatasetDocument.Contributor.ContributorType.Other, DatasetDocument.Contributor.ContributorType.class);
132
		datasetMappingIteratorOptions.setContributorOptions(datasetMappingIteratorContributorOptions);
133

    
134
		DatasetMappingIterator.Options.IdentifierOptions datasetMappingIteratorIdentifierOptions = new DatasetMappingIterator.Options.IdentifierOptions();
135
		datasetMappingIteratorIdentifierOptions.fallbackType = Utils.getAsEnum(interfaceDescriptor.getParams(), "identifierFallbackType", null, DatasetDocument.Identifier.IdentifierType.class);
136
		datasetMappingIteratorIdentifierOptions.fallbackURL = Utils.getAsBoolean(interfaceDescriptor.getParams(), "identifierFallbackURL", true);
137
		datasetMappingIteratorIdentifierOptions.mappingARK = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingARK", null);
138
		datasetMappingIteratorIdentifierOptions.mappingDOI = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingDOI", null);
139
		datasetMappingIteratorIdentifierOptions.mappingHandle = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingHandle", null);
140
		datasetMappingIteratorIdentifierOptions.mappingPURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingPURL", null);
141
		datasetMappingIteratorIdentifierOptions.mappingURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURL", null);
142
		datasetMappingIteratorIdentifierOptions.mappingURN = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURN", null);
143
		datasetMappingIteratorOptions.setIdentifierOptions(datasetMappingIteratorIdentifierOptions);
144
		return datasetMappingIteratorOptions;
145
	}
146

    
147
	private SchemaOrgIterable.Options compileSchemaOrgOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
148
		SchemaOrgIterable.Options schemaOrgIterableOptions = new SchemaOrgIterable.Options();
149
		schemaOrgIterableOptions.setDatasetMappingOptions(this.compileDatasetMappingOptions(interfaceDescriptor));
150
		schemaOrgIterableOptions.setEndpointAccessOptions(this.compileEndpointAccessOptions(interfaceDescriptor));
151
		return schemaOrgIterableOptions;
152
	}
153
}
(10-10/11)