Project

General

Profile

« Previous | Next » 

Revision 62189

[maven-release-plugin] copy for tag dnet-collector-plugins-1.6.0

View differences:

modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainKaggle.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
4
import org.apache.commons.io.FileUtils;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7
import org.apache.log4j.ConsoleAppender;
8
import org.apache.log4j.Level;
9
import org.apache.log4j.Logger;
10
import org.apache.log4j.PatternLayout;
11

  
12
import java.io.File;
13
import java.nio.charset.StandardCharsets;
14
import java.util.HashMap;
15
import java.util.concurrent.TimeUnit;
16

  
17
public class SchemaOrgMainKaggle {
18

  
19
    private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class);
20

  
21
    public static void main(String[] args) throws Exception {
22

  
23
        ConsoleAppender console = new ConsoleAppender();
24
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
25
        console.setThreshold(Level.DEBUG);
26
        console.activateOptions();
27
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
28

  
29
        HashMap<String,String> params = new HashMap<>();
30
        params.put("consumerBlockPolling", Boolean.toString(true));
31
        params.put("consumerBlockPollingTimeout", "2");
32
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
33
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
34
        params.put("updatedDateFormat", "YYYY-MM-DD");
35
        params.put("createdDateFormat", "YYYY-MM-DD");
36
        params.put("publicationDateFormat", "YYYY-MM-DD");
37
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
38
        params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString());
39
        params.put("identifierFallbackURL", Boolean.toString(true));
40
        params.put("identifierMappingARK", "ark, ARK");
41
        params.put("identifierMappingDOI", "doi, DOI");
42
        params.put("identifierMappingHandle", "Handle, HANDLE");
43
        params.put("identifierMappingPURL", "purl, PURL");
44
        params.put("identifierMappingURN", "urn, URN");
45
        params.put("identifierMappingURL", "url, URL");
46

  
47
        params.put("repositoryAccessType", "httpapi-kaggle");
48

  
49
        params.put("httpapi-kaggle_queueSize", "100");
50
        params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name());
51
        params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll");
52
        params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}");
53
        params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems");
54
        params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems");
55
        params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl");
56
        params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com");
57
        params.put("httpapi-kaggle_producerBlockPollingTimeout", "2");
58
        params.put("httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
59

  
60
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
61
        descriptor.setId("schema.org - kaggle");
62
        descriptor.setBaseUrl("https://www.kaggle.com");
63

  
64
        descriptor.setParams(params);
65

  
66
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
67

  
68
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
69

  
70
        String outDir = params.get("repositoryAccessType");
71

  
72
        log.info("saving content in " + outDir);
73

  
74
        File directory = new File(outDir);
75
        if (directory.exists()) {
76
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
77
            FileUtils.deleteDirectory(directory);
78
        }
79
        FileUtils.forceMkdir(directory);
80
        Utils.writeFiles(iterable, outDir);
81

  
82
    }
83

  
84
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgPlugin.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle.KaggleRepositoryIterable;
5
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
6
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator;
7
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexRepositoryIterable;
8
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
9
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
10
import org.apache.commons.logging.Log;
11
import org.apache.commons.logging.LogFactory;
12

  
13
import java.net.MalformedURLException;
14
import java.net.URL;
15
import java.nio.charset.StandardCharsets;
16
import java.util.concurrent.TimeUnit;
17

  
18
public class SchemaOrgPlugin extends AbstractCollectorPlugin {
19

  
20
    private static final Log log = LogFactory.getLog(SchemaOrgPlugin.class);
21

  
22
    public String hello(){
23
        return "hello";
24
    }
25

  
26
    @Override
27
    public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
28
        try {
29
			RepositoryIterable repository = null;
30
        	String repositoryAccessType = Utils.getAsString(interfaceDescriptor.getParams(), "repositoryAccessType", null);
31
        	switch(repositoryAccessType) {
32
				case "sitemapindex": {
33
					SitemapIndexRepositoryIterable.Options repositoryOptions = this.compileSitemapIndexRepositoryOptions(interfaceDescriptor);
34
					SitemapIndexRepositoryIterable repositoryIterable = new SitemapIndexRepositoryIterable(repositoryOptions);
35
					repositoryIterable.bootstrap();
36
					repository = repositoryIterable;
37
					break;
38
				}
39
				case "httpapi-kaggle": {
40
					KaggleRepositoryIterable.Options repositoryOptions = this.compileKaggleRepositoryOptions(interfaceDescriptor);
41
					KaggleRepositoryIterable repositoryIterable = new KaggleRepositoryIterable(repositoryOptions);
42
					repositoryIterable.bootstrap();
43
					repository = repositoryIterable;
44
					break;
45
				}
46
				default:
47
					throw new CollectorServiceException(String.format("unrecognized repository access type ", repositoryAccessType));
48
			}
49
			SchemaOrgIterable.Options schemaOrgOptions = this.compileSchemaOrgOptions(interfaceDescriptor);
50
            SchemaOrgIterable iterable = new SchemaOrgIterable(schemaOrgOptions, repository);
51
            return iterable;
52
        } catch (Exception e) {
53
            throw new CollectorServiceException("Could not create iterator", e);
54
        }
55
    }
56

  
57
	private KaggleRepositoryIterable.Options compileKaggleRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
58
		KaggleRepositoryIterable.Options kaggleRepositoryOptions = new KaggleRepositoryIterable.Options();
59
		kaggleRepositoryOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "httpapi-kaggle_queueSize", 100));
60
		kaggleRepositoryOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeout", 20));
61
		kaggleRepositoryOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
62
		kaggleRepositoryOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "httpapi-kaggle_APICharset", StandardCharsets.UTF_8));
63
		kaggleRepositoryOptions.setQueryUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryUrl", null));
64
		kaggleRepositoryOptions.setQueryPagePlaceholder(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryPagePlaceholder", "{PAGE}"));
65
		kaggleRepositoryOptions.setResponsePropertyTotalDataset(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems"));
66
		kaggleRepositoryOptions.setResponsePropertyDatasetList(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetList", "datasetListItems"));
67
		kaggleRepositoryOptions.setResponsePropertyDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl"));
68
		kaggleRepositoryOptions.setResponseBaseDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responseBaseDatasetUrl", interfaceDescriptor.getBaseUrl()));
69
		kaggleRepositoryOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
70
		return kaggleRepositoryOptions;
71

  
72
	}
73

  
74
    private SitemapIndexIterator.Options compileSitemapIndexOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
75
		SitemapIndexIterator.Options sitemapIndexIteratorOptions = new SitemapIndexIterator.Options();
76
		sitemapIndexIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_IndexCharset", StandardCharsets.UTF_8));
77
		sitemapIndexIteratorOptions.setIndexUrl(new URL(interfaceDescriptor.getBaseUrl()));
78
		return sitemapIndexIteratorOptions;
79

  
80
	}
81

  
82
	private SitemapFileIterator.Options compileSitemapFileOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
83
		SitemapFileIterator.Options sitemapFileIteratorOptions = new SitemapFileIterator.Options();
84
		sitemapFileIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_FileCharset", StandardCharsets.UTF_8));
85
		sitemapFileIteratorOptions.setSchemaType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Xml, SitemapFileIterator.Options.SitemapSchemaType.class));
86
		sitemapFileIteratorOptions.setFileType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text, SitemapFileIterator.Options.SitemapFileType.class));
87
		return sitemapFileIteratorOptions;
88
	}
89

  
90
	private RepositoryQueueIterator.Options compileRepositoryQueueOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
91
		RepositoryQueueIterator.Options repositoryQueueIteratorOptions = new RepositoryQueueIterator.Options();
92
		repositoryQueueIteratorOptions.setBlockPolling(Utils.getAsBoolean(interfaceDescriptor.getParams(), "consumerBlockPolling", true));
93
		repositoryQueueIteratorOptions.setPollTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "consumerBlockPollingTimeout", 2));
94
		repositoryQueueIteratorOptions.setPollTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
95
		return repositoryQueueIteratorOptions;
96
	}
97

  
98
	private SitemapIndexRepositoryIterable.Options compileSitemapIndexRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
99
		SitemapIndexRepositoryIterable.Options sitemapIndexRepositoryIterableOptions = new SitemapIndexRepositoryIterable.Options();
100
		sitemapIndexRepositoryIterableOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "sitemap_queueSize", 100));
101
		sitemapIndexRepositoryIterableOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeout", 20));
102
		sitemapIndexRepositoryIterableOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
103
		sitemapIndexRepositoryIterableOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
104
		sitemapIndexRepositoryIterableOptions.setSitemapFileIteratorOptions(this.compileSitemapFileOptions(interfaceDescriptor));
105
		sitemapIndexRepositoryIterableOptions.setSitemapIndexIteratorOptions(this.compileSitemapIndexOptions(interfaceDescriptor));
106
		return sitemapIndexRepositoryIterableOptions;
107
	}
108

  
109
	private EndpointAccessIterator.Options compileEndpointAccessOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
110
		EndpointAccessIterator.Options endpointAccessIteratorOptions = new EndpointAccessIterator.Options();
111
		endpointAccessIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "endpointCharset", StandardCharsets.UTF_8));
112
		return endpointAccessIteratorOptions;
113
	}
114

  
115
	private DatasetMappingIterator.Options compileDatasetMappingOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
116
		DatasetMappingIterator.Options datasetMappingIteratorOptions = new DatasetMappingIterator.Options();
117

  
118
		DatasetMappingIterator.Options.UpdatedDateOptions datasetMappingIteratorUpdatedDateOptions = new DatasetMappingIterator.Options.UpdatedDateOptions();
119
		datasetMappingIteratorUpdatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "updatedDateFormat", "YYYY-MM-DD");
120
		datasetMappingIteratorOptions.setUpdatedDateOptions(datasetMappingIteratorUpdatedDateOptions);
121

  
122
		DatasetMappingIterator.Options.CreatedDateOptions datasetMappingIteratorCreatedDateOptions = new DatasetMappingIterator.Options.CreatedDateOptions();
123
		datasetMappingIteratorCreatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "createdDateFormat", "YYYY-MM-DD");
124
		datasetMappingIteratorOptions.setCreatedDateOptions(datasetMappingIteratorCreatedDateOptions);
125

  
126
		DatasetMappingIterator.Options.PublicationDateOptions datasetMappingIteratorPublicationDateOptions = new DatasetMappingIterator.Options.PublicationDateOptions();
127
		datasetMappingIteratorPublicationDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "publicationDateFormat", "YYYY-MM-DD");
128
		datasetMappingIteratorOptions.setPublicationDateOptions(datasetMappingIteratorPublicationDateOptions);
129

  
130
		DatasetMappingIterator.Options.ContributorOptions datasetMappingIteratorContributorOptions = new DatasetMappingIterator.Options.ContributorOptions();
131
		datasetMappingIteratorContributorOptions.fallbackType =Utils.getAsEnum(interfaceDescriptor.getParams(), "contributorFallbackType",DatasetDocument.Contributor.ContributorType.Other, DatasetDocument.Contributor.ContributorType.class);
132
		datasetMappingIteratorOptions.setContributorOptions(datasetMappingIteratorContributorOptions);
133

  
134
		DatasetMappingIterator.Options.IdentifierOptions datasetMappingIteratorIdentifierOptions = new DatasetMappingIterator.Options.IdentifierOptions();
135
		datasetMappingIteratorIdentifierOptions.fallbackType = Utils.getAsEnum(interfaceDescriptor.getParams(), "identifierFallbackType", null, DatasetDocument.Identifier.IdentifierType.class);
136
		datasetMappingIteratorIdentifierOptions.fallbackURL = Utils.getAsBoolean(interfaceDescriptor.getParams(), "identifierFallbackURL", true);
137
		datasetMappingIteratorIdentifierOptions.mappingARK = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingARK", null);
138
		datasetMappingIteratorIdentifierOptions.mappingDOI = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingDOI", null);
139
		datasetMappingIteratorIdentifierOptions.mappingHandle = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingHandle", null);
140
		datasetMappingIteratorIdentifierOptions.mappingPURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingPURL", null);
141
		datasetMappingIteratorIdentifierOptions.mappingURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURL", null);
142
		datasetMappingIteratorIdentifierOptions.mappingURN = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURN", null);
143
		datasetMappingIteratorOptions.setIdentifierOptions(datasetMappingIteratorIdentifierOptions);
144
		return datasetMappingIteratorOptions;
145
	}
146

  
147
	private SchemaOrgIterable.Options compileSchemaOrgOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
148
		SchemaOrgIterable.Options schemaOrgIterableOptions = new SchemaOrgIterable.Options();
149
		schemaOrgIterableOptions.setDatasetMappingOptions(this.compileDatasetMappingOptions(interfaceDescriptor));
150
		schemaOrgIterableOptions.setEndpointAccessOptions(this.compileEndpointAccessOptions(interfaceDescriptor));
151
		return schemaOrgIterableOptions;
152
	}
153
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/pom.xml
1
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2
	<modelVersion>4.0.0</modelVersion>
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet45-parent</artifactId>
6
		<version>1.0.0</version>
7
	</parent>
8
	<groupId>eu.dnetlib</groupId>
9
	<artifactId>dnet-collector-plugins</artifactId>
10
	<version>1.6.0</version>
11
	<scm>
12
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0</developerConnection>
13
	</scm>
14

  
15
	<build>
16
		<plugins>
17
			<plugin>
18
				<artifactId>maven-assembly-plugin</artifactId>
19
				<configuration>
20
					<archive>
21
						<manifest>
22
							<mainClass>eu.dnetlib.data.collector.plugins.schemaorg.SchemaOrgMainReactome</mainClass>
23
						</manifest>
24
					</archive>
25
					<descriptorRefs>
26
						<descriptorRef>jar-with-dependencies</descriptorRef>
27
					</descriptorRefs>
28
				</configuration>
29
			</plugin>
30
		</plugins>
31
	</build>
32

  
33
	<dependencies>
34
		<dependency>
35
			<groupId>eu.dnetlib</groupId>
36
			<artifactId>dnet-modular-collector-service-rmi</artifactId>
37
			<version>[1.3.0,2.0.0)</version>
38
		</dependency>
39
		<dependency>
40
			<groupId>eu.dnetlib</groupId>
41
			<artifactId>dnet-modular-collector-service</artifactId>
42
			<version>[3.3.26,4.0.0)</version>
43
		</dependency>
44
		<dependency>
45
			<groupId>com.google.code.gson</groupId>
46
			<artifactId>gson</artifactId>
47
			<version>${google.gson.version}</version>
48
		</dependency>
49
		<dependency>
50
			<groupId>commons-io</groupId>
51
			<artifactId>commons-io</artifactId>
52
			<version>${commons.io.version}</version>
53
		</dependency>
54
		<dependency>
55
			<groupId>junit</groupId>
56
			<artifactId>junit</artifactId>
57
			<version>${junit.version}</version>
58
			<scope>test</scope>
59
		</dependency>
60
		<dependency>
61
			<groupId>org.apache.httpcomponents</groupId>
62
			<artifactId>httpclient</artifactId>
63
			<version>4.5</version>
64
		</dependency>
65
		<dependency>
66
			<groupId>eu.dnetlib</groupId>
67
			<artifactId>cnr-resultset-service</artifactId>
68
			<version>[2.0.0, 3.0.0)</version>
69
			<scope>provided</scope>
70
		</dependency>
71
		<dependency>
72
			<groupId>com.ximpleware</groupId>
73
			<artifactId>vtd-xml</artifactId>
74
			<version>[2.12, 3.0.0)</version>
75
		</dependency>
76
		<dependency>
77
			<groupId>joda-time</groupId>
78
			<artifactId>joda-time</artifactId>
79
			<version>2.9.2</version>
80
		</dependency>
81

  
82
		<dependency>
83
			<groupId>org.json</groupId>
84
			<artifactId>json</artifactId>
85
			<version>20180813</version>
86
		 <type>jar</type>
87
		</dependency>
88
		<dependency>
89
			<groupId>org.apache.commons</groupId>
90
			<artifactId>commons-lang3</artifactId>
91
			<version>3.5</version>
92
		</dependency>
93

  
94
		<dependency>
95
			<groupId>org.apache.poi</groupId>
96
			<artifactId>poi</artifactId>
97
			<version>3.16</version>
98
		</dependency>
99
		<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
100
		<dependency>
101
			<groupId>org.apache.poi</groupId>
102
			<artifactId>poi-ooxml</artifactId>
103
			<version>3.16</version>
104
		</dependency>
105
		<dependency>
106
			<groupId>org.jsoup</groupId>
107
			<artifactId>jsoup</artifactId>
108
			<version>1.11.2</version>
109
		</dependency>
110
		<dependency>
111
			<groupId>commons-lang</groupId>
112
			<artifactId>commons-lang</artifactId>
113
			<version>2.6</version>
114
			<scope>compile</scope>
115
		</dependency>
116
        <dependency>
117
            <groupId>org.mockito</groupId>
118
            <artifactId>mockito-core</artifactId>
119
            <version>3.3.3</version>
120
            <scope>test</scope>
121
        </dependency>
122
    </dependencies>
123
</project>
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/fairsharing/FairSharingPlugin.java
1
package eu.dnetlib.data.collector.plugins.fairsharing;
2

  
3
import java.io.UnsupportedEncodingException;
4

  
5
import org.apache.commons.io.IOUtils;
6
import org.apache.commons.lang3.StringUtils;
7
import org.apache.commons.logging.Log;
8
import org.apache.commons.logging.LogFactory;
9
import org.apache.http.HttpEntity;
10
import org.apache.http.client.methods.CloseableHttpResponse;
11
import org.apache.http.client.methods.HttpPost;
12
import org.apache.http.entity.StringEntity;
13
import org.apache.http.impl.client.CloseableHttpClient;
14
import org.apache.http.impl.client.HttpClients;
15
import org.json.JSONObject;
16

  
17
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
18
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
19
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
20

  
21
public class FairSharingPlugin extends AbstractCollectorPlugin {
22

  
23
	private static final int PAGE_SIZE = 100;
24

  
25
	private static final Log log = LogFactory.getLog(FairSharingPlugin.class); // NOPMD by marko on 11/24/08 5:02 PM
26

  
27
	// Suggested values:
28
	// baseUrl = https://api.fairsharing.org
29
	// XPATH_ID = /record/id
30

  
31
	@Override
32
	public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
33
		throws CollectorServiceException {
34

  
35
		final String baseUrl = interfaceDescriptor.getBaseUrl();
36
		final String login = interfaceDescriptor.getParams().get("login");
37
		final String password = interfaceDescriptor.getParams().get("password");
38

  
39
		final String authCode = login(baseUrl, login, password);
40

  
41
		log.debug("authCode: " + authCode);
42

  
43
		if (StringUtils.isBlank(authCode)) { throw new CollectorServiceException("Authorization failed: authCode is empty"); }
44

  
45
		return () -> new FairSharingIterator(baseUrl, authCode, PAGE_SIZE);
46
	}
47

  
48
	private String login(final String baseUrl, final String login, final String password) throws CollectorServiceException {
49
		final HttpPost req = new HttpPost(baseUrl + "/users/sign_in");
50
		req.addHeader("Accept", "application/json");
51
		req.addHeader("Content-Type", "application/json");
52
		req.setEntity(prepareCredentials(login, password));
53

  
54
		try (final CloseableHttpClient client = HttpClients.createDefault()) {
55
			try (final CloseableHttpResponse response = client.execute(req)) {
56
				final String content = IOUtils.toString(response.getEntity().getContent());
57
				final JSONObject obj = new JSONObject(content);
58
				return obj.getString("jwt");
59
			}
60
		} catch (final Exception e) {
61
			throw new CollectorServiceException("Error perfoming login", e);
62
		}
63
	}
64

  
65
	public HttpEntity prepareCredentials(final String login, final String password) throws CollectorServiceException {
66

  
67
		final JSONObject objUser = new JSONObject();
68
		objUser.put("login", login);
69
		objUser.put("password", password);
70

  
71
		final JSONObject objCredentials = new JSONObject();
72
		objCredentials.put("user", objUser);
73

  
74
		try {
75
			return new StringEntity(objCredentials.toString());
76
		} catch (final UnsupportedEncodingException e) {
77
			throw new CollectorServiceException("Error preparing http entity for login");
78
		}
79
	}
80

  
81
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/fairsharing/FairSharingIterator.java
1
package eu.dnetlib.data.collector.plugins.fairsharing;
2

  
3
import java.util.Iterator;
4
import java.util.Queue;
5
import java.util.concurrent.PriorityBlockingQueue;
6

  
7
import org.apache.commons.io.IOUtils;
8
import org.apache.commons.logging.Log;
9
import org.apache.commons.logging.LogFactory;
10
import org.apache.http.client.methods.CloseableHttpResponse;
11
import org.apache.http.client.methods.HttpGet;
12
import org.apache.http.impl.client.CloseableHttpClient;
13
import org.apache.http.impl.client.HttpClients;
14
import org.json.JSONObject;
15
import org.json.XML;
16

  
17
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
18

  
19
public class FairSharingIterator implements Iterator<String> {
20

  
21
	private static final Log log = LogFactory.getLog(FairSharingIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
22

  
23
	private final Queue<String> queue = new PriorityBlockingQueue<>();
24

  
25
	private final String baseUrl;
26
	private final String authCode;
27
	private final int pageSize;
28

  
29
	private String nextUrl;
30
	private boolean started;
31

  
32
	public FairSharingIterator(final String baseUrl, final String authCode, final int pageSize) {
33
		this.baseUrl = baseUrl;
34
		this.authCode = authCode;
35
		this.pageSize = pageSize;
36
		this.started = false;
37
	}
38

  
39
	private void verifyStarted() {
40
		if (!this.started) {
41
			this.started = true;
42
			try {
43
				final String url = baseUrl + "/fairsharing_records/?page%5Bnumber%5D=1&page%5Bsize%5D=" + pageSize;
44
				this.nextUrl = downloadPage(url);
45
			} catch (final CollectorServiceException e) {
46
				throw new RuntimeException(e);
47
			}
48
		}
49
	}
50

  
51
	@Override
52
	public boolean hasNext() {
53
		synchronized (queue) {
54
			verifyStarted();
55
			return !queue.isEmpty();
56
		}
57
	}
58

  
59
	@Override
60
	public String next() {
61
		synchronized (queue) {
62
			verifyStarted();
63
			final String res = queue.poll();
64
			while (queue.isEmpty() && nextUrl != null && !nextUrl.isEmpty()) {
65
				try {
66
					nextUrl = downloadPage(nextUrl);
67
				} catch (final CollectorServiceException e) {
68
					throw new RuntimeException(e);
69
				}
70
			}
71
			return res;
72
		}
73
	}
74

  
75
	@Override
76
	public void remove() {}
77

  
78
	private String downloadPage(final String url) throws CollectorServiceException {
79
		log.debug("Fetching url: " + url);
80

  
81
		final HttpGet req = new HttpGet(url);
82
		req.addHeader("Accept", "application/json");
83
		req.addHeader("Content-Type", "application/json");
84
		req.addHeader("Authorization", "Bearer " + authCode);
85

  
86
		try (final CloseableHttpClient client = HttpClients.createDefault()) {
87
			try (final CloseableHttpResponse response = client.execute(req)) {
88
				final String content = IOUtils.toString(response.getEntity().getContent());
89
				final JSONObject obj = new JSONObject(content);
90

  
91
				obj.getJSONArray("data")
92
					.forEach(x -> queue.add(XML.toString(x, "record")));
93

  
94
				final JSONObject links = obj.getJSONObject("links");
95

  
96
				return links.isNull("next") ? null : links.getString("next");
97
			}
98
		} catch (final Exception e) {
99
			throw new CollectorServiceException("Error perfoming call fro login", e);
100
		}
101
	}
102

  
103
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/eosc/EoscServicesPlugin.java
1
package eu.dnetlib.data.collector.plugins.eosc;
2

  
3
import org.apache.commons.lang3.math.NumberUtils;
4
import org.apache.http.client.methods.CloseableHttpResponse;
5
import org.apache.http.client.methods.HttpGet;
6
import org.apache.http.impl.client.CloseableHttpClient;
7
import org.apache.http.impl.client.HttpClients;
8
import org.dom4j.Document;
9
import org.dom4j.DocumentHelper;
10
import org.dom4j.Element;
11
import org.dom4j.Node;
12
import org.dom4j.io.SAXReader;
13

  
14
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
15
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
16
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
17

  
18
public class EoscServicesPlugin extends AbstractCollectorPlugin {
19

  
20
	// Suggested values:
21
	// baseUrl = https://api.eosc-portal.eu
22
	// maxProviders = 10000
23
	// XPATH_ID = /record/organization/id
24

  
25
	@SuppressWarnings("unchecked")
26
	@Override
27
	public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
28
		throws CollectorServiceException {
29

  
30
		final String baseUrl = interfaceDescriptor.getBaseUrl();
31
		final long maxProviders = NumberUtils.toLong(interfaceDescriptor.getParams().get("maxProviders"), 10000);
32

  
33
		return () -> httpCall(baseUrl + "/provider/all?quantity=" + maxProviders)
34
			.selectNodes("/Paging/results/results")
35
			.stream()
36
			.map(o -> processProvider(baseUrl, (Node) o))
37
			.iterator();
38
	}
39

  
40
	private String processProvider(final String baseUrl, final Node nodeProv) {
41
		final String orgId = nodeProv.valueOf("./id");
42

  
43
		final Document newDoc = DocumentHelper.createDocument();
44
		final Element newRoot = DocumentHelper.createElement("record");
45

  
46
		newDoc.setRootElement(newRoot);
47

  
48
		nodeProv.setName("organization");
49
		newRoot.add(nodeProv.detach());
50

  
51
		final Document docSrvs = httpCall(baseUrl + "/provider/services/" + orgId);
52
		for (final Object o : docSrvs.selectNodes("/List/item")) {
53
			final Node nodeSrv = (Node) o;
54
			nodeSrv.setName("service");
55
			newRoot.add(nodeSrv.detach());
56
		}
57

  
58
		return newDoc.asXML();
59
	}
60

  
61
	private Document httpCall(final String url) {
62
		final SAXReader reader = new SAXReader();
63

  
64
		final HttpGet req = new HttpGet(url);
65
		req.addHeader("Accept", "application/xml");
66

  
67
		try (final CloseableHttpClient client = HttpClients.createDefault()) {
68
			try (final CloseableHttpResponse response = client.execute(req)) {
69
				return reader.read(response.getEntity().getContent());
70
			}
71
		} catch (final Exception e) {
72
			throw new RuntimeException(e);
73
		}
74
	}
75

  
76
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/Connector.java
1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

  
3
import eu.dnetlib.data.collector.plugins.HttpConnector;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5

  
6

  
7
/**
8
 * Created by miriam on 07/05/2018.
9
 */
10
public class Connector extends HttpConnector implements ConnectorInterface  {
11
    private String response;
12

  
13
    @Override
14
    public void get(final String requestUrl) throws CollectorServiceException {
15
        response = getInputSource(requestUrl);
16
    }
17

  
18
    @Override
19
    public String getResponse() {
20
        return response;
21
    }
22

  
23
    @Override
24
    public boolean isStatusOk() {
25
        return (response != null);
26
    }
27

  
28
    @Override
29
    public boolean responseTypeContains(String string) {
30
        String responseType = getResponseType();
31
        if (responseType != null)
32
            return responseType.contains(string);
33
        return false;
34
    }
35

  
36

  
37
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/rest/RestIterator.java
1
/**
2
 * log.debug(...) equal to  log.trace(...) in the application-logs
3
 * <p>
4
 * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
5
 */
6
package eu.dnetlib.data.collector.plugins.rest;
7

  
8
import java.io.InputStream;
9
import java.io.StringWriter;
10
import java.io.UnsupportedEncodingException;
11
import java.net.URL;
12
import java.net.URLEncoder;
13
import java.nio.charset.StandardCharsets;
14
import java.net.HttpURLConnection;
15
import java.util.Iterator;
16
import java.util.Map;
17
import java.util.Queue;
18
import java.util.concurrent.PriorityBlockingQueue;
19
import java.util.regex.Pattern;
20
import java.util.regex.Matcher;
21
import javax.xml.transform.OutputKeys;
22
import javax.xml.transform.Transformer;
23
import javax.xml.transform.TransformerConfigurationException;
24
import javax.xml.transform.TransformerFactory;
25
import javax.xml.transform.dom.DOMSource;
26
import javax.xml.transform.stream.StreamResult;
27
import javax.xml.xpath.*;
28

  
29
import com.google.common.collect.Maps;
30
import eu.dnetlib.data.collector.plugins.utils.JsonUtils;
31
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
32
import org.apache.commons.io.IOUtils;
33
import org.apache.commons.lang3.StringUtils;
34
import org.apache.commons.logging.Log;
35
import org.apache.commons.logging.LogFactory;
36
import org.apache.http.client.methods.CloseableHttpResponse;
37
import org.apache.http.client.methods.HttpGet;
38
import org.apache.http.impl.client.HttpClients;
39
import org.w3c.dom.Node;
40
import org.w3c.dom.NodeList;
41
import org.xml.sax.InputSource;
42

  
43
/**
44
 * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak, Alessia Bardi, Miriam Baglioni
45
 * @date 2020-04-09
46
 */
47
public class RestIterator implements Iterator<String> {
48
    private final String AUTHBASIC = "basic";
49

  
50
    // TODO: clean up the comments of replaced source code
51
    private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
52
    private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
53
    private static final String EMPTY_XML = XML_HEADER + JsonUtils.wrapName + "></" + JsonUtils.wrapName + ">";
54
    private JsonUtils jsonUtils;
55

  
56
    private String baseUrl;
57
    private String resumptionType;
58
    private String resumptionParam;
59
    private String resultFormatValue;
60
    private String queryParams = "";
61
    private int resultSizeValue;
62
    private int resumptionInt = 0;            // integer resumption token (first record to harvest)
63
    private int resultTotal = -1;
64
    private String resumptionStr = Integer.toString(resumptionInt);  // string resumption token (first record to harvest or token scanned from results)
65
    private InputStream resultStream;
66
    private Transformer transformer;
67
    private XPath xpath;
68
    private String query;
69
    private XPathExpression xprResultTotalPath;
70
    private XPathExpression xprResumptionPath;
71
    private XPathExpression xprEntity;
72
    private String queryFormat;
73
    private String querySize;
74
    private String authMethod;
75
    private String authToken;
76
    private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
77
    private int discoverResultSize = 0;
78
    private int pagination = 1;
79
    /*
80
    While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in json.
81
    useful for cases when the target API expects a resultFormatValue != json, but the results are returned in json.
82
    An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format.
83
     */
84
    private String resultOutputFormat;
85
    /*
86
    Can be used to set additional request headers, like for content negotiation
87
     */
88
    private Map<String, String> requestHeaders;
89

  
90

  
91
    public RestIterator(
92
            final String baseUrl,
93
            final String resumptionType,
94
            final String resumptionParam,
95
            final String resumptionXpath,
96
            final String resultTotalXpath,
97
            final String resultFormatParam,
98
            final String resultFormatValue,
99
            final String resultSizeParam,
100
            final String resultSizeValueStr,
101
            final String queryParams,
102
            final String entityXpath,
103
            final String authMethod,
104
            final String authToken,
105
            final String resultOutputFormat,
106
            final Map<String, String> requestHeaders
107
    ) {
108
        this.jsonUtils = new JsonUtils();
109
        this.baseUrl = baseUrl;
110
        this.resumptionType = resumptionType;
111
        this.resumptionParam = resumptionParam;
112
        this.resultFormatValue = resultFormatValue;
113
        this.queryParams = queryParams;
114
        this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
115
        this.authMethod = authMethod;
116
        this.authToken = authToken;
117
        this.resultOutputFormat = resultOutputFormat;
118
        this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();
119

  
120
        queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
121
        querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
122

  
123
        try {
124
            initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
125
        } catch (Exception e) {
126
            throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
127
        }
128
        initQueue();
129
    }
130

  
131

  
132
    private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
133
            throws TransformerConfigurationException, XPathExpressionException {
134
        transformer = TransformerFactory.newInstance().newTransformer();
135
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
136
        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
137
        xpath = XPathFactory.newInstance().newXPath();
138
        xprResultTotalPath = xpath.compile(resultTotalXpath);
139
        xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
140
        xprEntity = xpath.compile(entityXpath);
141
    }
142

  
143
    private void initQueue() {
144
        if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) {
145
            query = baseUrl;
146
        } else {
147
            query = baseUrl + "?" + queryParams + querySize + queryFormat;
148
        }
149

  
150
        log.info("RestIterator.initQueue():: REST calls starting with " + query);
151
    }
152

  
153
    private void disconnect() {
154
        // TODO close inputstream
155
    }
156

  
157
    /* (non-Javadoc)
158
     * @see java.util.Iterator#hasNext()
159
     */
160
    @Override
161
    public boolean hasNext() {
162
        if (recordQueue.isEmpty() && query.isEmpty()) {
163
            disconnect();
164
            return false;
165
        } else {
166
            return true;
167
        }
168
    }
169

  
170
    /* (non-Javadoc)
171
     * @see java.util.Iterator#next()
172
     */
173
    @Override
174
    public String next() {
175
        synchronized (recordQueue) {
176
            while (recordQueue.isEmpty() && !query.isEmpty()) {
177
                try {
178
                    log.debug("RestIterator.next():: get Query: " + query);
179
                    query = downloadPage(query);
180
                    log.debug("RestIterator.next():: next queryURL from downloadPage(): " + query);
181
                } catch (CollectorServiceException e) {
182
                    log.debug("RestIterator.next():: CollectorPlugin.next()-Exception: " + e);
183
                    throw new RuntimeException(e);
184
                }
185
            }
186
            return recordQueue.poll();
187
        }
188
    }
189

  
190
    /*
191
     * download page and return nextQuery
192
     */
193
    private String downloadPage(String query) throws CollectorServiceException {
194
        String resultJson;
195
        String resultXml = XML_HEADER;
196
        String nextQuery = "";
197
        Node resultNode = null;
198
        NodeList nodeList = null;
199
        String qUrlArgument = "";
200
        int urlOldResumptionSize = 0;
201
        InputStream theHttpInputStream;
202

  
203
        // modifying request URL
204
        // check if cursor=* is initial set otherwise add it to the queryParam URL
205
        if (resumptionType.equalsIgnoreCase("deep-cursor")) {
206
            log.debug("RestIterator.downloadPage():: check resumptionType deep-cursor and check cursor=*?" + query);
207
            if (!query.contains("&cursor=")) {
208
                query += "&cursor=*";
209
            }
210
        }
211
        // find pagination page start number in queryParam and remove before start the first query
212
        if((resumptionType.toLowerCase().equals("pagination")) && (query.contains("paginationStart="))) {
213

  
214
            final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query);
215
            m.find(); // guaranteed to be true for this regex
216

  
217
            String[] pageVal = m.group(0).split("=");
218
            pagination = Integer.parseInt(pageVal[1]);
219

  
220
            // remove page start number from queryParams
221
            query = query.replaceFirst("&?paginationStart=[0-9]+", "");
222

  
223
        }
224
        
225

  
226
        try {
227

  
228
            URL qUrl = new URL(query);
229
            log.debug("authMethod :" + authMethod);
230
            if (this.authMethod == "bearer") {
231
                log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
232
                requestHeaders.put("Authorization", "Bearer " + authToken);
233
                //requestHeaders.put("Content-Type", "application/json");
234
            } else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
235
                log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
236
                requestHeaders.put("Authorization", "Basic " + authToken);
237
                //requestHeaders.put("accept", "application/xml");
238
            }
239

  
240
            HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
241
            conn.setRequestMethod("GET");
242
            this.setRequestHeader(conn);
243
            resultStream = conn.getInputStream();
244

  
245
            if ("json".equals(resultOutputFormat)) {
246
                resultJson = IOUtils.toString(resultStream, "UTF-8");
247
                resultXml = jsonUtils.convertToXML(resultJson);
248
                resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
249
            }
250

  
251
            if (!isEmptyXml(resultXml)) {
252
                resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
253
                nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
254
                log.debug("RestIterator.downloadPage():: nodeList.length=" + nodeList.getLength());
255
                for (int i = 0; i < nodeList.getLength(); i++) {
256
                    StringWriter sw = new StringWriter();
257
                    transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
258
                    String toEnqueue = sw.toString();
259
                    if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
260
                        log.warn("RestIterator.downloadPage():: The following record resulted in empty item for the feeding queue: " + resultXml);
261
                    } else {
262
                        recordQueue.add(sw.toString());
263
                    }
264
                }
265
            } else {
266
                log.warn("resultXml is equal with emptyXml");
267
            }
268

  
269
            resumptionInt += resultSizeValue;
270

  
271
            switch (resumptionType.toLowerCase()) {
272
                case "scan":    // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
273
                    resumptionStr = xprResumptionPath.evaluate(resultNode);
274
                    break;
275

  
276
                case "count":   // begin at one step for all records, iterate over items
277
                    resumptionStr = Integer.toString(resumptionInt);
278
                    break;
279

  
280
                case "discover":   // size of result items unknown, iterate over items  (for openDOAR - 201808)
281
                    if (resultSizeValue < 2) {
282
                        log.debug("RestIterator.downloadPage().discover:: ode: discover, Param 'resultSizeValue' must greater then 1");
283
                        throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' must greater then 1");
284
                    }
285
                    log.debug("RestIterator.downloadPage().discover:: resumptionInt="+Integer.toString(resumptionInt)+"; ");
286
                    qUrlArgument = qUrl.getQuery();
287

  
288
                    if( qUrlArgument != null ) {
289
                    String[] arrayQUrlArgument = qUrlArgument.split("&");
290

  
291
                    // check if URL arguments given
292
                    if( arrayQUrlArgument != null ) {
293
                        for (String arrayUrlArgStr : arrayQUrlArgument) {
294
                            log.debug("RestIterator.downloadPage/discover:: "+arrayUrlArgStr);
295
                            if (arrayUrlArgStr.startsWith(resumptionParam)) {
296
                                String[] resumptionKeyValue = arrayUrlArgStr.split("=");
297
                                if (isInteger(resumptionKeyValue[1])) {
298
                                    urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
299
                                    log.debug("RestIterator.downloadPage():discover:: OldResumptionSize from Url (int): " + urlOldResumptionSize);
300
                                } else {
301
                                    log.debug("RestIterator.downloadPage().discover:: OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
302
                                }
303
                            }
304
                        }
305
                    }
306
                    }
307
                    log.debug("RestIterator.downloadPage().discover:: nodeList.length=" + nodeList.getLength());
308

  
309
                    if (isEmptyXml(resultXml) || ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
310
                    ) {
311
                        // resumptionStr = "";
312
                        if (nodeList != null) {
313
                            discoverResultSize += nodeList.getLength();
314
                        }
315
                        resultTotal = discoverResultSize;
316
                    } else {
317
                        resumptionStr = Integer.toString(resumptionInt);
318
                        resultTotal = resumptionInt + 1;
319
                        if (nodeList != null) {
320
                            discoverResultSize += nodeList.getLength();
321
                        }
322
                    }
323
                    log.debug("RestIterator.downloadPage().discover:: discoverResultSize=" + discoverResultSize);
324
                    break;
325

  
326
                case "pagination":
327
                case "page":         // pagination, iterate over page numbers
328
                    // find start page number
329
                    pagination += 1;
330
                    if (nodeList != null) {
331
                        discoverResultSize += nodeList.getLength();
332
                    } else {
333
                        resultTotal = discoverResultSize;
334
                        pagination = discoverResultSize;
335
                    }
336
                    resumptionInt = pagination;
337
                    resumptionStr = Integer.toString(resumptionInt);
338

  
339
                    log.debug("RestIterator.downloadPage().pagination:: resumptionStr=" + resumptionStr + " ; queryParams=" + queryParams + " ; resultTotal: " + resultTotal + " ; discoverResultSize: " + discoverResultSize);
340

  
341
                    break;
342

  
343
                case "deep-cursor":   // size of result items unknown, iterate over items  (for supporting deep cursor in solr)
344
                    // isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: deep-cursor, Param 'resultSizeValue' is less than 2");}
345

  
346
                    resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
347
                    queryParams = queryParams.replace("&cursor=*", "");
348

  
349
                    // terminating if length of nodeList is 0
350
                    if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
351
                        resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
352
                    } else {
353
                        resumptionInt += (nodeList.getLength() - resultSizeValue);    // subtract the resultSizeValue because the iteration is over real length and the resultSizeValue is added before the switch()
354
                    }
355

  
356
                    discoverResultSize = nodeList.getLength();
357

  
358
                    log.debug("RestIterator.downloadPage().deep-cursor:: resumptionStr=" + resumptionStr + " ; queryParams=" + queryParams + " resumptionLengthIncreased: " + resumptionInt);
359

  
360
                    break;
361

  
362
                default:        // otherwise: abort
363
                    // resultTotal = resumptionInt;
364
                    break;
365
            }
366

  
367
        } catch (Exception e) {
368
            log.error(e);
369
            throw new IllegalStateException("collection failed: " + e.getMessage());
370
        }
371

  
372
        try {
373
            String resultTotalXpathEval = xprResultTotalPath.evaluate(resultNode);
374
 
375
            log.debug("downloadPage():: resInt: " +resumptionInt + "; resultTotal: " + resultTotal + " ; resultTotalXpath eval.: " + resultTotalXpathEval );
376
            if ((resultTotal == -1) && (!resultTotalXpathEval.isEmpty())) {
377
                resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
378
                if (resumptionType.toLowerCase().equals("page") && !AUTHBASIC.equalsIgnoreCase(authMethod)) {
379
                    resultTotal += 1;
380
                }           // to correct the upper bound
381
                log.info("resultTotal was -1 is now: " + resultTotal);
382
            }
383
        } catch (Exception e) {
384
            log.error(e);
385
            throw new IllegalStateException("downloadPage() resultTotal couldn't parse: " + e.getMessage());
386
        }
387
        log.debug("resultTotal: " + resultTotal + " ; resInt: " + resumptionInt);
388
        if (resumptionInt <= resultTotal) {
389
            nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
390
        } else {
391
            nextQuery = "";
392
            // if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; }    	// correct the resumptionInt and prevent a NullPointer Exception at mdStore
393
        }
394
        log.debug("downloadPage() nextQueryUrl: " + nextQuery);
395
        return nextQuery;
396

  
397

  
398
    }
399

  
400
    private boolean isEmptyXml(String s){
401
        return EMPTY_XML.equalsIgnoreCase(s);
402
    }
403

  
404

  
405
    private boolean isInteger(String s) {
406
        boolean isValidInteger = false;
407
        try {
408
            Integer.parseInt(s);
409

  
410
            // s is a valid integer
411

  
412
            isValidInteger = true;
413
        } catch (NumberFormatException ex) {
414
            // s is not an integer
415
        }
416

  
417
        return isValidInteger;
418
    }
419

  
420
    // Method to encode a string value using `UTF-8` encoding scheme
421
    private String encodeValue(String value) {
422
        try {
423
            return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
424
        } catch (UnsupportedEncodingException ex) {
425
            throw new RuntimeException(ex.getCause());
426
        }
427
    }
428

  
429
    /**
430
     * setRequestHeader
431
     * 
432
     * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
433
     * @param conn
434
     */
435
    private void setRequestHeader(HttpURLConnection conn) {
436
        if (requestHeaders != null) {
437
            for (String key : requestHeaders.keySet()) {
438
                conn.setRequestProperty(key, requestHeaders.get(key));
439
            }
440
            log.debug("Set Request Header with: " + requestHeaders);
441
        }
442

  
443
    }
444

  
445
    public String getResultFormatValue() {
446
        return resultFormatValue;
447
    }
448

  
449
    public String getResultOutputFormat() {
450
        return resultOutputFormat;
451
    }
452

  
453
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.6.0/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/DatasetMappingIterator.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import org.apache.commons.logging.Log;
4
import org.apache.commons.logging.LogFactory;
5
import org.json.JSONObject;
6

  
7
import java.net.URL;
8
import java.time.LocalDate;
9
import java.time.format.DateTimeFormatter;
10
import java.util.*;
11

  
12
public class DatasetMappingIterator implements Iterator<String> {
13
	private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
14

  
15
	public static class Options {
16
		public static class IdentifierOptions{
17
			public List<String> mappingARK;
18
			public List<String> mappingDOI;
19
			public List<String> mappingHandle;
20
			public List<String> mappingPURL;
21
			public List<String> mappingURN;
22
			public List<String> mappingURL;
23
			public DatasetDocument.Identifier.IdentifierType fallbackType;
24
			public Boolean fallbackURL;
25
		}
26

  
27
		public static class ContributorOptions{
28
			public DatasetDocument.Contributor.ContributorType fallbackType;
29
		}
30

  
31
		public static class PublicationDateOptions{
32
			public String format;
33
		}
34

  
35
		public static class CreatedDateOptions{
36
			public String format;
37
		}
38

  
39
		public static class UpdatedDateOptions{
40
			public String format;
41
		}
42

  
43
		private IdentifierOptions identifierOptions;
44
		private PublicationDateOptions publicationDateOptions;
45
		private ContributorOptions contributorOptions;
46
		private CreatedDateOptions createdDateOptions;
47
		private UpdatedDateOptions updatedDateOptions;
48

  
49
		public UpdatedDateOptions getUpdatedDateOptions() {
50
			return updatedDateOptions;
51
		}
52

  
53
		public void setUpdatedDateOptions(UpdatedDateOptions updatedDateOptions) {
54
			this.updatedDateOptions = updatedDateOptions;
55
		}
56

  
57
		public CreatedDateOptions getCreatedDateOptions() {
58
			return createdDateOptions;
59
		}
60

  
61
		public void setCreatedDateOptions(CreatedDateOptions createdDateOptions) {
62
			this.createdDateOptions = createdDateOptions;
63
		}
64

  
65
		public ContributorOptions getContributorOptions() {
66
			return contributorOptions;
67
		}
68

  
69
		public void setContributorOptions(ContributorOptions contributorOptions) {
70
			this.contributorOptions = contributorOptions;
71
		}
72

  
73
		public PublicationDateOptions getPublicationDateOptions() {
74
			return publicationDateOptions;
75
		}
76

  
77
		public void setPublicationDateOptions(PublicationDateOptions publicationDateOptions) {
78
			this.publicationDateOptions = publicationDateOptions;
79
		}
80

  
81
		public IdentifierOptions getIdentifierOptions() {
82
			return identifierOptions;
83
		}
84

  
85
		public void setIdentifierOptions(IdentifierOptions identifierOptions) {
86
			this.identifierOptions = identifierOptions;
87
		}
88
	}
89

  
90
	private Options options;
91
	private EndpointAccessIterator endpointAccessIterator;
92

  
93
	public DatasetMappingIterator(Options options, EndpointAccessIterator endpointAccessIterator) {
94
		this.options = options;
95
		this.endpointAccessIterator = endpointAccessIterator;
96
	}
97

  
98
	@Override
99
	public boolean hasNext() {
100
		return this.endpointAccessIterator.hasNext();
101
	}
102

  
103
	@Override
104
	public String next() {
105
		JSONObject document = this.endpointAccessIterator.next();
106
		String xml = null;
107
		if (document == null) {
108
			log.debug("no document provided to process. returning empty");
109
			xml = DatasetDocument.emptyXml();
110
		}
111
		else {
112
			log.debug("building document");
113
			xml = this.buildDataset(document);
114
			if (!Utils.validateXml(xml)) {
115
				log.debug("xml not valid. setting to empty");
116
				xml = null;
117
			}
118
			if (xml == null) {
119
				log.debug("could not build xml. returning empty");
120
				xml = DatasetDocument.emptyXml();
121
			}
122
		}
123

  
124
		//if all else fails
125
		if(xml == null){
126
			log.debug("could not build xml. returning empty");
127
			xml = "<dataset/>";
128
		}
129

  
130
		log.debug("xml document for dataset is: "+xml);
131

  
132
		return xml;
133
	}
134

  
135
	private String buildDataset(JSONObject document){
136
		String xml = null;
137
		try{
138
			DatasetDocument dataset = new DatasetDocument();
139

  
140
			dataset.setIdentifiers(this.extractIdentifier(document));
141
			dataset.setCreators(this.extractCreator(document));
142
			dataset.setTitles(this.extractTitles(document));
143
			dataset.setAlternativeTitles(this.extractAlternateTitles(document));
144
			dataset.setPublishers(this.extractPublisher(document));
145
			dataset.setPublicationDates(this.extractPublicationDate(document));
146
			dataset.setSubjects(this.extractSubjects(document));
147
			dataset.setContributors(this.extractContributors(document));
148
			dataset.setCreatedDates(this.extractCreatedDate(document));
149
			dataset.setUpdatedDates(this.extractUpdatedDate(document));
150
			dataset.setLanguages(this.extractLanguages(document));
151
			dataset.setResourceTypes(this.extractResourceTypes(document));
152
			dataset.setAlternateIdentifier(this.extractAlternateIdentifiers(document));
153
			dataset.setCitations(this.extractCitations(document));
154
			dataset.setSizes(this.extractSize(document));
155
			dataset.setFormat(this.extractEncodingFormat(document));
156
			dataset.setVersion(this.extractVersion(document));
157
			dataset.setLicenses(this.extractLicense(document));
158
			dataset.setDescriptions(this.extractDescription(document));
159
			dataset.setDisambiguatingDescriptions(this.extractDisambiguatingDescription(document));
160
			dataset.setGeoLocations(this.extractSpatialCoverage(document));
161

  
162
			log.debug("document contains native identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
163

  
164
			if((dataset.getIdentifiers() == null || dataset.getIdentifiers().size() == 0) &&
165
					this.options.getIdentifierOptions().fallbackURL){
166
				log.debug("falling back to url identifier");
167
				dataset.setIdentifiers(this.extractIdentifierFallbackURL(document));
168
				log.debug("document contains overridden identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
169
			}
170

  
171
			xml = dataset.toXml();
172
		}
173
		catch(Exception ex){
174
			log.error("problem constructing dataset xml. returning empty", ex);
175
			xml = null;
176
		}
177
		return xml;
178
	}
179

  
180
	private List<DatasetDocument.Identifier> extractIdentifierFallbackURL(JSONObject document){
181
		List<String> urls = JSONLDUtils.extractString(document, "url");
182

  
183
		ArrayList<DatasetDocument.Identifier> curated = new ArrayList<>();
184
		for(String item : urls){
185
			if(item == null || item.trim().length() == 0) continue;
186
			curated.add(new DatasetDocument.Identifier(DatasetDocument.Identifier.IdentifierType.URL,  item.trim()));
187
		}
188
		return curated;
189
	}
190

  
191
	private List<DatasetDocument.SpatialCoverage> extractSpatialCoverage(JSONObject document){
192
		List<JSONLDUtils.PlaceInfo> spatials = JSONLDUtils.extractPlaces(document, "spatialCoverage");
193

  
194
		ArrayList<DatasetDocument.SpatialCoverage> curated = new ArrayList<>();
195
		for(JSONLDUtils.PlaceInfo item : spatials){
196
			if((item.name == null || item.name.trim().length() == 0) &&
197
					(item.geoCoordinates == null || item.geoCoordinates.size() == 0) &&
198
					(item.geoShapes == null || item.geoShapes.size() == 0)) continue;
199

  
200
			List<DatasetDocument.SpatialCoverage.Point> points = new ArrayList<>();
201
			List<String> boxes = new ArrayList<>();
202
			if(item.geoCoordinates!=null) {
203
				for (JSONLDUtils.GeoCoordinatesInfo iter : item.geoCoordinates){
204
					points.add(new DatasetDocument.SpatialCoverage.Point(iter.latitude, iter.longitude));
205
				}
206
			}
207
			if(item.geoShapes!=null) {
208
				for (JSONLDUtils.GeoShapeInfo iter : item.geoShapes){
209
					boxes.add(iter.box);
210
				}
211
			}
212
			curated.add(new DatasetDocument.SpatialCoverage(item.name, points, boxes));
213
		}
214
		return curated;
215
	}
216

  
217
	private List<String> extractDescription(JSONObject document){
218
		List<String> descriptions = JSONLDUtils.extractString(document, "description");
219

  
220
		ArrayList<String> curated = new ArrayList<>();
221
		for(String item : descriptions){
222
			if(item == null || item.trim().length() == 0) continue;
223
			curated.add(item);
224
		}
225
		return curated;
226
	}
227

  
228
	private List<String> extractDisambiguatingDescription(JSONObject document){
229
		List<String> descriptions = JSONLDUtils.extractString(document, "disambiguatingDescription");
230

  
231
		ArrayList<String> curated = new ArrayList<>();
232
		for(String item : descriptions){
233
			if(item == null || item.trim().length() == 0) continue;
234
			curated.add(item);
235
		}
236
		return curated;
237
	}
238

  
239
	private List<DatasetDocument.License> extractLicense(JSONObject document){
240
		List<JSONLDUtils.LicenseInfo> licenses = JSONLDUtils.extractLicenses(document, "license");
241

  
242
		ArrayList<DatasetDocument.License> curated = new ArrayList<>();
243
		for(JSONLDUtils.LicenseInfo item : licenses){
244
			if(item.url == null || item.url.trim().length() == 0) continue;
245
			curated.add(new DatasetDocument.License(item.name, item.url));
246
		}
247
		return curated;
248
	}
249

  
250
	private List<String> extractVersion(JSONObject document){
251
		List<String> versions = JSONLDUtils.extractString(document, "version");
252

  
253
		ArrayList<String> curated = new ArrayList<>();
254
		for(String item : versions){
255
			if(item == null || item.trim().length() == 0) continue;
256
			curated.add(item);
257
		}
258
		return curated;
259
	}
260

  
261
	private List<String> extractSize(JSONObject document) {
262
		List<String> sizes = JSONLDUtils.extractSize(document, "distribution");
263

  
264
		HashSet<String> curated = new HashSet<>();
265
		for (String item : sizes) {
266
			if (item == null || item.trim().length() == 0) continue;
267
			curated.add(item);
268
		}
269
		return new ArrayList<>(curated);
270
	}
271

  
272
	private List<String> extractEncodingFormat(JSONObject document){
273
		List<String> formats = JSONLDUtils.extractEncodingFormat(document, "distribution");
274

  
275
		HashSet<String> curated = new HashSet<>();
276
		for(String item : formats){
277
			if(item == null || item.trim().length() == 0) continue;
278
			curated.add(item);
279
		}
280
		return new ArrayList<>(curated);
281
	}
282

  
283
	//TODO: Handle different citation types. Currently only urls
284
	private List<DatasetDocument.Citation> extractCitations(JSONObject document){
285
		List<JSONLDUtils.CitationInfo> citations = JSONLDUtils.extractCitations(document, "citation");
286

  
287
		ArrayList<DatasetDocument.Citation> curated = new ArrayList<>();
288
		for(JSONLDUtils.CitationInfo item : citations){
289
			if(item.url == null || item.url.trim().length() == 0) continue;
290
			try{
291
				new URL(item.url);
292
			}catch (Exception ex){
293
				continue;
294
			}
295
			curated.add(new DatasetDocument.Citation(item.url, DatasetDocument.Citation.CitationIdentifierType.URL));
296
		}
297
		return curated;
298
	}
299

  
300
	private List<DatasetDocument.AlternateIdentifier> extractAlternateIdentifiers(JSONObject document){
301
		List<String> issns = JSONLDUtils.extractString(document, "issn");
302
		List<String> urls = JSONLDUtils.extractString(document, "url");
303

  
304
		ArrayList<DatasetDocument.AlternateIdentifier> curated = new ArrayList<>();
305
		for(String item : issns){
306
			if(item == null || item.trim().length() == 0) continue;
307
			curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "ISSN"));
308
		}
309
		for(String item : urls){
310
			if(item == null || item.trim().length() == 0) continue;
311
			curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "URL"));
312
		}
313
		return curated;
314
	}
315

  
316
	private List<DatasetDocument.ResourceType> extractResourceTypes(JSONObject document){
317
		List<DatasetDocument.ResourceType> resourceTypes = new ArrayList<>();
318
		resourceTypes.add(new DatasetDocument.ResourceType(DatasetDocument.ResourceType.ResourceTypeGeneralType.Dataset));
319
		return resourceTypes;
320
	}
321

  
322
	private List<String> extractLanguages(JSONObject document){
323
		List<String> languages = JSONLDUtils.extractLanguage(document, "inLanguage");
324

  
325
		ArrayList<String> curated = new ArrayList<>();
326
		for(String item : languages){
327
			if(item == null || item.trim().length() == 0) continue;
328
			curated.add(item);
329
		}
330
		return curated;
331
	}
332

  
333
	private List<LocalDate> extractUpdatedDate(JSONObject document){
334
		List<LocalDate> updatedDates = new ArrayList<>();
335
		if(this.options.getUpdatedDateOptions() == null || this.options.getUpdatedDateOptions().format == null || this.options.getUpdatedDateOptions().format.length() == 0) return updatedDates;
336

  
337
		DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
338

  
339
		List<String> dates = JSONLDUtils.extractString(document, "dateModified");
340
		for(String updatedDate : dates){
341
			if(updatedDate == null || updatedDate.trim().length() == 0) continue;
342
			try {
343
				LocalDate localDate = LocalDate.parse(updatedDate, formatter);
344
				updatedDates.add(localDate);
345
			} catch (Exception e) {
346
				continue;
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff