Project

General

Profile

« Previous | Next » 

Revision 63299

Added by Michele Artini 3 months ago

new implementation of gtr2 plugin

View differences:

modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/excel/ReadExcelPlugin.java
1
package eu.dnetlib.data.collector.plugins.excel;
2

  
3

  
4
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
5
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
6
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
7
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
8
import org.apache.commons.logging.Log;
9
import org.apache.commons.logging.LogFactory;
10
import org.springframework.beans.factory.annotation.Autowired;
11
import org.springframework.beans.factory.annotation.Required;
12

  
13
/**
14
 * Created by miriam on 10/05/2017.
15
 */
16
public class ReadExcelPlugin extends AbstractCollectorPlugin{
17

  
18
	private static final Log log = LogFactory.getLog(ReadExcelPlugin.class);
19
	@Autowired
20
	HttpCSVCollectorPlugin httpCSVCollectorPlugin;
21

  
22

  
23

  
24
	@Override
25
	public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
26
			throws CollectorServiceException {
27
		Read r = new Read(interfaceDescriptor);
28
		r.setCollector(httpCSVCollectorPlugin);
29

  
30
		try {
31
			return r.parseFile();
32
		}catch(Exception e){
33
			log.error("Error importing excel file");
34
			throw new CollectorServiceException(e);
35
		}
36

  
37

  
38
	}
39
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/datasources/Re3DataCollectorPlugin.java
1
package eu.dnetlib.data.collector.plugins.datasources;
2

  
3
import java.io.IOException;
4

  
5
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
6
import eu.dnetlib.data.collector.plugins.HttpConnector;
7
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
8
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
9
import org.apache.commons.io.IOUtils;
10
import org.springframework.beans.factory.annotation.Autowired;
11

  
12
/**
13
 * Plugin to collect metadata record about data repositories from re3data.
14
 * <p>
15
 * Documentation on re3data API: http://service.re3data.org/api/doc.
16
 * </p>
17
 * <p>
18
 * BaseURL: http://service.re3data.org
19
 * </p>
20
 * <p>
21
 * API to get the list of repos: baseURL + /api/v1/repositories
22
 * </p>
23
 * <p>
24
 * API to get a repository: baseURL + content of link/@href of the above list
25
 * </p>
26
 *
27
 * @author alessia
28
 *
29
 */
30
public class Re3DataCollectorPlugin extends AbstractCollectorPlugin {
31

  
32
	private String repositoryListPath = "/api/v1/repositories";
33

  
34
	@Autowired
35
	private HttpConnector httpConnector;
36

  
37
	@Override
38
	public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
39
			throws CollectorServiceException {
40
		String repositoryListURL = interfaceDescriptor.getBaseUrl() + repositoryListPath;
41
		String input;
42
		try {
43
			input = httpConnector.getInputSource(repositoryListURL);
44
			return new Re3DataRepositoriesIterator(IOUtils.toInputStream(input, "UTF-8"), interfaceDescriptor.getBaseUrl(), getHttpConnector());
45
		} catch (IOException e) {
46
			throw new CollectorServiceException(e);
47
		}
48

  
49
	}
50

  
51
	public String getRepositoryListPath() {
52
		return repositoryListPath;
53
	}
54

  
55
	public void setRepositoryListPath(final String repositoryListPath) {
56
		this.repositoryListPath = repositoryListPath;
57
	}
58

  
59
	public HttpConnector getHttpConnector() {
60
		return httpConnector;
61
	}
62

  
63
	public void setHttpConnector(final HttpConnector httpConnector) {
64
		this.httpConnector = httpConnector;
65
	}
66
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgMainReactome.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
4
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
5
import org.apache.commons.io.FileUtils;
6
import org.apache.commons.logging.Log;
7
import org.apache.commons.logging.LogFactory;
8
import org.apache.log4j.ConsoleAppender;
9
import org.apache.log4j.Level;
10
import org.apache.log4j.Logger;
11
import org.apache.log4j.PatternLayout;
12

  
13
import java.io.File;
14
import java.nio.charset.StandardCharsets;
15
import java.util.HashMap;
16
import java.util.concurrent.TimeUnit;
17

  
18
public class SchemaOrgMainReactome {
19

  
20
    private static final Log log = LogFactory.getLog(SchemaOrgMainReactome.class);
21

  
22
    public static void main(String[] args) throws Exception {
23

  
24
        ConsoleAppender console = new ConsoleAppender();
25
        console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
26
        console.setThreshold(Level.DEBUG);
27
        console.activateOptions();
28
        Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
29

  
30
        HashMap<String,String> params = new HashMap<>();
31
        params.put("consumerBlockPolling", Boolean.toString(true));
32
        params.put("consumerBlockPollingTimeout", "2");
33
        params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
34
        params.put("endpointCharset", StandardCharsets.UTF_8.name());
35
        params.put("updatedDateFormat", "YYYY-MM-DD");
36
        params.put("createdDateFormat", "YYYY-MM-DD");
37
        params.put("publicationDateFormat", "YYYY-MM-DD");
38
        params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
39
        params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString());
40
        params.put("identifierFallbackURL", Boolean.toString(true));
41
        params.put("identifierMappingARK", "ark, ARK");
42
        params.put("identifierMappingDOI", "doi, DOI");
43
        params.put("identifierMappingHandle", "Handle, HANDLE");
44
        params.put("identifierMappingPURL", "purl, PURL");
45
        params.put("identifierMappingURN", "urn, URN");
46
        params.put("identifierMappingURL", "url, URL");
47

  
48
        params.put("repositoryAccessType", "sitemapindex");
49
        params.put("sitemap_queueSize", "100");
50
        params.put("sitemap_IndexCharset", StandardCharsets.UTF_8.name());
51
        params.put("sitemap_FileCharset", StandardCharsets.UTF_8.name());
52
        params.put("sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Text.toString());
53
        params.put("sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.GZ.toString());
54
        params.put("sitemap_producerBlockPollingTimeout", "2");
55
        params.put("sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
56

  
57
        InterfaceDescriptor descriptor = new InterfaceDescriptor();
58
        descriptor.setId("schema.org - reactome");
59
        descriptor.setBaseUrl("https://reactome.org/sitemapindex.xml");
60

  
61
        descriptor.setParams(params);
62

  
63
        SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
64

  
65
        Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
66

  
67
        String outDir = params.get("repositoryAccessType");
68

  
69
        log.info("saving content in " + outDir);
70

  
71
        File directory = new File(outDir);
72
        if (directory.exists()) {
73
            log.info(directory.getAbsolutePath() + " exists, cleaning up");
74
            FileUtils.deleteDirectory(directory);
75
        }
76
        FileUtils.forceMkdir(directory);
77
        Utils.writeFiles(iterable, outDir);
78
    }
79

  
80
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/kaggle/KaggleRepositoryIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
4
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator;
5
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.HttpApiRepositoryIterable;
6
import org.apache.commons.io.IOUtils;
7
import org.apache.commons.logging.Log;
8
import org.apache.commons.logging.LogFactory;
9
import org.json.JSONArray;
10
import org.json.JSONObject;
11

  
12
import java.net.URL;
13
import java.nio.charset.Charset;
14
import java.util.Iterator;
15
import java.util.NoSuchElementException;
16
import java.util.concurrent.ArrayBlockingQueue;
17
import java.util.concurrent.ExecutorService;
18
import java.util.concurrent.Executors;
19
import java.util.concurrent.TimeUnit;
20

  
21
public class KaggleRepositoryIterable implements HttpApiRepositoryIterable {
22
	private static final Log log = LogFactory.getLog(KaggleRepositoryIterable.class);
23

  
24
	public static class Options {
25
		private String queryUrl;
26
		private String queryPagePlaceholder;
27
		private Charset charset;
28
		private String responsePropertyTotalDataset;
29
		private String responsePropertyDatasetList;
30
		private String responsePropertyDatasetUrl;
31
		private String responseBaseDatasetUrl;
32
		private long putTimeout;
33
		private TimeUnit putTimeoutUnit;
34

  
35
		private RepositoryQueueIterator.Options repositoryQueueIteratorOptions;
36

  
37
		private int queueSize;
38

  
39
		public long getPutTimeout() {
40
			return putTimeout;
41
		}
42

  
43
		public void setPutTimeout(long putTimeout) {
44
			this.putTimeout = putTimeout;
45
		}
46

  
47
		public TimeUnit getPutTimeoutUnit() {
48
			return putTimeoutUnit;
49
		}
50

  
51
		public void setPutTimeoutUnit(TimeUnit putTimeoutUnit) {
52
			this.putTimeoutUnit = putTimeoutUnit;
53
		}
54

  
55
		public int getQueueSize() {
56
			return queueSize;
57
		}
58

  
59
		public void setQueueSize(int queueSize) {
60
			this.queueSize = queueSize;
61
		}
62

  
63
		public String getResponseBaseDatasetUrl() {
64
			return responseBaseDatasetUrl;
65
		}
66

  
67
		public void setResponseBaseDatasetUrl(String responseBaseDatasetUrl) {
68
			this.responseBaseDatasetUrl = responseBaseDatasetUrl;
69
		}
70

  
71
		public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() {
72
			return repositoryQueueIteratorOptions;
73
		}
74

  
75
		public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) {
76
			this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions;
77
		}
78

  
79
		public String getResponsePropertyDatasetUrl() {
80
			return responsePropertyDatasetUrl;
81
		}
82

  
83
		public void setResponsePropertyDatasetUrl(String responsePropertyDatasetUrl) {
84
			this.responsePropertyDatasetUrl = responsePropertyDatasetUrl;
85
		}
86

  
87
		public String getResponsePropertyDatasetList() {
88
			return responsePropertyDatasetList;
89
		}
90

  
91
		public void setResponsePropertyDatasetList(String responsePropertyDatasetList) {
92
			this.responsePropertyDatasetList = responsePropertyDatasetList;
93
		}
94

  
95
		public String getResponsePropertyTotalDataset() {
96
			return responsePropertyTotalDataset;
97
		}
98

  
99
		public void setResponsePropertyTotalDataset(String responsePropertyTotalDataset) {
100
			this.responsePropertyTotalDataset = responsePropertyTotalDataset;
101
		}
102

  
103
		public Charset getCharset() {
104
			return charset;
105
		}
106

  
107
		public void setCharset(Charset charset) {
108
			this.charset = charset;
109
		}
110

  
111
		public String getQueryPagePlaceholder() {
112
			return queryPagePlaceholder;
113
		}
114

  
115
		public void setQueryPagePlaceholder(String queryPagePlaceholder) {
116
			this.queryPagePlaceholder = queryPagePlaceholder;
117
		}
118

  
119
		public String getQueryUrl() {
120
			return queryUrl;
121
		}
122

  
123
		public void setQueryUrl(String queryUrl) {
124
			this.queryUrl = queryUrl;
125
		}
126
	}
127

  
128
	private Options options;
129
	private ArrayBlockingQueue<String> queue;
130

  
131
	public KaggleRepositoryIterable(Options options) {
132
		this.options = options;
133
//		this.currentPage = 1;
134
//		this.terminated = false;
135
	}
136

  
137
	public void bootstrap() {
138
		this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize());
139

  
140
		Thread ft = new Thread(new Harvester() );
141
		ft.start();
142
//		ExecutorService executor = Executors.newSingleThreadExecutor();
143
//		executor.execute(new Harvester());
144
//		executor.shutdown();
145
	}
146

  
147
	@Override
148
	public Iterator<String> iterator() {
149
		return new RepositoryQueueIterator(this.options.getRepositoryQueueIteratorOptions(), this.queue);
150
	}
151

  
152
	private class Harvester implements Runnable{
153

  
154
		@Override
155
		public void run() {
156
			this.execute();
157
		}
158
		private void execute() {
159
			try {
160
				int currentPage = 1;
161
				int totalDatasets = 0;
162
				int readDatasets = 0;
163
				while (true) {
164
					String query = options.getQueryUrl().replace(options.getQueryPagePlaceholder(), Integer.toString(currentPage));
165
					String response = IOUtils.toString(new URL(query), options.getCharset());
166
					currentPage += 1;
167

  
168
					JSONObject pageObject = new JSONObject(response);
169
					totalDatasets = pageObject.optInt(options.getResponsePropertyTotalDataset());
170
					JSONArray datasets = pageObject.optJSONArray(options.getResponsePropertyDatasetList());
171

  
172
					if (datasets == null || datasets.length() == 0) break;
173

  
174
					readDatasets += datasets.length();
175

  
176
					for (int i = 0; i < datasets.length(); i += 1) {
177
						JSONObject item = datasets.optJSONObject(i);
178
						String urlFragment = item.optString(options.getResponsePropertyDatasetUrl());
179
						if (urlFragment == null || urlFragment.trim().length() == 0) continue;
180
						String endpoint = String.format("%s%s", options.getResponseBaseDatasetUrl(), urlFragment);
181

  
182
						log.debug("adding endpoint in queue");
183
						log.debug("queue size: " + queue.size());
184

  
185
						try {
186
							queue.offer(endpoint, options.getPutTimeout(), options.getPutTimeoutUnit());
187
						} catch (InterruptedException ex) {
188
							log.warn(String.format("could not put elements from queue for more than %s %s. breaking", options.getPutTimeout(), options.getPutTimeoutUnit()));
189
							break;
190
						}
191
						log.debug("endpoint added in queue");
192
						log.debug("queue size: " + queue.size());
193
					}
194

  
195
					if (readDatasets >= totalDatasets) break;
196
				}
197
			} catch (Exception ex) {
198
				log.error("problem execution harvesting", ex);
199
			} finally {
200
				try {
201
					queue.offer(RepositoryIterable.TerminationHint, options.getPutTimeout(), options.getPutTimeoutUnit());
202
				} catch (Exception ex) {
203
					log.fatal("could not add termination hint. the process will not terminate gracefully", ex);
204
				}
205
			}
206
		}
207
	}
208
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/sitemapindex/SitemapIndexRepositoryIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
4
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7

  
8
import java.net.URL;
9
import java.util.Iterator;
10
import java.util.concurrent.ArrayBlockingQueue;
11
import java.util.concurrent.ExecutorService;
12
import java.util.concurrent.Executors;
13
import java.util.concurrent.TimeUnit;
14

  
15
public class SitemapIndexRepositoryIterable implements RepositoryIterable {
16
	private static final Log log = LogFactory.getLog(SitemapIndexRepositoryIterable.class);
17

  
18
	public static class Options {
19
		private SitemapIndexIterator.Options sitemapIndexIteratorOptions;
20
		private SitemapFileIterator.Options sitemapFileIteratorOptions;
21
		private RepositoryQueueIterator.Options repositoryQueueIteratorOptions;
22
		private long putTimeout;
23
		private TimeUnit putTimeoutUnit;
24

  
25
		private int queueSize;
26

  
27
		public long getPutTimeout() {
28
			return putTimeout;
29
		}
30

  
31
		public void setPutTimeout(long putTimeout) {
32
			this.putTimeout = putTimeout;
33
		}
34

  
35
		public TimeUnit getPutTimeoutUnit() {
36
			return putTimeoutUnit;
37
		}
38

  
39
		public void setPutTimeoutUnit(TimeUnit putTimeoutUnit) {
40
			this.putTimeoutUnit = putTimeoutUnit;
41
		}
42

  
43
		public int getQueueSize() {
44
			return queueSize;
45
		}
46

  
47
		public void setQueueSize(int queueSize) {
48
			this.queueSize = queueSize;
49
		}
50

  
51
		public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() {
52
			return repositoryQueueIteratorOptions;
53
		}
54

  
55
		public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) {
56
			this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions;
57
		}
58

  
59
		public SitemapIndexIterator.Options getSitemapIndexIteratorOptions() {
60
			return sitemapIndexIteratorOptions;
61
		}
62

  
63
		public void setSitemapIndexIteratorOptions(SitemapIndexIterator.Options sitemapIndexIteratorOptions) {
64
			this.sitemapIndexIteratorOptions = sitemapIndexIteratorOptions;
65
		}
66

  
67
		public SitemapFileIterator.Options getSitemapFileIteratorOptions() {
68
			return sitemapFileIteratorOptions;
69
		}
70

  
71
		public void setSitemapFileIteratorOptions(SitemapFileIterator.Options sitemapFileIteratorOptions) {
72
			this.sitemapFileIteratorOptions = sitemapFileIteratorOptions;
73
		}
74
	}
75

  
76
	private Options options;
77
	private ArrayBlockingQueue<String> queue;
78

  
79
	public SitemapIndexRepositoryIterable(Options options) {
80
		this.options = options;
81
	}
82

  
83
	public void bootstrap() {
84
		this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize());
85

  
86
		Thread ft = new Thread(new Harvester() );
87
		ft.start();
88
//		ExecutorService executor = Executors.newSingleThreadExecutor();
89
//		executor.execute(new Harvester());
90
//		executor.shutdown();
91
	}
92

  
93
	@Override
94
	public Iterator<String> iterator() {
95
		return new RepositoryQueueIterator(this.options.getRepositoryQueueIteratorOptions(), this.queue);
96
	}
97

  
98
	private class Harvester implements Runnable{
99

  
100
		@Override
101
		public void run() {
102
			this.execute();
103
		}
104

  
105
		private void execute(){
106
			try {
107
				SitemapIndexIterator sitemapIndexIterator = new SitemapIndexIterator(options.getSitemapIndexIteratorOptions());
108
				sitemapIndexIterator.bootstrap();
109

  
110
				while (sitemapIndexIterator.hasNext()) {
111
					String sitemapFile = sitemapIndexIterator.next();
112
					if(sitemapFile == null) continue;
113

  
114
					SitemapFileIterator.Options sitemapFileIteratorOptions = (SitemapFileIterator.Options)options.getSitemapFileIteratorOptions().clone();
115
					sitemapFileIteratorOptions.setFileUrl(new URL(sitemapFile));
116
					SitemapFileIterator sitemapFileIterator = new SitemapFileIterator(sitemapFileIteratorOptions);
117
					sitemapFileIterator.bootstrap();
118

  
119
					while(sitemapFileIterator.hasNext()){
120
						String endpoint = sitemapFileIterator.next();
121
						if(endpoint == null) continue;;
122

  
123
						log.debug("adding endpoint in queue");
124
						log.debug("queue size: " + queue.size());
125
						try {
126
							queue.offer(endpoint, options.getPutTimeout(), options.getPutTimeoutUnit());
127
						} catch (InterruptedException ex) {
128
							log.warn(String.format("could not put elements from queue for more than %s %s. breaking", options.getPutTimeout(), options.getPutTimeoutUnit()));
129
							break;
130
						}
131
						log.debug("endpoint added in queue");
132
						log.debug("queue size: " + queue.size());
133
					}
134
				}
135
			}catch(Exception ex){
136
				log.error("problem execution harvesting", ex);
137
			}
138
			finally {
139
				try {
140
					queue.offer(RepositoryIterable.TerminationHint, options.getPutTimeout(), options.getPutTimeoutUnit());
141
				} catch (Exception ex) {
142
					log.fatal("could not add termination hint. the process will not terminate gracefully", ex);
143
				}
144
			}
145
		}
146
	}
147
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/Utils.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import org.apache.commons.io.IOUtils;
4
import org.apache.commons.lang3.StringUtils;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7
import org.dom4j.DocumentException;
8
import org.dom4j.io.SAXReader;
9
import org.w3c.dom.Document;
10
import org.w3c.dom.NodeList;
11
import org.xml.sax.InputSource;
12

  
13
import javax.xml.parsers.DocumentBuilder;
14
import javax.xml.parsers.DocumentBuilderFactory;
15
import javax.xml.xpath.XPath;
16
import javax.xml.xpath.XPathConstants;
17
import javax.xml.xpath.XPathExpression;
18
import javax.xml.xpath.XPathFactory;
19
import java.io.*;
20
import java.net.URL;
21
import java.nio.charset.Charset;
22
import java.nio.charset.UnsupportedCharsetException;
23
import java.util.ArrayList;
24
import java.util.EnumSet;
25
import java.util.HashMap;
26
import java.util.List;
27
import java.util.zip.GZIPInputStream;
28

  
29
public class Utils {
30
	private static final Log log = LogFactory.getLog(Utils.class);
31

  
32
	public static List<String> collectAsStrings(String xml, String xpath) throws Exception{
33
		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
34
		DocumentBuilder builder = factory.newDocumentBuilder();
35
		Document doc = builder.parse(new InputSource(new StringReader(xml)));
36
		return Utils.collectAsStrings(doc, xpath);
37
	}
38

  
39
	public static List<String> collectAsStrings(File file, String xpath) throws Exception{
40
		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
41
		DocumentBuilder builder = factory.newDocumentBuilder();
42
		Document doc = builder.parse(file);
43
		return Utils.collectAsStrings(doc, xpath);
44
	}
45

  
46
	public static List<String> collectAsStrings(Document doc, String xpath) throws Exception{
47
		XPathFactory xPathfactory = XPathFactory.newInstance();
48
		XPath path = xPathfactory.newXPath();
49
		XPathExpression expr = path.compile(xpath);
50
		NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
51

  
52
		List<String> values = new ArrayList<>();
53

  
54
		for (int i = 0; i < nodes.getLength(); i++)
55
			values.add(nodes.item(i).getNodeValue());
56

  
57
		return values;
58
	}
59

  
60
	public static void decompressGZipTo(File input, File output) throws Exception {
61
		try (GZIPInputStream in = new GZIPInputStream(new FileInputStream(input))){
62
			try (FileOutputStream out = new FileOutputStream(output)){
63
				byte[] buffer = new byte[1024];
64
				int len;
65
				while((len = in.read(buffer)) != -1){
66
					out.write(buffer, 0, len);
67
				}
68
			}
69
		}
70
	}
71

  
72
	public static String getAsString(HashMap<String,String> map, String key, String defaultValue)
73
	{
74
		String value = map.get(key);
75
		if(value == null) return defaultValue;
76
		return value;
77
	}
78

  
79
	public static List<String> getAsStringCsv(HashMap<String,String> map, String key, List<String> defaultValue)
80
	{
81
		String value = map.get(key);
82
		if(value == null) return defaultValue;
83
		String[] splits = value.split(",");
84
		List<String> curated = new ArrayList<>();
85
		for(String item : splits){
86
			if(item == null || item.trim().length() == 0) continue;
87
			curated.add(item.trim());
88
		}
89
		return curated;
90
	}
91

  
92
	public static int getAsInt(HashMap<String,String> map, String key, int defaultValue)
93
	{
94
		String value = map.get(key);
95
		if(value == null) return defaultValue;
96
		try {
97
			return Integer.parseInt(value);
98
		} catch (NumberFormatException e) {
99
			return defaultValue;
100
		}
101
	}
102

  
103
	public static long getAsLong(HashMap<String,String> map, String key, long defaultValue)
104
	{
105
		String value = map.get(key);
106
		if(value == null) return defaultValue;
107
		try {
108
			return Long.parseLong(value);
109
		} catch (NumberFormatException e) {
110
			return defaultValue;
111
		}
112
	}
113

  
114
	public static <E extends Enum<E>> E getAsEnum(HashMap<String,String> map, String key, E defaultValue, Class<E> clazz) {
115
		//EnumSet<E> values = EnumSet.allOf(defaultValue.getClass());
116
		EnumSet<E> values = EnumSet.allOf(clazz);
117
		String value = map.get(key);
118
		if (value == null) return defaultValue;
119
		for(E val : values){
120
			if(!val.name().equalsIgnoreCase(value)) continue;
121
			return val;
122
		}
123
		return defaultValue;
124
	}
125

  
126
	public static Boolean getAsBoolean(HashMap<String,String> map, String key, Boolean defaultValue) {
127
		String value = map.get(key);
128
		if (value == null) return defaultValue;
129
		return Boolean.parseBoolean(value);
130
	}
131

  
132
	public static Charset getAsCharset(HashMap<String,String> map, String key, Charset defaultValue)
133
	{
134
		String value = map.get(key);
135
		if(value == null) return defaultValue;
136
		try {
137
			return Charset.forName(value);
138
		} catch (UnsupportedCharsetException e) {
139
			return defaultValue;
140
		}
141
	}
142

  
143

  
144
	public static String RemoteAccessWithRetry(int retryCount, long waitBetweenRetriesMillis, URL endpoint, Charset charset) throws IOException {
145
		int retry =0;
146
		while(retry < retryCount) {
147
			try {
148
				return IOUtils.toString(endpoint, charset);
149
			} catch (Exception ex) {
150
				retry += 1;
151
				if (retry < retryCount) {
152
					log.debug("problem accessing url " + endpoint + ". will retry after " + waitBetweenRetriesMillis + " milliseconds");
153
					try {
154
						Thread.sleep(waitBetweenRetriesMillis);
155
					} catch (Exception e) {
156
					}
157
				}
158
				else{
159
					log.debug("problem accessing url " + endpoint + ". throwing");
160
					throw  ex;
161
				}
162
			}
163
		}
164
		return null;
165
	}
166

  
167
	public static Boolean validateXml(String xml){
168
		try {
169
			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
170
			DocumentBuilder builder = factory.newDocumentBuilder();
171
			InputSource is = new InputSource(new StringReader(xml));
172
			builder.parse(is);
173
			return true;
174
		}catch(Exception ex){
175
			return false;
176
		}
177
	}
178

  
179
	public static void writeFiles(final Iterable<String> iterable, final String outDir) throws DocumentException, IOException {
180

  
181
		int skipped = 0;
182
		int count = 0;
183

  
184
		for(String item : iterable) {
185

  
186
			final org.dom4j.Document doc = new SAXReader().read(new StringReader(item));
187

  
188
			if (StringUtils.isNotBlank(doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()"))) {
189
				log.info(item);
190
				String fileName = outDir + "/" + count++;
191

  
192
				try(BufferedWriter w = new BufferedWriter(new FileWriter(fileName))) {
193
					w.write(item);
194
				}
195
				log.info("wrote " + fileName);
196
			} else {
197
				skipped++;
198
			}
199
			if (skipped % 100 == 0) {
200
				log.info("skipped so far " + skipped);
201
			}
202
			if (count % 100 == 0) {
203
				log.info("stored so far " + count);
204
			}
205
		}
206
		log.info(String.format("Done! skipped %s, stored %s", skipped, count));
207
	}
208
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/sitemapindex/SitemapIndexIterator.java
1
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.Utils;
4
import org.apache.commons.io.IOUtils;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7

  
8
import java.net.URL;
9
import java.nio.charset.Charset;
10
import java.util.*;
11

  
12
public class SitemapIndexIterator implements Iterator<String> {
13
	private static final Log log = LogFactory.getLog(SitemapIndexIterator.class);
14

  
15
	public static class Options {
16
		private URL indexUrl;
17
		private Charset charset;
18

  
19
		public Options(){}
20

  
21
		public Options(URL indexUrl, Charset charset){
22
			this.indexUrl = indexUrl;
23
			this.charset = charset;
24
		}
25

  
26
		public URL getIndexUrl() {
27
			return indexUrl;
28
		}
29

  
30
		public void setIndexUrl(URL indexUrl) {
31
			this.indexUrl = indexUrl;
32
		}
33

  
34
		public Charset getCharset() {
35
			return charset;
36
		}
37

  
38
		public void setCharset(Charset charset) {
39
			this.charset = charset;
40
		}
41
	}
42

  
43
	private Options options;
44
	private Queue<String> sitemapFiles;
45

  
46
	public SitemapIndexIterator(Options options) {
47
		this.options = options;
48
	}
49

  
50
	public void bootstrap() {
51
		List<String> files = null;
52
		try {
53
			log.debug("bootstrapping sitemapindex index access");
54
			String sitemapIndexPayload = Utils.RemoteAccessWithRetry(3, 5000, this.options.getIndexUrl(), this.options.getCharset());
55
			log.debug(String.format("sitemapindex payload is: %s", sitemapIndexPayload));
56
			files = Utils.collectAsStrings(sitemapIndexPayload, "/sitemapindex/sitemap/loc/text()");
57
			log.debug(String.format("extracted %d sitemapindex files", files.size()));
58
		}catch(Exception ex){
59
			log.error("problem bootstrapping sitemapindex index access. returning 0 files", ex);
60
			files = new ArrayList<>();
61
		}
62
		this.sitemapFiles = new PriorityQueue<String>(files);
63
	}
64

  
65
	@Override
66
	public boolean hasNext() {
67
		return !this.sitemapFiles.isEmpty();
68
	}
69

  
70
	@Override
71
	public String next() {
72
		return this.sitemapFiles.poll();
73
	}
74
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/EndpointAccessIterator.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import org.apache.commons.io.IOUtils;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6
import org.json.JSONObject;
7
import org.jsoup.Jsoup;
8
import org.jsoup.nodes.Document;
9
import org.jsoup.nodes.Element;
10
import org.jsoup.select.Elements;
11

  
12
import java.net.URL;
13
import java.nio.charset.Charset;
14
import java.util.Iterator;
15

  
16
public class EndpointAccessIterator implements Iterator<JSONObject> {
17
	private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
18

  
19
	public static class Options {
20

  
21
		private Charset charset;
22

  
23
		public Options(){}
24

  
25
		public Options(Charset charset) {
26
			this.charset = charset;
27
		}
28

  
29
		public Charset getCharset() {
30
			return charset;
31
		}
32

  
33
		public void setCharset(Charset charset) {
34
			this.charset = charset;
35
		}
36
	}
37

  
38
	private Options options;
39
	private Iterator<String> repositoryIterator;
40

  
41
	public EndpointAccessIterator(Options options, Iterator<String> repositoryIterator) {
42
		this.options = options;
43
		this.repositoryIterator = repositoryIterator;
44
	}
45

  
46
	@Override
47
	public boolean hasNext() {
48
		return this.repositoryIterator.hasNext();
49
	}
50

  
51
	@Override
52
	public JSONObject next() {
53
		String endpoint = this.repositoryIterator.next();
54
		if(endpoint == null) return null;
55

  
56
		log.debug(String.format("processing: %s", endpoint));
57

  
58
		JSONObject dataset = this.extractDatasetRecord(endpoint);
59

  
60
		return dataset;
61
	}
62

  
63
	private JSONObject extractDatasetRecord(String endpoint) {
64
		JSONObject datasetDocument = null;
65
		try {
66
			URL urlEndpoint = new URL(endpoint);
67
			log.debug("downloading endpoint "+urlEndpoint);
68
			String payload = Utils.RemoteAccessWithRetry(3, 5000, urlEndpoint, this.options.getCharset());
69

  
70
			log.trace("downloaded payload id: "+payload);
71
			Document doc = Jsoup.parse(payload);
72
			Elements scriptTags = doc.getElementsByTag("script");
73
			for (Element scriptTag : scriptTags) {
74
				if (!scriptTag.hasAttr("type")) continue;
75
				String scriptType = scriptTag.attr("type");
76
				if (!scriptType.equalsIgnoreCase("application/ld+json")) continue;
77

  
78
				String data = scriptTag.data();
79
				JSONObject schemaItem = new JSONObject(data);
80
				String context = schemaItem.optString("@context");
81
				String type = schemaItem.optString("@type");
82

  
83
				if (context == null || type == null) continue;
84

  
85
				Boolean isSchemaOrgContext = context.toLowerCase().startsWith("http://schema.org") || context.toLowerCase().startsWith("https://schema.org");
86
				Boolean isDataset = type.equalsIgnoreCase("dataset");
87

  
88
				if (!isSchemaOrgContext || !isDataset) continue;
89

  
90
				log.debug(String.format("discovered dataset document: %s", schemaItem.toString()));
91

  
92
				datasetDocument = schemaItem;
93
				break;
94
			}
95
		}catch(Exception ex){
96
			log.error("problem extracting dataset document. returning empty", ex);
97
			datasetDocument = null;
98
		}
99
		if(datasetDocument == null){
100
			log.debug("did not find any dataset document in endpoint");
101
		}
102
		else{
103
			log.debug("found dataset document in endpoint :"+datasetDocument.toString());
104
		}
105
		return datasetDocument;
106
	}
107
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/RepositoryIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import java.util.Iterator;
4

  
5
public interface RepositoryIterable extends Iterable<String> {
6
	public static String TerminationHint = "df667391-676d-4c0f-9c40-426b1001607a";
7
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/HTTPWithFileNameCollectorIterable.java
1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

  
3
import java.util.*;
4
import java.util.concurrent.ArrayBlockingQueue;
5
import java.util.concurrent.TimeUnit;
6

  
7
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
8
import org.apache.commons.logging.Log;
9
import org.apache.commons.logging.LogFactory;
10
import org.json.JSONObject;
11
import org.json.XML;
12
import org.jsoup.Jsoup;
13
import org.jsoup.nodes.Document;
14
import org.jsoup.nodes.Element;
15
import org.jsoup.select.Elements;
16

  
17
/**
18
 * Created by miriam on 04/05/2018.
19
 */
20
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
21

  
22
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
23

  
24
    private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>";
25
    public static final String APP_JSON = "application/json";
26
    public static final String APP_XML = "application/xml";
27
    public static final String TEXT_HTML = "text/html";
28
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
29

  
30

  
31

  
32

  
33
    private String filterParam;
34

  
35
    int total = 0;
36
    int filtered = 0;
37

  
38
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
39

  
40
        this.filterParam = filter;
41
        Thread ft = new Thread(new FillMetaQueue(startUrl) );
42
        ft.start();
43
    }
44

  
45

  
46
    @Override
47
    public Iterator<String> iterator() {
48
        return new HttpWithFileNameCollectorIterator(queue);
49
    }
50

  
51
    private class FillMetaQueue implements Runnable {
52
        final Connector c = new Connector();
53

  
54
        private final List<String> metas = Collections.synchronizedList(new ArrayList<String>());
55
        private final List<String> urls = Collections.synchronizedList(new ArrayList<>());
56

  
57
        public FillMetaQueue(String startUrl){
58
            if(!startUrl.isEmpty()){
59
                urls.add(startUrl);
60
            }
61
        }
62

  
63

  
64
        public void fillQueue() {
65
            String url;
66

  
67
            while((metas.size()>0 || urls.size() > 0 )) {
68
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
69
                if (metas.size() > 0) {
70
                    url = metas.remove(0);
71
                    try {
72
                        c.get(url);
73
                    } catch (CollectorServiceException e) {
74
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
75
                    }
76
                    if(c.isStatusOk()){
77
                        try {
78
                            String ret = c.getResponse();
79
                            if (ret != null && ret.length()>0) {
80
                                if (!containsFilter(ret))
81
                                    queue.put(addFilePath(ret, url, url.endsWith(".json")));
82
                                    //queue.offer(addFilePath(ret, url, url.endsWith(".json")), HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
83
                                else
84
                                    filtered++;
85
                                total++;
86
                            }
87
                        } catch (InterruptedException e) {
88
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
89

  
90
                        }
91
                    }
92
                } else {
93
                    url = urls.remove(0);
94
                    try {
95
                        c.get(url);
96
                    } catch (CollectorServiceException e) {
97
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
98
                    }
99
                    if(c.isStatusOk()) {
100
                        if (c.responseTypeContains(TEXT_HTML)){
101
                            recurFolder(c.getResponse(), url);
102
                        } else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){
103
                            try {
104
                                final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON));
105
                                //queue.offer(element, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
106
                                queue.put(element);
107
                            } catch (InterruptedException e) {
108
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
109
                            }
110
                        }
111
                    }
112
                }
113

  
114
            }
115
            try {
116
                //queue.offer(HttpWithFileNameCollectorIterator.TERMINATOR, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
117
                queue.put(HttpWithFileNameCollectorIterator.TERMINATOR);
118
            } catch (InterruptedException e) {
119
                throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS), e);
120
            }
121

  
122
        }
123

  
124
        private boolean containsFilter(String meta){
125
            if (filterParam == null || filterParam.isEmpty())
126
                return false;
127
            String[] filter = filterParam.split(";");
128
            for(String item:filter){
129
                if (meta.contains(item))
130
                    return true;
131
            }
132
            return false;
133
        }
134

  
135
        private String addFilePath(String meta, String url, boolean isJson){
136
            String path = url.replace("metadata", "pdf");
137

  
138
            try {
139
                if(isJson)
140
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
141
                else {
142

  
143
                    if (meta.contains("<!DOCTYPE")) {
144
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
145
                        meta = meta.substring(meta.indexOf(">") + 1);
146
                    }
147
                    int index = meta.lastIndexOf("</");
148
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
149
                }
150
            } catch(Exception ex) {
151
                log.info("not file with extension .json or .xml");
152
            }
153

  
154

  
155
            if(isJson) {
156
                try {
157
                    return XML.toString(new JSONObject("{'resource':" + meta + "}"));
158
                } catch(Exception e) {
159
                    log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
160
                   // throw new RuntimeException();
161
                    final String junk = String.format(JUNK, url);
162
                    log.warn("returning " + junk);
163
                    return junk;
164
                }
165
            }
166
            return meta;
167
        }
168

  
169
        private void recurFolder(String text, String url){
170
            Document doc = Jsoup.parse(text);
171
            Elements links = doc.select("a");
172
            for(Element e:links){
173
                if (!e.text().equals("../")){
174
                    String file = e.attr("href");
175
                    if(file.endsWith(".json") || file.endsWith(".xml"))
176
                        metas.add(url+file);
177
                    else
178
                        urls.add(url+file);
179
                }
180
            }
181
        }
182

  
183

  
184
        @Override
185
        public void run() {
186
            fillQueue();
187
        }
188
    }
189

  
190
}
modules/dnet-collector-plugins/branches/gtr2_michele/pom.xml
1
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2
	<modelVersion>4.0.0</modelVersion>
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet45-parent</artifactId>
6
		<version>1.0.0</version>
7
	</parent>
8
	<groupId>eu.dnetlib</groupId>
9
	<artifactId>dnet-collector-plugins</artifactId>
10
	<version>1.7.14-SNAPSHOT</version>
11
	<scm>
12
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-collector-plugins/trunk</developerConnection>
13
	</scm>
14

  
15
	<build>
16
		<plugins>
17
			<plugin>
18
				<artifactId>maven-assembly-plugin</artifactId>
19
				<configuration>
20
					<archive>
21
						<manifest>
22
							<mainClass>eu.dnetlib.data.collector.plugins.schemaorg.SchemaOrgMainReactome</mainClass>
23
						</manifest>
24
					</archive>
25
					<descriptorRefs>
26
						<descriptorRef>jar-with-dependencies</descriptorRef>
27
					</descriptorRefs>
28
				</configuration>
29
			</plugin>
30
		</plugins>
31
	</build>
32

  
33
	<dependencies>
34
		<dependency>
35
			<groupId>eu.dnetlib</groupId>
36
			<artifactId>dnet-modular-collector-service-rmi</artifactId>
37
			<version>[1.3.0,2.0.0)</version>
38
		</dependency>
39
		<dependency>
40
			<groupId>eu.dnetlib</groupId>
41
			<artifactId>dnet-modular-collector-service</artifactId>
42
			<version>[3.3.26,4.0.0)</version>
43
		</dependency>
44
		<dependency>
45
			<groupId>com.google.code.gson</groupId>
46
			<artifactId>gson</artifactId>
47
			<version>${google.gson.version}</version>
48
		</dependency>
49
		<dependency>
50
			<groupId>commons-io</groupId>
51
			<artifactId>commons-io</artifactId>
52
			<version>${commons.io.version}</version>
53
		</dependency>
54
		<dependency>
55
			<groupId>junit</groupId>
56
			<artifactId>junit</artifactId>
57
			<version>${junit.version}</version>
58
			<scope>test</scope>
59
		</dependency>
60
		<dependency>
61
			<groupId>org.apache.httpcomponents</groupId>
62
			<artifactId>httpclient</artifactId>
63
			<version>4.5</version>
64
		</dependency>
65
		<dependency>
66
			<groupId>eu.dnetlib</groupId>
67
			<artifactId>cnr-resultset-service</artifactId>
68
			<version>[2.0.0, 3.0.0)</version>
69
			<scope>provided</scope>
70
		</dependency>
71
		<dependency>
72
			<groupId>com.ximpleware</groupId>
73
			<artifactId>vtd-xml</artifactId>
74
			<version>[2.12, 3.0.0)</version>
75
		</dependency>
76
		<dependency>
77
			<groupId>joda-time</groupId>
78
			<artifactId>joda-time</artifactId>
79
			<version>2.9.2</version>
80
		</dependency>
81

  
82
		<dependency>
83
			<groupId>org.json</groupId>
84
			<artifactId>json</artifactId>
85
			<version>20180813</version>
86
		 <type>jar</type>
87
		</dependency>
88
		<dependency>
89
			<groupId>org.apache.commons</groupId>
90
			<artifactId>commons-lang3</artifactId>
91
			<version>3.5</version>
92
		</dependency>
93

  
94
		<dependency>
95
			<groupId>org.apache.poi</groupId>
96
			<artifactId>poi</artifactId>
97
			<version>3.16</version>
98
		</dependency>
99
		<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
100
		<dependency>
101
			<groupId>org.apache.poi</groupId>
102
			<artifactId>poi-ooxml</artifactId>
103
			<version>3.16</version>
104
		</dependency>
105
		<dependency>
106
			<groupId>org.jsoup</groupId>
107
			<artifactId>jsoup</artifactId>
108
			<version>1.11.2</version>
109
		</dependency>
110
		<dependency>
111
			<groupId>commons-lang</groupId>
112
			<artifactId>commons-lang</artifactId>
113
			<version>2.6</version>
114
			<scope>compile</scope>
115
		</dependency>
116
        <dependency>
117
            <groupId>org.mockito</groupId>
118
            <artifactId>mockito-core</artifactId>
119
            <version>3.3.3</version>
120
            <scope>test</scope>
121
        </dependency>
122
    </dependencies>
123
</project>
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/Connector.java
1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

  
3
import eu.dnetlib.data.collector.plugins.HttpConnector;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5

  
6

  
7
/**
8
 * Created by miriam on 07/05/2018.
9
 */
10
public class Connector extends HttpConnector implements ConnectorInterface  {
11
    private String response;
12

  
13
    @Override
14
    public void get(final String requestUrl) throws CollectorServiceException {
15
        response = getInputSource(requestUrl);
16
    }
17

  
18
    @Override
19
    public String getResponse() {
20
        return response;
21
    }
22

  
23
    @Override
24
    public boolean isStatusOk() {
25
        return (response != null);
26
    }
27

  
28
    @Override
29
    public boolean responseTypeContains(String string) {
30
        String responseType = getResponseType();
31
        if (responseType != null)
32
            return responseType.contains(string);
33
        return false;
34
    }
35

  
36

  
37
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/datasets/ElasticSearchResponse.java
1
package eu.dnetlib.data.collector.plugins.datasets;
2

  
3
import java.util.ArrayList;
4
import java.util.List;
5

  
6
import org.apache.commons.logging.Log;
7
import org.apache.commons.logging.LogFactory;
8

  
9
import com.google.gson.JsonArray;
10
import com.google.gson.JsonElement;
11
import com.google.gson.JsonObject;
12
import com.google.gson.JsonParser;
13

  
14
public class ElasticSearchResponse {
15

  
16
	/** The logger. */
17
	private static final Log log = LogFactory.getLog(ElasticSearchResponse.class);
18
	private long total;
19
	private List<String> xmlRecords;
20

  
21
	public static ElasticSearchResponse createNewResponse(final String response) {
22
		ElasticSearchResponse item = new ElasticSearchResponse();
23

  
24
		if (response == null) {
25
			log.fatal("Error: null elasticsearch reponse");
26
			return null;
27

  
28
		}
29
		JsonElement jElement = new JsonParser().parse(response);
30
		JsonObject jobject = jElement.getAsJsonObject();
31
		if (jobject.has("hits")) {
32

  
33
			item.setTotal(jobject.get("hits").getAsJsonObject().get("total").getAsLong());
34

  
35
			JsonElement hits = ((JsonObject) jobject.get("hits")).get("hits");
36

  
37
			JsonArray hitsObject = hits.getAsJsonArray();
38

  
39
			List<String> records = new ArrayList<String>();
40

  
41
			for (JsonElement elem : hitsObject) {
42
				JsonObject _source = (JsonObject) ((JsonObject) elem).get("_source");
43
				String xml = _source.get("xml").getAsString();
44
				records.add(xml);
45
			}
46
			item.setXmlRecords(records);
47
			return item;
48
		}
49
		return null;
50
	}
51

  
52
	/**
53
	 * @return the xmlRecords
54
	 */
55
	public List<String> getXmlRecords() {
56
		return xmlRecords;
57
	}
58

  
59
	/**
60
	 * @param xmlRecords
61
	 *            the xmlRecords to set
62
	 */
63
	public void setXmlRecords(final List<String> xmlRecords) {
64
		this.xmlRecords = xmlRecords;
65
	}
66

  
67
	/**
68
	 * @return the total
69
	 */
70
	public long getTotal() {
71
		return total;
72
	}
73

  
74
	/**
75
	 * @param total
76
	 *            the total to set
77
	 */
78
	public void setTotal(final long total) {
79
		this.total = total;
80
	}
81

  
82
}
modules/dnet-collector-plugins/branches/gtr2_michele/src/main/java/eu/dnetlib/data/collector/plugins/datasets/DatasetsIterator.java
1
package eu.dnetlib.data.collector.plugins.datasets;
2

  
3
import java.io.IOException;
4
import java.io.InputStream;
5
import java.util.Iterator;
6

  
7
import org.apache.commons.io.IOUtils;
8
import org.apache.commons.lang3.StringEscapeUtils;
9
import org.apache.commons.logging.Log;
10
import org.apache.commons.logging.LogFactory;
11
import org.apache.http.client.methods.CloseableHttpResponse;
12
import org.apache.http.client.methods.HttpPost;
13
import org.apache.http.entity.StringEntity;
14
import org.apache.http.impl.client.CloseableHttpClient;
15
import org.apache.http.impl.client.HttpClients;
16

  
17
import com.google.gson.Gson;
18
import com.google.gson.GsonBuilder;
19

  
20
/**
21
 * The Class JournalIterator.
22
 */
23
public class DatasetsIterator implements Iterable<String>, Iterator<String> {
24

  
25
	/** The logger. */
26
	private static final Log log = LogFactory.getLog(DatasetsIterator.class);
27

  
28
	/** The base url template. */
29
	private static String BASE_URL_TEMPLATE = "http://ws.pangaea.de/es/pangaea/panmd/_search?_source=xml&size=%d&from=%d";
30

  
31
	/** The journal id. */
32
	private String journalId = "";
33

  
34
	/** The journal name. */
35
	private String journalName = "";
36

  
37
	/** The journal issn. */
38
	private String journalISSN = "";
39

  
40
	/** The openaire datasource. */
41
	private String openaireDatasource = "";
42

  
43
	/** The total. */
44
	private long total;
45

  
46
	/** The from. */
47
	private int from;
48

  
49
	/** The current iterator. */
50
	private int currentIterator;
51

  
52
	/** The current response. */
53
	private ElasticSearchResponse currentResponse;
54

  
55
	/** The request. */
56
	private RequestField request;
57

  
58
	/** The default size. */
59
	private static int DEFAULT_SIZE = 10;
60

  
61
	private String projectCordaId;
62

  
63
	private static String RECORD_TEMPLATE = "<datasetsRecord><oaf:projectid xmlns:oaf=\"http://namespace.openaire.eu/oaf\">%s</oaf:projectid>"
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff