Project

General

Profile

« Previous | Next » 

Revision 63319

Added by Michele Artini 2 months ago

[maven-release-plugin] copy for tag dnet-collector-plugins-1.8.1

View differences:

modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/HttpApiRepositoryIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
4

  
5
public interface HttpApiRepositoryIterable extends RepositoryIterable {
6
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/excel/Read.java
1
package eu.dnetlib.data.collector.plugins.excel;
2

  
3
/**
4
 * Created by miriam on 10/05/2017.
5
 */
6
import java.io.File;
7
import java.io.FileInputStream;
8
import java.io.IOException;
9
import java.net.URL;
10
import java.util.ArrayList;
11
import java.util.HashMap;
12
import java.util.Iterator;
13

  
14
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
15
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
16
import org.apache.commons.lang3.StringUtils;
17
import org.apache.commons.logging.Log;
18
import org.apache.commons.logging.LogFactory;
19
import org.apache.poi.ss.usermodel.Cell;
20
import org.apache.poi.ss.usermodel.DataFormatter;
21
import org.apache.poi.ss.usermodel.Row;
22
import org.apache.poi.ss.usermodel.Sheet;
23
import org.apache.poi.ss.usermodel.Workbook;
24
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
25
import org.json.*;
26

  
27
import org.apache.commons.io.FileUtils;
28

  
29
public class Read {
30

  
31
	private static final Log log = LogFactory.getLog(Read.class);
32

  
33
	/** The descriptor. */
34
	private InterfaceDescriptor descriptor;
35

  
36

  
37
	/*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10";
38
	private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv";
39
	 private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," +
40
	  "\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}],"
41
			   + "\"col_currency\":10}"; */
42
	private Sheet sheet;
43
	private CSVFileWriter csv_writer = new CSVFileWriter();
44
	private HashMap<String,String> map_header = new HashMap<String,String>();
45
	private HashMap<String,String> map_body = new HashMap<String,String>();
46
	private int header_row;
47
	private String file_to_save ;
48
	private boolean replace_currency = false;
49
	private String from_currency, to_currency;
50
	private boolean remove_empty, remove_tmp_file;
51
	private String remove_id;
52
	private int column_id;
53
	private int currency_column;
54
	private int sheet_number;
55
	private String tmp_file;
56
	private String argument;
57
	private String identifier;
58

  
59
	private HttpCSVCollectorPlugin collector;
60

  
61
	public HttpCSVCollectorPlugin getCollector() {
62
		return collector;
63
	}
64

  
65
	public void setCollector(HttpCSVCollectorPlugin collector) {
66
		this.collector = collector;
67
	}
68

  
69
	public Read(InterfaceDescriptor descriptor){
70
		this.descriptor = descriptor;
71

  
72
	}
73

  
74
	private static String getCellValue( Cell cell)
75
	{
76
		DataFormatter formatter = new DataFormatter();
77
		String formattedCellValue = formatter.formatCellValue(cell);
78
		return formattedCellValue;
79

  
80
	}
81

  
82
	private void copyFile() throws IOException{
83
		FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file));
84

  
85
	}
86

  
87
	private void parseDescriptor(){
88
		HashMap<String, String> params = descriptor.getParams();
89
		argument = params.get("argument");
90
		header_row = Integer.parseInt(params.get("header_row"));
91
		tmp_file = params.get("tmp_file");
92
		remove_empty = (params.get("remove_empty_lines") == "yes");
93
		remove_id = params.get("remove_lines_with_id");
94
		column_id = Integer.parseInt(params.get("col_id"));
95
		remove_tmp_file = (params.get("remove_tmp_file") == "yes");
96
		sheet_number = Integer.parseInt(params.get("sheet_number"));
97
		file_to_save = params.get("file_to_save");
98
	}
99
	private void init() throws IOException{
100
		parseDescriptor();
101
		log.info("Parsing the arguments");
102
		parseArguments();
103
		log.info("Copying the file in temp local file");
104
		copyFile();
105
		log.info("Extracting the sheet " + sheet_number);
106
		FileInputStream fis = new FileInputStream(tmp_file);
107
		Workbook workbook = new XSSFWorkbook(fis);
108
		sheet = workbook.getSheetAt(sheet_number);
109
		fis.close();
110
		if(remove_tmp_file) {
111
			File f = new File(tmp_file);
112
			f.delete();
113
		}
114

  
115
	}
116

  
117
	private void fillMap(JSONObject json, HashMap<String,String> map, String elem){
118
		try{
119
			final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
120
			for(Object entry: arr)
121
				map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to"));
122
		}catch(Throwable e){
123
			log.error("Problems filling the map for " + elem);
124
			throw(e);
125
		}
126

  
127
	}
128

  
129

  
130

  
131
	private void parseArguments() {
132
		if (StringUtils.isNotEmpty(argument)){
133
			try{
134
				final JSONObject json = new JSONObject(argument);
135
				JSONObject tmp = json.getJSONObject("replace");
136
				if(tmp.has("header"))
137
					fillMap(json, map_header,"header");
138
				if(tmp.has("body"))
139
					fillMap(json,map_body,"body");
140
//				if(json.has("header"))
141
//					fillMap(json, map_header,"header");
142
//				if (json.has("body"))
143
//					fillMap(json,map_body,"body");
144

  
145
				if(json.has("replace_currency"))
146
				{
147
					replace_currency = true	;
148
					from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from");
149
					to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to");
150

  
151
				}
152

  
153
				if (json.has("col_currency"))
154
					currency_column = json.getInt("col_currency");
155
			}catch(Throwable e){
156
				log.error("Problems while parsing the argument parameter.");
157
				throw (e);
158
			}
159
		}
160

  
161

  
162

  
163
	}
164

  
165
	private String applyReplace(String row, HashMap<String,String>replace){
166
		for(String key: replace.keySet()){
167
			if(row.contains(key))
168
				row = row.replace(key, replace.get(key));
169
		}
170
		return row;
171
	}
172

  
173
	private void getHeader(){
174
		Row row = sheet.getRow(header_row);
175
		Iterator<Cell> cellIterator = row.cellIterator();
176
		Cell cell;
177
		String project = "";
178
		int count = 0;
179
		while (cellIterator.hasNext()){
180
			cell = cellIterator.next();
181
			final String stringCellValue = cell.getStringCellValue();
182
			project += applyReplace(stringCellValue,map_header) + ";";
183
			if(count++ == column_id) identifier = applyReplace(stringCellValue,map_header);
184
		}
185
		project = project.substring(0, project.length() -1 );
186
		csv_writer.setHeader(project.split(";"));
187

  
188
	}
189

  
190
	private void getData(){
191
		Row row;
192
		Cell cell;
193
		String tmp;
194
		Iterator<Cell>cellIterator;
195
		for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){
196
			row = sheet.getRow(row_number);
197
			if (row != null) {
198
				cellIterator = row.cellIterator();
199

  
200
				int col_number = 0;
201

  
202
				boolean discard_row = false;
203
				ArrayList<String> al = new ArrayList<String>();
204
				while (cellIterator.hasNext() && !discard_row) {
205
					cell = cellIterator.next();
206
					tmp = getCellValue(cell).trim();
207
					tmp = tmp.replace("\n"," ");
208
					if (col_number == column_id &&
209
							((remove_empty && tmp.trim().equals("")) ||
210
									(!remove_id.equals("") && tmp.equals(remove_id))))
211
						discard_row = true;
212

  
213
					if (replace_currency && col_number == currency_column)
214
						tmp = tmp.replace(from_currency, to_currency);
215

  
216
					al.add(applyReplace(tmp, map_body));
217
					col_number++;
218
				}
219
				if (!discard_row) {
220
					csv_writer.addProject(al);
221

  
222
				}
223
			}
224
		}
225

  
226
	}
227

  
228
	private void writeCSVFile(){
229

  
230
		csv_writer.writeFile(file_to_save);
231
	}
232

  
233
	private InterfaceDescriptor prepareHTTPCSVDescriptor(){
234
		InterfaceDescriptor dex = new InterfaceDescriptor();
235
		dex.setBaseUrl("file://"+file_to_save);
236
		HashMap<String, String> params = new HashMap<String, String>();
237
		params.put("separator", descriptor.getParams().get("separator"));
238
		params.put("identifier",identifier);
239
		params.put("quote",descriptor.getParams().get("quote"));
240
		dex.setParams(params);
241
		return dex;
242
	}
243

  
244
	public Iterable<String> parseFile() throws Exception{
245

  
246

  
247
		init();
248
		log.info("Getting header elements");
249
		getHeader();
250
		log.info("Getting sheet data");
251
		getData();
252
		log.info("Writing the csv file");
253
		writeCSVFile();
254
		log.info("Preparing to parse csv");
255

  
256
		return collector.collect(prepareHTTPCSVDescriptor(),"","");
257

  
258
	}
259

  
260

  
261
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/excel/ReadExcelPlugin.java
1
package eu.dnetlib.data.collector.plugins.excel;
2

  
3

  
4
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
5
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
6
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
7
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
8
import org.apache.commons.logging.Log;
9
import org.apache.commons.logging.LogFactory;
10
import org.springframework.beans.factory.annotation.Autowired;
11
import org.springframework.beans.factory.annotation.Required;
12

  
13
/**
14
 * Created by miriam on 10/05/2017.
15
 */
16
public class ReadExcelPlugin extends AbstractCollectorPlugin{
17

  
18
	private static final Log log = LogFactory.getLog(ReadExcelPlugin.class);
19
	@Autowired
20
	HttpCSVCollectorPlugin httpCSVCollectorPlugin;
21

  
22

  
23

  
24
	@Override
25
	public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
26
			throws CollectorServiceException {
27
		Read r = new Read(interfaceDescriptor);
28
		r.setCollector(httpCSVCollectorPlugin);
29

  
30
		try {
31
			return r.parseFile();
32
		}catch(Exception e){
33
			log.error("Error importing excel file");
34
			throw new CollectorServiceException(e);
35
		}
36

  
37

  
38
	}
39
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/datasources/Re3DataCollectorPlugin.java
1
package eu.dnetlib.data.collector.plugins.datasources;
2

  
3
import java.io.IOException;
4

  
5
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
6
import eu.dnetlib.data.collector.plugins.HttpConnector;
7
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
8
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
9
import org.apache.commons.io.IOUtils;
10
import org.springframework.beans.factory.annotation.Autowired;
11

  
12
/**
13
 * Plugin to collect metadata record about data repositories from re3data.
14
 * <p>
15
 * Documentation on re3data API: http://service.re3data.org/api/doc.
16
 * </p>
17
 * <p>
18
 * BaseURL: http://service.re3data.org
19
 * </p>
20
 * <p>
21
 * API to get the list of repos: baseURL + /api/v1/repositories
22
 * </p>
23
 * <p>
24
 * API to get a repository: baseURL + content of link/@href of the above list
25
 * </p>
26
 *
27
 * @author alessia
28
 *
29
 */
30
public class Re3DataCollectorPlugin extends AbstractCollectorPlugin {
31

  
32
	private String repositoryListPath = "/api/v1/repositories";
33

  
34
	@Autowired
35
	private HttpConnector httpConnector;
36

  
37
	@Override
38
	public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
39
			throws CollectorServiceException {
40
		String repositoryListURL = interfaceDescriptor.getBaseUrl() + repositoryListPath;
41
		String input;
42
		try {
43
			input = httpConnector.getInputSource(repositoryListURL);
44
			return new Re3DataRepositoriesIterator(IOUtils.toInputStream(input, "UTF-8"), interfaceDescriptor.getBaseUrl(), getHttpConnector());
45
		} catch (IOException e) {
46
			throw new CollectorServiceException(e);
47
		}
48

  
49
	}
50

  
51
	public String getRepositoryListPath() {
52
		return repositoryListPath;
53
	}
54

  
55
	public void setRepositoryListPath(final String repositoryListPath) {
56
		this.repositoryListPath = repositoryListPath;
57
	}
58

  
59
	public HttpConnector getHttpConnector() {
60
		return httpConnector;
61
	}
62

  
63
	public void setHttpConnector(final HttpConnector httpConnector) {
64
		this.httpConnector = httpConnector;
65
	}
66
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/sitemapindex/SitemapFileIterator.java
1
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
2

  
3
import eu.dnetlib.data.collector.plugins.schemaorg.Utils;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5
import org.apache.commons.io.FileUtils;
6
import org.apache.commons.io.IOUtils;
7
import org.apache.commons.logging.Log;
8
import org.apache.commons.logging.LogFactory;
9

  
10
import java.io.*;
11
import java.net.URL;
12
import java.nio.charset.Charset;
13
import java.util.*;
14

  
15
public class SitemapFileIterator implements Iterator<String> {
16
	private static final Log log = LogFactory.getLog(SitemapFileIterator.class);
17

  
18
	public static class Options {
19

  
20
		public enum SitemapFileType{
21
			Text,
22
			GZ
23
		}
24

  
25
		public enum SitemapSchemaType{
26
			Text,
27
			Xml
28
		}
29

  
30
		public Options(){}
31

  
32
		public Options(URL fileUrl, Charset charset, SitemapSchemaType schemaType, SitemapFileType fileType) {
33
			this.fileUrl = fileUrl;
34
			this.charset = charset;
35
			this.schemaType = schemaType;
36
			this.fileType = fileType;
37
		}
38

  
39
		private SitemapFileType fileType;
40
		private SitemapSchemaType schemaType;
41
		private URL fileUrl;
42
		private Charset charset;
43

  
44
		public Charset getCharset() {
45
			return charset;
46
		}
47

  
48
		public void setCharset(Charset charset) {
49
			this.charset = charset;
50
		}
51

  
52
		public URL getFileUrl() {
53
			return fileUrl;
54
		}
55

  
56
		public void setFileUrl(URL fileUrl) {
57
			this.fileUrl = fileUrl;
58
		}
59

  
60
		public SitemapFileType getFileType() {
61
			return fileType;
62
		}
63

  
64
		public void setFileType(SitemapFileType fileType) {
65
			this.fileType = fileType;
66
		}
67

  
68
		public SitemapSchemaType getSchemaType() {
69
			return schemaType;
70
		}
71

  
72
		public void setSchemaType(SitemapSchemaType schemaType) {
73
			this.schemaType = schemaType;
74
		}
75

  
76
		@Override
77
		public Object clone(){
78
			Options clone = new Options();
79
			clone.setCharset(this.getCharset());
80
			clone.setFileType(this.getFileType());
81
			clone.setFileUrl(this.getFileUrl());
82
			clone.setSchemaType(this.getSchemaType());
83
			return clone;
84
		}
85
	}
86

  
87
	private Options options;
88
	private File downloadedFile;
89
	private File contentFile;
90
	private Queue<String> locations;
91

  
92
	public SitemapFileIterator(Options options){
93
		this.options = options;
94
	}
95

  
96
	public void bootstrap() {
97
		LinkedList<String> endpoints = null;
98
		try {
99
			log.debug(String.format("bootstrapping sitemapindex file access for sitemapindex %s", this.options.getFileUrl()));
100
			this.downloadedFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp");
101
			this.downloadedFile.deleteOnExit();
102
			FileUtils.copyURLToFile(this.options.getFileUrl(), this.downloadedFile);
103
			log.debug(String.format("downloaded file: %s has size %d", this.downloadedFile.toString(), this.downloadedFile.length()));
104

  
105
			switch (this.options.getFileType()) {
106
				case Text: {
107
					this.contentFile = this.downloadedFile;
108
					break;
109
				}
110
				case GZ: {
111
					this.contentFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp");
112
					this.contentFile.deleteOnExit();
113
					Utils.decompressGZipTo(this.downloadedFile, this.contentFile);
114
					log.debug(String.format("extracted gz file: %s has size %d", this.contentFile.toString(), this.contentFile.length()));
115
					break;
116
				}
117
				default:
118
					throw new CollectorServiceException("unrecognized file type " + this.options.getFileType());
119
			}
120

  
121
			List<String> content = this.collectContentLocations();
122

  
123
			log.debug(String.format("extracted %d sitemapindex endpoints", content.size()));
124
			endpoints = new LinkedList<>(content);
125
		}catch(Exception ex){
126
			log.error(String.format("error processing sitemapindex %s. returning 0 endpoints",this.options.getFileUrl()), ex);
127
			endpoints = new LinkedList<>();
128
		}finally {
129
			if (this.contentFile != null) {
130
				this.contentFile.delete();
131
			}
132
			if (this.downloadedFile != null) {
133
				this.downloadedFile.delete();
134
			}
135
		}
136
		this.locations = endpoints;
137
	}
138

  
139
	private List<String> collectContentLocations() throws Exception{
140
		switch(this.options.getSchemaType()) {
141
			case Text:{
142
				return this.collectTextContentLocations();
143
			}
144
			case Xml:{
145
				return this.collectXmlContentLocations();
146
			}
147
			default: throw new CollectorServiceException("unrecognized file type "+this.options.getFileType());
148
		}
149
	}
150

  
151
	private List<String> collectTextContentLocations() throws Exception {
152
		log.debug(String.format("reading endpoint locations from text sitemapindex"));
153
		try (FileInputStream in = new FileInputStream(this.contentFile)) {
154
			return IOUtils.readLines(in, this.options.getCharset());
155
		}
156
	}
157

  
158
	private List<String> collectXmlContentLocations() throws Exception {
159
		log.debug(String.format("reading endpoint locations from xml sitemapindex"));
160
		return Utils.collectAsStrings(this.contentFile,"/urlset/url/loc/text()");
161
	}
162

  
163
	@Override
164
	public boolean hasNext() {
165
		return !this.locations.isEmpty();
166
	}
167

  
168
	@Override
169
	public String next() {
170
		return this.locations.poll();
171
	}
172
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/RepositoryQueueIterator.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import org.apache.commons.logging.Log;
4
import org.apache.commons.logging.LogFactory;
5

  
6
import java.util.Iterator;
7
import java.util.NoSuchElementException;
8
import java.util.concurrent.ArrayBlockingQueue;
9
import java.util.concurrent.TimeUnit;
10

  
11
public class RepositoryQueueIterator implements Iterator<String> {
12
	private static final Log log = LogFactory.getLog(RepositoryQueueIterator.class);
13

  
14
	public static class Options {
15
		private Boolean blockPolling;
16
		private long pollTimeout;
17
		private TimeUnit pollTimeoutUnit;
18

  
19
		public Boolean getBlockPolling() {
20
			return blockPolling;
21
		}
22

  
23
		public void setBlockPolling(Boolean blockPolling) {
24
			this.blockPolling = blockPolling;
25
		}
26

  
27
		public long getPollTimeout() {
28
			return pollTimeout;
29
		}
30

  
31
		public void setPollTimeout(long pollTimeout) {
32
			this.pollTimeout = pollTimeout;
33
		}
34

  
35
		public TimeUnit getPollTimeoutUnit() {
36
			return pollTimeoutUnit;
37
		}
38

  
39
		public void setPollTimeoutUnit(TimeUnit pollTimeoutUnit) {
40
			this.pollTimeoutUnit = pollTimeoutUnit;
41
		}
42
	}
43

  
44
	private ArrayBlockingQueue<String> queue;
45
	private Options options;
46
	private boolean hasTerminated;
47

  
48
	public RepositoryQueueIterator(Options options, ArrayBlockingQueue<String> queue) {
49
		this.options = options;
50
		this.queue = queue;
51
		this.hasTerminated = false;
52
	}
53

  
54
	@Override
55
	public boolean hasNext() {
56
		if(this.hasTerminated) return false;
57
		return true;
58
	}
59

  
60
	@Override
61
	public String next() {
62
		String next = this.poll();
63
		log.debug("next endpoint to process: " + next);
64
		if (next != null && next.equalsIgnoreCase(RepositoryIterable.TerminationHint)) {
65
			log.debug("no more endpoints to process");
66
			this.hasTerminated = true;
67
			next = null;
68
		}
69

  
70
		return next;
71
	}
72

  
73
	private String poll(){
74
		String item = null;
75
		log.debug("retrieving endpoint from queue");
76
		log.debug("queue size: " + queue.size());
77
		if(this.options.getBlockPolling()) {
78
			try {
79
				item = this.queue.poll(this.options.getPollTimeout(), this.options.getPollTimeoutUnit());
80
			} catch (InterruptedException ex) {
81
				log.warn(String.format("could not poll elements from queue for more than %s %s. throwing", this.options.getPollTimeout(), this.options.getPollTimeoutUnit()));
82
				throw new NoSuchElementException(ex.getMessage());
83
			}
84
		}
85
		else {
86
			item = this.queue.poll();
87
		}
88
		log.debug("retrieved endpoint from queue");
89
		log.debug("queue size: " + queue.size());
90
		return item;
91
	}
92
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/pom.xml
1
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2
	<modelVersion>4.0.0</modelVersion>
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet45-parent</artifactId>
6
		<version>1.0.0</version>
7
	</parent>
8
	<groupId>eu.dnetlib</groupId>
9
	<artifactId>dnet-collector-plugins</artifactId>
10
	<version>1.8.1</version>
11
	<scm>
12
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1</developerConnection>
13
	</scm>
14

  
15
	<build>
16
		<plugins>
17
			<plugin>
18
				<artifactId>maven-assembly-plugin</artifactId>
19
				<configuration>
20
					<archive>
21
						<manifest>
22
							<mainClass>eu.dnetlib.data.collector.plugins.schemaorg.SchemaOrgMainReactome</mainClass>
23
						</manifest>
24
					</archive>
25
					<descriptorRefs>
26
						<descriptorRef>jar-with-dependencies</descriptorRef>
27
					</descriptorRefs>
28
				</configuration>
29
			</plugin>
30
		</plugins>
31
	</build>
32

  
33
	<dependencies>
34
		<dependency>
35
			<groupId>eu.dnetlib</groupId>
36
			<artifactId>dnet-modular-collector-service-rmi</artifactId>
37
			<version>[1.3.0,2.0.0)</version>
38
		</dependency>
39
		<dependency>
40
			<groupId>eu.dnetlib</groupId>
41
			<artifactId>dnet-modular-collector-service</artifactId>
42
			<version>[3.3.26,4.0.0)</version>
43
		</dependency>
44
		<dependency>
45
			<groupId>com.google.code.gson</groupId>
46
			<artifactId>gson</artifactId>
47
			<version>${google.gson.version}</version>
48
		</dependency>
49
		<dependency>
50
			<groupId>commons-io</groupId>
51
			<artifactId>commons-io</artifactId>
52
			<version>${commons.io.version}</version>
53
		</dependency>
54
		<dependency>
55
			<groupId>junit</groupId>
56
			<artifactId>junit</artifactId>
57
			<version>${junit.version}</version>
58
			<scope>test</scope>
59
		</dependency>
60
		<dependency>
61
			<groupId>org.apache.httpcomponents</groupId>
62
			<artifactId>httpclient</artifactId>
63
			<version>4.5</version>
64
		</dependency>
65
		<dependency>
66
			<groupId>eu.dnetlib</groupId>
67
			<artifactId>cnr-resultset-service</artifactId>
68
			<version>[2.0.0, 3.0.0)</version>
69
			<scope>provided</scope>
70
		</dependency>
71
		<dependency>
72
			<groupId>com.ximpleware</groupId>
73
			<artifactId>vtd-xml</artifactId>
74
			<version>[2.12, 3.0.0)</version>
75
		</dependency>
76
		<dependency>
77
			<groupId>joda-time</groupId>
78
			<artifactId>joda-time</artifactId>
79
			<version>2.9.2</version>
80
		</dependency>
81

  
82
		<dependency>
83
			<groupId>org.json</groupId>
84
			<artifactId>json</artifactId>
85
			<version>20180813</version>
86
		 <type>jar</type>
87
		</dependency>
88
		<dependency>
89
			<groupId>org.apache.commons</groupId>
90
			<artifactId>commons-lang3</artifactId>
91
			<version>3.5</version>
92
		</dependency>
93

  
94
		<dependency>
95
			<groupId>org.apache.poi</groupId>
96
			<artifactId>poi</artifactId>
97
			<version>3.16</version>
98
		</dependency>
99
		<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
100
		<dependency>
101
			<groupId>org.apache.poi</groupId>
102
			<artifactId>poi-ooxml</artifactId>
103
			<version>3.16</version>
104
		</dependency>
105
		<dependency>
106
			<groupId>org.jsoup</groupId>
107
			<artifactId>jsoup</artifactId>
108
			<version>1.11.2</version>
109
		</dependency>
110
		<dependency>
111
			<groupId>commons-lang</groupId>
112
			<artifactId>commons-lang</artifactId>
113
			<version>2.6</version>
114
			<scope>compile</scope>
115
		</dependency>
116
        <dependency>
117
            <groupId>org.mockito</groupId>
118
            <artifactId>mockito-core</artifactId>
119
            <version>3.3.3</version>
120
            <scope>test</scope>
121
        </dependency>
122
    </dependencies>
123
</project>
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/RepositoryIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import java.util.Iterator;
4

  
5
public interface RepositoryIterable extends Iterable<String> {
6
	public static String TerminationHint = "df667391-676d-4c0f-9c40-426b1001607a";
7
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/gtr2/Gtr2Helper.java
1
package eu.dnetlib.data.collector.plugins.gtr2;
2

  
3
import org.apache.commons.lang.StringUtils;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6
import org.dom4j.Document;
7
import org.dom4j.DocumentHelper;
8
import org.joda.time.DateTime;
9
import org.joda.time.format.DateTimeFormat;
10
import org.joda.time.format.DateTimeFormatter;
11

  
12
import eu.dnetlib.data.collector.plugins.HttpConnector;
13
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
14

  
15
public class Gtr2Helper {
16

  
17
	private static final Log log = LogFactory.getLog(Gtr2Helper.class); // NOPMD by marko on 11/24/08 5:02 PM
18

  
19
	private static final HttpConnector connector = new HttpConnector();
20
	private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
21

  
22
	private static final int MAX_ATTEMPTS = 10;
23

  
24
	public static String cleanURL(final String url) {
25
		String cleaned = url;
26
		if (cleaned.contains("gtr.gtr")) {
27
			cleaned = cleaned.replace("gtr.gtr", "gtr");
28
		}
29
		if (cleaned.startsWith("http://")) {
30
			cleaned = cleaned.replaceFirst("http://", "https://");
31
		}
32
		return cleaned;
33
	}
34

  
35
	public static Document loadURL(final String url) {
36
		final String cleanUrl = cleanURL(url);
37
		return loadURL(cleanUrl, 0);
38
	}
39

  
40
	private static Document loadURL(final String cleanUrl, final int attempt) {
41
		try {
42
			log.debug("  * Downloading Url: " + cleanUrl);
43
			final byte[] bytes = connector.getInputSource(cleanUrl).getBytes("UTF-8");
44
			return DocumentHelper.parseText(new String(bytes));
45
		} catch (final Throwable e) {
46
			log.error("Error dowloading url: " + cleanUrl + ", attempt = " + attempt, e);
47
			if (attempt < MAX_ATTEMPTS) {
48
				try {
49
					Thread.sleep(60000); // I wait for a minute
50
				} catch (final InterruptedException e1) {
51
					throw new CollectorServiceRuntimeException("Error dowloading url: " + cleanUrl, e);
52
				}
53
				return loadURL(cleanUrl, attempt + 1);
54
			} else {
55
				throw new CollectorServiceRuntimeException("Error dowloading url: " + cleanUrl, e);
56
			}
57
		}
58
	}
59

  
60
	public static DateTime parseDate(final String s) {
61
		return DateTime.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter);
62
	}
63

  
64
	public static boolean isAfter(final String d, final DateTime fromDate) {
65
		return StringUtils.isNotBlank(d) && Gtr2Helper.parseDate(d).isAfter(fromDate);
66
	}
67
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/gtr2/AbstractGtr2CollectorPlugin.java
1
package eu.dnetlib.data.collector.plugins.gtr2;
2

  
3
import java.util.Iterator;
4

  
5
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
6
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
7
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
8

  
9
public abstract class AbstractGtr2CollectorPlugin extends AbstractCollectorPlugin {
10

  
11
	@Override
12
	public final Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
13
		throws CollectorServiceException {
14

  
15
		if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); }
16

  
17
		final String baseUrl = interfaceDescriptor.getBaseUrl();
18
		final String startPage = interfaceDescriptor.getParams().get("startPage");
19
		final String endPage = interfaceDescriptor.getParams().get("endPage");
20

  
21
		return () -> {
22
			try {
23
				return createIterator(baseUrl, fromDate, startPage, endPage);
24
			} catch (final CollectorServiceException e) {
25
				throw new RuntimeException(e);
26
			}
27
		};
28
	}
29

  
30
	protected abstract Iterator<String> createIterator(String baseUrl, final String fromDate, String startPage, String endPage)
31
		throws CollectorServiceException;
32

  
33
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/SchemaOrgIterable.java
1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

  
3
import org.apache.commons.logging.Log;
4
import org.apache.commons.logging.LogFactory;
5

  
6
import java.util.Iterator;
7
import java.util.concurrent.ArrayBlockingQueue;
8

  
9
public class SchemaOrgIterable implements Iterable<String> {
10
	private static final Log log = LogFactory.getLog(SchemaOrgIterable.class);
11

  
12
	public static class Options {
13
		private EndpointAccessIterator.Options endpointAccessOptions;
14
		private DatasetMappingIterator.Options datasetMappingOptions;
15

  
16
		public EndpointAccessIterator.Options getEndpointAccessOptions() {
17
			return endpointAccessOptions;
18
		}
19

  
20
		public void setEndpointAccessOptions(EndpointAccessIterator.Options endpointAccessOptions) {
21
			this.endpointAccessOptions = endpointAccessOptions;
22
		}
23

  
24
		public DatasetMappingIterator.Options getDatasetMappingOptions() {
25
			return datasetMappingOptions;
26
		}
27

  
28
		public void setDatasetMappingOptions(DatasetMappingIterator.Options datasetMappingOptions) {
29
			this.datasetMappingOptions = datasetMappingOptions;
30
		}
31
	}
32

  
33
	private Options options;
34
	private RepositoryIterable repository;
35

  
36
	public SchemaOrgIterable(Options options, RepositoryIterable repository){
37
		this.options = options;
38
		this.repository = repository;
39
	}
40

  
41
	@Override
42
	public Iterator<String> iterator() {
43
		Iterator<String> repositoryIterator = this.repository.iterator();
44
		EndpointAccessIterator endpointAccessIterator = new EndpointAccessIterator(options.getEndpointAccessOptions(), repositoryIterator);
45
		DatasetMappingIterator datasetMappingIterator = new DatasetMappingIterator(options.getDatasetMappingOptions(), endpointAccessIterator);
46

  
47
		return datasetMappingIterator;
48
	}
49
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/HTTPWithFileNameCollectorIterable.java
1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

  
3
import java.util.*;
4
import java.util.concurrent.ArrayBlockingQueue;
5
import java.util.concurrent.TimeUnit;
6

  
7
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
8
import org.apache.commons.logging.Log;
9
import org.apache.commons.logging.LogFactory;
10
import org.json.JSONObject;
11
import org.json.XML;
12
import org.jsoup.Jsoup;
13
import org.jsoup.nodes.Document;
14
import org.jsoup.nodes.Element;
15
import org.jsoup.select.Elements;
16

  
17
/**
18
 * Created by miriam on 04/05/2018.
19
 */
20
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
21

  
22
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
23

  
24
    private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>";
25
    public static final String APP_JSON = "application/json";
26
    public static final String APP_XML = "application/xml";
27
    public static final String TEXT_HTML = "text/html";
28
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
29

  
30

  
31

  
32

  
33
    private String filterParam;
34

  
35
    int total = 0;
36
    int filtered = 0;
37

  
38
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
39

  
40
        this.filterParam = filter;
41
        Thread ft = new Thread(new FillMetaQueue(startUrl) );
42
        ft.start();
43
    }
44

  
45

  
46
    @Override
47
    public Iterator<String> iterator() {
48
        return new HttpWithFileNameCollectorIterator(queue);
49
    }
50

  
51
    private class FillMetaQueue implements Runnable {
52
        final Connector c = new Connector();
53

  
54
        private final List<String> metas = Collections.synchronizedList(new ArrayList<String>());
55
        private final List<String> urls = Collections.synchronizedList(new ArrayList<>());
56

  
57
        public FillMetaQueue(String startUrl){
58
            if(!startUrl.isEmpty()){
59
                urls.add(startUrl);
60
            }
61
        }
62

  
63

  
64
        public void fillQueue() {
65
            String url;
66

  
67
            while((metas.size()>0 || urls.size() > 0 )) {
68
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
69
                if (metas.size() > 0) {
70
                    url = metas.remove(0);
71
                    try {
72
                        c.get(url);
73
                    } catch (CollectorServiceException e) {
74
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
75
                    }
76
                    if(c.isStatusOk()){
77
                        try {
78
                            String ret = c.getResponse();
79
                            if (ret != null && ret.length()>0) {
80
                                if (!containsFilter(ret))
81
                                    queue.put(addFilePath(ret, url, url.endsWith(".json")));
82
                                    //queue.offer(addFilePath(ret, url, url.endsWith(".json")), HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
83
                                else
84
                                    filtered++;
85
                                total++;
86
                            }
87
                        } catch (InterruptedException e) {
88
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
89

  
90
                        }
91
                    }
92
                } else {
93
                    url = urls.remove(0);
94
                    try {
95
                        c.get(url);
96
                    } catch (CollectorServiceException e) {
97
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
98
                    }
99
                    if(c.isStatusOk()) {
100
                        if (c.responseTypeContains(TEXT_HTML)){
101
                            recurFolder(c.getResponse(), url);
102
                        } else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){
103
                            try {
104
                                final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON));
105
                                //queue.offer(element, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
106
                                queue.put(element);
107
                            } catch (InterruptedException e) {
108
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
109
                            }
110
                        }
111
                    }
112
                }
113

  
114
            }
115
            try {
116
                //queue.offer(HttpWithFileNameCollectorIterator.TERMINATOR, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
117
                queue.put(HttpWithFileNameCollectorIterator.TERMINATOR);
118
            } catch (InterruptedException e) {
119
                throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS), e);
120
            }
121

  
122
        }
123

  
124
        private boolean containsFilter(String meta){
125
            if (filterParam == null || filterParam.isEmpty())
126
                return false;
127
            String[] filter = filterParam.split(";");
128
            for(String item:filter){
129
                if (meta.contains(item))
130
                    return true;
131
            }
132
            return false;
133
        }
134

  
135
        private String addFilePath(String meta, String url, boolean isJson){
136
            String path = url.replace("metadata", "pdf");
137

  
138
            try {
139
                if(isJson)
140
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
141
                else {
142

  
143
                    if (meta.contains("<!DOCTYPE")) {
144
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
145
                        meta = meta.substring(meta.indexOf(">") + 1);
146
                    }
147
                    int index = meta.lastIndexOf("</");
148
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
149
                }
150
            } catch(Exception ex) {
151
                log.info("not file with extension .json or .xml");
152
            }
153

  
154

  
155
            if(isJson) {
156
                try {
157
                    return XML.toString(new JSONObject("{'resource':" + meta + "}"));
158
                } catch(Exception e) {
159
                    log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
160
                   // throw new RuntimeException();
161
                    final String junk = String.format(JUNK, url);
162
                    log.warn("returning " + junk);
163
                    return junk;
164
                }
165
            }
166
            return meta;
167
        }
168

  
169
        private void recurFolder(String text, String url){
170
            Document doc = Jsoup.parse(text);
171
            Elements links = doc.select("a");
172
            for(Element e:links){
173
                if (!e.text().equals("../")){
174
                    String file = e.attr("href");
175
                    if(file.endsWith(".json") || file.endsWith(".xml"))
176
                        metas.add(url+file);
177
                    else
178
                        urls.add(url+file);
179
                }
180
            }
181
        }
182

  
183

  
184
        @Override
185
        public void run() {
186
            fillQueue();
187
        }
188
    }
189

  
190
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/datasets/DatasetsIterator.java
1
package eu.dnetlib.data.collector.plugins.datasets;
2

  
3
import java.io.IOException;
4
import java.io.InputStream;
5
import java.util.Iterator;
6

  
7
import org.apache.commons.io.IOUtils;
8
import org.apache.commons.lang3.StringEscapeUtils;
9
import org.apache.commons.logging.Log;
10
import org.apache.commons.logging.LogFactory;
11
import org.apache.http.client.methods.CloseableHttpResponse;
12
import org.apache.http.client.methods.HttpPost;
13
import org.apache.http.entity.StringEntity;
14
import org.apache.http.impl.client.CloseableHttpClient;
15
import org.apache.http.impl.client.HttpClients;
16

  
17
import com.google.gson.Gson;
18
import com.google.gson.GsonBuilder;
19

  
20
/**
21
 * The Class JournalIterator.
22
 */
23
public class DatasetsIterator implements Iterable<String>, Iterator<String> {
24

  
25
	/** The logger. */
26
	private static final Log log = LogFactory.getLog(DatasetsIterator.class);
27

  
28
	/** The base url template. */
29
	private static String BASE_URL_TEMPLATE = "http://ws.pangaea.de/es/pangaea/panmd/_search?_source=xml&size=%d&from=%d";
30

  
31
	/** The journal id. */
32
	private String journalId = "";
33

  
34
	/** The journal name. */
35
	private String journalName = "";
36

  
37
	/** The journal issn. */
38
	private String journalISSN = "";
39

  
40
	/** The openaire datasource. */
41
	private String openaireDatasource = "";
42

  
43
	/** The total. */
44
	private long total;
45

  
46
	/** The from. */
47
	private int from;
48

  
49
	/** The current iterator. */
50
	private int currentIterator;
51

  
52
	/** The current response. */
53
	private ElasticSearchResponse currentResponse;
54

  
55
	/** The request. */
56
	private RequestField request;
57

  
58
	/** The default size. */
59
	private static int DEFAULT_SIZE = 10;
60

  
61
	private String projectCordaId;
62

  
63
	private static String RECORD_TEMPLATE = "<datasetsRecord><oaf:projectid xmlns:oaf=\"http://namespace.openaire.eu/oaf\">%s</oaf:projectid>"
64
			+ "<journal name='%s' issn='%s' datasourceid = '%s'/><metadata>%s</metadata></datasetsRecord>";
65

  
66
	/**
67
	 * Instantiates a new journal iterator.
68
	 * 
69
	 * @param request
70
	 *            the request
71
	 */
72
	public DatasetsIterator(final RequestField request, final String projectCordaId, final PangaeaJournalInfo info) {
73
		this.request = request;
74
		this.setProjectCordaId(projectCordaId);
75

  
76
		if (info != null) {
77
			this.setJournalId(info.getJournalId());
78
			this.setJournalName(StringEscapeUtils.escapeXml(info.getJournalName()));
79
			this.setJournalISSN(info.getJournalISSN());
80
			this.setOpenaireDatasource(info.getDatasourceId());
81
		}
82
		log.debug("Start Iterator");
83
	}
84

  
85
	/**
86
	 * Execute query.
87
	 * 
88
	 * @param from
89
	 *            the from
90
	 * @param size
91
	 *            the size
92
	 * @return the string
93
	 */
94
	private String executeQuery(final int from, final int size) {
95
		log.debug("executing query " + this.request.getQuery().getTerm());
96
		log.debug(String.format("from:%d size:%d", from, size));
97
		CloseableHttpResponse response = null;
98
		InputStream responseBody = null;
99
		CloseableHttpClient httpclient = HttpClients.createDefault();
100
		try {
101

  
102
			HttpPost post = new HttpPost(String.format(BASE_URL_TEMPLATE, size, from));
103
			Gson g = new GsonBuilder().disableHtmlEscaping().create();
104
			StringEntity entry = new StringEntity(g.toJson(this.request));
105
			post.setEntity(entry);
106
			long start = System.currentTimeMillis();
107
			response = httpclient.execute(post);
108
			int statusCode = response.getStatusLine().getStatusCode();
109
			if (statusCode == 200) {
110
				responseBody = response.getEntity().getContent();
111
				String s = IOUtils.toString(responseBody);
112
				log.debug("Request done in " + (System.currentTimeMillis() - start) + " ms");
113
				responseBody.close();
114
				return s;
115
			}
116
			return null;
117
		} catch (Exception e) {
118
			log.error("Error on executing query :" + request.getQuery().getTerm(), e);
119
			return null;
120
		} finally {
121
			try {
122
				responseBody.close();
123
				response.close();
124
				httpclient.close();
125
			} catch (IOException e) {
126
				log.error("Can't close connections gracefully", e);
127
			}
128
		}
129

  
130
	}
131

  
132
	/**
133
	 * Gets the journal id.
134
	 * 
135
	 * @return the journalId
136
	 */
137
	public String getJournalId() {
138
		return journalId;
139
	}
140

  
141
	/**
142
	 * Sets the journal id.
143
	 * 
144
	 * @param journalId
145
	 *            the journalId to set
146
	 */
147
	public void setJournalId(final String journalId) {
148
		this.journalId = journalId;
149
	}
150

  
151
	/*
152
	 * (non-Javadoc)
153
	 * 
154
	 * @see java.util.Iterator#hasNext()
155
	 */
156
	@Override
157
	public boolean hasNext() {
158
		return (from + currentIterator) < total;
159
	}
160

  
161
	/*
162
	 * (non-Javadoc)
163
	 * 
164
	 * @see java.util.Iterator#next()
165
	 */
166
	@Override
167
	public String next() {
168
		String xml = String.format(RECORD_TEMPLATE, this.projectCordaId, this.journalName, this.journalISSN, this.openaireDatasource, currentResponse
169
				.getXmlRecords().get(currentIterator));
170
		currentIterator++;
171
		if (currentIterator == DEFAULT_SIZE) {
172
			getNextItem();
173
		}
174
		return xml;
175
	}
176

  
177
	/*
178
	 * (non-Javadoc)
179
	 * 
180
	 * @see java.util.Iterator#remove()
181
	 */
182
	@Override
183
	public void remove() {
184
		throw new UnsupportedOperationException();
185

  
186
	}
187

  
188
	/*
189
	 * (non-Javadoc)
190
	 * 
191
	 * @see java.lang.Iterable#iterator()
192
	 */
193
	@Override
194
	public Iterator<String> iterator() {
195
		from = 0;
196
		total = 0;
197
		getNextItem();
198
		return this;
199
	}
200

  
201
	/**
202
	 * Gets the next item.
203
	 * 
204
	 * @return the next item
205
	 */
206
	private void getNextItem() {
207
		from += currentIterator;
208
		currentResponse = ElasticSearchResponse.createNewResponse(executeQuery(from, DEFAULT_SIZE));
209
		total = currentResponse == null ? 0 : currentResponse.getTotal();
210
		log.debug("from : " + from + " total of the request is " + total);
211
		currentIterator = 0;
212
	}
213

  
214
	/**
215
	 * @return the projectCordaId
216
	 */
217
	public String getProjectCordaId() {
218
		return projectCordaId;
219
	}
220

  
221
	/**
222
	 * @param projectCordaId
223
	 *            the projectCordaId to set
224
	 */
225
	public void setProjectCordaId(final String projectCordaId) {
226
		this.projectCordaId = projectCordaId;
227
	}
228

  
229
	/**
230
	 * @return the journalName
231
	 */
232
	public String getJournalName() {
233
		return journalName;
234
	}
235

  
236
	/**
237
	 * @param journalName
238
	 *            the journalName to set
239
	 */
240
	public void setJournalName(final String journalName) {
241
		this.journalName = journalName;
242
	}
243

  
244
	/**
245
	 * @return the journalISSN
246
	 */
247
	public String getJournalISSN() {
248
		return journalISSN;
249
	}
250

  
251
	/**
252
	 * @param journalISSN
253
	 *            the journalISSN to set
254
	 */
255
	public void setJournalISSN(final String journalISSN) {
256
		this.journalISSN = journalISSN;
257
	}
258

  
259
	/**
260
	 * @return the openaireDatasource
261
	 */
262
	public String getOpenaireDatasource() {
263
		return openaireDatasource;
264
	}
265

  
266
	/**
267
	 * @param openaireDatasource
268
	 *            the openaireDatasource to set
269
	 */
270
	public void setOpenaireDatasource(final String openaireDatasource) {
271
		this.openaireDatasource = openaireDatasource;
272
	}
273

  
274
}
modules/dnet-collector-plugins/tags/dnet-collector-plugins-1.8.1/src/main/java/eu/dnetlib/data/collector/plugins/gtr2/Gtr2Iterator.java
1
package eu.dnetlib.data.collector.plugins.gtr2;
2

  
3
import java.util.ArrayList;
4
import java.util.HashMap;
5
import java.util.Iterator;
6
import java.util.LinkedList;
7
import java.util.List;
8
import java.util.Map;
9
import java.util.Queue;
10
import java.util.function.Function;
11

  
12
import org.apache.commons.lang.math.NumberUtils;
13
import org.apache.commons.lang3.StringUtils;
14
import org.apache.commons.logging.Log;
15
import org.apache.commons.logging.LogFactory;
16
import org.dom4j.Document;
17
import org.dom4j.DocumentException;
18
import org.dom4j.DocumentHelper;
19
import org.dom4j.Element;
20
import org.joda.time.DateTime;
21

  
22
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
23
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
24

  
25
public abstract class Gtr2Iterator implements Iterator<String> {
26

  
27
	public static final int PAGE_SIZE = 20;
28

  
29
	private static final Log log = LogFactory.getLog(Gtr2Iterator.class);
30

  
31
	private final String baseUrl;
32
	private int currPage;
33
	private int endPage;
34
	private boolean incremental = false;
35
	private DateTime fromDate;
36

  
37
	private final Map<String, String> cache = new HashMap<>();
38

  
39
	private final Queue<String> queue = new LinkedList<>();
40

  
41
	private String nextElement;
42

  
43
	public Gtr2Iterator(final String baseUrl, final String fromDate, final String startPage, final String endPage)
44
		throws CollectorServiceException {
45

  
46
		this.baseUrl = baseUrl;
47
		this.currPage = NumberUtils.toInt(startPage, 1);
48
		this.endPage = NumberUtils.toInt(endPage, Integer.MAX_VALUE);
49
		this.incremental = StringUtils.isNotBlank(fromDate);
50

  
51
		if (this.incremental) {
52
			this.fromDate = Gtr2Helper.parseDate(fromDate);
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff