Project

General

Profile

« Previous | Next » 

Revision 49006

FTP metadata collector plugin now supports incremental harvesting

View differences:

FtpIterator.java
1 1
package eu.dnetlib.data.collector.plugins.ftp;
2 2

  
3
import java.io.IOException;
4
import java.io.OutputStream;
5
import java.net.MalformedURLException;
6
import java.net.URL;
7
import java.util.*;
8

  
3 9
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
4 10
import org.apache.commons.io.output.ByteArrayOutputStream;
11
import org.apache.commons.lang.StringUtils;
5 12
import org.apache.commons.logging.Log;
6 13
import org.apache.commons.logging.LogFactory;
7 14
import org.apache.commons.net.ftp.FTPClient;
8 15
import org.apache.commons.net.ftp.FTPFile;
9 16
import org.apache.commons.net.ftp.FTPReply;
17
import org.joda.time.DateTime;
18
import org.joda.time.format.DateTimeFormat;
19
import org.joda.time.format.DateTimeFormatter;
10 20

  
11
import java.io.IOException;
12
import java.io.OutputStream;
13
import java.net.MalformedURLException;
14
import java.net.URL;
15
import java.util.Iterator;
16
import java.util.LinkedList;
17
import java.util.Queue;
18
import java.util.Set;
19

  
20 21
/**
21 22
 *
22 23
 * @author Author: Andrea Mannocci
......
37 38
	private String password;
38 39
	private boolean isRecursive;
39 40
	private Set<String> extensionsSet;
41
	private boolean incremental;
42
	private DateTime fromDate = null;
43
	private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
40 44

  
41 45
	private Queue<String> queue;
42 46

  
43 47
	public FtpIterator(final String baseUrl, final String username, final String password, final boolean isRecursive,
44
			final Set<String> extensionsSet) {
48
			final Set<String> extensionsSet, String fromDate) {
45 49
		this.username = username;
46 50
		this.password = password;
47 51
		this.isRecursive = isRecursive;
48 52
		this.extensionsSet = extensionsSet;
53
		this.incremental = StringUtils.isNotBlank(fromDate);
54
		if (incremental) {
55
			//I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode .
56
			this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter);
57
			log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString());
58
		}
49 59
		try {
50 60
			URL server = new URL(baseUrl);
51 61
			this.ftpServerAddress = server.getHost();
......
112 122
			if ((subFiles != null) && (subFiles.length > 0)) {
113 123
				for (FTPFile aFile : subFiles) {
114 124
					String currentFileName = aFile.getName();
125

  
115 126
					if (currentFileName.equals(".") || currentFileName.equals("..")) {
116 127
						// skip parent directory and directory itself
117 128
						continue;
......
124 135
						// test the file for extensions compliance and, just in case, add it to the list.
125 136
						for (String ext : extensionsSet) {
126 137
							if (currentFileName.endsWith(ext)) {
127
								queue.add(dirToList + "/" + currentFileName);
138
								//incremental mode: let's check the last update date
139
								if(incremental){
140
									Calendar timestamp = aFile.getTimestamp();
141
									DateTime lastModificationDate = new DateTime(timestamp);
142
									if(lastModificationDate.isAfter(fromDate)){
143
										queue.add(dirToList + "/" + currentFileName);
144
										log.debug(currentFileName + " has changed and must be re-collected");
145
									} else {
146
										if (log.isDebugEnabled()) {
147
											log.debug(currentFileName + " has not changed since last collection");
148
										}
149
									}
150
								}
151
								else {
152
									//not incremental: just add it to the queue
153
									queue.add(dirToList + "/" + currentFileName);
154
								}
128 155
							}
129 156
						}
130 157
					}

Also available in: Unified diff