Revision 49006
Added by Alessia Bardi over 6 years ago
FtpIterator.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.collector.plugins.ftp; |
2 | 2 |
|
3 |
import java.io.IOException; |
|
4 |
import java.io.OutputStream; |
|
5 |
import java.net.MalformedURLException; |
|
6 |
import java.net.URL; |
|
7 |
import java.util.*; |
|
8 |
|
|
3 | 9 |
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException; |
4 | 10 |
import org.apache.commons.io.output.ByteArrayOutputStream; |
11 |
import org.apache.commons.lang.StringUtils; |
|
5 | 12 |
import org.apache.commons.logging.Log; |
6 | 13 |
import org.apache.commons.logging.LogFactory; |
7 | 14 |
import org.apache.commons.net.ftp.FTPClient; |
8 | 15 |
import org.apache.commons.net.ftp.FTPFile; |
9 | 16 |
import org.apache.commons.net.ftp.FTPReply; |
17 |
import org.joda.time.DateTime; |
|
18 |
import org.joda.time.format.DateTimeFormat; |
|
19 |
import org.joda.time.format.DateTimeFormatter; |
|
10 | 20 |
|
11 |
import java.io.IOException; |
|
12 |
import java.io.OutputStream; |
|
13 |
import java.net.MalformedURLException; |
|
14 |
import java.net.URL; |
|
15 |
import java.util.Iterator; |
|
16 |
import java.util.LinkedList; |
|
17 |
import java.util.Queue; |
|
18 |
import java.util.Set; |
|
19 |
|
|
20 | 21 |
/** |
21 | 22 |
* |
22 | 23 |
* @author Author: Andrea Mannocci |
... | ... | |
37 | 38 |
private String password; |
38 | 39 |
private boolean isRecursive; |
39 | 40 |
private Set<String> extensionsSet; |
41 |
private boolean incremental; |
|
42 |
private DateTime fromDate = null; |
|
43 |
private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); |
|
40 | 44 |
|
41 | 45 |
private Queue<String> queue; |
42 | 46 |
|
43 | 47 |
public FtpIterator(final String baseUrl, final String username, final String password, final boolean isRecursive, |
44 |
final Set<String> extensionsSet) { |
|
48 |
final Set<String> extensionsSet, String fromDate) {
|
|
45 | 49 |
this.username = username; |
46 | 50 |
this.password = password; |
47 | 51 |
this.isRecursive = isRecursive; |
48 | 52 |
this.extensionsSet = extensionsSet; |
53 |
this.incremental = StringUtils.isNotBlank(fromDate); |
|
54 |
if (incremental) { |
|
55 |
//I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode . |
|
56 |
this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter); |
|
57 |
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString()); |
|
58 |
} |
|
49 | 59 |
try { |
50 | 60 |
URL server = new URL(baseUrl); |
51 | 61 |
this.ftpServerAddress = server.getHost(); |
... | ... | |
112 | 122 |
if ((subFiles != null) && (subFiles.length > 0)) { |
113 | 123 |
for (FTPFile aFile : subFiles) { |
114 | 124 |
String currentFileName = aFile.getName(); |
125 |
|
|
115 | 126 |
if (currentFileName.equals(".") || currentFileName.equals("..")) { |
116 | 127 |
// skip parent directory and directory itself |
117 | 128 |
continue; |
... | ... | |
124 | 135 |
// test the file for extensions compliance and, just in case, add it to the list. |
125 | 136 |
for (String ext : extensionsSet) { |
126 | 137 |
if (currentFileName.endsWith(ext)) { |
127 |
queue.add(dirToList + "/" + currentFileName); |
|
138 |
//incremental mode: let's check the last update date |
|
139 |
if(incremental){ |
|
140 |
Calendar timestamp = aFile.getTimestamp(); |
|
141 |
DateTime lastModificationDate = new DateTime(timestamp); |
|
142 |
if(lastModificationDate.isAfter(fromDate)){ |
|
143 |
queue.add(dirToList + "/" + currentFileName); |
|
144 |
log.debug(currentFileName + " has changed and must be re-collected"); |
|
145 |
} else { |
|
146 |
if (log.isDebugEnabled()) { |
|
147 |
log.debug(currentFileName + " has not changed since last collection"); |
|
148 |
} |
|
149 |
} |
|
150 |
} |
|
151 |
else { |
|
152 |
//not incremental: just add it to the queue |
|
153 |
queue.add(dirToList + "/" + currentFileName); |
|
154 |
} |
|
128 | 155 |
} |
129 | 156 |
} |
130 | 157 |
} |
Also available in: Unified diff
FTP metadata collector plugin now supports incremental harvesting