Revision 59094
Added by Alessia Bardi over 4 years ago
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/filesystem/FilesystemCollectorPlugin.java | ||
---|---|---|
3 | 3 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
4 | 4 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
5 | 5 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
6 |
import org.apache.commons.lang3.StringUtils; |
|
6 | 7 |
|
7 | 8 |
/** |
8 | 9 |
* |
... | ... | |
17 | 18 |
|
18 | 19 |
final String baseUrl = interfaceDescriptor.getBaseUrl(); |
19 | 20 |
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); } |
20 |
return new FilesystemIterable(interfaceDescriptor); |
|
21 |
|
|
22 |
if (StringUtils.isNotBlank(fromDate) && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); } |
|
23 |
return new FilesystemIterable(interfaceDescriptor, fromDate); |
|
21 | 24 |
} |
22 | 25 |
|
23 | 26 |
} |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/filesystem/FilesystemIterable.java | ||
---|---|---|
51 | 51 |
|
52 | 52 |
private boolean setObjIdentifierFromFileName = false; |
53 | 53 |
|
54 |
private String fromDate; |
|
55 |
|
|
54 | 56 |
/** |
55 | 57 |
* Instantiates a new filesystem iterable. |
56 | 58 |
* |
57 | 59 |
* @param descriptor the descriptor |
58 | 60 |
* @throws CollectorServiceException the collector service exception |
59 | 61 |
*/ |
60 |
public FilesystemIterable(final InterfaceDescriptor descriptor) throws CollectorServiceException { |
|
62 |
public FilesystemIterable(final InterfaceDescriptor descriptor, final String fromDate) throws CollectorServiceException {
|
|
61 | 63 |
try { |
62 | 64 |
final String baseUrl = descriptor.getBaseUrl(); |
63 | 65 |
URL basePath = new URL(baseUrl); |
... | ... | |
71 | 73 |
if (descriptor.getParams().containsKey("setObjIdentifierFromFileName")) { |
72 | 74 |
setObjIdentifierFromFileName = Boolean.parseBoolean(descriptor.getParams().get("setObjIdentifierFromFileName")); |
73 | 75 |
} |
76 |
this.fromDate = fromDate; |
|
74 | 77 |
} catch (MalformedURLException e) { |
75 | 78 |
throw new CollectorServiceException("Filesystem collector failed! ", e); |
76 | 79 |
} |
... | ... | |
83 | 86 |
*/ |
84 | 87 |
@Override |
85 | 88 |
public Iterator<String> iterator() { |
86 |
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions); |
|
89 |
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions, fromDate);
|
|
87 | 90 |
return Iterators.transform(fsi, inputFileName -> { |
88 | 91 |
FileInputStream fileInputStream = null; |
89 | 92 |
try { |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/filesystem/FileSystemIterator.java | ||
---|---|---|
4 | 4 |
import java.nio.file.Files; |
5 | 5 |
import java.nio.file.Path; |
6 | 6 |
import java.nio.file.Paths; |
7 |
import java.nio.file.attribute.FileTime; |
|
8 |
import java.time.*; |
|
9 |
import java.time.format.DateTimeFormatter; |
|
10 |
import java.time.temporal.TemporalAccessor; |
|
7 | 11 |
import java.util.Iterator; |
8 | 12 |
import java.util.Set; |
9 | 13 |
|
14 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
15 |
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException; |
|
10 | 16 |
import org.apache.commons.io.FilenameUtils; |
11 | 17 |
import org.apache.commons.lang3.StringUtils; |
12 | 18 |
import org.apache.commons.logging.Log; |
... | ... | |
30 | 36 |
private Iterator<Path> pathIterator; |
31 | 37 |
private String current; |
32 | 38 |
|
39 |
private boolean incremental = false; |
|
40 |
private LocalDate fromDate = null; |
|
41 |
private final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormatter |
|
42 |
.ofPattern("yyyy-MM-dd") |
|
43 |
.withZone(ZoneId.systemDefault()); |
|
44 |
|
|
33 | 45 |
public FileSystemIterator(final String baseDir, final String extensions) { |
46 |
this(baseDir,extensions, null); |
|
47 |
} |
|
48 |
|
|
49 |
public FileSystemIterator(final String baseDir, final String extensions, final String fromDate) { |
|
34 | 50 |
if(StringUtils.isNotBlank(extensions)) { |
35 | 51 |
this.extensions = Sets.newHashSet(extensions.split(",")); |
36 | 52 |
} |
53 |
this.incremental = StringUtils.isNotBlank(fromDate); |
|
54 |
if (incremental) { |
|
55 |
//I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode . |
|
56 |
this.fromDate = LocalDate.parse(fromDate, simpleDateTimeFormatter); |
|
57 |
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString()); |
|
58 |
} |
|
37 | 59 |
try { |
38 | 60 |
this.pathIterator = Files.newDirectoryStream(Paths.get(baseDir)).iterator(); |
39 | 61 |
this.current = walkTillNext(); |
40 |
} catch (IOException e) {
|
|
62 |
} catch (Exception e) { |
|
41 | 63 |
log.error("Cannot initialize File System Iterator. Is this path correct? " + baseDir); |
42 |
throw new RuntimeException("Filesystem collection error.", e); |
|
64 |
e.printStackTrace(); |
|
65 |
throw new CollectorServiceRuntimeException("Filesystem collection error.", e); |
|
43 | 66 |
} |
67 |
|
|
44 | 68 |
} |
45 | 69 |
|
46 | 70 |
@Override |
... | ... | |
79 | 103 |
} |
80 | 104 |
} else { |
81 | 105 |
if (extensions.isEmpty() || extensions.contains(FilenameUtils.getExtension(nextFilePath.toString()))) { |
82 |
log.debug("Returning: " + nextFilePath.toString()); |
|
83 |
return nextFilePath.toString(); |
|
106 |
if(incremental){ |
|
107 |
try { |
|
108 |
final FileTime lastModifiedTime = Files.getLastModifiedTime(nextFilePath); |
|
109 |
if(lastModifiedTime.toInstant().isAfter(fromDate.atStartOfDay().toInstant(ZoneOffset.UTC))){ |
|
110 |
log.debug("Returning: " + nextFilePath.toString()); |
|
111 |
return nextFilePath.toString(); |
|
112 |
} |
|
113 |
else { |
|
114 |
log.debug("File "+nextFilePath.toString()+" has not changed."); |
|
115 |
} |
|
116 |
} catch (Exception e) { |
|
117 |
throw new CollectorServiceRuntimeException(e); |
|
118 |
} |
|
119 |
} |
|
120 |
else { |
|
121 |
log.debug("Returning: " + nextFilePath.toString()); |
|
122 |
return nextFilePath.toString(); |
|
123 |
} |
|
84 | 124 |
} |
85 | 125 |
} |
86 | 126 |
} |
Also available in: Unified diff
supporting incremental harvesting with the fs collector plugin