Project

General

Profile

« Previous | Next » 

Revision 59094

supporting incremental harvesting with the fs collector plugin

View differences:

modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/filesystem/FilesystemCollectorPlugin.java
3 3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4 4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5 5
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
6
import org.apache.commons.lang3.StringUtils;
6 7

  
7 8
/**
8 9
 *
......
17 18

  
18 19
		final String baseUrl = interfaceDescriptor.getBaseUrl();
19 20
		if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
20
		return new FilesystemIterable(interfaceDescriptor);
21

  
22
		if (StringUtils.isNotBlank(fromDate) && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); }
23
		return new FilesystemIterable(interfaceDescriptor, fromDate);
21 24
	}
22 25

  
23 26
}
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/filesystem/FilesystemIterable.java
51 51

  
52 52
	private boolean setObjIdentifierFromFileName = false;
53 53

  
54
	private String fromDate;
55

  
54 56
	/**
55 57
	 * Instantiates a new filesystem iterable.
56 58
	 *
57 59
	 * @param descriptor the descriptor
58 60
	 * @throws CollectorServiceException the collector service exception
59 61
	 */
60
	public FilesystemIterable(final InterfaceDescriptor descriptor) throws CollectorServiceException {
62
	public FilesystemIterable(final InterfaceDescriptor descriptor, final String fromDate) throws CollectorServiceException {
61 63
		try {
62 64
			final String baseUrl = descriptor.getBaseUrl();
63 65
			URL basePath = new URL(baseUrl);
......
71 73
			if (descriptor.getParams().containsKey("setObjIdentifierFromFileName")) {
72 74
				setObjIdentifierFromFileName = Boolean.parseBoolean(descriptor.getParams().get("setObjIdentifierFromFileName"));
73 75
			}
76
			this.fromDate = fromDate;
74 77
		} catch (MalformedURLException e) {
75 78
			throw new CollectorServiceException("Filesystem collector failed! ", e);
76 79
		}
......
83 86
	 */
84 87
	@Override
85 88
	public Iterator<String> iterator() {
86
		final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions);
89
		final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions, fromDate);
87 90
		return Iterators.transform(fsi, inputFileName -> {
88 91
			FileInputStream fileInputStream = null;
89 92
			try {
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/filesystem/FileSystemIterator.java
4 4
import java.nio.file.Files;
5 5
import java.nio.file.Path;
6 6
import java.nio.file.Paths;
7
import java.nio.file.attribute.FileTime;
8
import java.time.*;
9
import java.time.format.DateTimeFormatter;
10
import java.time.temporal.TemporalAccessor;
7 11
import java.util.Iterator;
8 12
import java.util.Set;
9 13

  
14
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
15
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
10 16
import org.apache.commons.io.FilenameUtils;
11 17
import org.apache.commons.lang3.StringUtils;
12 18
import org.apache.commons.logging.Log;
......
30 36
	private Iterator<Path> pathIterator;
31 37
	private String current;
32 38

  
39
	private boolean incremental = false;
40
	private LocalDate fromDate = null;
41
	private final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormatter
42
			.ofPattern("yyyy-MM-dd")
43
			.withZone(ZoneId.systemDefault());
44

  
33 45
	public FileSystemIterator(final String baseDir, final String extensions) {
46
		this(baseDir,extensions, null);
47
	}
48

  
49
	public FileSystemIterator(final String baseDir, final String extensions, final String fromDate) {
34 50
		if(StringUtils.isNotBlank(extensions)) {
35 51
			this.extensions = Sets.newHashSet(extensions.split(","));
36 52
		}
53
		this.incremental = StringUtils.isNotBlank(fromDate);
54
		if (incremental) {
55
			//I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode .
56
			this.fromDate = LocalDate.parse(fromDate, simpleDateTimeFormatter);
57
			log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString());
58
		}
37 59
		try {
38 60
			this.pathIterator = Files.newDirectoryStream(Paths.get(baseDir)).iterator();
39 61
			this.current = walkTillNext();
40
		} catch (IOException e) {
62
		} catch (Exception e) {
41 63
			log.error("Cannot initialize File System Iterator. Is this path correct? " + baseDir);
42
			throw new RuntimeException("Filesystem collection error.", e);
64
			e.printStackTrace();
65
			throw new CollectorServiceRuntimeException("Filesystem collection error.", e);
43 66
		}
67

  
44 68
	}
45 69

  
46 70
	@Override
......
79 103
				}
80 104
			} else {
81 105
				if (extensions.isEmpty() || extensions.contains(FilenameUtils.getExtension(nextFilePath.toString()))) {
82
					log.debug("Returning: " + nextFilePath.toString());
83
					return nextFilePath.toString();
106
					if(incremental){
107
						try {
108
							final FileTime lastModifiedTime = Files.getLastModifiedTime(nextFilePath);
109
							if(lastModifiedTime.toInstant().isAfter(fromDate.atStartOfDay().toInstant(ZoneOffset.UTC))){
110
								log.debug("Returning: " + nextFilePath.toString());
111
								return nextFilePath.toString();
112
							}
113
							else {
114
								log.debug("File "+nextFilePath.toString()+" has not changed.");
115
							}
116
						} catch (Exception e) {
117
							throw new CollectorServiceRuntimeException(e);
118
						}
119
					}
120
					else {
121
						log.debug("Returning: " + nextFilePath.toString());
122
						return nextFilePath.toString();
123
					}
84 124
				}
85 125
			}
86 126
		}

Also available in: Unified diff