Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.filesystem;
2

    
3
import java.io.File;
4
import java.io.FileInputStream;
5
import java.io.IOException;
6
import java.net.MalformedURLException;
7
import java.net.URL;
8
import java.util.Iterator;
9
import java.util.List;
10

    
11
import com.google.common.collect.Iterators;
12
import com.google.common.collect.Lists;
13
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
14
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
15
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
16
import org.apache.commons.io.IOUtils;
17
import org.apache.commons.lang.StringUtils;
18
import org.apache.commons.logging.Log;
19
import org.apache.commons.logging.LogFactory;
20
import org.json.JSONObject;
21
import org.json.XML;
22

    
23
/**
24
 * The Class FilesystemIterable.
25
 *
26
 * @author Sandro, Michele, Andrea
27
 */
28
public class FilesystemIterable implements Iterable<String> {
29

    
30

    
31
	/** The Constant log. */
32
	private static final Log log = LogFactory.getLog(FilesystemIterable.class);
33

    
34
	/** The base dir. */
35
	private File baseDir;
36

    
37
	/** The extensions. */
38
	private String extensions;
39

    
40
	/** File format (json / xml) **/
41
	private String fileFormat = "xml";
42

    
43
	private List<String> supportedFormats = Lists.newArrayList("xml", "json");
44

    
45
	/**
46
	 * Instantiates a new filesystem iterable.
47
	 *
48
	 * @param descriptor
49
	 *            the descriptor
50
	 * @throws CollectorServiceException
51
	 *             the collector service exception
52
	 */
53
	public FilesystemIterable(final InterfaceDescriptor descriptor) throws CollectorServiceException {
54
		try {
55
			final String baseUrl = descriptor.getBaseUrl();
56
			URL basePath = new URL(baseUrl);
57
			this.baseDir = new File(basePath.getPath());
58
			if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", basePath.getPath())); }
59
			this.extensions = descriptor.getParams().get("extensions");
60
			if(descriptor.getParams().containsKey("fileFormat")) fileFormat = descriptor.getParams().get("fileFormat");
61
			if(!supportedFormats.contains(fileFormat)) throw new CollectorServiceException("File format "+fileFormat+" not supported. Supported formats are: "+ StringUtils
62
					.join(supportedFormats, ','));
63
		} catch (MalformedURLException e) {
64
			throw new CollectorServiceException("Filesystem collector failed! ", e);
65
		}
66
	}
67

    
68
	/**
69
	 * {@inheritDoc}
70
	 *
71
	 * @see java.lang.Iterable#iterator()
72
	 */
73
	@Override
74
	public Iterator<String> iterator() {
75
		final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions);
76
		return Iterators.transform(fsi, inputFileName -> {
77
			FileInputStream fileInputStream = null;
78
			try {
79
				fileInputStream = new FileInputStream(inputFileName);
80
				final String s = IOUtils.toString(fileInputStream);
81
				if(fileFormat.equalsIgnoreCase("json")){
82
					JSONObject json = new JSONObject(s);
83
					return XML.toString(json, "record");
84
				}
85
				return XmlCleaner.cleanAllEntities(s.startsWith("\uFEFF") ? s.substring(1) : s);
86
			} catch (Exception e) {
87
				log.error("Unable to read " + inputFileName);
88
				return "";
89
			} finally {
90
				if (fileInputStream != null) {
91
					try {
92
						fileInputStream.close();
93
					} catch (IOException e) {
94
						log.error("Unable to close inputstream for  " + inputFileName);
95
					}
96
				}
97
			}
98
		});
99
	}
100
}
(3-3/3)