Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.filesystem;
2

    
3
import java.io.*;
4
import java.net.MalformedURLException;
5
import java.net.URL;
6
import java.util.Iterator;
7
import java.util.List;
8

    
9
import com.google.common.collect.Iterators;
10
import com.google.common.collect.Lists;
11
import com.ximpleware.*;
12
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
13
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
14
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
15
import org.apache.commons.io.FilenameUtils;
16
import org.apache.commons.io.IOUtils;
17
import org.apache.commons.lang3.StringUtils;
18
import org.apache.commons.logging.Log;
19
import org.apache.commons.logging.LogFactory;
20
import org.json.JSONObject;
21
import org.json.XML;
22

    
23
/**
24
 * The Class FilesystemIterable.
25
 *
26
 * @author Sandro, Michele, Andrea
27
 */
28
public class FilesystemIterable implements Iterable<String> {
29

    
30
	/**
31
	 * The Constant log.
32
	 */
33
	private static final Log log = LogFactory.getLog(FilesystemIterable.class);
34

    
35
	/**
36
	 * The base dir.
37
	 */
38
	private File baseDir;
39

    
40
	/**
41
	 * The extensions.
42
	 */
43
	private String extensions;
44

    
45
	/**
46
	 * File format (json / xml)
47
	 **/
48
	private String fileFormat = "xml";
49

    
50
	private List<String> supportedFormats = Lists.newArrayList("xml", "json");
51

    
52
	private boolean setObjIdentifierFromFileName = false;
53

    
54
	private String fromDate;
55

    
56
	/**
57
	 * Instantiates a new filesystem iterable.
58
	 *
59
	 * @param descriptor the descriptor
60
	 * @throws CollectorServiceException the collector service exception
61
	 */
62
	public FilesystemIterable(final InterfaceDescriptor descriptor, final String fromDate) throws CollectorServiceException {
63
		try {
64
			final String baseUrl = descriptor.getBaseUrl();
65
			URL basePath = new URL(baseUrl);
66
			this.baseDir = new File(basePath.getPath());
67
			if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", basePath.getPath())); }
68
			this.extensions = descriptor.getParams().get("extensions");
69
			if (descriptor.getParams().containsKey("fileFormat")) fileFormat = descriptor.getParams().get("fileFormat");
70
			if (!supportedFormats.contains(fileFormat))
71
				throw new CollectorServiceException("File format " + fileFormat + " not supported. Supported formats are: " + StringUtils
72
						.join(supportedFormats, ','));
73
			if (descriptor.getParams().containsKey("setObjIdentifierFromFileName")) {
74
				setObjIdentifierFromFileName = Boolean.parseBoolean(descriptor.getParams().get("setObjIdentifierFromFileName"));
75
			}
76
			this.fromDate = fromDate;
77
		} catch (MalformedURLException e) {
78
			throw new CollectorServiceException("Filesystem collector failed! ", e);
79
		}
80
	}
81

    
82
	/**
83
	 * {@inheritDoc}
84
	 *
85
	 * @see java.lang.Iterable#iterator()
86
	 */
87
	@Override
88
	public Iterator<String> iterator() {
89
		final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions, fromDate);
90
		return Iterators.transform(fsi, inputFileName -> {
91
			FileInputStream fileInputStream = null;
92
			try {
93
				fileInputStream = new FileInputStream(inputFileName);
94
				final String s = IOUtils.toString(fileInputStream);
95
				if (fileFormat.equalsIgnoreCase("json")) {
96
					JSONObject json = new JSONObject(s);
97
					JSONObject obj = new JSONObject();
98
					if (setObjIdentifierFromFileName) {
99
						obj.put("header", new JSONObject().put("objIdentifier", FilenameUtils.getBaseName(inputFileName)));
100
					}
101
					obj.put("metadata", json);
102
					log.debug(obj.toString());
103
					return XML.toString(obj, "record");
104
				}
105
				String cleanedXML = XmlCleaner.cleanAllEntities(s.startsWith("\uFEFF") ? s.substring(1) : s);
106
				if (setObjIdentifierFromFileName) {
107
					return addObjIdentifier(cleanedXML, FilenameUtils.getBaseName(inputFileName));
108
				} else return cleanedXML;
109
			} catch (VTDException e) {
110
				log.error("Cannot process with VTD to set the objIdentifier " + inputFileName);
111
				return "";
112
			} catch (Exception e) {
113
				log.error("Unable to read " + inputFileName);
114
				return "";
115
			} finally {
116
				if (fileInputStream != null) {
117
					try {
118
						fileInputStream.close();
119
					} catch (IOException e) {
120
						log.error("Unable to close inputstream for  " + inputFileName);
121
					}
122
				}
123
			}
124
		});
125
	}
126

    
127
	private String addObjIdentifier(String xml, String objidentifier) throws VTDException, IOException {
128
		VTDGen vg = new VTDGen(); // Instantiate VTDGen
129
		XMLModifier xm = new XMLModifier(); //Instantiate XMLModifier
130
		vg.setDoc(xml.getBytes("UTF-8"));
131
		vg.parse(false);
132
		VTDNav vn = vg.getNav();
133
		xm.bind(vn);
134
		if (vn.toElement(VTDNav.ROOT)) {
135
			xm.insertBeforeElement("<record><header><objIdentifier>" + objidentifier + "</objIdentifier></header><metadata>");
136
			xm.insertAfterElement("</metadata></record>");
137
		}
138
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
139
		xm.output(baos);
140
		return baos.toString("UTF-8");
141
	}
142
}
(3-3/3)