Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.filesystem;
2

    
3
import java.io.*;
4
import java.net.MalformedURLException;
5
import java.net.URL;
6
import java.util.Iterator;
7
import java.util.List;
8

    
9
import com.google.common.collect.Iterators;
10
import com.google.common.collect.Lists;
11
import com.ximpleware.*;
12
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
13
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
14
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
15
import org.apache.commons.io.FilenameUtils;
16
import org.apache.commons.io.IOUtils;
17
import org.apache.commons.lang.StringUtils;
18
import org.apache.commons.logging.Log;
19
import org.apache.commons.logging.LogFactory;
20
import org.json.JSONObject;
21
import org.json.XML;
22

    
23
/**
24
 * The Class FilesystemIterable.
25
 *
26
 * @author Sandro, Michele, Andrea
27
 */
28
public class FilesystemIterable implements Iterable<String> {
29

    
30
	/**
31
	 * The Constant log.
32
	 */
33
	private static final Log log = LogFactory.getLog(FilesystemIterable.class);
34

    
35
	/**
36
	 * The base dir.
37
	 */
38
	private File baseDir;
39

    
40
	/**
41
	 * The extensions.
42
	 */
43
	private String extensions;
44

    
45
	/**
46
	 * File format (json / xml)
47
	 **/
48
	private String fileFormat = "xml";
49

    
50
	private List<String> supportedFormats = Lists.newArrayList("xml", "json");
51

    
52
	private boolean setObjIdentifierFromFileName = false;
53

    
54
	/**
55
	 * Instantiates a new filesystem iterable.
56
	 *
57
	 * @param descriptor the descriptor
58
	 * @throws CollectorServiceException the collector service exception
59
	 */
60
	public FilesystemIterable(final InterfaceDescriptor descriptor) throws CollectorServiceException {
61
		try {
62
			final String baseUrl = descriptor.getBaseUrl();
63
			URL basePath = new URL(baseUrl);
64
			this.baseDir = new File(basePath.getPath());
65
			if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", basePath.getPath())); }
66
			this.extensions = descriptor.getParams().get("extensions");
67
			if (descriptor.getParams().containsKey("fileFormat")) fileFormat = descriptor.getParams().get("fileFormat");
68
			if (!supportedFormats.contains(fileFormat))
69
				throw new CollectorServiceException("File format " + fileFormat + " not supported. Supported formats are: " + StringUtils
70
						.join(supportedFormats, ','));
71
			if (descriptor.getParams().containsKey("setObjIdentifierFromFileName")) {
72
				setObjIdentifierFromFileName = Boolean.parseBoolean(descriptor.getParams().get("setObjIdentifierFromFileName"));
73
			}
74
		} catch (MalformedURLException e) {
75
			throw new CollectorServiceException("Filesystem collector failed! ", e);
76
		}
77
	}
78

    
79
	/**
80
	 * {@inheritDoc}
81
	 *
82
	 * @see java.lang.Iterable#iterator()
83
	 */
84
	@Override
85
	public Iterator<String> iterator() {
86
		final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions);
87
		return Iterators.transform(fsi, inputFileName -> {
88
			FileInputStream fileInputStream = null;
89
			try {
90
				fileInputStream = new FileInputStream(inputFileName);
91
				final String s = IOUtils.toString(fileInputStream);
92
				if (fileFormat.equalsIgnoreCase("json")) {
93
					JSONObject json = new JSONObject(s);
94
					JSONObject obj = new JSONObject();
95
					if (setObjIdentifierFromFileName) {
96
						obj.put("header", new JSONObject().put("objIdentifier", FilenameUtils.getBaseName(inputFileName)));
97
					}
98
					obj.put("metadata", json);
99
					log.debug(obj.toString());
100
					return XML.toString(obj, "record");
101
				}
102
				String cleanedXML = XmlCleaner.cleanAllEntities(s.startsWith("\uFEFF") ? s.substring(1) : s);
103
				if (setObjIdentifierFromFileName) {
104
					return addObjIdentifier(cleanedXML, FilenameUtils.getBaseName(inputFileName));
105
				} else return cleanedXML;
106
			}catch (VTDException e) {
107
				log.error("Cannot process with VTD to set the objIdentifier " + inputFileName);
108
				return "";
109
			}
110
			catch (Exception e) {
111
				log.error("Unable to read " + inputFileName);
112
				return "";
113
			} finally {
114
				if (fileInputStream != null) {
115
					try {
116
						fileInputStream.close();
117
					} catch (IOException e) {
118
						log.error("Unable to close inputstream for  " + inputFileName);
119
					}
120
				}
121
			}
122
		});
123
	}
124

    
125
	private String addObjIdentifier(String xml, String objidentifier) throws VTDException, IOException {
126
		VTDGen vg = new VTDGen(); // Instantiate VTDGen
127
		XMLModifier xm = new XMLModifier(); //Instantiate XMLModifier
128
		vg.setDoc(xml.getBytes("UTF-8"));
129
		vg.parse(false);
130
		VTDNav vn = vg.getNav();
131
		xm.bind(vn);
132
		if (vn.toElement(VTDNav.ROOT)) {
133
			xm.insertBeforeElement("<record><header><objIdentifier>" + objidentifier + "</objIdentifier></header><metadata>");
134
			xm.insertAfterElement("</metadata></record>");
135
		}
136
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
137
		xm.output(baos);
138
		return baos.toString("UTF-8");
139
	}
140
}
(3-3/3)