Project

General

Profile

1 53614 gpapanikos
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
2
3
import eu.dnetlib.data.collector.plugins.schemaorg.Utils;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5
import org.apache.commons.io.FileUtils;
6
import org.apache.commons.io.IOUtils;
7
import org.apache.commons.logging.Log;
8
import org.apache.commons.logging.LogFactory;
9
10
import java.io.*;
11
import java.net.URL;
12
import java.nio.charset.Charset;
13
import java.util.*;
14
15
public class SitemapFileIterator implements Iterator<String> {
16
	private static final Log log = LogFactory.getLog(SitemapFileIterator.class);
17
18
	public static class Options {
19
20
		public enum SitemapFileType{
21
			Text,
22
			GZ
23
		}
24
25
		public enum SitemapSchemaType{
26
			Text,
27
			Xml
28
		}
29
30
		public Options(){}
31
32
		public Options(URL fileUrl, Charset charset, SitemapSchemaType schemaType, SitemapFileType fileType) {
33
			this.fileUrl = fileUrl;
34
			this.charset = charset;
35
			this.schemaType = schemaType;
36
			this.fileType = fileType;
37
		}
38
39
		private SitemapFileType fileType;
40
		private SitemapSchemaType schemaType;
41
		private URL fileUrl;
42
		private Charset charset;
43
44
		public Charset getCharset() {
45
			return charset;
46
		}
47
48
		public void setCharset(Charset charset) {
49
			this.charset = charset;
50
		}
51
52
		public URL getFileUrl() {
53
			return fileUrl;
54
		}
55
56
		public void setFileUrl(URL fileUrl) {
57
			this.fileUrl = fileUrl;
58
		}
59
60
		public SitemapFileType getFileType() {
61
			return fileType;
62
		}
63
64
		public void setFileType(SitemapFileType fileType) {
65
			this.fileType = fileType;
66
		}
67
68
		public SitemapSchemaType getSchemaType() {
69
			return schemaType;
70
		}
71
72
		public void setSchemaType(SitemapSchemaType schemaType) {
73
			this.schemaType = schemaType;
74
		}
75
76
		@Override
77
		public Object clone(){
78
			Options clone = new Options();
79
			clone.setCharset(this.getCharset());
80
			clone.setFileType(this.getFileType());
81
			clone.setFileUrl(this.getFileUrl());
82
			clone.setSchemaType(this.getSchemaType());
83
			return clone;
84
		}
85
	}
86
87
	private Options options;
88
	private File downloadedFile;
89
	private File contentFile;
90
	private Queue<String> locations;
91
92
	public SitemapFileIterator(Options options){
93
		this.options = options;
94
	}
95
96
	public void bootstrap() {
97
		LinkedList<String> endpoints = null;
98
		try {
99
100
			String path = new java.io.File( "." ).getCanonicalPath();
101
102
			log.debug(String.format("bootstrapping sitemapindex file access for sitemapindex %s", this.options.getFileUrl()));
103
			this.downloadedFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp");
104
			this.downloadedFile.deleteOnExit();
105
			FileUtils.copyURLToFile(this.options.getFileUrl(), this.downloadedFile);
106
			log.debug(String.format("downloaded file: %s has size %d", this.downloadedFile.toString(), this.downloadedFile.length()));
107
108
			switch (this.options.getFileType()) {
109
				case Text: {
110
					this.contentFile = this.downloadedFile;
111
					break;
112
				}
113
				case GZ: {
114
					this.contentFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp");
115
					this.contentFile.deleteOnExit();
116
					Utils.decompressGZipTo(this.downloadedFile, this.contentFile);
117
					log.debug(String.format("extracted gz file: %s has size %d", this.contentFile.toString(), this.contentFile.length()));
118
					break;
119
				}
120
				default:
121
					throw new CollectorServiceException("unrecognized file type " + this.options.getFileType());
122
			}
123
124
			List<String> content = this.collectContentLocations();
125
126
			log.debug(String.format("extracted %d sitemapindex endpoints", content.size()));
127
			endpoints = new LinkedList<>(content);
128
		}catch(Exception ex){
129
			log.error(String.format("error processing sitemapindex %s. returning 0 endpoints",this.options.getFileUrl()), ex);
130
			endpoints = new LinkedList<>();
131
		}finally {
132
			if (this.contentFile != null) {
133
				this.contentFile.delete();
134
			}
135
			if (this.downloadedFile != null) {
136
				this.downloadedFile.delete();
137
			}
138
		}
139
		this.locations = endpoints;
140
	}
141
142
	private List<String> collectContentLocations() throws Exception{
143
		switch(this.options.getSchemaType()) {
144
			case Text:{
145
				return this.collectTextContentLocations();
146
			}
147
			case Xml:{
148
				return this.collectXmlContentLocations();
149
			}
150
			default: throw new CollectorServiceException("unrecognized file type "+this.options.getFileType());
151
		}
152
	}
153
154
	private List<String> collectTextContentLocations() throws Exception {
155
		log.debug(String.format("reading endpoint locations from text sitemapindex"));
156
		try (FileInputStream in = new FileInputStream(this.contentFile)) {
157
			return IOUtils.readLines(in, this.options.getCharset());
158
		}
159
	}
160
161
	private List<String> collectXmlContentLocations() throws Exception {
162
		log.debug(String.format("reading endpoint locations from xml sitemapindex"));
163
		return Utils.collectAsStrings(this.contentFile,"/urlset/url/loc/text()");
164
	}
165
166
	@Override
167
	public boolean hasNext() {
168
		return !this.locations.isEmpty();
169
	}
170
171
	@Override
172
	public String next() {
173
		return this.locations.poll();
174
	}
175
}