1 |
53614
|
gpapanikos
|
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
|
2 |
|
|
|
3 |
|
|
import eu.dnetlib.data.collector.plugins.schemaorg.Utils;
|
4 |
|
|
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
5 |
|
|
import org.apache.commons.io.FileUtils;
|
6 |
|
|
import org.apache.commons.io.IOUtils;
|
7 |
|
|
import org.apache.commons.logging.Log;
|
8 |
|
|
import org.apache.commons.logging.LogFactory;
|
9 |
|
|
|
10 |
|
|
import java.io.*;
|
11 |
|
|
import java.net.URL;
|
12 |
|
|
import java.nio.charset.Charset;
|
13 |
|
|
import java.util.*;
|
14 |
|
|
|
15 |
|
|
public class SitemapFileIterator implements Iterator<String> {
|
16 |
|
|
private static final Log log = LogFactory.getLog(SitemapFileIterator.class);
|
17 |
|
|
|
18 |
|
|
public static class Options {
|
19 |
|
|
|
20 |
|
|
public enum SitemapFileType{
|
21 |
|
|
Text,
|
22 |
|
|
GZ
|
23 |
|
|
}
|
24 |
|
|
|
25 |
|
|
public enum SitemapSchemaType{
|
26 |
|
|
Text,
|
27 |
|
|
Xml
|
28 |
|
|
}
|
29 |
|
|
|
30 |
|
|
public Options(){}
|
31 |
|
|
|
32 |
|
|
public Options(URL fileUrl, Charset charset, SitemapSchemaType schemaType, SitemapFileType fileType) {
|
33 |
|
|
this.fileUrl = fileUrl;
|
34 |
|
|
this.charset = charset;
|
35 |
|
|
this.schemaType = schemaType;
|
36 |
|
|
this.fileType = fileType;
|
37 |
|
|
}
|
38 |
|
|
|
39 |
|
|
private SitemapFileType fileType;
|
40 |
|
|
private SitemapSchemaType schemaType;
|
41 |
|
|
private URL fileUrl;
|
42 |
|
|
private Charset charset;
|
43 |
|
|
|
44 |
|
|
public Charset getCharset() {
|
45 |
|
|
return charset;
|
46 |
|
|
}
|
47 |
|
|
|
48 |
|
|
public void setCharset(Charset charset) {
|
49 |
|
|
this.charset = charset;
|
50 |
|
|
}
|
51 |
|
|
|
52 |
|
|
public URL getFileUrl() {
|
53 |
|
|
return fileUrl;
|
54 |
|
|
}
|
55 |
|
|
|
56 |
|
|
public void setFileUrl(URL fileUrl) {
|
57 |
|
|
this.fileUrl = fileUrl;
|
58 |
|
|
}
|
59 |
|
|
|
60 |
|
|
public SitemapFileType getFileType() {
|
61 |
|
|
return fileType;
|
62 |
|
|
}
|
63 |
|
|
|
64 |
|
|
public void setFileType(SitemapFileType fileType) {
|
65 |
|
|
this.fileType = fileType;
|
66 |
|
|
}
|
67 |
|
|
|
68 |
|
|
public SitemapSchemaType getSchemaType() {
|
69 |
|
|
return schemaType;
|
70 |
|
|
}
|
71 |
|
|
|
72 |
|
|
public void setSchemaType(SitemapSchemaType schemaType) {
|
73 |
|
|
this.schemaType = schemaType;
|
74 |
|
|
}
|
75 |
|
|
|
76 |
|
|
@Override
|
77 |
|
|
public Object clone(){
|
78 |
|
|
Options clone = new Options();
|
79 |
|
|
clone.setCharset(this.getCharset());
|
80 |
|
|
clone.setFileType(this.getFileType());
|
81 |
|
|
clone.setFileUrl(this.getFileUrl());
|
82 |
|
|
clone.setSchemaType(this.getSchemaType());
|
83 |
|
|
return clone;
|
84 |
|
|
}
|
85 |
|
|
}
|
86 |
|
|
|
87 |
|
|
private Options options;
|
88 |
|
|
private File downloadedFile;
|
89 |
|
|
private File contentFile;
|
90 |
|
|
private Queue<String> locations;
|
91 |
|
|
|
92 |
|
|
public SitemapFileIterator(Options options){
|
93 |
|
|
this.options = options;
|
94 |
|
|
}
|
95 |
|
|
|
96 |
|
|
public void bootstrap() {
|
97 |
|
|
LinkedList<String> endpoints = null;
|
98 |
|
|
try {
|
99 |
|
|
|
100 |
|
|
String path = new java.io.File( "." ).getCanonicalPath();
|
101 |
|
|
|
102 |
|
|
log.debug(String.format("bootstrapping sitemapindex file access for sitemapindex %s", this.options.getFileUrl()));
|
103 |
|
|
this.downloadedFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp");
|
104 |
|
|
this.downloadedFile.deleteOnExit();
|
105 |
|
|
FileUtils.copyURLToFile(this.options.getFileUrl(), this.downloadedFile);
|
106 |
|
|
log.debug(String.format("downloaded file: %s has size %d", this.downloadedFile.toString(), this.downloadedFile.length()));
|
107 |
|
|
|
108 |
|
|
switch (this.options.getFileType()) {
|
109 |
|
|
case Text: {
|
110 |
|
|
this.contentFile = this.downloadedFile;
|
111 |
|
|
break;
|
112 |
|
|
}
|
113 |
|
|
case GZ: {
|
114 |
|
|
this.contentFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp");
|
115 |
|
|
this.contentFile.deleteOnExit();
|
116 |
|
|
Utils.decompressGZipTo(this.downloadedFile, this.contentFile);
|
117 |
|
|
log.debug(String.format("extracted gz file: %s has size %d", this.contentFile.toString(), this.contentFile.length()));
|
118 |
|
|
break;
|
119 |
|
|
}
|
120 |
|
|
default:
|
121 |
|
|
throw new CollectorServiceException("unrecognized file type " + this.options.getFileType());
|
122 |
|
|
}
|
123 |
|
|
|
124 |
|
|
List<String> content = this.collectContentLocations();
|
125 |
|
|
|
126 |
|
|
log.debug(String.format("extracted %d sitemapindex endpoints", content.size()));
|
127 |
|
|
endpoints = new LinkedList<>(content);
|
128 |
|
|
}catch(Exception ex){
|
129 |
|
|
log.error(String.format("error processing sitemapindex %s. returning 0 endpoints",this.options.getFileUrl()), ex);
|
130 |
|
|
endpoints = new LinkedList<>();
|
131 |
|
|
}finally {
|
132 |
|
|
if (this.contentFile != null) {
|
133 |
|
|
this.contentFile.delete();
|
134 |
|
|
}
|
135 |
|
|
if (this.downloadedFile != null) {
|
136 |
|
|
this.downloadedFile.delete();
|
137 |
|
|
}
|
138 |
|
|
}
|
139 |
|
|
this.locations = endpoints;
|
140 |
|
|
}
|
141 |
|
|
|
142 |
|
|
private List<String> collectContentLocations() throws Exception{
|
143 |
|
|
switch(this.options.getSchemaType()) {
|
144 |
|
|
case Text:{
|
145 |
|
|
return this.collectTextContentLocations();
|
146 |
|
|
}
|
147 |
|
|
case Xml:{
|
148 |
|
|
return this.collectXmlContentLocations();
|
149 |
|
|
}
|
150 |
|
|
default: throw new CollectorServiceException("unrecognized file type "+this.options.getFileType());
|
151 |
|
|
}
|
152 |
|
|
}
|
153 |
|
|
|
154 |
|
|
private List<String> collectTextContentLocations() throws Exception {
|
155 |
|
|
log.debug(String.format("reading endpoint locations from text sitemapindex"));
|
156 |
|
|
try (FileInputStream in = new FileInputStream(this.contentFile)) {
|
157 |
|
|
return IOUtils.readLines(in, this.options.getCharset());
|
158 |
|
|
}
|
159 |
|
|
}
|
160 |
|
|
|
161 |
|
|
private List<String> collectXmlContentLocations() throws Exception {
|
162 |
|
|
log.debug(String.format("reading endpoint locations from xml sitemapindex"));
|
163 |
|
|
return Utils.collectAsStrings(this.contentFile,"/urlset/url/loc/text()");
|
164 |
|
|
}
|
165 |
|
|
|
166 |
|
|
@Override
|
167 |
|
|
public boolean hasNext() {
|
168 |
|
|
return !this.locations.isEmpty();
|
169 |
|
|
}
|
170 |
|
|
|
171 |
|
|
@Override
|
172 |
|
|
public String next() {
|
173 |
|
|
return this.locations.poll();
|
174 |
|
|
}
|
175 |
|
|
}
|