Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
2

    
3
import eu.dnetlib.data.collector.plugins.schemaorg.Utils;
4
import org.apache.commons.io.IOUtils;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7

    
8
import java.net.URL;
9
import java.nio.charset.Charset;
10
import java.util.*;
11

    
12
public class SitemapIndexIterator implements Iterator<String> {
13
	private static final Log log = LogFactory.getLog(SitemapIndexIterator.class);
14

    
15
	public static class Options {
16
		private URL indexUrl;
17
		private Charset charset;
18

    
19
		public Options(){}
20

    
21
		public Options(URL indexUrl, Charset charset){
22
			this.indexUrl = indexUrl;
23
			this.charset = charset;
24
		}
25

    
26
		public URL getIndexUrl() {
27
			return indexUrl;
28
		}
29

    
30
		public void setIndexUrl(URL indexUrl) {
31
			this.indexUrl = indexUrl;
32
		}
33

    
34
		public Charset getCharset() {
35
			return charset;
36
		}
37

    
38
		public void setCharset(Charset charset) {
39
			this.charset = charset;
40
		}
41
	}
42

    
43
	private Options options;
44
	private Queue<String> sitemapFiles;
45

    
46
	public SitemapIndexIterator(Options options) {
47
		this.options = options;
48
	}
49

    
50
	public void bootstrap() {
51
		List<String> files = null;
52
		try {
53
			log.debug("bootstrapping sitemapindex index access");
54
			String sitemapIndexPayload = IOUtils.toString(this.options.getIndexUrl(), this.options.getCharset());
55
			log.debug(String.format("sitemapindex payload is: %s", sitemapIndexPayload));
56
			files = Utils.collectAsStrings(sitemapIndexPayload, "/sitemapindex/sitemap/loc/text()");
57
			log.debug(String.format("extracted %d sitemapindex files", files.size()));
58
		}catch(Exception ex){
59
			log.error("problem bootstrapping sitemapindex index access. returning 0 files", ex);
60
			files = new ArrayList<>();
61
		}
62
		this.sitemapFiles = new PriorityQueue<String>(files);
63
	}
64

    
65
	@Override
66
	public boolean hasNext() {
67
		return !this.sitemapFiles.isEmpty();
68
	}
69

    
70
	@Override
71
	public String next() {
72
		return this.sitemapFiles.poll();
73
	}
74
}
(2-2/3)