Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

    
3
import org.apache.commons.io.IOUtils;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6
import org.json.JSONObject;
7
import org.jsoup.Jsoup;
8
import org.jsoup.nodes.Document;
9
import org.jsoup.nodes.Element;
10
import org.jsoup.select.Elements;
11

    
12
import java.net.URL;
13
import java.nio.charset.Charset;
14
import java.util.Iterator;
15

    
16
public class EndpointAccessIterator implements Iterator<JSONObject> {
17
	private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
18

    
19
	public static class Options {
20

    
21
		private Charset charset;
22

    
23
		public Options(){}
24

    
25
		public Options(Charset charset) {
26
			this.charset = charset;
27
		}
28

    
29
		public Charset getCharset() {
30
			return charset;
31
		}
32

    
33
		public void setCharset(Charset charset) {
34
			this.charset = charset;
35
		}
36
	}
37

    
38
	private Options options;
39
	private Iterator<String> repositoryIterator;
40

    
41
	public EndpointAccessIterator(Options options, Iterator<String> repositoryIterator) {
42
		this.options = options;
43
		this.repositoryIterator = repositoryIterator;
44
	}
45

    
46
	@Override
47
	public boolean hasNext() {
48
		return this.repositoryIterator.hasNext();
49
	}
50

    
51
	@Override
52
	public JSONObject next() {
53
		String endpoint = this.repositoryIterator.next();
54
		if(endpoint == null) return null;
55

    
56
		log.debug(String.format("processing: %s", endpoint));
57

    
58
		JSONObject dataset = this.extractDatasetRecord(endpoint);
59

    
60
		return dataset;
61
	}
62

    
63
	private JSONObject extractDatasetRecord(String endpoint) {
64
		JSONObject datasetDocument = null;
65
		try {
66
			URL urlEndpoint = new URL(endpoint);
67
			String payload = IOUtils.toString(urlEndpoint, this.options.getCharset());
68

    
69
			Document doc = Jsoup.parse(payload);
70
			Elements scriptTags = doc.getElementsByTag("script");
71
			for (Element scriptTag : scriptTags) {
72
				if (!scriptTag.hasAttr("type")) continue;
73
				String scriptType = scriptTag.attr("type");
74
				if (!scriptType.equalsIgnoreCase("application/ld+json")) continue;
75

    
76
				String data = scriptTag.data();
77
				JSONObject schemaItem = new JSONObject(data);
78
				String context = schemaItem.optString("@context");
79
				String type = schemaItem.optString("@type");
80

    
81
				if (context == null || type == null) continue;
82

    
83
				Boolean isSchemaOrgContext = context.toLowerCase().startsWith("http://schema.org") || context.toLowerCase().startsWith("https://schema.org");
84
				Boolean isDataset = type.equalsIgnoreCase("dataset");
85

    
86
				if (!isSchemaOrgContext || !isDataset) continue;
87

    
88
				log.debug(String.format("discovered dataset document: %s", schemaItem.toString()));
89

    
90
				datasetDocument = schemaItem;
91
				break;
92
			}
93
		}catch(Exception ex){
94
			log.error("problem extracting dataset document. returning empty", ex);
95
			datasetDocument = null;
96
		}
97
		return datasetDocument;
98
	}
99
}
(3-3/11)