Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.schemaorg;
2

    
3
import org.apache.commons.io.IOUtils;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6
import org.json.JSONObject;
7
import org.jsoup.Jsoup;
8
import org.jsoup.nodes.Document;
9
import org.jsoup.nodes.Element;
10
import org.jsoup.select.Elements;
11

    
12
import java.net.URL;
13
import java.nio.charset.Charset;
14
import java.util.Iterator;
15

    
16
public class EndpointAccessIterator implements Iterator<JSONObject> {
17
	private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
18

    
19
	public static class Options {
20

    
21
		private Charset charset;
22

    
23
		public Options(){}
24

    
25
		public Options(Charset charset) {
26
			this.charset = charset;
27
		}
28

    
29
		public Charset getCharset() {
30
			return charset;
31
		}
32

    
33
		public void setCharset(Charset charset) {
34
			this.charset = charset;
35
		}
36
	}
37

    
38
	private Options options;
39
	private Iterator<String> repositoryIterator;
40

    
41
	public EndpointAccessIterator(Options options, Iterator<String> repositoryIterator) {
42
		this.options = options;
43
		this.repositoryIterator = repositoryIterator;
44
	}
45

    
46
	@Override
47
	public boolean hasNext() {
48
		return this.repositoryIterator.hasNext();
49
	}
50

    
51
	@Override
52
	public JSONObject next() {
53
		String endpoint = this.repositoryIterator.next();
54
		if(endpoint == null) return null;
55

    
56
		log.debug(String.format("processing: %s", endpoint));
57

    
58
		JSONObject dataset = this.extractDatasetRecord(endpoint);
59

    
60
		return dataset;
61
	}
62

    
63
	private JSONObject extractDatasetRecord(String endpoint) {
64
		JSONObject datasetDocument = null;
65
		try {
66
			URL urlEndpoint = new URL(endpoint);
67
			log.debug("downloading endpoint "+urlEndpoint);
68
			String payload = Utils.RemoteAccessWithRetry(3, 5000, urlEndpoint, this.options.getCharset());
69

    
70
			log.trace("downloaded payload id: "+payload);
71
			Document doc = Jsoup.parse(payload);
72
			Elements scriptTags = doc.getElementsByTag("script");
73
			for (Element scriptTag : scriptTags) {
74
				if (!scriptTag.hasAttr("type")) continue;
75
				String scriptType = scriptTag.attr("type");
76
				if (!scriptType.equalsIgnoreCase("application/ld+json")) continue;
77

    
78
				String data = scriptTag.data();
79
				JSONObject schemaItem = new JSONObject(data);
80
				String context = schemaItem.optString("@context");
81
				String type = schemaItem.optString("@type");
82

    
83
				if (context == null || type == null) continue;
84

    
85
				Boolean isSchemaOrgContext = context.toLowerCase().startsWith("http://schema.org") || context.toLowerCase().startsWith("https://schema.org");
86
				Boolean isDataset = type.equalsIgnoreCase("dataset");
87

    
88
				if (!isSchemaOrgContext || !isDataset) continue;
89

    
90
				log.debug(String.format("discovered dataset document: %s", schemaItem.toString()));
91

    
92
				datasetDocument = schemaItem;
93
				break;
94
			}
95
		}catch(Exception ex){
96
			log.error("problem extracting dataset document. returning empty", ex);
97
			datasetDocument = null;
98
		}
99
		if(datasetDocument == null){
100
			log.debug("did not find any dataset document in endpoint");
101
		}
102
		else{
103
			log.debug("found dataset document in endpoint :"+datasetDocument.toString());
104
		}
105
		return datasetDocument;
106
	}
107
}
(3-3/11)