Project

General

Profile

1 51970 miriam.bag
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2 51956 miriam.bag
3
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6
import org.json.JSONObject;
7
import org.json.XML;
8
import org.jsoup.Jsoup;
9
import org.jsoup.nodes.Document;
10
import org.jsoup.nodes.Element;
11
import org.jsoup.select.Elements;
12
13
import java.util.ArrayList;
14
import java.util.Iterator;
15
16
import java.util.concurrent.ArrayBlockingQueue;
17
import java.util.function.Consumer;
18
19
/**
20
 * Created by miriam on 04/05/2018.
21
 */
22 51970 miriam.bag
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
23
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
24
25 51956 miriam.bag
    private final ArrayList<String> urls = new ArrayList<>();
26 52026 miriam.bag
    private final ArrayList<String> metas = new ArrayList<String>();
27 52054 miriam.bag
    private String filter;
28 51956 miriam.bag
29 52054 miriam.bag
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
30 51956 miriam.bag
31
        urls.add(startUrl);
32 52054 miriam.bag
        this.filter = filter;
33 51956 miriam.bag
    }
34
35 52054 miriam.bag
    private boolean containsFilter(String meta){
36
        if (filter == null || filter.isEmpty())
37
            return false;
38
        String[] filter = this.filter.split(";");
39
        for(String item:filter){
40
            if (meta.contains(item))
41
                return true;
42
        }
43
        return false;
44
    }
45
46 52026 miriam.bag
    private String addFilePath(String meta,String url, boolean isJson){
47 51956 miriam.bag
        String path = url.replace("metadata", "pdf");
48 52054 miriam.bag
49 51956 miriam.bag
        try {
50 52026 miriam.bag
            if(isJson)
51
                meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
52 52031 miriam.bag
            else{
53 52054 miriam.bag
54 52056 miriam.bag
                    if (meta.trim().startsWith("<!DOCTYPE"))
55 52054 miriam.bag
                        meta = meta.substring(meta.indexOf(">")+1);
56
                    int index = meta.lastIndexOf("</");
57
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
58
59
60 52031 miriam.bag
            }
61
62 51956 miriam.bag
        }catch(Exception ex){
63 52026 miriam.bag
            log.info("not file with extension .json or .xml");
64 51956 miriam.bag
        }
65
66 52026 miriam.bag
       //JSONObject jsonobj = new JSONObject("{'metadata':"+json+"}");
67
        if(isJson){
68 52034 miriam.bag
            JSONObject jsonobj = new JSONObject("{'resource':" + meta + "}");
69 52026 miriam.bag
            return XML.toString(jsonobj);
70
        }
71
        return meta;
72 51956 miriam.bag
    }
73
74
    private void recurFolder(String text, String url){
75
        Document doc = Jsoup.parse(text);
76
        Elements links = doc.select("a");
77
        for(Element e:links){
78
            if (!e.text().equals("../")){
79
                String file = e.attr("href");
80 52026 miriam.bag
                if(file.endsWith(".json") || file.endsWith(".xml"))
81
                    metas.add(url+file);
82 51956 miriam.bag
                else
83
                    urls.add(url+file);
84
            }
85
        }
86
    }
87
88
89 51970 miriam.bag
    @Override
90
    public Iterator<String> iterator() {
91
        final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
92
93
94
        return new Iterator<String>(){
95
96
            public void fillQueue() {
97
                Connector c = new Connector();
98
                String url;
99 52026 miriam.bag
                while((metas.size()>0 || urls.size() > 0 ) && queue.size()<100){
100
                    if (metas.size() > 0){
101
                        url = metas.remove(0);
102 51956 miriam.bag
                        try {
103 51970 miriam.bag
                            c.get(url);
104
                        } catch (CollectorServiceException e) {
105
                            log.error("Impossible to collect url: " + url + " error: " + e.getMessage());
106 51956 miriam.bag
                        }
107 51970 miriam.bag
                        if(c.isStatusOk()){
108
                            try {
109
                                String ret = c.getResponse();
110 52054 miriam.bag
                                if (ret != null && ret.length()>0 && !containsFilter(ret))
111 52026 miriam.bag
                                    queue.put(addFilePath(ret,url,url.endsWith(".json")));
112 51970 miriam.bag
                            } catch (InterruptedException e) {
113
                                log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
114
115
                            }
116
                        }
117
                    }else{
118
                        url = urls.remove(0);
119
                        try {
120
                            c.get(url);
121
                        } catch (CollectorServiceException e) {
122
                            log.error("Impossible to collect url: " + url + " error: " + e.getMessage());
123
                        }
124
                        if(c.isStatusOk()){
125
                            if (c.responseTypeContains("text/html")){
126
                                recurFolder(c.getResponse(),url);
127
                            }
128 52026 miriam.bag
                            else if(c.responseTypeContains("application/json") || c.responseTypeContains("application/xml")){
129 51970 miriam.bag
                                try {
130 52026 miriam.bag
                                    queue.put(addFilePath(c.getResponse(),url, c.responseTypeContains("application/json")));
131 51970 miriam.bag
                                } catch (InterruptedException e) {
132
                                    log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
133
                                }
134
                            }
135
                        }
136
137 51956 miriam.bag
                    }
138 51970 miriam.bag
139 51956 miriam.bag
                }
140
141
            }
142
            @Override
143
            public boolean hasNext() {
144
                if (queue.isEmpty()){
145
                    fillQueue();
146
                }
147
                return (!queue.isEmpty());
148
            }
149
150
            @Override
151
            public String next() {
152
                return queue.poll();
153
            }
154
155
            @Override
156
            public void remove() {
157
158
            }
159
160
            @Override
161
            public void forEachRemaining(Consumer<? super String> action) {
162
163
            }
164
        };
165
    }
166
167
168
}