Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

    
3
import java.util.*;
4
import java.util.concurrent.ArrayBlockingQueue;
5
import java.util.concurrent.TimeUnit;
6

    
7
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
8
import org.apache.commons.logging.Log;
9
import org.apache.commons.logging.LogFactory;
10
import org.json.JSONObject;
11
import org.json.XML;
12
import org.jsoup.Jsoup;
13
import org.jsoup.nodes.Document;
14
import org.jsoup.nodes.Element;
15
import org.jsoup.select.Elements;
16

    
17
/**
18
 * Created by miriam on 04/05/2018.
19
 */
20
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
21

    
22
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
23

    
24
    private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>";
25
    public static final String APP_JSON = "application/json";
26
    public static final String APP_XML = "application/xml";
27
    public static final String TEXT_HTML = "text/html";
28
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
29

    
30

    
31

    
32

    
33
    private String filterParam;
34

    
35
    int total = 0;
36
    int filtered = 0;
37

    
38
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
39

    
40
        this.filterParam = filter;
41
        Thread ft = new Thread(new FillMetaQueue(startUrl) );
42
        ft.start();
43
    }
44

    
45

    
46
    @Override
47
    public Iterator<String> iterator() {
48
        return new HttpWithFileNameCollectorIterator(queue);
49
    }
50

    
51
    private class FillMetaQueue implements Runnable {
52
        final Connector c = new Connector();
53

    
54
        private final List<String> metas = Collections.synchronizedList(new ArrayList<String>());
55
        private final List<String> urls = Collections.synchronizedList(new ArrayList<>());
56

    
57
        public FillMetaQueue(String startUrl){
58
            if(!startUrl.isEmpty()){
59
                urls.add(startUrl);
60
            }
61
        }
62

    
63

    
64
        public void fillQueue() {
65
            String url;
66

    
67
            while((metas.size()>0 || urls.size() > 0 )) {
68
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
69
                if (metas.size() > 0) {
70
                    url = metas.remove(0);
71
                    try {
72
                        c.get(url);
73
                    } catch (CollectorServiceException e) {
74
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
75
                    }
76
                    if(c.isStatusOk()){
77
                        try {
78
                            String ret = c.getResponse();
79
                            if (ret != null && ret.length()>0) {
80
                                if (!containsFilter(ret))
81
                                    queue.offer(addFilePath(ret, url, url.endsWith(".json")), HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
82
                                else
83
                                    filtered++;
84
                                total++;
85
                            }
86
                        } catch (InterruptedException e) {
87
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
88

    
89
                        }
90
                    }
91
                } else {
92
                    url = urls.remove(0);
93
                    try {
94
                        c.get(url);
95
                    } catch (CollectorServiceException e) {
96
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
97
                    }
98
                    if(c.isStatusOk()) {
99
                        if (c.responseTypeContains(TEXT_HTML)){
100
                            recurFolder(c.getResponse(), url);
101
                        } else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){
102
                            try {
103
                                final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON));
104
                                queue.offer(element, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
105
                            } catch (InterruptedException e) {
106
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
107
                            }
108
                        }
109
                    }
110
                }
111

    
112
            }
113
            try {
114
                queue.offer(HttpWithFileNameCollectorIterator.TERMINATOR, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
115
            } catch (InterruptedException e) {
116
                throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS), e);
117
            }
118

    
119
        }
120

    
121
        private boolean containsFilter(String meta){
122
            if (filterParam == null || filterParam.isEmpty())
123
                return false;
124
            String[] filter = filterParam.split(";");
125
            for(String item:filter){
126
                if (meta.contains(item))
127
                    return true;
128
            }
129
            return false;
130
        }
131

    
132
        private String addFilePath(String meta, String url, boolean isJson){
133
            String path = url.replace("metadata", "pdf");
134

    
135
            try {
136
                if(isJson)
137
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
138
                else {
139

    
140
                    if (meta.contains("<!DOCTYPE")) {
141
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
142
                        meta = meta.substring(meta.indexOf(">") + 1);
143
                    }
144
                    int index = meta.lastIndexOf("</");
145
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
146
                }
147
            } catch(Exception ex) {
148
                log.info("not file with extension .json or .xml");
149
            }
150

    
151

    
152
            if(isJson) {
153
                try {
154
                    return XML.toString(new JSONObject("{'resource':" + meta + "}"));
155
                } catch(Exception e) {
156
                    log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
157
                   // throw new RuntimeException();
158
                    final String junk = String.format(JUNK, url);
159
                    log.warn("returning " + junk);
160
                    return junk;
161
                }
162
            }
163
            return meta;
164
        }
165

    
166
        private void recurFolder(String text, String url){
167
            Document doc = Jsoup.parse(text);
168
            Elements links = doc.select("a");
169
            for(Element e:links){
170
                if (!e.text().equals("../")){
171
                    String file = e.attr("href");
172
                    if(file.endsWith(".json") || file.endsWith(".xml"))
173
                        metas.add(url+file);
174
                    else
175
                        urls.add(url+file);
176
                }
177
            }
178
        }
179

    
180

    
181
        @Override
182
        public void run() {
183
            fillQueue();
184
        }
185
    }
186

    
187
}
(3-3/5)