Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

    
3
import java.util.ArrayList;
4
import java.util.Iterator;
5
import java.util.NoSuchElementException;
6
import java.util.Objects;
7
import java.util.concurrent.ArrayBlockingQueue;
8
import java.util.concurrent.TimeUnit;
9

    
10
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
11
import org.apache.commons.logging.Log;
12
import org.apache.commons.logging.LogFactory;
13
import org.json.JSONObject;
14
import org.json.XML;
15
import org.jsoup.Jsoup;
16
import org.jsoup.nodes.Document;
17
import org.jsoup.nodes.Element;
18
import org.jsoup.select.Elements;
19

    
20
/**
21
 * Created by miriam on 04/05/2018.
22
 */
23
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
24

    
25
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
26
    private static final String TERMINATOR = "FINITO";
27
    private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>";
28
    public static final String APP_JSON = "application/json";
29
    public static final String APP_XML = "application/xml";
30
    public static final String TEXT_HTML = "text/html";
31
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
32

    
33
    private long waitTime = 60L;
34

    
35
    private final ArrayList<String> urls = new ArrayList<>();
36
    private final ArrayList<String> metas = new ArrayList<String>();
37
    private String filterParam;
38

    
39
    int total = 0;
40
    int filtered = 0;
41

    
42
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
43
        if (!startUrl.isEmpty())
44
            urls.add(startUrl);
45
        this.filterParam = filter;
46
        Thread ft = new Thread(new FillMetaQueue());
47
        ft.start();
48
    }
49

    
50

    
51
    @Override
52
    public Iterator<String> iterator() {
53
        return new Iterator<String>(){
54

    
55
            private String last = null;
56
            private boolean exec_next = true;
57

    
58
            @Override
59
            public boolean hasNext() {
60
                if(exec_next){
61
                    try {
62
                        last = queue.poll(waitTime, TimeUnit.SECONDS);
63
                        exec_next = false;
64
                    }catch(InterruptedException e){
65
                        log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS));
66
                        throw new NoSuchElementException(e.getMessage());
67
                    }
68
                }
69

    
70
                return !(Objects.equals(last, TERMINATOR));
71
            }
72

    
73
            @Override
74
            public String next() {
75
                exec_next = true;
76
                return last;
77
            }
78

    
79
        };
80
    }
81

    
82
    private class FillMetaQueue implements Runnable {
83

    
84
        final Connector c = new Connector();
85

    
86
        public void fillQueue() {
87
            String url;
88
            while((metas.size()>0 || urls.size() > 0 )) {
89
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
90
                if (metas.size() > 0) {
91
                    url = metas.remove(0);
92
                    try {
93
                        c.get(url);
94
                    } catch (CollectorServiceException e) {
95
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
96
                    }
97
                    if(c.isStatusOk()){
98
                        try {
99
                            String ret = c.getResponse();
100
                            if (ret != null && ret.length()>0) {
101
                                if (!containsFilter(ret))
102
                                    queue.offer(addFilePath(ret, url, url.endsWith(".json")), waitTime, TimeUnit.SECONDS);
103
                                else
104
                                    filtered++;
105
                                total++;
106
                            }
107
                        } catch (InterruptedException e) {
108
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
109

    
110
                        }
111
                    }
112
                } else {
113
                    url = urls.remove(0);
114
                    try {
115
                        c.get(url);
116
                    } catch (CollectorServiceException e) {
117
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
118
                    }
119
                    if(c.isStatusOk()) {
120
                        if (c.responseTypeContains(TEXT_HTML)){
121
                            recurFolder(c.getResponse(), url);
122
                        } else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){
123
                            try {
124
                                final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON));
125
                                queue.offer(element, waitTime, TimeUnit.SECONDS);
126
                            } catch (InterruptedException e) {
127
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
128
                            }
129
                        }
130
                    }
131
                }
132

    
133
            }
134
            try {
135
                queue.offer(TERMINATOR, waitTime, TimeUnit.SECONDS);
136
            } catch (InterruptedException e) {
137
                throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", waitTime, TimeUnit.SECONDS), e);
138
            }
139

    
140
        }
141

    
142
        private boolean containsFilter(String meta){
143
            if (filterParam == null || filterParam.isEmpty())
144
                return false;
145
            String[] filter = filterParam.split(";");
146
            for(String item:filter){
147
                if (meta.contains(item))
148
                    return true;
149
            }
150
            return false;
151
        }
152

    
153
        private String addFilePath(String meta, String url, boolean isJson){
154
            String path = url.replace("metadata", "pdf");
155

    
156
            try {
157
                if(isJson)
158
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
159
                else {
160

    
161
                    if (meta.contains("<!DOCTYPE")) {
162
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
163
                        meta = meta.substring(meta.indexOf(">") + 1);
164
                    }
165
                    int index = meta.lastIndexOf("</");
166
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
167
                }
168
            } catch(Exception ex) {
169
                log.info("not file with extension .json or .xml");
170
            }
171

    
172

    
173
            if(isJson) {
174
                try {
175
                    return XML.toString(new JSONObject("{'resource':" + meta + "}"));
176
                } catch(Exception e) {
177
                    log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
178
                   // throw new RuntimeException();
179
                    final String junk = String.format(JUNK, url);
180
                    log.warn("returning " + junk);
181
                    return junk;
182
                }
183
            }
184
            return meta;
185
        }
186

    
187
        private void recurFolder(String text, String url){
188
            Document doc = Jsoup.parse(text);
189
            Elements links = doc.select("a");
190
            for(Element e:links){
191
                if (!e.text().equals("../")){
192
                    String file = e.attr("href");
193
                    if(file.endsWith(".json") || file.endsWith(".xml"))
194
                        metas.add(url+file);
195
                    else
196
                        urls.add(url+file);
197
                }
198
            }
199
        }
200

    
201

    
202
        @Override
203
        public void run() {
204
            fillQueue();
205
        }
206
    }
207

    
208
}
(3-3/4)