Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

    
3
import java.util.ArrayList;
4
import java.util.Iterator;
5
import java.util.NoSuchElementException;
6
import java.util.Objects;
7
import java.util.concurrent.ArrayBlockingQueue;
8
import java.util.concurrent.TimeUnit;
9

    
10
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
11
import org.apache.commons.logging.Log;
12
import org.apache.commons.logging.LogFactory;
13
import org.json.JSONObject;
14
import org.json.XML;
15
import org.jsoup.Jsoup;
16
import org.jsoup.nodes.Document;
17
import org.jsoup.nodes.Element;
18
import org.jsoup.select.Elements;
19

    
20
/**
21
 * Created by miriam on 04/05/2018.
22
 */
23
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
24

    
25
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
26
    private static final String TERMINATOR = "FINITO";
27
    private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>";
28
    public static final String APP_JSON = "application/json";
29
    public static final String APP_XML = "application/xml";
30
    public static final String TEXT_HTML = "text/html";
31
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
32

    
33
    private long waitTime = 60L;
34

    
35
    private final ArrayList<String> urls = new ArrayList<>();
36
    private final ArrayList<String> metas = new ArrayList<String>();
37
    private String filterParam;
38

    
39
    int total = 0;
40
    int filtered = 0;
41

    
42
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
43
        if (!startUrl.isEmpty())
44
            urls.add(startUrl);
45
        this.filterParam = filter;
46
        Thread ft = new Thread(new FillMetaQueue());
47
        ft.start();
48
    }
49

    
50

    
51
    @Override
52
    public Iterator<String> iterator() {
53
        return new Iterator<String>(){
54

    
55
            private String last = null;
56
            private boolean exec_next = true;
57

    
58
            @Override
59
            public boolean hasNext() {
60
                if(exec_next){
61
                    try {
62
                        last = queue.poll(waitTime, TimeUnit.SECONDS);
63
                        exec_next = false;
64
                    }catch(InterruptedException e){
65
                        log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS));
66
                        throw new NoSuchElementException(e.getMessage());
67
                    }
68
                }
69

    
70
                return !(Objects.equals(last, TERMINATOR) || Objects.equals(last,null));
71
            }
72

    
73
            @Override
74
            public String next() {
75
                exec_next = true;
76
                return last;
77
            }
78

    
79
//            @Override
80
//            public boolean hasNext() {
81
//
82
//                return !Objects.equals(last, TERMINATOR);
83
//            }
84
//
85
//            @Override
86
//            public String next() {
87
//                try {
88
//                    last = queue.poll(waitTime, TimeUnit.SECONDS);
89
//                    if (Objects.equals(last, TERMINATOR)) {
90
//                        log.info("found terminator, omg!");
91
//                    }
92
//                } catch (InterruptedException e) {
93
//                    log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS));
94
//                    throw new NoSuchElementException(e.getMessage());
95
//                }
96
//                return last;
97
//            }
98

    
99
        };
100
    }
101

    
102
    private class FillMetaQueue implements Runnable {
103

    
104
        final Connector c = new Connector();
105

    
106
        public void fillQueue() {
107
            String url;
108
            while((metas.size()>0 || urls.size() > 0 )) {
109
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
110
                if (metas.size() > 0) {
111
                    url = metas.remove(0);
112
                    try {
113
                        c.get(url);
114
                    } catch (CollectorServiceException e) {
115
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
116
                    }
117
                    if(c.isStatusOk()){
118
                        try {
119
                            String ret = c.getResponse();
120
                            if (ret != null && ret.length()>0) {
121
                                if (!containsFilter(ret))
122
                                    queue.offer(addFilePath(ret, url, url.endsWith(".json")), waitTime, TimeUnit.SECONDS);
123
                                else
124
                                    filtered++;
125
                                total++;
126
                            }
127
                        } catch (InterruptedException e) {
128
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
129

    
130
                        }
131
                    }
132
                } else {
133
                    url = urls.remove(0);
134
                    try {
135
                        c.get(url);
136
                    } catch (CollectorServiceException e) {
137
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
138
                    }
139
                    if(c.isStatusOk()) {
140
                        if (c.responseTypeContains(TEXT_HTML)){
141
                            recurFolder(c.getResponse(), url);
142
                        } else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){
143
                            try {
144
                                final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON));
145
                                queue.offer(element, waitTime, TimeUnit.SECONDS);
146
                            } catch (InterruptedException e) {
147
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
148
                            }
149
                        }
150
                    }
151
                }
152

    
153
            }
154
            try {
155
                queue.offer(TERMINATOR, waitTime, TimeUnit.SECONDS);
156
            } catch (InterruptedException e) {
157
                throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", waitTime, TimeUnit.SECONDS), e);
158
            }
159

    
160
        }
161

    
162
        private boolean containsFilter(String meta){
163
            if (filterParam == null || filterParam.isEmpty())
164
                return false;
165
            String[] filter = filterParam.split(";");
166
            for(String item:filter){
167
                if (meta.contains(item))
168
                    return true;
169
            }
170
            return false;
171
        }
172

    
173
        private String addFilePath(String meta, String url, boolean isJson){
174
            String path = url.replace("metadata", "pdf");
175

    
176
            try {
177
                if(isJson)
178
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
179
                else {
180

    
181
                    if (meta.contains("<!DOCTYPE")) {
182
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
183
                        meta = meta.substring(meta.indexOf(">") + 1);
184
                    }
185
                    int index = meta.lastIndexOf("</");
186
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
187
                }
188
            } catch(Exception ex) {
189
                log.info("not file with extension .json or .xml");
190
            }
191

    
192

    
193
            if(isJson) {
194
                try {
195
                    return XML.toString(new JSONObject("{'resource':" + meta + "}"));
196
                } catch(Exception e) {
197
                    log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
198
                   // throw new RuntimeException();
199
                    final String junk = String.format(JUNK, url);
200
                    log.warn("returning " + junk);
201
                    return junk;
202
                }
203
            }
204
            return meta;
205
        }
206

    
207
        private void recurFolder(String text, String url){
208
            Document doc = Jsoup.parse(text);
209
            Elements links = doc.select("a");
210
            for(Element e:links){
211
                if (!e.text().equals("../")){
212
                    String file = e.attr("href");
213
                    if(file.endsWith(".json") || file.endsWith(".xml"))
214
                        metas.add(url+file);
215
                    else
216
                        urls.add(url+file);
217
                }
218
            }
219
        }
220

    
221

    
222
        @Override
223
        public void run() {
224
            fillQueue();
225
        }
226
    }
227

    
228
}
(3-3/4)