Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2

    
3
import java.util.ArrayList;
4
import java.util.Iterator;
5
import java.util.Objects;
6
import java.util.concurrent.ArrayBlockingQueue;
7
import java.util.concurrent.TimeUnit;
8

    
9
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
10
import org.apache.commons.logging.Log;
11
import org.apache.commons.logging.LogFactory;
12
import org.json.JSONObject;
13
import org.json.XML;
14
import org.jsoup.Jsoup;
15
import org.jsoup.nodes.Document;
16
import org.jsoup.nodes.Element;
17
import org.jsoup.select.Elements;
18

    
19
/**
20
 * Created by miriam on 04/05/2018.
21
 */
22
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
23
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
24
    private static final String TERMINATOR = "FINITO";
25
    private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>";
26
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
27

    
28
    private long waitTime = 60L;
29

    
30
    private final ArrayList<String> urls = new ArrayList<>();
31
    private final ArrayList<String> metas = new ArrayList<String>();
32
    private String filterParam;
33

    
34
    int total = 0;
35
    int filtered = 0;
36

    
37
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
38
        if (!startUrl.isEmpty())
39
            urls.add(startUrl);
40
        this.filterParam = filter;
41
        Thread ft = new Thread(new FillMetaQueue());
42
        ft.start();
43
    }
44

    
45

    
46
    @Override
47
    public Iterator<String> iterator() {
48
        return new Iterator<String>(){
49

    
50
            private String current;
51

    
52
            @Override
53
            public boolean hasNext() {
54
                try {
55
                    current = queue.poll(waitTime, TimeUnit.SECONDS);
56
                } catch (InterruptedException e) {
57
                    log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS));
58
                    return false;
59
                }
60
                return !Objects.equals(current, TERMINATOR);
61
            }
62

    
63
            @Override
64
            public String next() {
65
                return current;
66
            }
67

    
68
        };
69
    }
70

    
71
    private class FillMetaQueue implements Runnable {
72

    
73
        final Connector c = new Connector();
74

    
75
        public void fillQueue() {
76
            String url;
77
            while((metas.size()>0 || urls.size() > 0 )) {
78
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
79
                if (metas.size() > 0) {
80
                    url = metas.remove(0);
81
                    try {
82
                        c.get(url);
83
                    } catch (CollectorServiceException e) {
84
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
85
                    }
86
                    if(c.isStatusOk()){
87
                        try {
88
                            String ret = c.getResponse();
89
                            if (ret != null && ret.length()>0) {
90
                                if (!containsFilter(ret))
91
                                    queue.offer(addFilePath(ret, url, url.endsWith(".json")), waitTime, TimeUnit.SECONDS);
92
                                else
93
                                    filtered++;
94
                                total++;
95
                            }
96
                        } catch (InterruptedException e) {
97
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
98

    
99
                        }
100
                    }
101
                } else {
102
                    url = urls.remove(0);
103
                    try {
104
                        c.get(url);
105
                    } catch (CollectorServiceException e) {
106
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
107
                    }
108
                    if(c.isStatusOk()) {
109
                        if (c.responseTypeContains("text/html")){
110
                            recurFolder(c.getResponse(), url);
111
                        }
112
                        else if(c.responseTypeContains("application/json") || c.responseTypeContains("application/xml")){
113
                            try {
114
                                final String element = addFilePath(c.getResponse(), url, c.responseTypeContains("application/json"));
115
                                queue.offer(element, waitTime, TimeUnit.SECONDS);
116
                            } catch (InterruptedException e) {
117
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
118
                            }
119
                        }
120
                    }
121

    
122
                }
123

    
124
            }
125
            try {
126
                queue.offer(TERMINATOR, waitTime, TimeUnit.SECONDS);
127
            } catch (InterruptedException e) {
128
                throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", waitTime, TimeUnit.SECONDS), e);
129
            }
130

    
131
        }
132

    
133
        private boolean containsFilter(String meta){
134
            if (filterParam == null || filterParam.isEmpty())
135
                return false;
136
            String[] filter = filterParam.split(";");
137
            for(String item:filter){
138
                if (meta.contains(item))
139
                    return true;
140
            }
141
            return false;
142
        }
143

    
144
        private String addFilePath(String meta, String url, boolean isJson){
145
            String path = url.replace("metadata", "pdf");
146

    
147
            try {
148
                if(isJson)
149
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
150
                else {
151

    
152
                    if (meta.contains("<!DOCTYPE")) {
153
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
154
                        meta = meta.substring(meta.indexOf(">") + 1);
155
                    }
156
                    int index = meta.lastIndexOf("</");
157
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
158
                }
159
            } catch(Exception ex) {
160
                log.info("not file with extension .json or .xml");
161
            }
162

    
163

    
164
            if(isJson) {
165
                try {
166
                    return XML.toString(new JSONObject("{'resource':" + meta + "}"));
167
                } catch(Exception e) {
168
                    log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
169
                   // throw new RuntimeException();
170
                    final String junk = String.format(JUNK, url);
171
                    log.warn("returning " + junk);
172
                    return junk;
173
                }
174
            }
175
            return meta;
176
        }
177

    
178
        private void recurFolder(String text, String url){
179
            Document doc = Jsoup.parse(text);
180
            Elements links = doc.select("a");
181
            for(Element e:links){
182
                if (!e.text().equals("../")){
183
                    String file = e.attr("href");
184
                    if(file.endsWith(".json") || file.endsWith(".xml"))
185
                        metas.add(url+file);
186
                    else
187
                        urls.add(url+file);
188
                }
189
            }
190
        }
191

    
192

    
193
        @Override
194
        public void run() {
195
            fillQueue();
196
        }
197
    }
198

    
199
}
(3-3/4)