Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2

    
3
import java.util.ArrayList;
4
import java.util.Iterator;
5
import java.util.Objects;
6
import java.util.concurrent.ArrayBlockingQueue;
7
import java.util.concurrent.TimeUnit;
8

    
9
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
10
import org.apache.commons.logging.Log;
11
import org.apache.commons.logging.LogFactory;
12
import org.json.JSONObject;
13
import org.json.XML;
14
import org.jsoup.Jsoup;
15
import org.jsoup.nodes.Document;
16
import org.jsoup.nodes.Element;
17
import org.jsoup.select.Elements;
18

    
19
/**
20
 * Created by miriam on 04/05/2018.
21
 */
22
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
23
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
24
    private static final String TERMINATOR = "FINITO";
25
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
26

    
27
    private long waitTime = 60L;
28

    
29
    private final ArrayList<String> urls = new ArrayList<>();
30
    private final ArrayList<String> metas = new ArrayList<String>();
31
    private String filterParam;
32

    
33
    int total = 0;
34
    int filtered = 0;
35

    
36
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
37
        if (!startUrl.isEmpty())
38
            urls.add(startUrl);
39
        this.filterParam = filter;
40
        Thread ft = new Thread(new FillMetaQueue());
41
        ft.start();
42
    }
43

    
44

    
45
    @Override
46
    public Iterator<String> iterator() {
47
        return new Iterator<String>(){
48

    
49
            private String current;
50

    
51
            @Override
52
            public boolean hasNext() {
53
                try {
54
                    current = queue.poll(waitTime, TimeUnit.SECONDS);
55
                } catch (InterruptedException e) {
56
                    log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS));
57
                    return false;
58
                }
59
                return !Objects.equals(current, TERMINATOR);
60
            }
61

    
62
            @Override
63
            public String next() {
64
                return current;
65
            }
66

    
67
        };
68
    }
69

    
70
    private class FillMetaQueue implements Runnable {
71

    
72
        final Connector c = new Connector();
73

    
74
        public void fillQueue() {
75
            String url;
76
            while((metas.size()>0 || urls.size() > 0 )) {
77
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
78
                if (metas.size() > 0) {
79
                    url = metas.remove(0);
80
                    try {
81
                        c.get(url);
82
                    } catch (CollectorServiceException e) {
83
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
84
                    }
85
                    if(c.isStatusOk()){
86
                        try {
87
                            String ret = c.getResponse();
88
                            if (ret != null && ret.length()>0) {
89
                                if (!containsFilter(ret))
90
                                    queue.offer(addFilePath(ret, url, url.endsWith(".json")), waitTime, TimeUnit.SECONDS);
91
                                else
92
                                    filtered++;
93
                                total++;
94
                            }
95
                        } catch (InterruptedException e) {
96
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
97

    
98
                        }
99
                    }
100
                } else {
101
                    url = urls.remove(0);
102
                    try {
103
                        c.get(url);
104
                    } catch (CollectorServiceException e) {
105
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
106
                    }
107
                    if(c.isStatusOk()) {
108
                        if (c.responseTypeContains("text/html")){
109
                            recurFolder(c.getResponse(), url);
110
                        }
111
                        else if(c.responseTypeContains("application/json") || c.responseTypeContains("application/xml")){
112
                            try {
113
                                final String element = addFilePath(c.getResponse(), url, c.responseTypeContains("application/json"));
114
                                queue.offer(element, waitTime, TimeUnit.SECONDS);
115
                            } catch (InterruptedException e) {
116
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
117
                            }
118
                        }
119
                    }
120

    
121
                }
122

    
123
            }
124
            try {
125
                queue.offer(TERMINATOR, waitTime, TimeUnit.SECONDS);
126
            } catch (InterruptedException e) {
127
                throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", waitTime, TimeUnit.SECONDS), e);
128
            }
129

    
130
        }
131

    
132
        private boolean containsFilter(String meta){
133
            if (filterParam == null || filterParam.isEmpty())
134
                return false;
135
            String[] filter = filterParam.split(";");
136
            for(String item:filter){
137
                if (meta.contains(item))
138
                    return true;
139
            }
140
            return false;
141
        }
142

    
143
        private String addFilePath(String meta,String url, boolean isJson){
144
            String path = url.replace("metadata", "pdf");
145

    
146
            try {
147
                if(isJson)
148
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
149
                else {
150

    
151
                    if (meta.contains("<!DOCTYPE")) {
152
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
153
                        meta = meta.substring(meta.indexOf(">") + 1);
154
                    }
155
                    int index = meta.lastIndexOf("</");
156
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
157
                }
158
            } catch(Exception ex) {
159
                log.info("not file with extension .json or .xml");
160
            }
161

    
162

    
163
            if(isJson) {
164
                JSONObject jsonobj = null;
165
                try {
166
                    jsonobj = new JSONObject("{'resource':" + meta + "}");
167

    
168
                    return XML.toString(jsonobj);
169
                } catch(Exception e) {
170
                    log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
171
                   // throw new RuntimeException();
172
                    jsonobj = new JSONObject("{'resource':{'DOI':'JUNK','url':'"+path+"'}}");
173
                    meta = XML.toString(jsonobj);
174
                }
175
            }
176
            return meta;
177
        }
178

    
179
        private void recurFolder(String text, String url){
180
            Document doc = Jsoup.parse(text);
181
            Elements links = doc.select("a");
182
            for(Element e:links){
183
                if (!e.text().equals("../")){
184
                    String file = e.attr("href");
185
                    if(file.endsWith(".json") || file.endsWith(".xml"))
186
                        metas.add(url+file);
187
                    else
188
                        urls.add(url+file);
189
                }
190
            }
191
        }
192

    
193

    
194
        @Override
195
        public void run() {
196
            fillQueue();
197
        }
198
    }
199

    
200
}
(3-3/4)