Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2

    
3
import eu.dnetlib.data.collector.plugins.projects.gtr2.Gtr2ProjectsIterable;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7
import org.json.JSONObject;
8
import org.json.XML;
9
import org.jsoup.Jsoup;
10
import org.jsoup.nodes.Document;
11
import org.jsoup.nodes.Element;
12
import org.jsoup.select.Elements;
13

    
14
import java.util.ArrayList;
15
import java.util.Iterator;
16

    
17
import java.util.concurrent.ArrayBlockingQueue;
18
import java.util.function.Consumer;
19

    
20
/**
21
 * Created by miriam on 04/05/2018.
22
 */
23
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
24
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
25
    private final String TERMINATOR = "FINITO";
26
    final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
27

    
28
    private final ArrayList<String> urls = new ArrayList<>();
29
    private final ArrayList<String> metas = new ArrayList<String>();
30
    private String filterParam;
31

    
32
    int total = 0;
33
    int filtered = 0;
34

    
35
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
36
        if (!startUrl.isEmpty())
37
            urls.add(startUrl);
38
        this.filterParam = filter;
39
        Thread ft = new Thread(new FillMetaQueue());
40
        ft.start();
41
    }
42

    
43

    
44
    @Override
45
    public Iterator<String> iterator() {
46

    
47

    
48

    
49
        return new Iterator<String>(){
50

    
51
            @Override
52
            public boolean hasNext() {
53
                while (queue.isEmpty());
54
                if (queue.peek().equals(TERMINATOR))
55
                    log.info(String.format("Total number of metadata %d, Number of metadata filtered %d", total, filtered));
56
                return !queue.peek().equals(TERMINATOR);
57

    
58
            }
59

    
60
            @Override
61
            public String next() {
62
                return queue.poll(); 
63
            }
64

    
65
        };
66
    }
67

    
68
    private class FillMetaQueue implements Runnable{
69

    
70

    
71
        Connector c = new Connector();
72

    
73
        public void fillQueue() {
74
            String url;
75
            while((metas.size()>0 || urls.size() > 0 )){
76
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
77
                if (metas.size() > 0){
78
                    url = metas.remove(0);
79
                    try {
80
                        c.get(url);
81
                    } catch (CollectorServiceException e) {
82
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
83
                    }
84
                    if(c.isStatusOk()){
85
                        try {
86
                            String ret = c.getResponse();
87
                            if (ret != null && ret.length()>0) {
88
                                if (!containsFilter(ret))
89
                                    queue.put(addFilePath(ret, url, url.endsWith(".json")));
90
                                else
91
                                    filtered++;
92
                                total++;
93
                            }
94
                        } catch (InterruptedException e) {
95
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
96

    
97

    
98
                        }
99
                    }
100
                }else{
101
                    url = urls.remove(0);
102
                    try {
103
                        c.get(url);
104
                    } catch (CollectorServiceException e) {
105
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
106
                    }
107
                    if(c.isStatusOk()){
108
                        if (c.responseTypeContains("text/html")){
109
                            recurFolder(c.getResponse(),url);
110
                        }
111
                        else if(c.responseTypeContains("application/json") || c.responseTypeContains("application/xml")){
112
                            try {
113
                                queue.put(addFilePath(c.getResponse(),url, c.responseTypeContains("application/json")));
114
                            } catch (InterruptedException e) {
115
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
116
                            }
117
                        }
118
                    }
119

    
120
                }
121

    
122
            }
123
            try {
124
                queue.put(TERMINATOR);
125
            } catch (InterruptedException e) {
126
                e.printStackTrace();
127
            }
128

    
129
        }
130

    
131
        private boolean containsFilter(String meta){
132
            if (filterParam == null || filterParam.isEmpty())
133
                return false;
134
            String[] filter = filterParam.split(";");
135
            for(String item:filter){
136
                if (meta.contains(item))
137
                    return true;
138
            }
139
            return false;
140
        }
141

    
142
        private String addFilePath(String meta,String url, boolean isJson){
143
            String path = url.replace("metadata", "pdf");
144

    
145
            try {
146
                if(isJson)
147
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
148
                else{
149

    
150
                    if (meta.contains("<!DOCTYPE")) {
151
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
152
                        meta = meta.substring(meta.indexOf(">") + 1);
153
                    }
154
                    int index = meta.lastIndexOf("</");
155
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
156

    
157

    
158
                }
159

    
160
            }catch(Exception ex){
161
                log.info("not file with extension .json or .xml");
162
            }
163

    
164

    
165
            if(isJson){
166
                JSONObject jsonobj = null;
167
                try {
168
                    jsonobj = new JSONObject("{'resource':" + meta + "}");
169

    
170
                    return XML.toString(jsonobj);
171
                }catch(Exception e){
172
                    log.fatal("Impossible to transform json object to xml \n" + jsonobj + "\n " + e.getMessage() + "\n" + url);
173
                   // throw new RuntimeException();
174
                }
175
            }
176
            return meta;
177
        }
178

    
179
        private void recurFolder(String text, String url){
180
            Document doc = Jsoup.parse(text);
181
            Elements links = doc.select("a");
182
            for(Element e:links){
183
                if (!e.text().equals("../")){
184
                    String file = e.attr("href");
185
                    if(file.endsWith(".json") || file.endsWith(".xml"))
186
                        metas.add(url+file);
187
                    else
188
                        urls.add(url+file);
189
                }
190
            }
191
        }
192

    
193

    
194
        @Override
195
        public void run() {
196
            fillQueue();
197
        }
198
    }
199

    
200

    
201

    
202
}
(3-3/4)