Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2

    
3
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6
import org.json.JSONObject;
7
import org.json.XML;
8
import org.jsoup.Jsoup;
9
import org.jsoup.nodes.Document;
10
import org.jsoup.nodes.Element;
11
import org.jsoup.select.Elements;
12

    
13
import java.util.ArrayList;
14
import java.util.Iterator;
15

    
16
import java.util.concurrent.ArrayBlockingQueue;
17
import java.util.function.Consumer;
18

    
19
/**
20
 * Created by miriam on 04/05/2018.
21
 */
22
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
23
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
24

    
25
    private final ArrayList<String> urls = new ArrayList<>();
26
    private final ArrayList<String> metas = new ArrayList<String>();
27
    private String filter;
28

    
29
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
30

    
31
        urls.add(startUrl);
32
        this.filter = filter;
33
    }
34

    
35
    private boolean containsFilter(String meta){
36
        if (filter == null || filter.isEmpty())
37
            return false;
38
        String[] filter = this.filter.split(";");
39
        for(String item:filter){
40
            if (meta.contains(item))
41
                return true;
42
        }
43
        return false;
44
    }
45

    
46
    private String addFilePath(String meta,String url, boolean isJson){
47
        String path = url.replace("metadata", "pdf");
48

    
49
        try {
50
            if(isJson)
51
                meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
52
            else{
53

    
54
                    if (meta.contains("<!DOCTYPE")) {
55
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
56
                        meta = meta.substring(meta.indexOf(">") + 1);
57
                    }
58
                    int index = meta.lastIndexOf("</");
59
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
60

    
61

    
62
            }
63

    
64
        }catch(Exception ex){
65
            log.info("not file with extension .json or .xml");
66
        }
67

    
68

    
69
        if(isJson){
70
            JSONObject jsonobj = null;
71
            try {
72
                jsonobj = new JSONObject("{'resource':" + meta + "}");
73

    
74
                return XML.toString(jsonobj);
75
            }catch(Exception e){
76
                log.fatal("Impossible to transform json object to xml \n" + jsonobj + "\n " + e.getMessage() + "\n" + url);
77
                throw new RuntimeException();
78
            }
79
        }
80
        return meta;
81
    }
82

    
83
    private void recurFolder(String text, String url){
84
        Document doc = Jsoup.parse(text);
85
        Elements links = doc.select("a");
86
        for(Element e:links){
87
            if (!e.text().equals("../")){
88
                String file = e.attr("href");
89
                if(file.endsWith(".json") || file.endsWith(".xml"))
90
                    metas.add(url+file);
91
                else
92
                    urls.add(url+file);
93
            }
94
        }
95
    }
96

    
97

    
98
    @Override
99
    public Iterator<String> iterator() {
100
        final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
101

    
102

    
103
        return new Iterator<String>(){
104
            int total = 0;
105
            int filtered = 0;
106
            public void fillQueue() {
107
                Connector c = new Connector();
108
                String url;
109
                while((metas.size()>0 || urls.size() > 0 ) && queue.size()<100){
110
                    log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
111
                    if (metas.size() > 0){
112
                        url = metas.remove(0);
113
                        try {
114
                            c.get(url);
115
                        } catch (CollectorServiceException e) {
116
                            log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
117
                        }
118
                        if(c.isStatusOk()){
119
                            try {
120
                                String ret = c.getResponse();
121
                                if (ret != null && ret.length()>0) {
122
                                    if (!containsFilter(ret))
123
                                        queue.put(addFilePath(ret, url, url.endsWith(".json")));
124
                                    else
125
                                        filtered++;
126
                                    total++;
127
                                }
128
                            } catch (InterruptedException e) {
129
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
130

    
131

    
132
                            }
133
                        }
134
                    }else{
135
                        url = urls.remove(0);
136
                        try {
137
                            c.get(url);
138
                        } catch (CollectorServiceException e) {
139
                            log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
140
                        }
141
                        if(c.isStatusOk()){
142
                            if (c.responseTypeContains("text/html")){
143
                                recurFolder(c.getResponse(),url);
144
                            }
145
                            else if(c.responseTypeContains("application/json") || c.responseTypeContains("application/xml")){
146
                                try {
147
                                    queue.put(addFilePath(c.getResponse(),url, c.responseTypeContains("application/json")));
148
                                } catch (InterruptedException e) {
149
                                    log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
150
                                }
151
                            }
152
                        }
153

    
154
                    }
155

    
156
                }
157

    
158
            }
159
            @Override
160
            public boolean hasNext() {
161
                if (queue.isEmpty()){
162
                    fillQueue();
163
                }
164
                if(queue.isEmpty()){
165
                    log.info(String.format("Total number of metadata %d, Number of metadata filtered %d", total, filtered));
166
                    return false;
167
                }
168

    
169
                return true;
170
            }
171

    
172
            @Override
173
            public String next() {
174
                return queue.poll(); 
175
            }
176

    
177
//            @Override
178
//            public void remove() {
179
//
180
//            }
181
//
182
//            @Override
183
//            public void forEachRemaining(Consumer<? super String> action) {
184
//
185
//            }
186
        };
187
    }
188

    
189

    
190
}
(3-3/4)