Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2

    
3
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6
import org.json.JSONObject;
7
import org.json.XML;
8
import org.jsoup.Jsoup;
9
import org.jsoup.nodes.Document;
10
import org.jsoup.nodes.Element;
11
import org.jsoup.select.Elements;
12

    
13
import java.util.ArrayList;
14
import java.util.Iterator;
15

    
16
import java.util.concurrent.ArrayBlockingQueue;
17
import java.util.function.Consumer;
18

    
19
/**
20
 * Created by miriam on 04/05/2018.
21
 */
22
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
23
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
24

    
25
    private final ArrayList<String> urls = new ArrayList<>();
26
    private final ArrayList<String> metas = new ArrayList<String>();
27
    private String filter;
28

    
29
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
30

    
31
        urls.add(startUrl);
32
        this.filter = filter;
33
    }
34

    
35
    private boolean containsFilter(String meta){
36
        if (filter == null || filter.isEmpty())
37
            return false;
38
        String[] filter = this.filter.split(";");
39
        for(String item:filter){
40
            if (meta.contains(item))
41
                return true;
42
        }
43
        return false;
44
    }
45

    
46
    private String addFilePath(String meta,String url, boolean isJson){
47
        String path = url.replace("metadata", "pdf");
48

    
49
        try {
50
            if(isJson)
51
                meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
52
            else{
53

    
54
                    if (meta.trim().startsWith("<!DOCTYPE"))
55
                        meta = meta.substring(meta.indexOf(">")+1);
56
                    int index = meta.lastIndexOf("</");
57
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
58

    
59

    
60
            }
61

    
62
        }catch(Exception ex){
63
            log.info("not file with extension .json or .xml");
64
        }
65

    
66
       //JSONObject jsonobj = new JSONObject("{'metadata':"+json+"}");
67
        if(isJson){
68
            JSONObject jsonobj = new JSONObject("{'resource':" + meta + "}");
69
            return XML.toString(jsonobj);
70
        }
71
        return meta;
72
    }
73

    
74
    private void recurFolder(String text, String url){
75
        Document doc = Jsoup.parse(text);
76
        Elements links = doc.select("a");
77
        for(Element e:links){
78
            if (!e.text().equals("../")){
79
                String file = e.attr("href");
80
                if(file.endsWith(".json") || file.endsWith(".xml"))
81
                    metas.add(url+file);
82
                else
83
                    urls.add(url+file);
84
            }
85
        }
86
    }
87

    
88

    
89
    @Override
90
    public Iterator<String> iterator() {
91
        final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
92

    
93

    
94
        return new Iterator<String>(){
95

    
96
            public void fillQueue() {
97
                Connector c = new Connector();
98
                String url;
99
                while((metas.size()>0 || urls.size() > 0 ) && queue.size()<100){
100
                    if (metas.size() > 0){
101
                        url = metas.remove(0);
102
                        try {
103
                            c.get(url);
104
                        } catch (CollectorServiceException e) {
105
                            log.error("Impossible to collect url: " + url + " error: " + e.getMessage());
106
                        }
107
                        if(c.isStatusOk()){
108
                            try {
109
                                String ret = c.getResponse();
110
                                if (ret != null && ret.length()>0 && !containsFilter(ret))
111
                                    queue.put(addFilePath(ret,url,url.endsWith(".json")));
112
                            } catch (InterruptedException e) {
113
                                log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
114

    
115
                            }
116
                        }
117
                    }else{
118
                        url = urls.remove(0);
119
                        try {
120
                            c.get(url);
121
                        } catch (CollectorServiceException e) {
122
                            log.error("Impossible to collect url: " + url + " error: " + e.getMessage());
123
                        }
124
                        if(c.isStatusOk()){
125
                            if (c.responseTypeContains("text/html")){
126
                                recurFolder(c.getResponse(),url);
127
                            }
128
                            else if(c.responseTypeContains("application/json") || c.responseTypeContains("application/xml")){
129
                                try {
130
                                    queue.put(addFilePath(c.getResponse(),url, c.responseTypeContains("application/json")));
131
                                } catch (InterruptedException e) {
132
                                    log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
133
                                }
134
                            }
135
                        }
136

    
137
                    }
138

    
139
                }
140

    
141
            }
142
            @Override
143
            public boolean hasNext() {
144
                if (queue.isEmpty()){
145
                    fillQueue();
146
                }
147
                return (!queue.isEmpty());
148
            }
149

    
150
            @Override
151
            public String next() {
152
                return queue.poll(); 
153
            }
154

    
155
            @Override
156
            public void remove() {
157

    
158
            }
159

    
160
            @Override
161
            public void forEachRemaining(Consumer<? super String> action) {
162

    
163
            }
164
        };
165
    }
166

    
167

    
168
}
(3-3/4)