Project

General

Profile

1
package eu.dnetlib.data.collector.plugins.resourcesynck;
2

    
3
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6
import org.json.JSONObject;
7
import org.json.XML;
8
import org.jsoup.Jsoup;
9
import org.jsoup.nodes.Document;
10
import org.jsoup.nodes.Element;
11
import org.jsoup.select.Elements;
12

    
13
import java.util.ArrayList;
14
import java.util.Iterator;
15

    
16
import java.util.concurrent.ArrayBlockingQueue;
17
import java.util.function.Consumer;
18

    
19
/**
20
 * Created by miriam on 04/05/2018.
21
 */
22
public class RSCollectorIterable implements Iterable<String> {
23
    private static final Log log = LogFactory.getLog(RSCollectorIterable.class);
24
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
25
    private final ArrayList<String> urls = new ArrayList<>();
26
    private final ArrayList<String> jsons = new ArrayList<String>();
27

    
28

    
29
    public RSCollectorIterable(String startUrl){
30

    
31
        urls.add(startUrl);
32
        fillQueue();
33
    }
34

    
35
    private String addFilePath(String json,String url){
36
        String path = url.replace("metadata", "pdf");
37
        try {
38
            json = json.substring(0, json.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
39
        }catch(Exception ex){
40
            log.info("not file with extension .json");
41
        }
42

    
43
        JSONObject jsonobj = new JSONObject("{'metadata':"+json+"}");
44

    
45
        return XML.toString(jsonobj);
46
    }
47

    
48
    private void recurFolder(String text, String url){
49
        Document doc = Jsoup.parse(text);
50
        Elements links = doc.select("a");
51
        for(Element e:links){
52
            if (!e.text().equals("../")){
53
                String file = e.attr("href");
54
                if(file.endsWith(".json"))
55
                    jsons.add(url+file);
56
                else
57
                    urls.add(url+file);
58
            }
59
        }
60
    }
61

    
62
    private void fillQueue() {
63
        Connector c = new Connector();
64
        String url;
65
        while((jsons.size()>0 || urls.size() > 0 ) && queue.size()<100){
66
            if (jsons.size() > 0){
67
                url = jsons.remove(0);
68
                try {
69
                    c.get(url);
70
                } catch (CollectorServiceException e) {
71
                    log.error("Impossible to collect url: " + url + " error: " + e.getMessage());
72
                }
73
                if(c.isStatusOk()){
74
                    try {
75
                        String ret = c.getResponse();
76
                        if (ret != null && ret.length()>0)
77
                            queue.put(addFilePath(ret,url));
78
                    } catch (InterruptedException e) {
79
                        log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
80

    
81
                    }
82
                }
83
            }else{
84
                url = urls.remove(0);
85
                try {
86
                    c.get(url);
87
                } catch (CollectorServiceException e) {
88
                    log.error("Impossible to collect url: " + url + " error: " + e.getMessage());
89
                }
90
                if(c.isStatusOk()){
91
                    if (c.responseTypeContains("text/html")){
92
                        recurFolder(c.getResponse(),url);
93
                    }
94
                    else if(c.responseTypeContains("application/json")){
95
                        try {
96
                            queue.put(addFilePath(c.getResponse(),url));
97
                        } catch (InterruptedException e) {
98
                            log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
99
                        }
100
                    }
101
                }
102

    
103
            }
104

    
105
        }
106

    
107
    }
108

    
109
    @Override
110
    public Iterator<String> iterator() {
111

    
112
        return new Iterator<String>(){
113

    
114
            @Override
115
            public boolean hasNext() {
116
                if (queue.isEmpty()){
117
                    fillQueue();
118
                }
119
                return (!queue.isEmpty());
120
            }
121

    
122
            @Override
123
            public String next() {
124
                return queue.poll(); 
125
            }
126

    
127
            @Override
128
            public void remove() {
129

    
130
            }
131

    
132
            @Override
133
            public void forEachRemaining(Consumer<? super String> action) {
134

    
135
            }
136
        };
137
    }
138

    
139

    
140
}
(3-3/4)