Project

General

Profile

« Previous | Next » 

Revision 51956

pluging for collecting metadata from files mapped to urls (related to #3236)

View differences:

modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/RSCollectorIterable.java
1
package eu.dnetlib.data.collector.plugins.resourcesynck;
2

  
3
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6
import org.json.JSONObject;
7
import org.json.XML;
8
import org.jsoup.Jsoup;
9
import org.jsoup.nodes.Document;
10
import org.jsoup.nodes.Element;
11
import org.jsoup.select.Elements;
12

  
13
import java.util.ArrayList;
14
import java.util.Iterator;
15

  
16
import java.util.concurrent.ArrayBlockingQueue;
17
import java.util.function.Consumer;
18

  
19
/**
20
 * Created by miriam on 04/05/2018.
21
 */
22
public class RSCollectorIterable implements Iterable<String> {
23
    private static final Log log = LogFactory.getLog(RSCollectorIterable.class);
24
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
25
    private final ArrayList<String> urls = new ArrayList<>();
26
    private final ArrayList<String> jsons = new ArrayList<String>();
27

  
28

  
29
    public RSCollectorIterable(String startUrl){
30

  
31
        urls.add(startUrl);
32
        fillQueue();
33
    }
34

  
35
    private String addFilePath(String json,String url){
36
        String path = url.replace("metadata", "pdf");
37
        try {
38
            json = json.substring(0, json.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
39
        }catch(Exception ex){
40
            log.info("not file with extension .json");
41
        }
42

  
43
        JSONObject jsonobj = new JSONObject("{'metadata':"+json+"}");
44

  
45
        return XML.toString(jsonobj);
46
    }
47

  
48
    private void recurFolder(String text, String url){
49
        Document doc = Jsoup.parse(text);
50
        Elements links = doc.select("a");
51
        for(Element e:links){
52
            if (!e.text().equals("../")){
53
                String file = e.attr("href");
54
                if(file.endsWith(".json"))
55
                    jsons.add(url+file);
56
                else
57
                    urls.add(url+file);
58
            }
59
        }
60
    }
61

  
62
    private void fillQueue() {
63
        Connector c = new Connector();
64
        String url;
65
        while((jsons.size()>0 || urls.size() > 0 ) && queue.size()<100){
66
            if (jsons.size() > 0){
67
                url = jsons.remove(0);
68
                try {
69
                    c.get(url);
70
                } catch (CollectorServiceException e) {
71
                    log.error("Impossible to collect url: " + url + " error: " + e.getMessage());
72
                }
73
                if(c.isStatusOk()){
74
                    try {
75
                        String ret = c.getResponse();
76
                        if (ret != null && ret.length()>0)
77
                            queue.put(addFilePath(ret,url));
78
                    } catch (InterruptedException e) {
79
                        log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
80

  
81
                    }
82
                }
83
            }else{
84
                url = urls.remove(0);
85
                try {
86
                    c.get(url);
87
                } catch (CollectorServiceException e) {
88
                    log.error("Impossible to collect url: " + url + " error: " + e.getMessage());
89
                }
90
                if(c.isStatusOk()){
91
                    if (c.responseTypeContains("text/html")){
92
                        recurFolder(c.getResponse(),url);
93
                    }
94
                    else if(c.responseTypeContains("application/json")){
95
                        try {
96
                            queue.put(addFilePath(c.getResponse(),url));
97
                        } catch (InterruptedException e) {
98
                            log.error("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
99
                        }
100
                    }
101
                }
102

  
103
            }
104

  
105
        }
106

  
107
    }
108

  
109
    @Override
110
    public Iterator<String> iterator() {
111

  
112
        return new Iterator<String>(){
113

  
114
            @Override
115
            public boolean hasNext() {
116
                if (queue.isEmpty()){
117
                    fillQueue();
118
                }
119
                return (!queue.isEmpty());
120
            }
121

  
122
            @Override
123
            public String next() {
124
                return queue.poll(); 
125
            }
126

  
127
            @Override
128
            public void remove() {
129

  
130
            }
131

  
132
            @Override
133
            public void forEachRemaining(Consumer<? super String> action) {
134

  
135
            }
136
        };
137
    }
138

  
139

  
140
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/Connector.java
1
package eu.dnetlib.data.collector.plugins.resourcesynck;
2

  
3
import eu.dnetlib.data.collector.plugins.HttpConnector;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5

  
6

  
7
/**
8
 * Created by miriam on 07/05/2018.
9
 */
10
public class Connector extends HttpConnector implements ConnectorInterface  {
11
    private String response;
12

  
13
    @Override
14
    public void get(final String requestUrl) throws CollectorServiceException {
15
        response = getInputSource(requestUrl);
16
    }
17

  
18
    @Override
19
    public String getResponse() {
20
        return response;
21
    }
22

  
23
    @Override
24
    public boolean isStatusOk() {
25
        return (response != null);
26
    }
27

  
28
    @Override
29
    public boolean responseTypeContains(String string) {
30
        String responseType = getResponseType();
31
        if (responseType != null)
32
            return getResponseType().contains(string);
33
        return false;
34
    }
35

  
36

  
37
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/RSCollectorPlugin.java
1
package eu.dnetlib.data.collector.plugins.resourcesynck;
2

  
3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
6

  
7
/**
8
 * Created by miriam on 04/05/2018.
9
 */
10
public class RSCollectorPlugin extends AbstractCollectorPlugin {
11

  
12
    @Override
13
    public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException {
14
        return new RSCollectorIterable(interfaceDescriptor.getBaseUrl());
15
    }
16
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/resourcesynck/ConnectorInterface.java
1
package eu.dnetlib.data.collector.plugins.resourcesynck;
2

  
3
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
4

  
5
/**
6
 * Created by miriam on 07/05/2018.
7
 */
8
public interface ConnectorInterface {
9

  
10
    public void get(final String requestUrl) throws CollectorServiceException;
11

  
12
    public String getResponse();
13

  
14
    public boolean isStatusOk();
15

  
16

  
17
    public boolean responseTypeContains(String string);
18

  
19
}

Also available in: Unified diff