Project

General

Profile

« Previous | Next » 

Revision 52518

fix for package name (HTTPWithFileName -> httpfilename and fixed issue on iterator for HTTPWithFileNameCollectorIterable

View differences:

modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameTest.java
1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2
import java.util.Iterator;
3

  
4
import org.junit.Ignore;
5
import org.junit.Test;
6

  
7

  
8
/**
9
 * Created by miriam on 07/05/2018.
10
 */
11
@Ignore
12
public class HTTPWithFileNameTest {
13

  
14
    private void iterate(Iterator<String> iterator, boolean exit){
15
        try{
16
            while (iterator.hasNext()){
17

  
18
                System.out.println(iterator.next());
19
                if(exit)
20
                    System.exit(0);
21

  
22

  
23
            }
24

  
25
        }catch(Exception ex){
26
            ex.printStackTrace();
27
        }
28
    }
29

  
30
    @Test
31
    @Ignore
32
    public void testRSCollectorFrontiers()
33
    {
34
        HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/Frontiers/data/Frontiers/metadata/000/",null);
35
        iterate(rsc.iterator(),false);
36

  
37
    }
38

  
39
    @Test
40
    @Ignore
41
    public void testRSCollectorPLOSCount()
42
    {
43
        HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/PLOS/data/public_library_of_science/metadata/354/","article-type=\"correction\"");
44
        Iterator<String> iterator = rsc.iterator();
45
        int count = 0;
46
        int body = 0;
47
        int corrections = 0;
48
        try{
49
            while (iterator.hasNext()){
50

  
51
                String meta = iterator.next();
52
                if (!meta.contains("article-type=\"correction\"")){
53
                    count++;
54
                    int index = meta.indexOf("<body>");
55
                    if(meta.substring(index).contains("<sec"))
56
                        body++;
57
                    else {
58
                        System.out.println(meta);
59
                        System.out.println(count);
60
                    }
61

  
62
                }else
63
                    corrections++;
64

  
65
            }
66
            System.out.println(count + "       "  + body + "                  " + corrections);
67
        }catch(Exception ex){
68
            ex.printStackTrace();
69
        }
70
    }
71

  
72
    @Test
73
    @Ignore
74
    public void testRSCollectorPLOS()
75
    {
76
        HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/PLOS/data/public_library_of_science/metadata/400/","article-type=\"correction\"");
77

  
78

  
79
        iterate(rsc.iterator(),false);
80
    }
81

  
82
    @Test
83
    @Ignore
84
    public void testRSCollectorSpringer()
85
    {
86
        HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/Springer-OA/data/Springer-OA/metadata/8a0/",null);
87

  
88
        iterate(rsc.iterator(),false);
89

  
90
    }
91

  
92
    @Test
93
    public void testEmptyCollection()
94
    {
95
        HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("",null);
96

  
97
        iterate(rsc.iterator(),true);
98
    }
99

  
100
}
modules/dnet-collector-plugins/trunk/src/test/java/eu/dnetlib/data/collector/plugins/httpfilename/HTTPWithFileNameTest.java
1
package eu.dnetlib.data.collector.plugins.httpfilename;
2
import java.util.Iterator;
3

  
4
import org.junit.Ignore;
5
import org.junit.Test;
6

  
7

  
8
/**
9
 * Created by miriam on 07/05/2018.
10
 */
11
@Ignore
12
public class HTTPWithFileNameTest {
13

  
14
    private void iterate(Iterator<String> iterator, boolean exit){
15
        try{
16
            while (iterator.hasNext()){
17

  
18
                System.out.println(iterator.next());
19
                if(exit)
20
                    System.exit(0);
21

  
22

  
23
            }
24

  
25
        }catch(Exception ex){
26
            ex.printStackTrace();
27
        }
28
    }
29

  
30
    @Test
31
    @Ignore
32
    public void testRSCollectorFrontiers()
33
    {
34
        HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/Frontiers/data/Frontiers/metadata/000/",null);
35
        iterate(rsc.iterator(),false);
36

  
37
    }
38

  
39
    @Test
40
    @Ignore
41
    public void testRSCollectorPLOSCount()
42
    {
43
        HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/PLOS/data/public_library_of_science/metadata/354/","article-type=\"correction\"");
44
        Iterator<String> iterator = rsc.iterator();
45
        int count = 0;
46
        int body = 0;
47
        int corrections = 0;
48
        try{
49
            while (iterator.hasNext()){
50

  
51
                String meta = iterator.next();
52
                if (!meta.contains("article-type=\"correction\"")){
53
                    count++;
54
                    int index = meta.indexOf("<body>");
55
                    if(meta.substring(index).contains("<sec"))
56
                        body++;
57
                    else {
58
                        System.out.println(meta);
59
                        System.out.println(count);
60
                    }
61

  
62
                }else
63
                    corrections++;
64

  
65
            }
66
            System.out.println(count + "       "  + body + "                  " + corrections);
67
        }catch(Exception ex){
68
            ex.printStackTrace();
69
        }
70
    }
71

  
72
    @Test
73
    @Ignore
74
    public void testRSCollectorPLOS()
75
    {
76
        HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/PLOS/data/public_library_of_science/metadata/400/","article-type=\"correction\"");
77

  
78

  
79
        iterate(rsc.iterator(),false);
80
    }
81

  
82
    @Test
83
    @Ignore
84
    public void testRSCollectorSpringer()
85
    {
86
        HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("https://dev-openaire.d4science.org/RS/Springer-OA/data/Springer-OA/metadata/8a0/",null);
87

  
88
        iterate(rsc.iterator(),false);
89

  
90
    }
91

  
92
    @Test
93
    public void testEmptyCollection()
94
    {
95
        HTTPWithFileNameCollectorIterable rsc = new HTTPWithFileNameCollectorIterable("",null);
96

  
97
        iterate(rsc.iterator(),true);
98
    }
99

  
100
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameCollectorPlugin.java
1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2

  
3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
6

  
7
/**
8
 * Created by miriam on 04/05/2018.
9
 */
10
public class HTTPWithFileNameCollectorPlugin extends AbstractCollectorPlugin {
11

  
12
    @Override
13
    public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException {
14
        return new HTTPWithFileNameCollectorIterable(interfaceDescriptor.getBaseUrl(), interfaceDescriptor.getParams().get("filter"));
15
    }
16
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/Connector.java
1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2

  
3
import eu.dnetlib.data.collector.plugins.HttpConnector;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5

  
6

  
7
/**
8
 * Created by miriam on 07/05/2018.
9
 */
10
public class Connector extends HttpConnector implements ConnectorInterface  {
11
    private String response;
12

  
13
    @Override
14
    public void get(final String requestUrl) throws CollectorServiceException {
15
        response = getInputSource(requestUrl);
16
    }
17

  
18
    @Override
19
    public String getResponse() {
20
        return response;
21
    }
22

  
23
    @Override
24
    public boolean isStatusOk() {
25
        return (response != null);
26
    }
27

  
28
    @Override
29
    public boolean responseTypeContains(String string) {
30
        String responseType = getResponseType();
31
        if (responseType != null)
32
            return responseType.contains(string);
33
        return false;
34
    }
35

  
36

  
37
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/ConnectorInterface.java
1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2

  
3
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
4

  
5
/**
6
 * Created by miriam on 07/05/2018.
7
 */
8
public interface ConnectorInterface {
9

  
10
    public void get(final String requestUrl) throws CollectorServiceException;
11

  
12
    public String getResponse();
13

  
14
    public boolean isStatusOk();
15

  
16

  
17
    public boolean responseTypeContains(String string);
18

  
19
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameCollectorIterable.java
1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2

  
3
import java.util.ArrayList;
4
import java.util.Iterator;
5
import java.util.NoSuchElementException;
6
import java.util.Objects;
7
import java.util.concurrent.ArrayBlockingQueue;
8
import java.util.concurrent.TimeUnit;
9

  
10
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
11
import org.apache.commons.logging.Log;
12
import org.apache.commons.logging.LogFactory;
13
import org.json.JSONObject;
14
import org.json.XML;
15
import org.jsoup.Jsoup;
16
import org.jsoup.nodes.Document;
17
import org.jsoup.nodes.Element;
18
import org.jsoup.select.Elements;
19

  
20
/**
21
 * Created by miriam on 04/05/2018.
22
 */
23
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
24

  
25
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
26
    private static final String TERMINATOR = "FINITO";
27
    private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>";
28
    public static final String APP_JSON = "application/json";
29
    public static final String APP_XML = "application/xml";
30
    public static final String TEXT_HTML = "text/html";
31
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
32

  
33
    private long waitTime = 60L;
34

  
35
    private final ArrayList<String> urls = new ArrayList<>();
36
    private final ArrayList<String> metas = new ArrayList<String>();
37
    private String filterParam;
38

  
39
    int total = 0;
40
    int filtered = 0;
41

  
42
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
43
        if (!startUrl.isEmpty())
44
            urls.add(startUrl);
45
        this.filterParam = filter;
46
        Thread ft = new Thread(new FillMetaQueue());
47
        ft.start();
48
    }
49

  
50

  
51
    @Override
52
    public Iterator<String> iterator() {
53
        return new Iterator<String>(){
54

  
55
            private String last = null;
56

  
57
            @Override
58
            public boolean hasNext() {
59
                return !Objects.equals(last, TERMINATOR);
60
            }
61

  
62
            @Override
63
            public String next() {
64
                try {
65
                    last = queue.poll(waitTime, TimeUnit.SECONDS);
66
                    if (Objects.equals(last, TERMINATOR)) {
67
                        log.info("found terminator, omg!");
68
                    }
69
                } catch (InterruptedException e) {
70
                    log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS));
71
                    throw new NoSuchElementException(e.getMessage());
72
                }
73
                return last;
74
            }
75

  
76
        };
77
    }
78

  
79
    private class FillMetaQueue implements Runnable {
80

  
81
        final Connector c = new Connector();
82

  
83
        public void fillQueue() {
84
            String url;
85
            while((metas.size()>0 || urls.size() > 0 )) {
86
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
87
                if (metas.size() > 0) {
88
                    url = metas.remove(0);
89
                    try {
90
                        c.get(url);
91
                    } catch (CollectorServiceException e) {
92
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
93
                    }
94
                    if(c.isStatusOk()){
95
                        try {
96
                            String ret = c.getResponse();
97
                            if (ret != null && ret.length()>0) {
98
                                if (!containsFilter(ret))
99
                                    queue.offer(addFilePath(ret, url, url.endsWith(".json")), waitTime, TimeUnit.SECONDS);
100
                                else
101
                                    filtered++;
102
                                total++;
103
                            }
104
                        } catch (InterruptedException e) {
105
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
106

  
107
                        }
108
                    }
109
                } else {
110
                    url = urls.remove(0);
111
                    try {
112
                        c.get(url);
113
                    } catch (CollectorServiceException e) {
114
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
115
                    }
116
                    if(c.isStatusOk()) {
117
                        if (c.responseTypeContains(TEXT_HTML)){
118
                            recurFolder(c.getResponse(), url);
119
                        } else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){
120
                            try {
121
                                final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON));
122
                                queue.offer(element, waitTime, TimeUnit.SECONDS);
123
                            } catch (InterruptedException e) {
124
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
125
                            }
126
                        }
127
                    }
128
                }
129

  
130
            }
131
            try {
132
                queue.offer(TERMINATOR, waitTime, TimeUnit.SECONDS);
133
            } catch (InterruptedException e) {
134
                throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", waitTime, TimeUnit.SECONDS), e);
135
            }
136

  
137
        }
138

  
139
        private boolean containsFilter(String meta){
140
            if (filterParam == null || filterParam.isEmpty())
141
                return false;
142
            String[] filter = filterParam.split(";");
143
            for(String item:filter){
144
                if (meta.contains(item))
145
                    return true;
146
            }
147
            return false;
148
        }
149

  
150
        private String addFilePath(String meta, String url, boolean isJson){
151
            String path = url.replace("metadata", "pdf");
152

  
153
            try {
154
                if(isJson)
155
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
156
                else {
157

  
158
                    if (meta.contains("<!DOCTYPE")) {
159
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
160
                        meta = meta.substring(meta.indexOf(">") + 1);
161
                    }
162
                    int index = meta.lastIndexOf("</");
163
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
164
                }
165
            } catch(Exception ex) {
166
                log.info("not file with extension .json or .xml");
167
            }
168

  
169

  
170
            if(isJson) {
171
                try {
172
                    return XML.toString(new JSONObject("{'resource':" + meta + "}"));
173
                } catch(Exception e) {
174
                    log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
175
                   // throw new RuntimeException();
176
                    final String junk = String.format(JUNK, url);
177
                    log.warn("returning " + junk);
178
                    return junk;
179
                }
180
            }
181
            return meta;
182
        }
183

  
184
        private void recurFolder(String text, String url){
185
            Document doc = Jsoup.parse(text);
186
            Elements links = doc.select("a");
187
            for(Element e:links){
188
                if (!e.text().equals("../")){
189
                    String file = e.attr("href");
190
                    if(file.endsWith(".json") || file.endsWith(".xml"))
191
                        metas.add(url+file);
192
                    else
193
                        urls.add(url+file);
194
                }
195
            }
196
        }
197

  
198

  
199
        @Override
200
        public void run() {
201
            fillQueue();
202
        }
203
    }
204

  
205
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/HTTPWithFileNameCollectorPlugin.java
1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

  
3
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
6

  
7
/**
8
 * Created by miriam on 04/05/2018.
9
 */
10
public class HTTPWithFileNameCollectorPlugin extends AbstractCollectorPlugin {
11

  
12
    @Override
13
    public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException {
14
        return new HTTPWithFileNameCollectorIterable(interfaceDescriptor.getBaseUrl(), interfaceDescriptor.getParams().get("filter"));
15
    }
16
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/Connector.java
1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

  
3
import eu.dnetlib.data.collector.plugins.HttpConnector;
4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
5

  
6

  
7
/**
8
 * Created by miriam on 07/05/2018.
9
 */
10
public class Connector extends HttpConnector implements ConnectorInterface  {
11
    private String response;
12

  
13
    @Override
14
    public void get(final String requestUrl) throws CollectorServiceException {
15
        response = getInputSource(requestUrl);
16
    }
17

  
18
    @Override
19
    public String getResponse() {
20
        return response;
21
    }
22

  
23
    @Override
24
    public boolean isStatusOk() {
25
        return (response != null);
26
    }
27

  
28
    @Override
29
    public boolean responseTypeContains(String string) {
30
        String responseType = getResponseType();
31
        if (responseType != null)
32
            return responseType.contains(string);
33
        return false;
34
    }
35

  
36

  
37
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/ConnectorInterface.java
1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

  
3
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
4

  
5
/**
6
 * Created by miriam on 07/05/2018.
7
 */
8
public interface ConnectorInterface {
9

  
10
    void get(final String requestUrl) throws CollectorServiceException;
11

  
12
    String getResponse();
13

  
14
    boolean isStatusOk();
15

  
16

  
17
    boolean responseTypeContains(String string);
18

  
19
}
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/httpfilename/HTTPWithFileNameCollectorIterable.java
1
package eu.dnetlib.data.collector.plugins.httpfilename;
2

  
3
import java.util.ArrayList;
4
import java.util.Iterator;
5
import java.util.NoSuchElementException;
6
import java.util.Objects;
7
import java.util.concurrent.ArrayBlockingQueue;
8
import java.util.concurrent.TimeUnit;
9

  
10
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
11
import org.apache.commons.logging.Log;
12
import org.apache.commons.logging.LogFactory;
13
import org.json.JSONObject;
14
import org.json.XML;
15
import org.jsoup.Jsoup;
16
import org.jsoup.nodes.Document;
17
import org.jsoup.nodes.Element;
18
import org.jsoup.select.Elements;
19

  
20
/**
21
 * Created by miriam on 04/05/2018.
22
 */
23
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
24

  
25
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
26
    private static final String TERMINATOR = "FINITO";
27
    private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>";
28
    public static final String APP_JSON = "application/json";
29
    public static final String APP_XML = "application/xml";
30
    public static final String TEXT_HTML = "text/html";
31
    private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
32

  
33
    private long waitTime = 60L;
34

  
35
    private final ArrayList<String> urls = new ArrayList<>();
36
    private final ArrayList<String> metas = new ArrayList<String>();
37
    private String filterParam;
38

  
39
    int total = 0;
40
    int filtered = 0;
41

  
42
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
43
        if (!startUrl.isEmpty())
44
            urls.add(startUrl);
45
        this.filterParam = filter;
46
        Thread ft = new Thread(new FillMetaQueue());
47
        ft.start();
48
    }
49

  
50

  
51
    @Override
52
    public Iterator<String> iterator() {
53
        return new Iterator<String>(){
54

  
55
            private String last = null;
56
            private boolean exec_next = true;
57

  
58
            @Override
59
            public boolean hasNext() {
60
                if(exec_next){
61
                    try {
62
                        last = queue.poll(waitTime, TimeUnit.SECONDS);
63
                        exec_next = false;
64
                    }catch(InterruptedException e){
65
                        log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS));
66
                        throw new NoSuchElementException(e.getMessage());
67
                    }
68
                }
69

  
70
                return !Objects.equals(last, TERMINATOR);
71
            }
72

  
73
            @Override
74
            public String next() {
75
                exec_next = true;
76
                return last;
77
            }
78

  
79
//            @Override
80
//            public boolean hasNext() {
81
//
82
//                return !Objects.equals(last, TERMINATOR);
83
//            }
84
//
85
//            @Override
86
//            public String next() {
87
//                try {
88
//                    last = queue.poll(waitTime, TimeUnit.SECONDS);
89
//                    if (Objects.equals(last, TERMINATOR)) {
90
//                        log.info("found terminator, omg!");
91
//                    }
92
//                } catch (InterruptedException e) {
93
//                    log.warn(String.format("could not find elements to consume for more than %s%s", waitTime, TimeUnit.SECONDS));
94
//                    throw new NoSuchElementException(e.getMessage());
95
//                }
96
//                return last;
97
//            }
98

  
99
        };
100
    }
101

  
102
    private class FillMetaQueue implements Runnable {
103

  
104
        final Connector c = new Connector();
105

  
106
        public void fillQueue() {
107
            String url;
108
            while((metas.size()>0 || urls.size() > 0 )) {
109
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
110
                if (metas.size() > 0) {
111
                    url = metas.remove(0);
112
                    try {
113
                        c.get(url);
114
                    } catch (CollectorServiceException e) {
115
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
116
                    }
117
                    if(c.isStatusOk()){
118
                        try {
119
                            String ret = c.getResponse();
120
                            if (ret != null && ret.length()>0) {
121
                                if (!containsFilter(ret))
122
                                    queue.offer(addFilePath(ret, url, url.endsWith(".json")), waitTime, TimeUnit.SECONDS);
123
                                else
124
                                    filtered++;
125
                                total++;
126
                            }
127
                        } catch (InterruptedException e) {
128
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
129

  
130
                        }
131
                    }
132
                } else {
133
                    url = urls.remove(0);
134
                    try {
135
                        c.get(url);
136
                    } catch (CollectorServiceException e) {
137
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
138
                    }
139
                    if(c.isStatusOk()) {
140
                        if (c.responseTypeContains(TEXT_HTML)){
141
                            recurFolder(c.getResponse(), url);
142
                        } else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){
143
                            try {
144
                                final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON));
145
                                queue.offer(element, waitTime, TimeUnit.SECONDS);
146
                            } catch (InterruptedException e) {
147
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
148
                            }
149
                        }
150
                    }
151
                }
152

  
153
            }
154
            try {
155
                queue.offer(TERMINATOR, waitTime, TimeUnit.SECONDS);
156
            } catch (InterruptedException e) {
157
                throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", waitTime, TimeUnit.SECONDS), e);
158
            }
159

  
160
        }
161

  
162
        private boolean containsFilter(String meta){
163
            if (filterParam == null || filterParam.isEmpty())
164
                return false;
165
            String[] filter = filterParam.split(";");
166
            for(String item:filter){
167
                if (meta.contains(item))
168
                    return true;
169
            }
170
            return false;
171
        }
172

  
173
        private String addFilePath(String meta, String url, boolean isJson){
174
            String path = url.replace("metadata", "pdf");
175

  
176
            try {
177
                if(isJson)
178
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
179
                else {
180

  
181
                    if (meta.contains("<!DOCTYPE")) {
182
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
183
                        meta = meta.substring(meta.indexOf(">") + 1);
184
                    }
185
                    int index = meta.lastIndexOf("</");
186
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
187
                }
188
            } catch(Exception ex) {
189
                log.info("not file with extension .json or .xml");
190
            }
191

  
192

  
193
            if(isJson) {
194
                try {
195
                    return XML.toString(new JSONObject("{'resource':" + meta + "}"));
196
                } catch(Exception e) {
197
                    log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
198
                   // throw new RuntimeException();
199
                    final String junk = String.format(JUNK, url);
200
                    log.warn("returning " + junk);
201
                    return junk;
202
                }
203
            }
204
            return meta;
205
        }
206

  
207
        private void recurFolder(String text, String url){
208
            Document doc = Jsoup.parse(text);
209
            Elements links = doc.select("a");
210
            for(Element e:links){
211
                if (!e.text().equals("../")){
212
                    String file = e.attr("href");
213
                    if(file.endsWith(".json") || file.endsWith(".xml"))
214
                        metas.add(url+file);
215
                    else
216
                        urls.add(url+file);
217
                }
218
            }
219
        }
220

  
221

  
222
        @Override
223
        public void run() {
224
            fillQueue();
225
        }
226
    }
227

  
228
}
modules/dnet-collector-plugins/trunk/src/main/resources/eu/dnetlib/data/collector/plugins/applicationContext-dnet-modular-collector-plugins.xml
32 32
		</property>
33 33
	</bean>
34 34

  
35
	<bean id="HTTPWithFileNamePlugin" class="eu.dnetlib.data.collector.plugins.HTTPWithFileName.HTTPWithFileNameCollectorPlugin">
35
	<bean id="HTTPWithFileNamePlugin" class="eu.dnetlib.data.collector.plugins.httpfilename.HTTPWithFileNameCollectorPlugin">
36 36
		<property name="protocolDescriptor">
37 37

  
38 38
			<bean class="eu.dnetlib.data.collector.rmi.ProtocolDescriptor" p:name="HTTPWithFileName">

Also available in: Unified diff