Project

General

Profile

« Previous | Next » 

Revision 52235

changed implementation of data gathering

View differences:

HTTPWithFileNameCollectorIterable.java
1 1
package eu.dnetlib.data.collector.plugins.HTTPWithFileName;
2 2

  
3
import eu.dnetlib.data.collector.plugins.projects.gtr2.Gtr2ProjectsIterable;
3 4
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
4 5
import org.apache.commons.logging.Log;
5 6
import org.apache.commons.logging.LogFactory;
......
21 22
 */
22 23
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
23 24
    private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
25
    private final String TERMINATOR = "FINITO";
26
    final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
24 27

  
25 28
    private final ArrayList<String> urls = new ArrayList<>();
26 29
    private final ArrayList<String> metas = new ArrayList<String>();
27
    private String filter;
30
    private String filterParam;
28 31

  
32
    int total = 0;
33
    int filtered = 0;
34

  
29 35
    public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
30

  
31
        urls.add(startUrl);
32
        this.filter = filter;
36
        if (!startUrl.isEmpty())
37
            urls.add(startUrl);
38
        this.filterParam = filter;
39
        Thread ft = new Thread(new FillMetaQueue());
40
        ft.start();
33 41
    }
34 42

  
35
    private boolean containsFilter(String meta){
43
  /*  private boolean containsFilter(String meta){
36 44
        if (filter == null || filter.isEmpty())
37 45
            return false;
38 46
        String[] filter = this.filter.split(";");
......
92 100
                    urls.add(url+file);
93 101
            }
94 102
        }
95
    }
103
    }*/
96 104

  
97 105

  
98 106
    @Override
99 107
    public Iterator<String> iterator() {
100
        final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
101 108

  
102 109

  
110

  
103 111
        return new Iterator<String>(){
104
            int total = 0;
112
    /*        int total = 0;
105 113
            int filtered = 0;
106 114
            public void fillQueue() {
107 115
                Connector c = new Connector();
......
155 163

  
156 164
                }
157 165

  
158
            }
166
            }*/
159 167
            @Override
160 168
            public boolean hasNext() {
161
                if (queue.isEmpty()){
169
                while (queue.isEmpty());
170
                if (queue.peek().equals(TERMINATOR))
171
                    log.info(String.format("Total number of metadata %d, Number of metadata filtered %d", total, filtered));
172
                return !queue.peek().equals(TERMINATOR);
173
                /*if (queue.isEmpty()){
162 174
                    fillQueue();
163
                }
164
                if(queue.isEmpty()){
165
                    log.info(String.format("Total number of metadata %d, Number of metadata filtered %d", total, filtered));
166
                    return false;
167
                }
175
                }*/
176
               // if(queue.isEmpty()){
177
                    //log.info(String.format("Total number of metadata %d, Number of metadata filtered %d", total, filtered));
178
               //     return false;
179
                //}
168 180

  
169
                return true;
181
                //return true;
170 182
            }
171 183

  
172 184
            @Override
......
174 186
                return queue.poll(); 
175 187
            }
176 188

  
177
//            @Override
178
//            public void remove() {
179
//
180
//            }
181
//
182
//            @Override
183
//            public void forEachRemaining(Consumer<? super String> action) {
184
//
185
//            }
186 189
        };
187 190
    }
188 191

  
192
    private class FillMetaQueue implements Runnable{
189 193

  
194

  
195
        Connector c = new Connector();
196

  
197
        public void fillQueue() {
198
            String url;
199
            while((metas.size()>0 || urls.size() > 0 )){
200
                log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
201
                if (metas.size() > 0){
202
                    url = metas.remove(0);
203
                    try {
204
                        c.get(url);
205
                    } catch (CollectorServiceException e) {
206
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
207
                    }
208
                    if(c.isStatusOk()){
209
                        try {
210
                            String ret = c.getResponse();
211
                            if (ret != null && ret.length()>0) {
212
                                if (!containsFilter(ret))
213
                                    queue.put(addFilePath(ret, url, url.endsWith(".json")));
214
                                else
215
                                    filtered++;
216
                                total++;
217
                            }
218
                        } catch (InterruptedException e) {
219
                            log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
220

  
221

  
222
                        }
223
                    }
224
                }else{
225
                    url = urls.remove(0);
226
                    try {
227
                        c.get(url);
228
                    } catch (CollectorServiceException e) {
229
                        log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
230
                    }
231
                    if(c.isStatusOk()){
232
                        if (c.responseTypeContains("text/html")){
233
                            recurFolder(c.getResponse(),url);
234
                        }
235
                        else if(c.responseTypeContains("application/json") || c.responseTypeContains("application/xml")){
236
                            try {
237
                                queue.put(addFilePath(c.getResponse(),url, c.responseTypeContains("application/json")));
238
                            } catch (InterruptedException e) {
239
                                log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
240
                            }
241
                        }
242
                    }
243

  
244
                }
245

  
246
            }
247
            try {
248
                queue.put(TERMINATOR);
249
            } catch (InterruptedException e) {
250
                e.printStackTrace();
251
            }
252

  
253
        }
254

  
255
        private boolean containsFilter(String meta){
256
            if (filterParam == null || filterParam.isEmpty())
257
                return false;
258
            String[] filter = filterParam.split(";");
259
            for(String item:filter){
260
                if (meta.contains(item))
261
                    return true;
262
            }
263
            return false;
264
        }
265

  
266
        private String addFilePath(String meta,String url, boolean isJson){
267
            String path = url.replace("metadata", "pdf");
268

  
269
            try {
270
                if(isJson)
271
                    meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
272
                else{
273

  
274
                    if (meta.contains("<!DOCTYPE")) {
275
                        meta = meta.substring(meta.indexOf("<!DOCTYPE"));
276
                        meta = meta.substring(meta.indexOf(">") + 1);
277
                    }
278
                    int index = meta.lastIndexOf("</");
279
                    meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
280

  
281

  
282
                }
283

  
284
            }catch(Exception ex){
285
                log.info("not file with extension .json or .xml");
286
            }
287

  
288

  
289
            if(isJson){
290
                JSONObject jsonobj = null;
291
                try {
292
                    jsonobj = new JSONObject("{'resource':" + meta + "}");
293

  
294
                    return XML.toString(jsonobj);
295
                }catch(Exception e){
296
                    log.fatal("Impossible to transform json object to xml \n" + jsonobj + "\n " + e.getMessage() + "\n" + url);
297
                    throw new RuntimeException();
298
                }
299
            }
300
            return meta;
301
        }
302

  
303
        private void recurFolder(String text, String url){
304
            Document doc = Jsoup.parse(text);
305
            Elements links = doc.select("a");
306
            for(Element e:links){
307
                if (!e.text().equals("../")){
308
                    String file = e.attr("href");
309
                    if(file.endsWith(".json") || file.endsWith(".xml"))
310
                        metas.add(url+file);
311
                    else
312
                        urls.add(url+file);
313
                }
314
            }
315
        }
316

  
317

  
318
        @Override
319
        public void run() {
320
            fillQueue();
321
        }
322
    }
323

  
324

  
325

  
190 326
}

Also available in: Unified diff