Revision 52237
Added by Miriam Baglioni almost 6 years ago
modules/dnet-collector-plugins/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HTTPWithFileName/HTTPWithFileNameCollectorIterable.java | ||
---|---|---|
40 | 40 |
ft.start(); |
41 | 41 |
} |
42 | 42 |
|
43 |
/* private boolean containsFilter(String meta){ |
|
44 |
if (filter == null || filter.isEmpty()) |
|
45 |
return false; |
|
46 |
String[] filter = this.filter.split(";"); |
|
47 |
for(String item:filter){ |
|
48 |
if (meta.contains(item)) |
|
49 |
return true; |
|
50 |
} |
|
51 |
return false; |
|
52 |
} |
|
53 | 43 |
|
54 |
private String addFilePath(String meta,String url, boolean isJson){ |
|
55 |
String path = url.replace("metadata", "pdf"); |
|
56 |
|
|
57 |
try { |
|
58 |
if(isJson) |
|
59 |
meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}"; |
|
60 |
else{ |
|
61 |
|
|
62 |
if (meta.contains("<!DOCTYPE")) { |
|
63 |
meta = meta.substring(meta.indexOf("<!DOCTYPE")); |
|
64 |
meta = meta.substring(meta.indexOf(">") + 1); |
|
65 |
} |
|
66 |
int index = meta.lastIndexOf("</"); |
|
67 |
meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index); |
|
68 |
|
|
69 |
|
|
70 |
} |
|
71 |
|
|
72 |
}catch(Exception ex){ |
|
73 |
log.info("not file with extension .json or .xml"); |
|
74 |
} |
|
75 |
|
|
76 |
|
|
77 |
if(isJson){ |
|
78 |
JSONObject jsonobj = null; |
|
79 |
try { |
|
80 |
jsonobj = new JSONObject("{'resource':" + meta + "}"); |
|
81 |
|
|
82 |
return XML.toString(jsonobj); |
|
83 |
}catch(Exception e){ |
|
84 |
log.fatal("Impossible to transform json object to xml \n" + jsonobj + "\n " + e.getMessage() + "\n" + url); |
|
85 |
throw new RuntimeException(); |
|
86 |
} |
|
87 |
} |
|
88 |
return meta; |
|
89 |
} |
|
90 |
|
|
91 |
private void recurFolder(String text, String url){ |
|
92 |
Document doc = Jsoup.parse(text); |
|
93 |
Elements links = doc.select("a"); |
|
94 |
for(Element e:links){ |
|
95 |
if (!e.text().equals("../")){ |
|
96 |
String file = e.attr("href"); |
|
97 |
if(file.endsWith(".json") || file.endsWith(".xml")) |
|
98 |
metas.add(url+file); |
|
99 |
else |
|
100 |
urls.add(url+file); |
|
101 |
} |
|
102 |
} |
|
103 |
}*/ |
|
104 |
|
|
105 |
|
|
106 | 44 |
@Override |
107 | 45 |
public Iterator<String> iterator() { |
108 | 46 |
|
109 | 47 |
|
110 | 48 |
|
111 | 49 |
return new Iterator<String>(){ |
112 |
/* int total = 0; |
|
113 |
int filtered = 0; |
|
114 |
public void fillQueue() { |
|
115 |
Connector c = new Connector(); |
|
116 |
String url; |
|
117 |
while((metas.size()>0 || urls.size() > 0 ) && queue.size()<100){ |
|
118 |
log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size()); |
|
119 |
if (metas.size() > 0){ |
|
120 |
url = metas.remove(0); |
|
121 |
try { |
|
122 |
c.get(url); |
|
123 |
} catch (CollectorServiceException e) { |
|
124 |
log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
125 |
} |
|
126 |
if(c.isStatusOk()){ |
|
127 |
try { |
|
128 |
String ret = c.getResponse(); |
|
129 |
if (ret != null && ret.length()>0) { |
|
130 |
if (!containsFilter(ret)) |
|
131 |
queue.put(addFilePath(ret, url, url.endsWith(".json"))); |
|
132 |
else |
|
133 |
filtered++; |
|
134 |
total++; |
|
135 |
} |
|
136 |
} catch (InterruptedException e) { |
|
137 |
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
138 | 50 |
|
139 |
|
|
140 |
} |
|
141 |
} |
|
142 |
}else{ |
|
143 |
url = urls.remove(0); |
|
144 |
try { |
|
145 |
c.get(url); |
|
146 |
} catch (CollectorServiceException e) { |
|
147 |
log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); |
|
148 |
} |
|
149 |
if(c.isStatusOk()){ |
|
150 |
if (c.responseTypeContains("text/html")){ |
|
151 |
recurFolder(c.getResponse(),url); |
|
152 |
} |
|
153 |
else if(c.responseTypeContains("application/json") || c.responseTypeContains("application/xml")){ |
|
154 |
try { |
|
155 |
queue.put(addFilePath(c.getResponse(),url, c.responseTypeContains("application/json"))); |
|
156 |
} catch (InterruptedException e) { |
|
157 |
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); |
|
158 |
} |
|
159 |
} |
|
160 |
} |
|
161 |
|
|
162 |
} |
|
163 |
|
|
164 |
} |
|
165 |
|
|
166 |
}*/ |
|
167 | 51 |
@Override |
168 | 52 |
public boolean hasNext() { |
169 | 53 |
while (queue.isEmpty()); |
170 | 54 |
if (queue.peek().equals(TERMINATOR)) |
171 | 55 |
log.info(String.format("Total number of metadata %d, Number of metadata filtered %d", total, filtered)); |
172 | 56 |
return !queue.peek().equals(TERMINATOR); |
173 |
/*if (queue.isEmpty()){ |
|
174 |
fillQueue(); |
|
175 |
}*/ |
|
176 |
// if(queue.isEmpty()){ |
|
177 |
//log.info(String.format("Total number of metadata %d, Number of metadata filtered %d", total, filtered)); |
|
178 |
// return false; |
|
179 |
//} |
|
180 | 57 |
|
181 |
//return true; |
|
182 | 58 |
} |
183 | 59 |
|
184 | 60 |
@Override |
... | ... | |
294 | 170 |
return XML.toString(jsonobj); |
295 | 171 |
}catch(Exception e){ |
296 | 172 |
log.fatal("Impossible to transform json object to xml \n" + jsonobj + "\n " + e.getMessage() + "\n" + url); |
297 |
throw new RuntimeException(); |
|
173 |
// throw new RuntimeException();
|
|
298 | 174 |
} |
299 | 175 |
} |
300 | 176 |
return meta; |
Also available in: Unified diff
code cleaning