Revision 58415
Added by Antonis Lempesis almost 4 years ago
PiwikStatsDB.java | ||
---|---|---|
1 |
package eu.dnetlib.usagestats.export; |
|
2 |
|
|
3 |
import java.io.*; |
|
4 |
import java.net.URLDecoder; |
|
5 |
import java.sql.Connection; |
|
6 |
import java.sql.PreparedStatement; |
|
7 |
import java.sql.SQLException; |
|
8 |
import java.sql.Statement; |
|
9 |
import java.sql.Timestamp; |
|
10 |
import java.text.SimpleDateFormat; |
|
11 |
import java.util.*; |
|
12 |
import java.util.regex.Matcher; |
|
13 |
import java.util.regex.Pattern; |
|
14 |
import java.util.stream.Stream; |
|
15 |
|
|
16 |
import org.apache.hadoop.conf.Configuration; |
|
17 |
import org.apache.hadoop.fs.LocatedFileStatus; |
|
18 |
import org.apache.hadoop.fs.Path; |
|
19 |
import org.apache.hadoop.fs.FileSystem; |
|
20 |
import org.apache.hadoop.fs.RemoteIterator; |
|
21 |
import org.apache.log4j.Logger; |
|
22 |
import org.json.simple.JSONArray; |
|
23 |
import org.json.simple.JSONObject; |
|
24 |
import org.json.simple.parser.JSONParser; |
|
25 |
|
|
26 |
public class PiwikStatsDB { |
|
27 |
|
|
28 |
private String logPath; |
|
29 |
private String logRepoPath; |
|
30 |
private String logPortalPath; |
|
31 |
|
|
32 |
private Statement stmt = null; |
|
33 |
|
|
34 |
private final Logger log = Logger.getLogger(this.getClass()); |
|
35 |
private String CounterRobotsURL; |
|
36 |
private ArrayList robotsList; |
|
37 |
|
|
38 |
|
|
39 |
public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception { |
|
40 |
this.logRepoPath = logRepoPath; |
|
41 |
this.logPortalPath = logPortalPath; |
|
42 |
this.createTables(); |
|
43 |
this.createTmpTables(); |
|
44 |
} |
|
45 |
|
|
46 |
public void foo() { |
|
47 |
Stream<String> s = Arrays.stream(new String[] {"a", "b", "c", "d"}); |
|
48 |
|
|
49 |
System.out.println(s.parallel().count()); |
|
50 |
} |
|
51 |
|
|
52 |
public ArrayList getRobotsList() { |
|
53 |
return robotsList; |
|
54 |
} |
|
55 |
|
|
56 |
public void setRobotsList(ArrayList robotsList) { |
|
57 |
this.robotsList = robotsList; |
|
58 |
} |
|
59 |
|
|
60 |
public String getCounterRobotsURL() { |
|
61 |
return CounterRobotsURL; |
|
62 |
} |
|
63 |
|
|
64 |
public void setCounterRobotsURL(String CounterRobotsURL) { |
|
65 |
this.CounterRobotsURL = CounterRobotsURL; |
|
66 |
} |
|
67 |
|
|
68 |
private void createTables() throws Exception { |
|
69 |
try { |
|
70 |
stmt = ConnectDB.getConnection().createStatement(); |
|
71 |
String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS piwiklog(source INTEGER, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; |
|
72 |
String sqlcreateRulePiwikLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " |
|
73 |
+ " ON INSERT TO piwiklog " |
|
74 |
+ " WHERE (EXISTS ( SELECT piwiklog.source, piwiklog.id_visit," |
|
75 |
+ "piwiklog.action, piwiklog.\"timestamp\", piwiklog.entity_id " |
|
76 |
+ "FROM piwiklog " |
|
77 |
+ "WHERE piwiklog.source = new.source AND piwiklog.id_visit = new.id_visit AND piwiklog.action = new.action AND piwiklog.entity_id = new.entity_id AND piwiklog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; |
|
78 |
String sqlCreateRuleIndexPiwikLog = "create index if not exists piwiklog_rule on piwiklog(source, id_visit, action, entity_id, \"timestamp\");"; |
|
79 |
stmt.executeUpdate(sqlCreateTablePiwikLog); |
|
80 |
stmt.executeUpdate(sqlcreateRulePiwikLog); |
|
81 |
stmt.executeUpdate(sqlCreateRuleIndexPiwikLog); |
|
82 |
|
|
83 |
String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS process_portal_log(source INTEGER, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, timestamp));"; |
|
84 |
String sqlcreateRulePortalLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " |
|
85 |
+ " ON INSERT TO process_portal_log " |
|
86 |
+ " WHERE (EXISTS ( SELECT process_portal_log.source, process_portal_log.id_visit," |
|
87 |
+ "process_portal_log.\"timestamp\" " |
|
88 |
+ "FROM process_portal_log " |
|
89 |
+ "WHERE process_portal_log.source = new.source AND process_portal_log.id_visit = new.id_visit AND process_portal_log.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; |
|
90 |
String sqlCreateRuleIndexPortalLog = "create index if not exists process_portal_log_rule on process_portal_log(source, id_visit, \"timestamp\");"; |
|
91 |
stmt.executeUpdate(sqlCreateTablePortalLog); |
|
92 |
stmt.executeUpdate(sqlcreateRulePortalLog); |
|
93 |
stmt.executeUpdate(sqlCreateRuleIndexPiwikLog); |
|
94 |
|
|
95 |
stmt.close(); |
|
96 |
ConnectDB.getConnection().close(); |
|
97 |
log.info("Usage Tables Created"); |
|
98 |
|
|
99 |
} catch (Exception e) { |
|
100 |
log.error("Failed to create tables: " + e); |
|
101 |
throw new Exception("Failed to create tables: " + e.toString(), e); |
|
102 |
} |
|
103 |
} |
|
104 |
|
|
105 |
private void createTmpTables() throws Exception { |
|
106 |
try { |
|
107 |
Statement stmt = ConnectDB.getConnection().createStatement(); |
|
108 |
String sqlCreateTmpTablePiwikLog = "CREATE TABLE IF NOT EXISTS piwiklogtmp(source INTEGER, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; |
|
109 |
String sqlcreateTmpRulePiwikLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " |
|
110 |
+ " ON INSERT TO piwiklogtmp " |
|
111 |
+ " WHERE (EXISTS ( SELECT piwiklogtmp.source, piwiklogtmp.id_visit," |
|
112 |
+ "piwiklogtmp.action, piwiklogtmp.\"timestamp\", piwiklogtmp.entity_id " |
|
113 |
+ "FROM piwiklogtmp " |
|
114 |
+ "WHERE piwiklogtmp.source = new.source AND piwiklogtmp.id_visit = new.id_visit AND piwiklogtmp.action = new.action AND piwiklogtmp.entity_id = new.entity_id AND piwiklogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; |
|
115 |
stmt.executeUpdate(sqlCreateTmpTablePiwikLog); |
|
116 |
stmt.executeUpdate(sqlcreateTmpRulePiwikLog); |
|
117 |
|
|
118 |
//String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;"; |
|
119 |
//stmt.executeUpdate(sqlCopyPublicPiwiklog); |
|
120 |
String sqlCreateTmpTablePortalLog = "CREATE TABLE IF NOT EXISTS process_portal_log_tmp(source INTEGER, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, timestamp));"; |
|
121 |
String sqlcreateTmpRulePortalLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " |
|
122 |
+ " ON INSERT TO process_portal_log_tmp " |
|
123 |
+ " WHERE (EXISTS ( SELECT process_portal_log_tmp.source, process_portal_log_tmp.id_visit," |
|
124 |
+ "process_portal_log_tmp.\"timestamp\" " |
|
125 |
+ "FROM process_portal_log_tmp " |
|
126 |
+ "WHERE process_portal_log_tmp.source = new.source AND process_portal_log_tmp.id_visit = new.id_visit AND process_portal_log_tmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; |
|
127 |
stmt.executeUpdate(sqlCreateTmpTablePortalLog); |
|
128 |
stmt.executeUpdate(sqlcreateTmpRulePortalLog); |
|
129 |
|
|
130 |
stmt.close(); |
|
131 |
log.info("Usage Tmp Tables Created"); |
|
132 |
|
|
133 |
} catch (Exception e) { |
|
134 |
log.error("Failed to create tmptables: " + e); |
|
135 |
throw new Exception("Failed to create tmp tables: " + e.toString(), e); |
|
136 |
//System.exit(0); |
|
137 |
} |
|
138 |
} |
|
139 |
|
|
140 |
public void processLogs() throws Exception { |
|
141 |
try { |
|
142 |
ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); |
|
143 |
this.robotsList = counterRobots.getRobotsPatterns(); |
|
144 |
|
|
145 |
processRepositoryLog(); |
|
146 |
log.info("repository process done"); |
|
147 |
removeDoubleClicks(); |
|
148 |
log.info("removing double clicks done"); |
|
149 |
cleanOAI(); |
|
150 |
log.info("cleaning oai done"); |
|
151 |
|
|
152 |
viewsStats(); |
|
153 |
downloadsStats(); |
|
154 |
|
|
155 |
processPortalLog(); |
|
156 |
log.info("portal process done"); |
|
157 |
|
|
158 |
portalStats(); |
|
159 |
log.info("portal usagestats done"); |
|
160 |
|
|
161 |
updateProdTables(); |
|
162 |
log.info("updateProdTables done"); |
|
163 |
|
|
164 |
} catch (Exception e) { |
|
165 |
log.error("Failed to process logs: " + e); |
|
166 |
throw new Exception("Failed to process logs: " + e.toString(), e); |
|
167 |
} |
|
168 |
} |
|
169 |
|
|
170 |
// public void usageStats() throws Exception { |
|
171 |
// try { |
|
172 |
// viewsStats(); |
|
173 |
// downloadsStats(); |
|
174 |
// log.info("stat tables and views done"); |
|
175 |
// } catch (Exception e) { |
|
176 |
// log.error("Failed to create usage usagestats: " + e); |
|
177 |
// throw new Exception("Failed to create usage usagestats: " + e.toString(), e); |
|
178 |
// } |
|
179 |
// } |
|
180 |
|
|
181 |
public void processRepositoryLog() throws Exception { |
|
182 |
Statement stmt = ConnectDB.getConnection().createStatement(); |
|
183 |
ConnectDB.getConnection().setAutoCommit(false); |
|
184 |
|
|
185 |
ArrayList<String> jsonFiles = listHdfsDir(this.logRepoPath); |
|
186 |
// File dir = new File(this.logRepoPath); |
|
187 |
// File[] jsonFiles = dir.listFiles(); |
|
188 |
|
|
189 |
PreparedStatement prepStatem = ConnectDB.getConnection().prepareStatement("INSERT INTO piwiklogtmp (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)"); |
|
190 |
int batch_size = 0; |
|
191 |
JSONParser parser = new JSONParser(); |
|
192 |
for (String jsonFile : jsonFiles) { |
|
193 |
System.out.println(jsonFile); |
|
194 |
JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile)); |
|
195 |
for (Object aJsonArray : jsonArray) { |
|
196 |
JSONObject jsonObjectRow = (JSONObject) aJsonArray; |
|
197 |
int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString()); |
|
198 |
String idVisit = jsonObjectRow.get("idVisit").toString(); |
|
199 |
String country = jsonObjectRow.get("country").toString(); |
|
200 |
String referrerName = jsonObjectRow.get("referrerName").toString(); |
|
201 |
String agent = jsonObjectRow.get("browser").toString(); |
|
202 |
boolean botFound = false; |
|
203 |
Iterator it = robotsList.iterator(); |
|
204 |
while (it.hasNext()) { |
|
205 |
// Create a Pattern object |
|
206 |
Pattern r = Pattern.compile(it.next().toString()); |
|
207 |
// Now create matcher object. |
|
208 |
Matcher m = r.matcher(agent); |
|
209 |
if (m.find()) { |
|
210 |
//System.out.println("Found value: " + m.group(0)); |
|
211 |
botFound = true; |
|
212 |
break; |
|
213 |
} |
|
214 |
} |
|
215 |
if (botFound == false) { |
|
216 |
String sourceItemType = "repItem"; |
|
217 |
|
|
218 |
JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails")); |
|
219 |
for (Object actionDetail : actionDetails) { |
|
220 |
JSONObject actionDetailsObj = (JSONObject) actionDetail; |
|
221 |
|
|
222 |
if (actionDetailsObj.get("customVariables") != null) { |
|
223 |
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); |
|
224 |
simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); |
|
225 |
Timestamp timestamp = new Timestamp(Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000); |
|
226 |
String url = actionDetailsObj.get("url").toString(); |
|
227 |
String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("1")).get("customVariablePageValue1").toString(); |
|
228 |
String action = actionDetailsObj.get("type").toString(); |
|
229 |
|
|
230 |
prepStatem.setInt(1, idSite); |
|
231 |
prepStatem.setString(2, idVisit); |
|
232 |
prepStatem.setString(3, country); |
|
233 |
prepStatem.setString(4, action); |
|
234 |
prepStatem.setString(5, url); |
|
235 |
prepStatem.setString(6, oaipmh); |
|
236 |
prepStatem.setString(7, sourceItemType); |
|
237 |
prepStatem.setString(8, simpleDateFormat.format(timestamp)); |
|
238 |
prepStatem.setString(9, referrerName); |
|
239 |
prepStatem.setString(10, agent); |
|
240 |
prepStatem.addBatch(); |
|
241 |
batch_size++; |
|
242 |
if (batch_size == 10000) { |
|
243 |
prepStatem.executeBatch(); |
|
244 |
ConnectDB.getConnection().commit(); |
|
245 |
batch_size = 0; |
|
246 |
} |
|
247 |
} |
|
248 |
} |
|
249 |
} |
|
250 |
} |
|
251 |
} |
|
252 |
prepStatem.executeBatch(); |
|
253 |
ConnectDB.getConnection().commit(); |
|
254 |
stmt.close(); |
|
255 |
} |
|
256 |
|
|
257 |
public void removeDoubleClicks() throws Exception { |
|
258 |
Statement stmt = ConnectDB.getConnection().createStatement(); |
|
259 |
ConnectDB.getConnection().setAutoCommit(false); |
|
260 |
|
|
261 |
//clean download double clicks |
|
262 |
String sql = "DELETE FROM piwiklogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp FROM piwiklogtmp p1, piwiklogtmp p2 WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp AND p1.timestamp<p2.timestamp AND extract(EPOCH FROM p2.timestamp::timestamp-p1.timestamp::timestamp)<30 AND p.source=p1.source AND p.id_visit=p1.id_visit AND p.action=p1.action AND p.entity_id=p1.entity_id AND p.timestamp=p1.timestamp);"; |
|
263 |
stmt.executeUpdate(sql); |
|
264 |
stmt.close(); |
|
265 |
ConnectDB.getConnection().commit(); |
|
266 |
|
|
267 |
stmt = ConnectDB.getConnection().createStatement(); |
|
268 |
|
|
269 |
//clean view double clicks |
|
270 |
sql = "DELETE FROM piwiklogtmp p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp from piwiklogtmp p1, piwiklogtmp p2 WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp AND p1.timestamp<p2.timestamp AND extract(EPOCH FROM p2.timestamp::timestamp-p1.timestamp::timestamp)<10 AND p.source=p1.source AND p.id_visit=p1.id_visit AND p.action=p1.action AND p.entity_id=p1.entity_id AND p.timestamp=p1.timestamp);"; |
|
271 |
stmt.executeUpdate(sql); |
|
272 |
stmt.close(); |
|
273 |
ConnectDB.getConnection().commit(); |
|
274 |
} |
|
275 |
|
|
276 |
public void viewsStats() throws Exception { |
|
277 |
Statement stmt = ConnectDB.getConnection().createStatement(); |
|
278 |
ConnectDB.getConnection().setAutoCommit(false); |
|
279 |
|
|
280 |
//String sql = "CREATE OR REPLACE VIEW result_views_monthly AS SELECT entity_id AS id, COUNT(entity_id) as views, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM piwiklog where action='action' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;"; |
|
281 |
String sql = "CREATE OR REPLACE VIEW result_views_monthly_tmp AS SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM piwiklogtmp where action='action' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;"; |
|
282 |
stmt.executeUpdate(sql); |
|
283 |
|
|
284 |
// sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count, max(openaire_referrer) AS openaire INTO views_stats FROM result_views_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
285 |
sql = "CREATE TABLE IF NOT EXISTS views_stats_tmp AS SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count, max(openaire_referrer) AS openaire FROM result_views_monthly_tmp p, public.datasource d, public.result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
286 |
stmt.executeUpdate(sql); |
|
287 |
|
|
288 |
// sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count INTO pageviews_stats FROM result_views_monthly p, datasource d, result_oids ro where p.source='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
289 |
sql = "CREATE TABLE IF NOT EXISTS pageviews_stats_tmp AS SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count FROM result_views_monthly_tmp p, public.datasource d, public.result_oids ro where p.source='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
290 |
stmt.executeUpdate(sql); |
|
291 |
|
|
292 |
sql = "DROP VIEW IF EXISTS result_views_monthly_tmp;"; |
|
293 |
stmt.executeUpdate(sql); |
|
294 |
|
|
295 |
stmt.close(); |
|
296 |
ConnectDB.getConnection().commit(); |
|
297 |
ConnectDB.getConnection().close(); |
|
298 |
} |
|
299 |
|
|
300 |
// public void viewsStats(String piwikid) throws Exception { |
|
301 |
// stmt = ConnectDB.getConnection().createStatement(); |
|
302 |
// ConnectDB.getConnection().setAutoCommit(false); |
|
303 |
// |
|
304 |
// //String sql = "CREATE OR REPLACE VIEW result_views_monthly AS SELECT entity_id AS id, COUNT(entity_id) as views, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM piwiklog where action='action' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;"; |
|
305 |
// String sql = "CREATE OR REPLACE VIEW result_views_monthly" + piwikid + " AS SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM piwiklog" + piwikid + " where action='action' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;"; |
|
306 |
// stmt.executeUpdate(sql); |
|
307 |
// |
|
308 |
// // sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count, max(openaire_referrer) AS openaire INTO views_stats FROM result_views_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
309 |
// sql = "CREATE TABLE IF NOT EXISTS views_stats" + piwikid + " AS SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count, max(openaire_referrer) AS openaire FROM result_views_monthly" + piwikid + " p, datasource d, result_oids ro where p.source!='109' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
310 |
// stmt.executeUpdate(sql); |
|
311 |
// |
|
312 |
//// sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count INTO pageviews_stats FROM result_views_monthly p, datasource d, result_oids ro where p.source='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
313 |
// sql = "CREATE TABLE IF NOT EXISTS pageviews_stats" + piwikid + " AS SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count FROM result_views_monthly" + piwikid + " p, datasource d, result_oids ro where p.source='109' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
314 |
// stmt.executeUpdate(sql); |
|
315 |
// |
|
316 |
// sql = "DROP VIEW IF EXISTS result_views_monthly" + piwikid + ";"; |
|
317 |
// stmt.executeUpdate(sql); |
|
318 |
// |
|
319 |
// stmt.close(); |
|
320 |
// ConnectDB.getConnection().commit(); |
|
321 |
// ConnectDB.getConnection().close(); |
|
322 |
// } |
|
323 |
|
|
324 |
private void downloadsStats() throws Exception { |
|
325 |
Statement stmt = ConnectDB.getConnection().createStatement(); |
|
326 |
ConnectDB.getConnection().setAutoCommit(false); |
|
327 |
|
|
328 |
//String sql = "CREATE OR REPLACE VIEW result_downloads_monthly as select entity_id AS id, COUNT(entity_id) as downloads, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM piwiklog where action='download' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;"; |
|
329 |
String sql = "CREATE OR REPLACE VIEW result_downloads_monthly_tmp as select entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM piwiklogtmp where action='download' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;"; |
|
330 |
stmt.executeUpdate(sql); |
|
331 |
|
|
332 |
//sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(downloads) AS count INTO downloads_stats FROM result_downloads_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
333 |
// sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(downloads) AS count, max(openaire_referrer) AS openaire INTO downloads_stats FROM result_downloads_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
334 |
sql = "CREATE TABLE IF NOT EXISTS downloads_stats_tmp AS SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(downloads) AS count, max(openaire_referrer) AS openaire FROM result_downloads_monthly_tmp p, public.datasource d, public.result_oids ro where p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;"; |
|
335 |
stmt.executeUpdate(sql); |
|
336 |
|
|
337 |
sql = "DROP VIEW IF EXISTS result_downloads_monthly_tmp;"; |
|
338 |
stmt.executeUpdate(sql); |
|
339 |
|
|
340 |
stmt.close(); |
|
341 |
ConnectDB.getConnection().commit(); |
|
342 |
ConnectDB.getConnection().close(); |
|
343 |
} |
|
344 |
|
|
345 |
public void finalizeStats() throws Exception { |
|
346 |
stmt = ConnectDB.getConnection().createStatement(); |
|
347 |
ConnectDB.getConnection().setAutoCommit(false); |
|
348 |
|
|
349 |
Calendar startCalendar = Calendar.getInstance(); |
|
350 |
startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01")); |
|
351 |
Calendar endCalendar = Calendar.getInstance(); |
|
352 |
int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR); |
|
353 |
int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); |
|
354 |
|
|
355 |
// String sql = "SELECT to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS full_date INTO full_dates FROM generate_series(0, " + diffMonth + ", 1) AS offs;"; |
|
356 |
String sql = "CREATE TABLE IF NOT EXISTS full_dates AS SELECT to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS full_date FROM generate_series(0, " + diffMonth + ", 1) AS offs;"; |
|
357 |
stmt.executeUpdate(sql); |
|
358 |
|
|
359 |
sql = "CREATE INDEX IF NOT EXISTS full_dates_full_date ON full_dates USING btree(full_date);"; |
|
360 |
stmt.executeUpdate(sql); |
|
361 |
|
|
362 |
sql = "CREATE INDEX IF NOT EXISTS views_stats_source ON views_stats USING btree(source);"; |
|
363 |
stmt.executeUpdate(sql); |
|
364 |
|
|
365 |
sql = "CREATE INDEX IF NOT EXISTS views_stats_repository_id ON views_stats USING btree(repository_id);"; |
|
366 |
stmt.executeUpdate(sql); |
|
367 |
|
|
368 |
sql = "CREATE INDEX IF NOT EXISTS views_stats_result_id ON views_stats USING btree(result_id);"; |
|
369 |
stmt.executeUpdate(sql); |
|
370 |
|
|
371 |
sql = "CREATE INDEX IF NOT EXISTS views_stats_date ON views_stats USING btree(date);"; |
|
372 |
stmt.executeUpdate(sql); |
|
373 |
|
|
374 |
sql = "CREATE INDEX IF NOT EXISTS pageviews_stats_source ON pageviews_stats USING btree(source);"; |
|
375 |
stmt.executeUpdate(sql); |
|
376 |
|
|
377 |
sql = "CREATE INDEX IF NOT EXISTS pageviews_stats_repository_id ON pageviews_stats USING btree(repository_id);"; |
|
378 |
stmt.executeUpdate(sql); |
|
379 |
|
|
380 |
sql = "CREATE INDEX IF NOT EXISTS pageviews_stats_result_id ON pageviews_stats USING btree(result_id);"; |
|
381 |
stmt.executeUpdate(sql); |
|
382 |
|
|
383 |
sql = "CREATE INDEX IF NOT EXISTS pageviews_stats_date ON pageviews_stats USING btree(date);"; |
|
384 |
stmt.executeUpdate(sql); |
|
385 |
|
|
386 |
sql = "CREATE INDEX IF NOT EXISTS downloads_stats_source ON downloads_stats USING btree(source);"; |
|
387 |
stmt.executeUpdate(sql); |
|
388 |
|
|
389 |
sql = "CREATE INDEX IF NOT EXISTS downloads_stats_repository_id ON downloads_stats USING btree(repository_id);"; |
|
390 |
stmt.executeUpdate(sql); |
|
391 |
|
|
392 |
sql = "CREATE INDEX IF NOT EXISTS downloads_stats_result_id ON downloads_stats USING btree(result_id);"; |
|
393 |
stmt.executeUpdate(sql); |
|
394 |
|
|
395 |
sql = "CREATE INDEX IF NOT EXISTS downloads_stats_date ON downloads_stats USING btree(date);"; |
|
396 |
stmt.executeUpdate(sql); |
|
397 |
|
|
398 |
sql = "CREATE TABLE IF NOT EXISTS usage_stats AS SELECT coalesce(ds.source, vs.source) as source, coalesce(ds.repository_id, vs.repository_id) as repository_id, coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, coalesce(ds.openaire, 0) as openaire_downloads, coalesce(vs.openaire, 0) as openaire_views FROM downloads_stats AS ds FULL OUTER JOIN views_stats AS vs ON ds.source=vs.source AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date;"; |
|
399 |
stmt.executeUpdate(sql); |
|
400 |
|
|
401 |
sql = "CREATE INDEX IF NOT EXISTS usage_stats_source ON usage_stats USING btree(source);"; |
|
402 |
stmt.executeUpdate(sql); |
|
403 |
|
|
404 |
sql = "CREATE INDEX IF NOT EXISTS usage_stats_repository_id ON usage_stats USING btree(repository_id);"; |
|
405 |
stmt.executeUpdate(sql); |
|
406 |
|
|
407 |
sql = "CREATE INDEX IF NOT EXISTS usage_stats_result_id ON usage_stats USING btree(result_id);"; |
|
408 |
stmt.executeUpdate(sql); |
|
409 |
|
|
410 |
sql = "CREATE INDEX IF NOT EXISTS usage_stats_date ON usage_stats USING btree(date);"; |
|
411 |
stmt.executeUpdate(sql); |
|
412 |
|
|
413 |
stmt.close(); |
|
414 |
ConnectDB.getConnection().commit(); |
|
415 |
ConnectDB.getConnection().close(); |
|
416 |
} |
|
417 |
|
|
418 |
//Create repository Views statistics |
|
419 |
private void repositoryViewsStats() throws Exception { |
|
420 |
stmt = ConnectDB.getConnection().createStatement(); |
|
421 |
ConnectDB.getConnection().setAutoCommit(false); |
|
422 |
|
|
423 |
// String sql = "SELECT entity_id AS id , COUNT(entity_id) AS number_of_views, timestamp::date AS date, source INTO repo_view_stats FROM piwiklog WHERE source!='5' AND action=\'action\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;"; |
|
424 |
String sql = "CREATE TABLE IF NOT EXISTS repo_view_stats AS SELECT entity_id AS id , COUNT(entity_id) AS number_of_views, timestamp::date AS date, source FROM piwiklog WHERE source!='5' AND action=\'action\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;"; |
|
425 |
stmt.executeUpdate(sql); |
|
426 |
|
|
427 |
sql = "CREATE INDEX repo_view_stats_id ON repo_view_stats USING btree (id)"; |
|
428 |
stmt.executeUpdate(sql); |
|
429 |
|
|
430 |
sql = "CREATE INDEX repo_view_stats_date ON repo_view_stats USING btree(date)"; |
|
431 |
stmt.executeUpdate(sql); |
|
432 |
|
|
433 |
// sql = "SELECT roid.id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source INTO repo_view_stats_monthly_clean FROM repo_view_stats rvs, result_oids roid where rvs.id=roid.orid group by roid.id, month, source;"; |
|
434 |
sql = "CREATE TABLE IF NOT EXISTS repo_view_stats_monthly_clean AS SELECT roid.id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source FROM repo_view_stats rvs, result_oids roid where rvs.id=roid.orid group by roid.id, month, source;"; |
|
435 |
stmt.executeUpdate(sql); |
|
436 |
|
|
437 |
sql = "CREATE INDEX repo_view_stats_monthly_clean_id ON repo_view_stats_monthly_clean USING btree (id)"; |
|
438 |
stmt.executeUpdate(sql); |
|
439 |
|
|
440 |
sql = "CREATE INDEX repo_view_stats_monthly_clean_month ON repo_view_stats_monthly_clean USING btree(month)"; |
|
441 |
stmt.executeUpdate(sql); |
|
442 |
|
|
443 |
sql = "CREATE INDEX repo_view_stats_monthly_clean_source ON repo_view_stats_monthly_clean USING btree(source)"; |
|
444 |
stmt.executeUpdate(sql); |
|
445 |
|
|
446 |
Calendar startCalendar = Calendar.getInstance(); |
|
447 |
startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01")); |
|
448 |
Calendar endCalendar = Calendar.getInstance(); |
|
449 |
int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR); |
|
450 |
int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); |
|
451 |
|
|
452 |
//sql="CREATE OR REPLACE view repo_view_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth +", 1) AS offs, repo_view_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_view_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date"; |
|
453 |
// sql = "select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source INTO repo_view_stats_monthly from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth + ", 1) AS offs, repo_view_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_view_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date"; |
|
454 |
sql = "CREATE TABLE IF NOT EXISTS repo_view_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth + ", 1) AS offs, repo_view_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_view_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date"; |
|
455 |
stmt.executeUpdate(sql); |
|
456 |
|
|
457 |
sql = "CREATE INDEX repo_view_stats_monthly_id ON repo_view_stats_monthly USING btree (id)"; |
|
458 |
stmt.executeUpdate(sql); |
|
459 |
|
|
460 |
sql = "CREATE INDEX repo_view_stats_monthly_month ON repo_view_stats_monthly USING btree(month)"; |
|
461 |
stmt.executeUpdate(sql); |
|
462 |
|
|
463 |
sql = "CREATE INDEX repo_view_stats_monthly_source ON repo_view_stats_monthly USING btree(source)"; |
|
464 |
stmt.executeUpdate(sql); |
|
465 |
|
|
466 |
sql = "CREATE OR REPLACE view repo_view_stats_monthly_sushi AS SELECT id, sum(number_of_views), extract('year' from date) ||'-'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') ||'-01' AS month, source FROM repo_view_stats group by id, month, source;"; |
|
467 |
stmt.executeUpdate(sql); |
|
468 |
|
|
469 |
stmt.close(); |
|
470 |
ConnectDB.getConnection().commit(); |
|
471 |
ConnectDB.getConnection().close(); |
|
472 |
} |
|
473 |
|
|
474 |
//Create repository downloads statistics |
|
475 |
private void repositoryDownloadsStats() throws Exception { |
|
476 |
stmt = ConnectDB.getConnection().createStatement(); |
|
477 |
ConnectDB.getConnection().setAutoCommit(false); |
|
478 |
|
|
479 |
// String sql = "SELECT entity_id AS id, COUNT(entity_id) AS number_of_downloads, timestamp::date AS date, source INTO repo_download_stats FROM piwiklog WHERE source!='5' AND action=\'download\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;"; |
|
480 |
String sql = "CREATE TABLE IF NOT EXISTS repo_download_stats AS SELECT entity_id AS id, COUNT(entity_id) AS number_of_downloads, timestamp::date AS date, source FROM piwiklog WHERE source!='5' AND action=\'download\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;"; |
|
481 |
stmt.executeUpdate(sql); |
|
482 |
|
|
483 |
sql = "CREATE INDEX repo_download_stats_id ON repo_download_stats USING btree (id)"; |
|
484 |
stmt.executeUpdate(sql); |
|
485 |
|
|
486 |
sql = "CREATE INDEX repo_download_stats_date ON repo_download_stats USING btree(date)"; |
|
487 |
stmt.executeUpdate(sql); |
|
488 |
|
|
489 |
// sql = "SELECT roid.id, sum(number_of_downloads), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source INTO repo_download_stats_monthly_clean FROM repo_download_stats rvs, result_oids roid WHERE rvs.id=roid.orid GROUP BY roid.id, month, source;"; |
|
490 |
sql = "CREATE TABLE IF NOT EXISTS repo_download_stats_monthly_clean AS SELECT roid.id, sum(number_of_downloads), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source FROM repo_download_stats rvs, result_oids roid WHERE rvs.id=roid.orid GROUP BY roid.id, month, source;"; |
|
491 |
stmt.executeUpdate(sql); |
|
492 |
|
|
493 |
sql = "CREATE INDEX repo_download_stats_monthly_clean_id ON repo_download_stats_monthly_clean USING btree (id)"; |
|
494 |
stmt.executeUpdate(sql); |
|
495 |
|
|
496 |
sql = "CREATE INDEX repo_download_stats_monthly_clean_month ON repo_download_stats_monthly_clean USING btree(month)"; |
|
497 |
stmt.executeUpdate(sql); |
|
498 |
|
|
499 |
sql = "CREATE INDEX repo_download_stats_monthly_clean_source ON repo_download_stats_monthly_clean USING btree(source)"; |
|
500 |
stmt.executeUpdate(sql); |
|
501 |
|
|
502 |
Calendar startCalendar = Calendar.getInstance(); |
|
503 |
startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01")); |
|
504 |
Calendar endCalendar = Calendar.getInstance(); |
|
505 |
int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR); |
|
506 |
int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); |
|
507 |
|
|
508 |
//sql="CREATE OR REPLACE view repo_download_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth +", 1) AS offs, repo_download_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_download_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date"; |
|
509 |
// sql = "select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source INTO repo_download_stats_monthly from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth + ", 1) AS offs, repo_download_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_download_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date"; |
|
510 |
sql = "CREATE TABLE IF NOT EXISTS repo_download_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth + ", 1) AS offs, repo_download_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_download_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date"; |
|
511 |
stmt.executeUpdate(sql); |
|
512 |
|
|
513 |
sql = "CREATE INDEX repo_download_stats_monthly_id ON repo_download_stats_monthly USING btree (id)"; |
|
514 |
stmt.executeUpdate(sql); |
|
515 |
|
|
516 |
sql = "CREATE INDEX repo_download_stats_monthly_month ON repo_download_stats_monthly USING btree(month)"; |
|
517 |
stmt.executeUpdate(sql); |
|
518 |
|
|
519 |
sql = "CREATE INDEX repo_download_stats_monthly_source ON repo_download_stats_monthly USING btree(source)"; |
|
520 |
stmt.executeUpdate(sql); |
|
521 |
|
|
522 |
sql = "CREATE OR REPLACE view repo_download_stats_monthly_sushi AS SELECT id, sum(number_of_downloads), extract('year' from date) ||'-'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') ||'-01' AS month, source FROM repo_download_stats group by id, month, source;"; |
|
523 |
stmt.executeUpdate(sql); |
|
524 |
|
|
525 |
stmt.close(); |
|
526 |
ConnectDB.getConnection().commit(); |
|
527 |
ConnectDB.getConnection().close(); |
|
528 |
} |
|
529 |
|
|
530 |
// Import OPENAIRE Logs to DB |
|
531 |
public void processPortalLog() throws Exception { |
|
532 |
Statement stmt = ConnectDB.getConnection().createStatement(); |
|
533 |
ConnectDB.getConnection().setAutoCommit(false); |
|
534 |
|
|
535 |
ArrayList<String> jsonFiles = listHdfsDir(this.logPortalPath); |
|
536 |
// File folder = new File(this.logPortalPath); |
|
537 |
// File[] jsonFiles = folder.listFiles(); |
|
538 |
|
|
539 |
PreparedStatement prepStatem = ConnectDB.getConnection().prepareStatement("INSERT INTO process_portal_log_tmp (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)"); |
|
540 |
int batch_size = 0; |
|
541 |
JSONParser parser = new JSONParser(); |
|
542 |
for (String jsonFile : jsonFiles) { |
|
543 |
JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile)); |
|
544 |
|
|
545 |
for (Object aJsonArray : jsonArray) { |
|
546 |
JSONObject jsonObjectRow = (JSONObject) aJsonArray; |
|
547 |
int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString()); |
|
548 |
String idVisit = jsonObjectRow.get("idVisit").toString(); |
|
549 |
String country = jsonObjectRow.get("country").toString(); |
|
550 |
String referrerName = jsonObjectRow.get("referrerName").toString(); |
|
551 |
String agent = jsonObjectRow.get("browser").toString(); |
|
552 |
boolean botFound = false; |
|
553 |
Iterator it = robotsList.iterator(); |
|
554 |
while (it.hasNext()) { |
|
555 |
// Create a Pattern object |
|
556 |
Pattern r = Pattern.compile(it.next().toString()); |
|
557 |
// Now create matcher object. |
|
558 |
Matcher m = r.matcher(agent); |
|
559 |
if (m.find()) { |
|
560 |
botFound = true; |
|
561 |
break; |
|
562 |
} |
|
563 |
} |
|
564 |
if (botFound == false) { |
|
565 |
JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails")); |
|
566 |
for (Object actionDetail : actionDetails) { |
|
567 |
JSONObject actionDetailsObj = (JSONObject) actionDetail; |
|
568 |
|
|
569 |
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); |
|
570 |
simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); |
|
571 |
Timestamp timestamp = new Timestamp(Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000); |
|
572 |
|
|
573 |
String action = actionDetailsObj.get("type").toString(); |
|
574 |
String url = actionDetailsObj.get("url").toString(); |
|
575 |
|
|
576 |
String entityID = processPortalURL(url); |
|
577 |
String sourceItemType = ""; |
|
578 |
|
|
579 |
if (entityID.indexOf("|") > 0) { |
|
580 |
sourceItemType = entityID.substring(0, entityID.indexOf("|")); |
|
581 |
entityID = entityID.substring(entityID.indexOf("|") + 1); |
|
582 |
} |
|
583 |
|
|
584 |
prepStatem.setInt(1, idSite); |
|
585 |
prepStatem.setString(2, idVisit); |
|
586 |
prepStatem.setString(3, country); |
|
587 |
prepStatem.setString(4, action); |
|
588 |
prepStatem.setString(5, url); |
|
589 |
prepStatem.setString(6, entityID); |
|
590 |
prepStatem.setString(7, sourceItemType); |
|
591 |
prepStatem.setString(8, simpleDateFormat.format(timestamp)); |
|
592 |
prepStatem.setString(9, referrerName); |
|
593 |
prepStatem.setString(10, agent); |
|
594 |
|
|
595 |
prepStatem.addBatch(); |
|
596 |
batch_size++; |
|
597 |
if (batch_size == 10000) { |
|
598 |
prepStatem.executeBatch(); |
|
599 |
ConnectDB.getConnection().commit(); |
|
600 |
batch_size = 0; |
|
601 |
} |
|
602 |
} |
|
603 |
} |
|
604 |
} |
|
605 |
} |
|
606 |
prepStatem.executeBatch(); |
|
607 |
ConnectDB.getConnection().commit(); |
|
608 |
|
|
609 |
stmt.close(); |
|
610 |
ConnectDB.getConnection().close(); |
|
611 |
} |
|
612 |
|
|
613 |
public void portalStats() throws SQLException { |
|
614 |
Connection con = ConnectDB.getConnection(); |
|
615 |
Statement stmt = con.createStatement(); |
|
616 |
con.setAutoCommit(false); |
|
617 |
|
|
618 |
String sql = "INSERT INTO piwiklogtmp SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'oaItem\', timestamp, referrer_name, agent FROM process_portal_log_tmp, result_oids roid WHERE entity_id IS NOT null AND entity_id=roid.orid AND roid.orid IS NOT null;"; |
|
619 |
stmt.executeUpdate(sql); |
|
620 |
stmt.close(); |
|
621 |
// con.commit(); |
|
622 |
|
|
623 |
stmt = con.createStatement(); |
|
624 |
sql = "INSERT INTO piwiklogtmp SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'datasource\', timestamp, referrer_name, agent FROM process_portal_log_tmp, public.datasource_oids roid WHERE entity_id IS NOT null AND entity_id=roid.orid AND roid.orid IS NOT null;"; |
|
625 |
stmt.executeUpdate(sql); |
|
626 |
stmt.close(); |
|
627 |
// con.commit(); |
|
628 |
|
|
629 |
stmt = con.createStatement(); |
|
630 |
sql = "INSERT INTO piwiklogtmp SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'organization\', timestamp, referrer_name, agent FROM process_portal_log_tmp, organization_oids roid WHERE entity_id IS NOT null AND entity_id=roid.orid AND roid.orid IS NOT null;"; |
|
631 |
stmt.executeUpdate(sql); |
|
632 |
stmt.close(); |
|
633 |
// con.commit(); |
|
634 |
|
|
635 |
stmt = con.createStatement(); |
|
636 |
sql = "INSERT INTO piwiklogtmp SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'project\', timestamp, referrer_name, agent FROM process_portal_log_tmp, project_oids roid WHERE entity_id IS NOT null AND entity_id=roid.orid AND roid.orid IS NOT null;"; |
|
637 |
stmt.executeUpdate(sql); |
|
638 |
stmt.close(); |
|
639 |
// con.commit(); |
|
640 |
|
|
641 |
con.close(); |
|
642 |
} |
|
643 |
|
|
644 |
private void cleanOAI() throws Exception { |
|
645 |
ConnectDB.getConnection().setAutoCommit(false); |
|
646 |
|
|
647 |
stmt = ConnectDB.getConnection().createStatement(); |
|
648 |
String sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/','oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%';"; |
|
649 |
stmt.executeUpdate(sql); |
|
650 |
stmt.close(); |
|
651 |
ConnectDB.getConnection().commit(); |
|
652 |
|
|
653 |
stmt = ConnectDB.getConnection().createStatement(); |
|
654 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/','oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%';"; |
|
655 |
stmt.executeUpdate(sql); |
|
656 |
stmt.close(); |
|
657 |
ConnectDB.getConnection().commit(); |
|
658 |
|
|
659 |
stmt = ConnectDB.getConnection().createStatement(); |
|
660 |
sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/','oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%';"; |
|
661 |
stmt.executeUpdate(sql); |
|
662 |
stmt.close(); |
|
663 |
ConnectDB.getConnection().commit(); |
|
664 |
|
|
665 |
stmt = ConnectDB.getConnection().createStatement(); |
|
666 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/','oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%';"; |
|
667 |
stmt.executeUpdate(sql); |
|
668 |
stmt.close(); |
|
669 |
ConnectDB.getConnection().commit(); |
|
670 |
|
|
671 |
stmt = ConnectDB.getConnection().createStatement(); |
|
672 |
sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/','oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%';"; |
|
673 |
stmt.executeUpdate(sql); |
|
674 |
stmt.close(); |
|
675 |
ConnectDB.getConnection().commit(); |
|
676 |
|
|
677 |
stmt = ConnectDB.getConnection().createStatement(); |
|
678 |
sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/','oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%';"; |
|
679 |
stmt.executeUpdate(sql); |
|
680 |
stmt.close(); |
|
681 |
ConnectDB.getConnection().commit(); |
|
682 |
|
|
683 |
stmt = ConnectDB.getConnection().createStatement(); |
|
684 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/','oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%';"; |
|
685 |
stmt.executeUpdate(sql); |
|
686 |
stmt.close(); |
|
687 |
ConnectDB.getConnection().commit(); |
|
688 |
|
|
689 |
stmt = ConnectDB.getConnection().createStatement(); |
|
690 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/','oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%';"; |
|
691 |
stmt.executeUpdate(sql); |
|
692 |
stmt.close(); |
|
693 |
ConnectDB.getConnection().commit(); |
|
694 |
|
|
695 |
stmt = ConnectDB.getConnection().createStatement(); |
|
696 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/','oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%';"; |
|
697 |
stmt.executeUpdate(sql); |
|
698 |
stmt.close(); |
|
699 |
ConnectDB.getConnection().commit(); |
|
700 |
|
|
701 |
stmt = ConnectDB.getConnection().createStatement(); |
|
702 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/','oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%';"; |
|
703 |
stmt.executeUpdate(sql); |
|
704 |
stmt.close(); |
|
705 |
ConnectDB.getConnection().commit(); |
|
706 |
|
|
707 |
stmt = ConnectDB.getConnection().createStatement(); |
|
708 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/','oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%';"; |
|
709 |
stmt.executeUpdate(sql); |
|
710 |
stmt.close(); |
|
711 |
ConnectDB.getConnection().commit(); |
|
712 |
|
|
713 |
stmt = ConnectDB.getConnection().createStatement(); |
|
714 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/','oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%';"; |
|
715 |
stmt.executeUpdate(sql); |
|
716 |
stmt.close(); |
|
717 |
ConnectDB.getConnection().commit(); |
|
718 |
|
|
719 |
stmt = ConnectDB.getConnection().createStatement(); |
|
720 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/','oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%';"; |
|
721 |
stmt.executeUpdate(sql); |
|
722 |
stmt.close(); |
|
723 |
ConnectDB.getConnection().commit(); |
|
724 |
|
|
725 |
stmt = ConnectDB.getConnection().createStatement(); |
|
726 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/','oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%';"; |
|
727 |
stmt.executeUpdate(sql); |
|
728 |
stmt.close(); |
|
729 |
ConnectDB.getConnection().commit(); |
|
730 |
|
|
731 |
stmt = ConnectDB.getConnection().createStatement(); |
|
732 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/','oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%';"; |
|
733 |
stmt.executeUpdate(sql); |
|
734 |
stmt.close(); |
|
735 |
ConnectDB.getConnection().commit(); |
|
736 |
|
|
737 |
stmt = ConnectDB.getConnection().createStatement(); |
|
738 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/','oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%';"; |
|
739 |
stmt.executeUpdate(sql); |
|
740 |
stmt.close(); |
|
741 |
ConnectDB.getConnection().commit(); |
|
742 |
|
|
743 |
stmt = ConnectDB.getConnection().createStatement(); |
|
744 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/','oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%';"; |
|
745 |
stmt.executeUpdate(sql); |
|
746 |
stmt.close(); |
|
747 |
ConnectDB.getConnection().commit(); |
|
748 |
|
|
749 |
stmt = ConnectDB.getConnection().createStatement(); |
|
750 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/','oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%';"; |
|
751 |
stmt.executeUpdate(sql); |
|
752 |
stmt.close(); |
|
753 |
ConnectDB.getConnection().commit(); |
|
754 |
|
|
755 |
stmt = ConnectDB.getConnection().createStatement(); |
|
756 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/','oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%';"; |
|
757 |
stmt.executeUpdate(sql); |
|
758 |
stmt.close(); |
|
759 |
ConnectDB.getConnection().commit(); |
|
760 |
|
|
761 |
stmt = ConnectDB.getConnection().createStatement(); |
|
762 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/','oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%';"; |
|
763 |
stmt.executeUpdate(sql); |
|
764 |
stmt.close(); |
|
765 |
ConnectDB.getConnection().commit(); |
|
766 |
|
|
767 |
stmt = ConnectDB.getConnection().createStatement(); |
|
768 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/','oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%';"; |
|
769 |
stmt.executeUpdate(sql); |
|
770 |
stmt.close(); |
|
771 |
ConnectDB.getConnection().commit(); |
|
772 |
|
|
773 |
stmt = ConnectDB.getConnection().createStatement(); |
|
774 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/','oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%';"; |
|
775 |
stmt.executeUpdate(sql); |
|
776 |
stmt.close(); |
|
777 |
ConnectDB.getConnection().commit(); |
|
778 |
|
|
779 |
stmt = ConnectDB.getConnection().createStatement(); |
|
780 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/','oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%';"; |
|
781 |
stmt.executeUpdate(sql); |
|
782 |
stmt.close(); |
|
783 |
ConnectDB.getConnection().commit(); |
|
784 |
|
|
785 |
stmt = ConnectDB.getConnection().createStatement(); |
|
786 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/','oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%';"; |
|
787 |
stmt.executeUpdate(sql); |
|
788 |
stmt.close(); |
|
789 |
ConnectDB.getConnection().commit(); |
|
790 |
|
|
791 |
stmt = ConnectDB.getConnection().createStatement(); |
|
792 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/','oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%';"; |
|
793 |
stmt.executeUpdate(sql); |
|
794 |
stmt.close(); |
|
795 |
ConnectDB.getConnection().commit(); |
|
796 |
|
|
797 |
stmt = ConnectDB.getConnection().createStatement(); |
|
798 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/','oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%';"; |
|
799 |
stmt.executeUpdate(sql); |
|
800 |
stmt.close(); |
|
801 |
ConnectDB.getConnection().commit(); |
|
802 |
|
|
803 |
stmt = ConnectDB.getConnection().createStatement(); |
|
804 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/','oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%';"; |
|
805 |
stmt.executeUpdate(sql); |
|
806 |
stmt.close(); |
|
807 |
ConnectDB.getConnection().commit(); |
|
808 |
|
|
809 |
stmt = ConnectDB.getConnection().createStatement(); |
|
810 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/','oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%';"; |
|
811 |
stmt.executeUpdate(sql); |
|
812 |
stmt.close(); |
|
813 |
ConnectDB.getConnection().commit(); |
|
814 |
|
|
815 |
stmt = ConnectDB.getConnection().createStatement(); |
|
816 |
sql = "UPDATE piwiklogtmp SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/','oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%';"; |
|
817 |
stmt.executeUpdate(sql); |
|
818 |
stmt.close(); |
|
819 |
ConnectDB.getConnection().commit(); |
|
820 |
|
|
821 |
ConnectDB.getConnection().close(); |
|
822 |
} |
|
823 |
|
|
824 |
private String processPortalURL(String url) { |
|
825 |
|
|
826 |
if (url.indexOf("explore.openaire.eu") > 0) { |
|
827 |
try { |
|
828 |
url = URLDecoder.decode(url, "UTF-8"); |
|
829 |
} catch (Exception e) { |
|
830 |
log.info(url); |
|
831 |
} |
|
832 |
if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) { |
|
833 |
url = "datasource|" + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59); |
|
834 |
} else if (url.indexOf("datasource=") > 0 && url.substring(url.indexOf("datasource=") + 11).length() >= 46) { |
|
835 |
url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57); |
|
836 |
} else if (url.indexOf("datasourceFilter=") > 0 && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) { |
|
837 |
url = "datasource|" + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63); |
|
838 |
} else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) { |
|
839 |
url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56); |
|
840 |
} else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) { |
|
841 |
url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56); |
|
842 |
} else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46 && !url.contains("oai:dnet:corda")) { |
|
843 |
url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56); |
|
844 |
} else if (url.indexOf("organizationId=") > 0 && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) { |
|
845 |
url = "organization|" + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61); |
|
846 |
} else { |
|
847 |
url = ""; |
|
848 |
} |
|
849 |
} else { |
|
850 |
url = ""; |
|
851 |
} |
|
852 |
|
|
853 |
return url; |
|
854 |
} |
|
855 |
|
|
856 |
private void updateProdTables() throws SQLException { |
|
857 |
Statement stmt = ConnectDB.getConnection().createStatement(); |
|
858 |
ConnectDB.getConnection().setAutoCommit(false); |
|
859 |
String sql = "insert into piwiklog select * from piwiklogtmp;"; |
|
860 |
stmt.executeUpdate(sql); |
|
861 |
|
|
862 |
sql = "insert into views_stats select * from views_stats_tmp;"; |
|
863 |
stmt.executeUpdate(sql); |
|
864 |
|
|
865 |
sql = "insert into downloads_stats select * from downloads_stats_tmp;"; |
|
866 |
stmt.executeUpdate(sql); |
|
867 |
|
|
868 |
sql = "insert into pageviews_stats select * from pageviews_stats_tmp;"; |
|
869 |
stmt.executeUpdate(sql); |
|
870 |
|
|
871 |
sql = "DROP TABLE IF EXISTS piwiklogtmp;"; |
|
872 |
stmt.executeUpdate(sql); |
|
873 |
|
|
874 |
sql = "DROP TABLE IF EXISTS views_stats_tmp;"; |
|
875 |
stmt.executeUpdate(sql); |
|
876 |
|
|
877 |
sql = "DROP TABLE IF EXISTS downloads_stats_tmp;"; |
|
878 |
stmt.executeUpdate(sql); |
|
879 |
|
|
880 |
sql = "DROP TABLE IF EXISTS pageviews_stats_stats_tmp;"; |
|
881 |
stmt.executeUpdate(sql); |
|
882 |
|
|
883 |
|
|
884 |
stmt.close(); |
|
885 |
ConnectDB.getConnection().commit(); |
|
886 |
ConnectDB.getConnection().close(); |
|
887 |
|
|
888 |
log.info("updateProdTables done"); |
|
889 |
} |
|
890 |
|
|
891 |
private ArrayList<String> listHdfsDir(String dir) throws Exception { |
|
892 |
|
|
893 |
FileSystem hdfs = FileSystem.get(new Configuration()); |
|
894 |
RemoteIterator<LocatedFileStatus> Files; |
|
895 |
ArrayList<String> fileNames = new ArrayList<>(); |
|
896 |
|
|
897 |
try { |
|
898 |
Path exportPath = new Path(hdfs.getUri() + dir); |
|
899 |
Files = hdfs.listFiles(exportPath, false); |
|
900 |
while (Files.hasNext()) { |
|
901 |
String fileName = Files.next().getPath().toString(); |
|
902 |
fileNames.add(fileName); |
|
903 |
} |
|
904 |
|
|
905 |
hdfs.close(); |
|
906 |
} catch (Exception e) { |
|
907 |
log.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath)); |
|
908 |
throw new Exception("HDFS file path with exported data does not exist : " + logPath, e); |
|
909 |
} |
|
910 |
|
|
911 |
return fileNames; |
|
912 |
} |
|
913 |
|
|
914 |
private String readHDFSFile(String filename) throws Exception { |
|
915 |
String result; |
|
916 |
try { |
|
917 |
|
|
918 |
FileSystem fs = FileSystem.get(new Configuration()); |
|
919 |
//log.info("reading file : " + filename); |
|
920 |
|
|
921 |
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); |
|
922 |
|
|
923 |
StringBuilder sb = new StringBuilder(); |
|
924 |
String line = br.readLine(); |
|
925 |
|
|
926 |
while (line != null) { |
|
927 |
if (!line.equals("[]")) { |
|
928 |
sb.append(line); |
|
929 |
} |
|
930 |
//sb.append(line); |
|
931 |
line = br.readLine(); |
|
932 |
} |
|
933 |
result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); |
|
934 |
if (result.equals("")) { |
|
935 |
result = "[]"; |
|
936 |
} |
|
937 |
|
|
938 |
//fs.close(); |
|
939 |
} catch (Exception e) { |
|
940 |
log.error(e); |
|
941 |
throw new Exception(e); |
|
942 |
} |
|
943 |
|
|
944 |
return result; |
|
945 |
} |
|
946 |
|
|
947 |
private Connection getConnection() throws SQLException { |
|
948 |
return ConnectDB.getConnection(); |
|
949 |
} |
|
950 |
} |
Also available in: Unified diff
a ton of fixes. Close to running smoothly in all cases