Project

General

Profile

1
package eu.dnetlib.usagestats.export;
2

    
3
import java.io.*;
4
import java.net.URLDecoder;
5
import java.sql.Connection;
6
import java.sql.DriverManager;
7
import java.sql.PreparedStatement;
8
import java.sql.Statement;
9
import java.sql.Timestamp;
10
import java.text.SimpleDateFormat;
11
import java.util.*;
12

    
13
import org.apache.hadoop.conf.Configuration;
14
import org.apache.hadoop.fs.LocatedFileStatus;
15
import org.apache.hadoop.fs.Path;
16
import org.apache.hadoop.fs.FileSystem;
17
import org.apache.hadoop.fs.RemoteIterator;
18
import org.apache.log4j.Logger;
19
import org.json.simple.JSONArray;
20
import org.json.simple.JSONObject;
21
import org.json.simple.parser.JSONParser;
22

    
23
public class PiwikStatsDB {
24
    private final String dbUrl;
25
    private final String dbSchema;
26
    private final String dbUserName;
27
    private final String dbPassword;
28
    private final String logPath;
29

    
30
    private Connection conn = null;
31
    private Statement stmt = null;
32

    
33
    private final Logger log = Logger.getLogger(this.getClass());
34

    
35
    public PiwikStatsDB(String dbUrl, String dbUserName, String dbPassword, String logPath) throws Exception {
36
        this.dbUrl = dbUrl;
37
        this.dbSchema = "shadow";
38
        this.dbUserName = dbUserName;
39
        this.dbPassword = dbPassword;
40
        this.logPath = logPath;
41

    
42
        connectDB();
43
        createTables();
44
    }
45

    
46
    private void connectDB() throws Exception {
47
        try {
48
            Class.forName("org.postgresql.Driver");
49
            conn = DriverManager.getConnection(dbUrl, dbUserName, dbPassword);
50

    
51
            stmt = conn.createStatement();
52
            String sqlSetSearchPath = "SET search_path TO " + dbSchema + ";";
53
            stmt.executeUpdate(sqlSetSearchPath);
54

    
55
            //log.info("Opened database successfully");
56
        } catch (Exception e) {
57
            log.error("Connect to db failed: " + e);
58
            throw new Exception("Failed to connect to db: " + e.toString(), e);
59
        }
60
    }
61

    
62
    private void createTables() throws Exception {
63
        try {
64
            stmt = conn.createStatement();
65
            String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS piwiklog(source INTEGER, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));";
66
            String sqlcreateRulePiwikLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +
67
                    " ON INSERT TO piwiklog " +
68
                    " WHERE (EXISTS ( SELECT piwiklog.source, piwiklog.id_visit," +
69
                    "piwiklog.action, piwiklog.\"timestamp\", piwiklog.entity_id " +
70
                    "FROM piwiklog " +
71
                    "WHERE piwiklog.source = new.source AND piwiklog.id_visit = new.id_visit AND piwiklog.action = new.action AND piwiklog.entity_id = new.entity_id AND piwiklog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
72
            stmt.executeUpdate(sqlCreateTablePiwikLog);
73
            stmt.executeUpdate(sqlcreateRulePiwikLog);
74

    
75
            String sqlCopyPublicPiwiklog="insert into piwiklog select * from public.piwiklog;";
76
            stmt.executeUpdate(sqlCopyPublicPiwiklog);
77

    
78
            String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS process_portal_log(source INTEGER, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, timestamp));";
79
            String sqlcreateRulePortalLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +
80
                    " ON INSERT TO process_portal_log " +
81
                    " WHERE (EXISTS ( SELECT process_portal_log.source, process_portal_log.id_visit," +
82
                    "process_portal_log.\"timestamp\" " +
83
                    "FROM process_portal_log " +
84
                    "WHERE process_portal_log.source = new.source AND process_portal_log.id_visit = new.id_visit AND process_portal_log.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;";
85
            stmt.executeUpdate(sqlCreateTablePortalLog);
86
            stmt.executeUpdate(sqlcreateRulePortalLog);
87

    
88
            stmt.close();
89
            conn.close();
90
            log.info("Usage Tables Created");
91

    
92
        } catch (Exception e) {
93
            log.error("Failed to create tables: " + e);
94
            throw new Exception("Failed to create tables: " + e.toString(), e);
95
            //System.exit(0);
96
        }
97
    }
98

    
99
    protected void processLogs() throws Exception {
100
        try {
101
            processRepositoryLog();
102
            log.info("repository process done");
103
            removeDoubleClicks();
104
            log.info("removing double clicks done");
105
            cleanOAI();
106
            log.info("cleaning oai done");
107

    
108
            processPortalLog();
109
            log.info("portal process done");
110
            portalStats();
111
            log.info("portal stats done");
112
        } catch (Exception e) {
113
            log.error("Failed to process logs: " + e);
114
            throw new Exception("Failed to process logs: " + e.toString(), e);
115
        }
116
    }
117

    
118
    protected void usageStats() throws Exception {
119
        try {
120
            //resultStats();
121
            //dataSourceStats();
122
            //organizationsStats();
123
            //projectsStats();
124
            //repositoryViewsStats();
125
            //repositoryDownloadsStats();
126
            viewsStats();
127
            downloadsStats();
128
            log.info("stat tables and views done");
129
        } catch (Exception e) {
130
            log.error("Failed to create usage stats: " + e);
131
            throw new Exception("Failed to create usage stats: " + e.toString(), e);
132
        }
133
    }
134

    
135
    // Import repository Logs to DB
136
    private void processRepositoryLog() throws Exception {
137
        if (conn.isClosed())
138
            connectDB();
139

    
140
        stmt = conn.createStatement();
141
        conn.setAutoCommit(false);
142

    
143
        ArrayList<String> jsonFiles = listHdfsDir(logPath + "repolog");
144

    
145
        PreparedStatement prepStatem = conn.prepareStatement("INSERT INTO piwiklog (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)");
146
        int batch_size = 0;
147
        for (String jsonFile : jsonFiles) {
148

    
149
            log.debug("Processing log file: " + jsonFile);
150

    
151
            JSONParser parser = new JSONParser();
152
            JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile));
153

    
154
            for (Object aJsonArray : jsonArray) {
155
                JSONObject jsonObjectRow = (JSONObject) aJsonArray;
156
                int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString());
157
                String idVisit = jsonObjectRow.get("idVisit").toString();
158
                String country = jsonObjectRow.get("country").toString();
159
                String referrerName = jsonObjectRow.get("referrerName").toString();
160
                String agent = jsonObjectRow.get("browser").toString();
161

    
162
                String sourceItemType = "repItem";
163

    
164
                JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails"));
165
                for (Object actionDetail : actionDetails) {
166
                    JSONObject actionDetailsObj = (JSONObject) actionDetail;
167

    
168
                    if (actionDetailsObj.get("customVariables") != null) {
169
                        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
170
                        simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
171
                        Timestamp timestamp = new Timestamp(Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000);
172
                        String url = actionDetailsObj.get("url").toString();
173
                        String oaipmh = ((JSONObject) ((JSONObject) actionDetailsObj.get("customVariables")).get("1")).get("customVariablePageValue1").toString();
174
                        String action = actionDetailsObj.get("type").toString();
175

    
176
                        prepStatem.setInt(1, idSite);
177
                        prepStatem.setString(2, idVisit);
178
                        prepStatem.setString(3, country);
179
                        prepStatem.setString(4, action);
180
                        prepStatem.setString(5, url);
181
                        prepStatem.setString(6, oaipmh);
182
                        prepStatem.setString(7, sourceItemType);
183
                        prepStatem.setString(8, simpleDateFormat.format(timestamp));
184
                        prepStatem.setString(9, referrerName);
185
                        prepStatem.setString(10, agent);
186
                        prepStatem.addBatch();
187
                        batch_size++;
188
                        if (batch_size == 10000) {
189
                            prepStatem.executeBatch();
190
                            conn.commit();
191
                            batch_size = 0;
192
                        }
193
                    }
194
                }
195
            }
196
        }
197
        prepStatem.executeBatch();
198
        conn.commit();
199
        stmt.close();
200
        conn.close();
201
    }
202

    
203
    private void removeDoubleClicks() throws Exception {
204
        if (conn.isClosed())
205
            connectDB();
206

    
207
        stmt = conn.createStatement();
208
        conn.setAutoCommit(false);
209

    
210
        //clean download double clicks
211
        String sql = "DELETE FROM piwiklog p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp FROM piwiklog p1, piwiklog p2 WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp AND p1.timestamp<p2.timestamp AND extract(EPOCH FROM p2.timestamp::timestamp-p1.timestamp::timestamp)<30 AND p.source=p1.source AND p.id_visit=p1.id_visit AND p.action=p1.action AND p.entity_id=p1.entity_id AND p.timestamp=p1.timestamp);";
212
        stmt.executeUpdate(sql);
213
        stmt.close();
214
        conn.commit();
215

    
216
        stmt = conn.createStatement();
217

    
218
        //clean view double clicks
219
        sql = "DELETE FROM piwiklog p WHERE EXISTS (SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp from piwiklog p1, piwiklog p2 WHERE p1.source!='5' AND p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id AND p1.action=p2.action AND p1.action='action' AND p1.timestamp!=p2.timestamp AND p1.timestamp<p2.timestamp AND extract(EPOCH FROM p2.timestamp::timestamp-p1.timestamp::timestamp)<10 AND p.source=p1.source AND p.id_visit=p1.id_visit AND p.action=p1.action AND p.entity_id=p1.entity_id AND p.timestamp=p1.timestamp);";
220
        stmt.executeUpdate(sql);
221
        stmt.close();
222
        conn.commit();
223
        conn.close();
224
    }
225

    
226
    private void viewsStats() throws Exception {
227
        if (conn.isClosed())
228
            connectDB();
229

    
230
        stmt = conn.createStatement();
231
        conn.setAutoCommit(false);
232

    
233
        //String sql = "CREATE OR REPLACE VIEW result_views_monthly AS SELECT entity_id AS id, COUNT(entity_id) as views, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM piwiklog where action='action' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;";
234
        String sql = "CREATE OR REPLACE VIEW result_views_monthly AS SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM piwiklog where action='action' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;";
235
        stmt.executeUpdate(sql);
236

    
237
        // sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count, max(openaire_referrer) AS openaire INTO views_stats FROM result_views_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
238
        sql = "CREATE TABLE views_stats AS SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count, max(openaire_referrer) AS openaire FROM result_views_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
239
        stmt.executeUpdate(sql);
240

    
241
//        sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count INTO pageviews_stats FROM result_views_monthly p, datasource d, result_oids ro where p.source='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
242
        sql = "CREATE TABLE pageviews_stats AS SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count FROM result_views_monthly p, datasource d, result_oids ro where p.source='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
243
        stmt.executeUpdate(sql);
244

    
245
        sql = "DROP VIEW IF EXISTS result_views_monthly;";
246
        stmt.executeUpdate(sql);
247

    
248
        stmt.close();
249
        conn.commit();
250
        conn.close();
251
    }
252

    
253
    private void downloadsStats() throws Exception {
254
        if (conn.isClosed())
255
            connectDB();
256

    
257
        stmt = conn.createStatement();
258
        conn.setAutoCommit(false);
259

    
260
        //String sql = "CREATE OR REPLACE VIEW result_downloads_monthly as select entity_id AS id, COUNT(entity_id) as downloads, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM piwiklog where action='download' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;";
261
        String sql = "CREATE OR REPLACE VIEW result_downloads_monthly as select entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, extract('year' from timestamp::date) ||'/'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') AS month, source FROM piwiklog where action='download' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;";
262
        stmt.executeUpdate(sql);
263

    
264
        //sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(downloads) AS count INTO downloads_stats FROM result_downloads_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
265
//        sql = "SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(downloads) AS count, max(openaire_referrer) AS openaire INTO downloads_stats FROM result_downloads_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
266
        sql = "CREATE TABLE downloads_stats AS SELECT 'OpenAIRE'::TEXT as source, d.id as repository_id, ro.id as result_id, month as date, max(downloads) AS count, max(openaire_referrer) AS openaire FROM result_downloads_monthly p, datasource d, result_oids ro where p.source!='5' AND p.source=d.piwik_id and p.id=ro.orid group by repository_id, result_id, date ORDER BY repository_id, result_id, date;";
267
        stmt.executeUpdate(sql);
268

    
269
        sql = "DROP VIEW IF EXISTS result_downloads_monthly;";
270
        stmt.executeUpdate(sql);
271

    
272
        stmt.close();
273
        conn.commit();
274
        conn.close();
275
    }
276

    
277
    public void finalizeStats() throws Exception {
278
        if (conn.isClosed())
279
            connectDB();
280

    
281
        stmt = conn.createStatement();
282
        conn.setAutoCommit(false);
283

    
284
        Calendar startCalendar = Calendar.getInstance();
285
        startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01"));
286
        Calendar endCalendar = Calendar.getInstance();
287
        int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR);
288
        int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
289

    
290
//        String sql = "SELECT to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS full_date INTO full_dates FROM generate_series(0, " + diffMonth + ", 1) AS offs;";
291
        String sql = "CREATE TABLE full_dates AS SELECT to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS full_date FROM generate_series(0, " + diffMonth + ", 1) AS offs;";
292
        stmt.executeUpdate(sql);
293

    
294
        sql = "CREATE INDEX full_dates_full_date ON full_dates USING btree(full_date);";
295
        stmt.executeUpdate(sql);
296

    
297

    
298
        sql = "CREATE INDEX views_stats_source ON views_stats USING btree(source);";
299
        stmt.executeUpdate(sql);
300

    
301
        sql = "CREATE INDEX views_stats_repository_id ON views_stats USING btree(repository_id);";
302
        stmt.executeUpdate(sql);
303

    
304
        sql = "CREATE INDEX views_stats_result_id ON views_stats USING btree(result_id);";
305
        stmt.executeUpdate(sql);
306

    
307
        sql = "CREATE INDEX views_stats_date ON views_stats USING btree(date);";
308
        stmt.executeUpdate(sql);
309

    
310

    
311
        sql = "CREATE INDEX pageviews_stats_source ON pageviews_stats USING btree(source);";
312
        stmt.executeUpdate(sql);
313

    
314
        sql = "CREATE INDEX pageviews_stats_repository_id ON pageviews_stats USING btree(repository_id);";
315
        stmt.executeUpdate(sql);
316

    
317
        sql = "CREATE INDEX pageviews_stats_result_id ON pageviews_stats USING btree(result_id);";
318
        stmt.executeUpdate(sql);
319

    
320
        sql = "CREATE INDEX pageviews_stats_date ON pageviews_stats USING btree(date);";
321
        stmt.executeUpdate(sql);
322

    
323

    
324
        sql = "CREATE INDEX downloads_stats_source ON downloads_stats USING btree(source);";
325
        stmt.executeUpdate(sql);
326

    
327
        sql = "CREATE INDEX downloads_stats_repository_id ON downloads_stats USING btree(repository_id);";
328
        stmt.executeUpdate(sql);
329

    
330
        sql = "CREATE INDEX downloads_stats_result_id ON downloads_stats USING btree(result_id);";
331
        stmt.executeUpdate(sql);
332

    
333
        sql = "CREATE INDEX downloads_stats_date ON downloads_stats USING btree(date);";
334
        stmt.executeUpdate(sql);
335

    
336
//        sql = "SELECT coalesce(ds.source, vs.source) as source, coalesce(ds.repository_id, vs.repository_id) as repository_id, coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, coalesce(ds.openaire, 0) as openaire_downloads, coalesce(vs.openaire, 0) as openaire_views INTO usage_stats FROM downloads_stats AS ds FULL OUTER JOIN views_stats AS vs ON ds.source=vs.source AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date;";
337
        sql = "CREATE TABLE usage_stats AS SELECT coalesce(ds.source, vs.source) as source, coalesce(ds.repository_id, vs.repository_id) as repository_id, coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, coalesce(ds.openaire, 0) as openaire_downloads, coalesce(vs.openaire, 0) as openaire_views FROM downloads_stats AS ds FULL OUTER JOIN views_stats AS vs ON ds.source=vs.source AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date;";
338
        stmt.executeUpdate(sql);
339

    
340
        sql = "CREATE INDEX usage_stats_source ON usage_stats USING btree(source);";
341
        stmt.executeUpdate(sql);
342

    
343
        sql = "CREATE INDEX usage_stats_repository_id ON usage_stats USING btree(repository_id);";
344
        stmt.executeUpdate(sql);
345

    
346
        sql = "CREATE INDEX usage_stats_result_id ON usage_stats USING btree(result_id);";
347
        stmt.executeUpdate(sql);
348

    
349
        sql = "CREATE INDEX usage_stats_date ON usage_stats USING btree(date);";
350
        stmt.executeUpdate(sql);
351

    
352
        stmt.close();
353
        conn.commit();
354
        conn.close();
355
    }
356

    
357
    //views stats
358
//    private void viewsStatsOLD() throws Exception {
359
//        if (conn.isClosed())
360
//            connectDB();
361
//
362
//        stmt = conn.createStatement();
363
//        conn.setAutoCommit(false);
364
//
365
//        Calendar startCalendar = Calendar.getInstance();
366
//        startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01"));
367
//        Calendar endCalendar = Calendar.getInstance();
368
//        int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR);
369
//        int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
370
//
371
//        String sql = "CREATE OR REPLACE VIEW result_views_monthly as select entity_id AS id, COUNT(entity_id) as views, extract('year' from timestamp::date) ||'-'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') ||'-01' AS month, source FROM piwiklog where action='action' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;";
372
//        stmt.executeUpdate(sql);
373
//
374
//        sql = "SELECT d.id, d.new_date AS month, CASE when rdm.views IS NULL THEN 0 ELSE rdm.views END, d.source INTO result_views_sushi FROM (SELECT distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY-MM-DD') AS new_date, rdsm.source FROM generate_series(0, " + diffMonth + ", 1) AS offs, result_views_monthly rdsm) d LEFT JOIN (SELECT id, month, views, source FROM result_views_monthly) rdm ON d.new_date=rdm.month AND d.id=rdm.id AND d.source=rdm.source ORDER BY d.source, d.id, d.new_date;";
375
//        stmt.executeUpdate(sql);
376
//
377
//        sql = "CREATE INDEX result_views_sushi_id ON result_views_sushi USING btree (id);";
378
//        stmt.executeUpdate(sql);
379
//
380
//        sql = "CREATE INDEX result_views_sushi_month ON result_views_sushi USING btree (month);";
381
//        stmt.executeUpdate(sql);
382
//
383
//        sql = "CREATE INDEX result_views_sushi_source ON result_views_sushi USING btree (source);";
384
//        stmt.executeUpdate(sql);
385
//
386
//        sql = "SELECT roid.id, extract('year' from month::date) ||'/'|| LPAD(CAST(extract('month' from month::date) AS VARCHAR), 2, '0') as month, max(views) as views, source INTO result_views FROM result_views_sushi rvs, result_oids roid WHERE rvs.id=roid.orid GROUP BY source, roid.id, month ORDER BY source, roid.id, month;";
387
//        stmt.executeUpdate(sql);
388
//
389
//        sql = "CREATE INDEX result_views_id ON result_views USING btree (id);";
390
//        stmt.executeUpdate(sql);
391
//
392
//        sql = "CREATE INDEX result_views_month ON result_views USING btree (month);";
393
//        stmt.executeUpdate(sql);
394
//
395
//        sql = "CREATE INDEX result_views_source ON result_views USING btree (source);";
396
//        stmt.executeUpdate(sql);
397
//
398
//
399
//        sql = "DROP VIEW IF EXISTS result_views_monthly;";
400
//        stmt.executeUpdate(sql);
401
//
402
//        stmt.close();
403
//        conn.commit();
404
//        conn.close();
405
//    }
406

    
407
    //downloads stats
408
//    private void downloadsStatsOLD() throws Exception {
409
//        if (conn.isClosed())
410
//            connectDB();
411
//
412
//        stmt = conn.createStatement();
413
//        conn.setAutoCommit(false);
414
//
415
//        Calendar startCalendar = Calendar.getInstance();
416
//        startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01"));
417
//        Calendar endCalendar = Calendar.getInstance();
418
//        int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR);
419
//        int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
420
//
421
//        String sql = "CREATE OR REPLACE VIEW result_downloads_monthly as select entity_id AS id, COUNT(entity_id) as downloads, extract('year' from timestamp::date) ||'-'|| LPAD(CAST(extract('month' from timestamp::date) AS VARCHAR), 2, '0') ||'-01' AS month, source FROM piwiklog where action='download' and (source_item_type='oaItem' or source_item_type='repItem') group by id, month, source order by source, id, month;";
422
//        stmt.executeUpdate(sql);
423
//
424
//        sql = "SELECT d.id, d.new_date AS month, CASE when rdm.downloads IS NULL THEN 0 ELSE rdm.downloads END, d.source INTO result_downloads_sushi FROM (SELECT distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY-MM-DD') AS new_date, rdsm.source FROM generate_series(0, " + diffMonth + ", 1) AS offs, result_downloads_monthly rdsm) d LEFT JOIN (SELECT id, month, downloads, source FROM result_downloads_monthly) rdm ON d.new_date=rdm.month AND d.id=rdm.id AND d.source=rdm.source ORDER BY d.source, d.id, d.new_date;";
425
//        stmt.executeUpdate(sql);
426
//
427
//        sql = "CREATE INDEX result_downloads_sushi_id ON result_downloads_sushi USING btree (id);";
428
//        stmt.executeUpdate(sql);
429
//
430
//        sql = "CREATE INDEX result_downloads_sushi_month ON result_downloads_sushi USING btree (month);";
431
//        stmt.executeUpdate(sql);
432
//
433
//        sql = "CREATE INDEX result_downloads_sushi_source ON result_downloads_sushi USING btree (source);";
434
//        stmt.executeUpdate(sql);
435
//
436
//        sql = "SELECT roid.id, extract('year' from month::date) ||'/'|| LPAD(CAST(extract('month' from month::date) AS VARCHAR), 2, '0') as month, downloads, source INTO result_downloads FROM result_downloads_sushi rvs, result_oids roid WHERE rvs.id=roid.orid ORDER BY source, roid.id, month;";
437
//        stmt.executeUpdate(sql);
438
//
439
//        sql = "CREATE INDEX result_downloads_id ON result_downloads USING btree (id);";
440
//        stmt.executeUpdate(sql);
441
//
442
//        sql = "CREATE INDEX result_downloads_month ON result_downloads USING btree (month);";
443
//        stmt.executeUpdate(sql);
444
//
445
//        sql = "CREATE INDEX result_downloads_source ON result_downloads USING btree (source);";
446
//        stmt.executeUpdate(sql);
447
//
448
//
449
//        sql = "DROP VIEW IF EXISTS result_downloads_monthly;";
450
//        stmt.executeUpdate(sql);
451
//
452
//        stmt.close();
453
//        conn.commit();
454
//        conn.close();
455
//    }
456

    
457
    //Create repository Views statistics
458
    private void repositoryViewsStats() throws Exception {
459
        if (conn.isClosed())
460
            connectDB();
461

    
462
        stmt = conn.createStatement();
463
        conn.setAutoCommit(false);
464

    
465
//        String sql = "SELECT entity_id AS id , COUNT(entity_id) AS number_of_views, timestamp::date AS date, source INTO repo_view_stats FROM piwiklog WHERE source!='5' AND action=\'action\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;";
466
        String sql = "CREATE TABLE repo_view_stats AS SELECT entity_id AS id , COUNT(entity_id) AS number_of_views, timestamp::date AS date, source FROM piwiklog WHERE source!='5' AND action=\'action\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;";
467
        stmt.executeUpdate(sql);
468

    
469
        sql = "CREATE INDEX repo_view_stats_id ON repo_view_stats USING btree (id)";
470
        stmt.executeUpdate(sql);
471

    
472
        sql = "CREATE INDEX repo_view_stats_date ON repo_view_stats USING btree(date)";
473
        stmt.executeUpdate(sql);
474

    
475
//        sql = "SELECT roid.id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source INTO repo_view_stats_monthly_clean FROM repo_view_stats rvs, result_oids roid where rvs.id=roid.orid group by roid.id, month, source;";
476
        sql = "CREATE TABLE repo_view_stats_monthly_clean AS SELECT roid.id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source FROM repo_view_stats rvs, result_oids roid where rvs.id=roid.orid group by roid.id, month, source;";
477
        stmt.executeUpdate(sql);
478

    
479
        sql = "CREATE INDEX repo_view_stats_monthly_clean_id ON repo_view_stats_monthly_clean USING btree (id)";
480
        stmt.executeUpdate(sql);
481

    
482
        sql = "CREATE INDEX repo_view_stats_monthly_clean_month ON repo_view_stats_monthly_clean USING btree(month)";
483
        stmt.executeUpdate(sql);
484

    
485
        sql = "CREATE INDEX repo_view_stats_monthly_clean_source ON repo_view_stats_monthly_clean USING btree(source)";
486
        stmt.executeUpdate(sql);
487

    
488
        Calendar startCalendar = Calendar.getInstance();
489
        startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01"));
490
        Calendar endCalendar = Calendar.getInstance();
491
        int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR);
492
        int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
493

    
494
        //sql="CREATE OR REPLACE view repo_view_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth +", 1) AS offs, repo_view_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_view_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date";
495
//        sql = "select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source INTO repo_view_stats_monthly from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth + ", 1) AS offs, repo_view_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_view_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date";
496
        sql = "CREATE TABLE repo_view_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth + ", 1) AS offs, repo_view_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_view_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date";
497
        stmt.executeUpdate(sql);
498

    
499
        sql = "CREATE INDEX repo_view_stats_monthly_id ON repo_view_stats_monthly USING btree (id)";
500
        stmt.executeUpdate(sql);
501

    
502
        sql = "CREATE INDEX repo_view_stats_monthly_month ON repo_view_stats_monthly USING btree(month)";
503
        stmt.executeUpdate(sql);
504

    
505
        sql = "CREATE INDEX repo_view_stats_monthly_source ON repo_view_stats_monthly USING btree(source)";
506
        stmt.executeUpdate(sql);
507

    
508
        sql = "CREATE OR REPLACE view repo_view_stats_monthly_sushi AS SELECT id, sum(number_of_views), extract('year' from date) ||'-'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') ||'-01' AS month, source FROM repo_view_stats group by id, month, source;";
509
        stmt.executeUpdate(sql);
510

    
511
        stmt.close();
512
        conn.commit();
513
        conn.close();
514
    }
515

    
516
    //Create repository downloads statistics
517
    private void repositoryDownloadsStats() throws Exception {
518
        if (conn.isClosed())
519
            connectDB();
520

    
521
        stmt = conn.createStatement();
522
        conn.setAutoCommit(false);
523

    
524
//        String sql = "SELECT entity_id AS id, COUNT(entity_id) AS number_of_downloads, timestamp::date AS date, source INTO repo_download_stats FROM piwiklog WHERE source!='5' AND action=\'download\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;";
525
        String sql = "CREATE TABLE repo_download_stats AS SELECT entity_id AS id, COUNT(entity_id) AS number_of_downloads, timestamp::date AS date, source FROM piwiklog WHERE source!='5' AND action=\'download\' AND source_item_type=\'repItem\' GROUP BY entity_id, date, source ORDER BY entity_id, date ASC, COUNT(entity_id) DESC;";
526
        stmt.executeUpdate(sql);
527

    
528
        sql = "CREATE INDEX repo_download_stats_id ON repo_download_stats USING btree (id)";
529
        stmt.executeUpdate(sql);
530

    
531
        sql = "CREATE INDEX repo_download_stats_date ON repo_download_stats USING btree(date)";
532
        stmt.executeUpdate(sql);
533

    
534
//        sql = "SELECT roid.id, sum(number_of_downloads), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source INTO repo_download_stats_monthly_clean FROM repo_download_stats rvs, result_oids roid WHERE rvs.id=roid.orid GROUP BY roid.id, month, source;";
535
        sql = "CREATE TABLE repo_download_stats_monthly_clean AS SELECT roid.id, sum(number_of_downloads), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month, source FROM repo_download_stats rvs, result_oids roid WHERE rvs.id=roid.orid GROUP BY roid.id, month, source;";
536
        stmt.executeUpdate(sql);
537

    
538
        sql = "CREATE INDEX repo_download_stats_monthly_clean_id ON repo_download_stats_monthly_clean USING btree (id)";
539
        stmt.executeUpdate(sql);
540

    
541
        sql = "CREATE INDEX repo_download_stats_monthly_clean_month ON repo_download_stats_monthly_clean USING btree(month)";
542
        stmt.executeUpdate(sql);
543

    
544
        sql = "CREATE INDEX repo_download_stats_monthly_clean_source ON repo_download_stats_monthly_clean USING btree(source)";
545
        stmt.executeUpdate(sql);
546

    
547
        Calendar startCalendar = Calendar.getInstance();
548
        startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01"));
549
        Calendar endCalendar = Calendar.getInstance();
550
        int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR);
551
        int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
552

    
553
        //sql="CREATE OR REPLACE view repo_download_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth +", 1) AS offs, repo_download_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_download_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date";
554
        // sql = "select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source INTO repo_download_stats_monthly from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth + ", 1) AS offs, repo_download_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_download_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date";
555
        sql = "CREATE TABLE repo_download_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end, d.source from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date, rdsm.source from generate_series(0, " + diffMonth + ", 1) AS offs, repo_download_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum, source from repo_download_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id and d.source=rdm.source order by d.id, d.new_date";
556
        stmt.executeUpdate(sql);
557

    
558
        sql = "CREATE INDEX repo_download_stats_monthly_id ON repo_download_stats_monthly USING btree (id)";
559
        stmt.executeUpdate(sql);
560

    
561
        sql = "CREATE INDEX repo_download_stats_monthly_month ON repo_download_stats_monthly USING btree(month)";
562
        stmt.executeUpdate(sql);
563

    
564
        sql = "CREATE INDEX repo_download_stats_monthly_source ON repo_download_stats_monthly USING btree(source)";
565
        stmt.executeUpdate(sql);
566

    
567

    
568
        sql = "CREATE OR REPLACE view repo_download_stats_monthly_sushi AS SELECT id, sum(number_of_downloads), extract('year' from date) ||'-'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') ||'-01' AS month, source FROM repo_download_stats group by id, month, source;";
569
        stmt.executeUpdate(sql);
570

    
571
        stmt.close();
572
        conn.commit();
573
        conn.close();
574
    }
575

    
576
    // Import OPENAIRE Logs to DB
577
    private void processPortalLog() throws Exception {
578

    
579
        if (conn.isClosed())
580
            connectDB();
581

    
582
        stmt = conn.createStatement();
583
        conn.setAutoCommit(false);
584

    
585
        ArrayList<String> jsonFiles = listHdfsDir(logPath + "portallog");
586
        PreparedStatement prepStatem = conn.prepareStatement("INSERT INTO process_portal_log (source, id_visit, country, action, url, entity_id, source_item_type, timestamp, referrer_name, agent) VALUES (?,?,?,?,?,?,?,?,?,?)");
587
        int batch_size = 0;
588
        JSONParser parser = new JSONParser();
589
        for (String jsonFile : jsonFiles) {
590
            JSONArray jsonArray = (JSONArray) parser.parse(readHDFSFile(jsonFile));
591

    
592
            for (Object aJsonArray : jsonArray) {
593
                JSONObject jsonObjectRow = (JSONObject) aJsonArray;
594
                int idSite = Integer.parseInt(jsonObjectRow.get("idSite").toString());
595
                String idVisit = jsonObjectRow.get("idVisit").toString();
596
                String country = jsonObjectRow.get("country").toString();
597
                String referrerName = jsonObjectRow.get("referrerName").toString();
598
                String agent = jsonObjectRow.get("browser").toString();
599

    
600
                JSONArray actionDetails = (JSONArray) jsonObjectRow.get(("actionDetails"));
601
                for (Object actionDetail : actionDetails) {
602
                    JSONObject actionDetailsObj = (JSONObject) actionDetail;
603

    
604
                    SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
605
                    simpleDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
606
                    Timestamp timestamp = new Timestamp(Long.parseLong(actionDetailsObj.get("timestamp").toString()) * 1000);
607

    
608
                    String action = actionDetailsObj.get("type").toString();
609
                    String url = actionDetailsObj.get("url").toString();
610

    
611
                    String entityID = processPortalURL(url);
612
                    String sourceItemType = "";
613

    
614
                    if (entityID.indexOf("|") > 0) {
615
                        sourceItemType = entityID.substring(0, entityID.indexOf("|"));
616
                        entityID = entityID.substring(entityID.indexOf("|") + 1);
617
                    }
618

    
619
                    prepStatem.setInt(1, idSite);
620
                    prepStatem.setString(2, idVisit);
621
                    prepStatem.setString(3, country);
622
                    prepStatem.setString(4, action);
623
                    prepStatem.setString(5, url);
624
                    prepStatem.setString(6, entityID);
625
                    prepStatem.setString(7, sourceItemType);
626
                    prepStatem.setString(8, simpleDateFormat.format(timestamp));
627
                    prepStatem.setString(9, referrerName);
628
                    prepStatem.setString(10, agent);
629

    
630
                    prepStatem.addBatch();
631
                    batch_size++;
632
                    if (batch_size == 10000) {
633
                        prepStatem.executeBatch();
634
                        conn.commit();
635
                        batch_size = 0;
636
                    }
637
                }
638
            }
639
        }
640
        prepStatem.executeBatch();
641
        conn.commit();
642

    
643
        stmt.close();
644
        conn.close();
645
    }
646

    
647
    private void portalStats() throws Exception {
648
        if (conn.isClosed())
649
            connectDB();
650

    
651
        stmt = conn.createStatement();
652
        conn.setAutoCommit(false);
653

    
654
        String sql = "INSERT INTO piwiklog SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'oaItem\', timestamp, referrer_name, agent FROM process_portal_log, result_oids roid WHERE entity_id IS NOT null AND entity_id=roid.id AND roid.orid IS NOT null;";
655
        stmt.executeUpdate(sql);
656
        stmt.close();
657
        conn.commit();
658

    
659
        stmt = conn.createStatement();
660
        sql = "INSERT INTO piwiklog SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'datasource\', timestamp, referrer_name, agent FROM process_portal_log, datasource_oids roid WHERE entity_id IS NOT null AND entity_id=roid.id AND roid.orid IS NOT null;";
661
        stmt.executeUpdate(sql);
662
        stmt.close();
663
        conn.commit();
664

    
665
        stmt = conn.createStatement();
666
        sql = "INSERT INTO piwiklog SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'organization\', timestamp, referrer_name, agent FROM process_portal_log, organization_oids roid WHERE entity_id IS NOT null AND entity_id=roid.id AND roid.orid IS NOT null;";
667
        stmt.executeUpdate(sql);
668
        stmt.close();
669
        conn.commit();
670

    
671
        stmt = conn.createStatement();
672
        sql = "INSERT INTO piwiklog SELECT DISTINCT source, id_visit, country, action, url, roid.orid, \'project\', timestamp, referrer_name, agent FROM process_portal_log, project_oids roid WHERE entity_id IS NOT null AND entity_id=roid.id AND roid.orid IS NOT null;";
673
        stmt.executeUpdate(sql);
674
        stmt.close();
675
        conn.commit();
676

    
677
        stmt = conn.createStatement();
678
        sql = "DROP TABLE process_portal_log;";
679
        stmt.executeUpdate(sql);
680
        stmt.close();
681
        conn.commit();
682

    
683
        conn.close();
684
    }
685

    
686
    private void cleanOAI() throws Exception {
687
        if (conn.isClosed())
688
            connectDB();
689

    
690
        conn.setAutoCommit(false);
691

    
692
        stmt = conn.createStatement();
693
        String sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/','oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%';";
694
        stmt.executeUpdate(sql);
695
        stmt.close();
696
        conn.commit();
697

    
698
        stmt = conn.createStatement();
699
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/','oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%';";
700
        stmt.executeUpdate(sql);
701
        stmt.close();
702
        conn.commit();
703

    
704
        stmt = conn.createStatement();
705
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/','oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%';";
706
        stmt.executeUpdate(sql);
707
        stmt.close();
708
        conn.commit();
709

    
710
        stmt = conn.createStatement();
711
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/','oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%';";
712
        stmt.executeUpdate(sql);
713
        stmt.close();
714
        conn.commit();
715

    
716
        stmt = conn.createStatement();
717
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/','oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%';";
718
        stmt.executeUpdate(sql);
719
        stmt.close();
720
        conn.commit();
721

    
722
        stmt = conn.createStatement();
723
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/','oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%';";
724
        stmt.executeUpdate(sql);
725
        stmt.close();
726
        conn.commit();
727

    
728
        stmt = conn.createStatement();
729
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/','oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%';";
730
        stmt.executeUpdate(sql);
731
        stmt.close();
732
        conn.commit();
733

    
734
        stmt = conn.createStatement();
735
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/','oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%';";
736
        stmt.executeUpdate(sql);
737
        stmt.close();
738
        conn.commit();
739

    
740
        stmt = conn.createStatement();
741
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/','oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%';";
742
        stmt.executeUpdate(sql);
743
        stmt.close();
744
        conn.commit();
745

    
746
        stmt = conn.createStatement();
747
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/','oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%';";
748
        stmt.executeUpdate(sql);
749
        stmt.close();
750
        conn.commit();
751

    
752
        stmt = conn.createStatement();
753
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/','oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%';";
754
        stmt.executeUpdate(sql);
755
        stmt.close();
756
        conn.commit();
757

    
758
        stmt = conn.createStatement();
759
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/','oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%';";
760
        stmt.executeUpdate(sql);
761
        stmt.close();
762
        conn.commit();
763

    
764
        stmt = conn.createStatement();
765
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/','oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%';";
766
        stmt.executeUpdate(sql);
767
        stmt.close();
768
        conn.commit();
769

    
770
        stmt = conn.createStatement();
771
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/','oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%';";
772
        stmt.executeUpdate(sql);
773
        stmt.close();
774
        conn.commit();
775

    
776
        stmt = conn.createStatement();
777
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/','oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%';";
778
        stmt.executeUpdate(sql);
779
        stmt.close();
780
        conn.commit();
781

    
782
        stmt = conn.createStatement();
783
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/','oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%';";
784
        stmt.executeUpdate(sql);
785
        stmt.close();
786
        conn.commit();
787

    
788
        stmt = conn.createStatement();
789
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/','oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%';";
790
        stmt.executeUpdate(sql);
791
        stmt.close();
792
        conn.commit();
793

    
794
        stmt = conn.createStatement();
795
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/','oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%';";
796
        stmt.executeUpdate(sql);
797
        stmt.close();
798
        conn.commit();
799

    
800
        stmt = conn.createStatement();
801
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/','oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%';";
802
        stmt.executeUpdate(sql);
803
        stmt.close();
804
        conn.commit();
805

    
806
        stmt = conn.createStatement();
807
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/','oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%';";
808
        stmt.executeUpdate(sql);
809
        stmt.close();
810
        conn.commit();
811

    
812
        stmt = conn.createStatement();
813
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/','oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%';";
814
        stmt.executeUpdate(sql);
815
        stmt.close();
816
        conn.commit();
817

    
818
        stmt = conn.createStatement();
819
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/','oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%';";
820
        stmt.executeUpdate(sql);
821
        stmt.close();
822
        conn.commit();
823

    
824
        stmt = conn.createStatement();
825
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/','oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%';";
826
        stmt.executeUpdate(sql);
827
        stmt.close();
828
        conn.commit();
829

    
830
        stmt = conn.createStatement();
831
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/','oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%';";
832
        stmt.executeUpdate(sql);
833
        stmt.close();
834
        conn.commit();
835

    
836
        stmt = conn.createStatement();
837
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/','oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%';";
838
        stmt.executeUpdate(sql);
839
        stmt.close();
840
        conn.commit();
841

    
842
        stmt = conn.createStatement();
843
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/','oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%';";
844
        stmt.executeUpdate(sql);
845
        stmt.close();
846
        conn.commit();
847

    
848
        stmt = conn.createStatement();
849
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/','oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%';";
850
        stmt.executeUpdate(sql);
851
        stmt.close();
852
        conn.commit();
853

    
854
        stmt = conn.createStatement();
855
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/','oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%';";
856
        stmt.executeUpdate(sql);
857
        stmt.close();
858
        conn.commit();
859

    
860
        stmt = conn.createStatement();
861
        sql = "UPDATE piwiklog SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/','oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%';";
862
        stmt.executeUpdate(sql);
863
        stmt.close();
864
        conn.commit();
865

    
866
        conn.close();
867
    }
868

    
869
    //Create OpenAIRE's portal datasource statistics
870
    private void dataSourceStats() throws Exception {
871
        if (conn.isClosed())
872
            connectDB();
873

    
874
        stmt = conn.createStatement();
875
        conn.setAutoCommit(false);
876

    
877
        // String sql = "SELECT  orgid AS id, max(viewcount) AS number_of_views, date INTO datasource_stats FROM (select entity_id, ooid.id AS orgid, count(entity_id) viewcount, timestamp::date as date FROM piwiklog, datasource_oids ooid WHERE entity_id=ooid.orid AND source_item_type='datasource' GROUP BY entity_id, ooid.id, date) AS temp GROUP BY orgid, date ORDER BY orgid, date ASC, max(viewcount) DESC;";
878
        String sql = "CREATE TABLE datasource_stats AS SELECT  orgid AS id, max(viewcount) AS number_of_views, date FROM (select entity_id, ooid.id AS orgid, count(entity_id) viewcount, timestamp::date as date FROM piwiklog, datasource_oids ooid WHERE entity_id=ooid.orid AND source_item_type='datasource' GROUP BY entity_id, ooid.id, date) AS temp GROUP BY orgid, date ORDER BY orgid, date ASC, max(viewcount) DESC;";
879
        stmt.executeUpdate(sql);
880

    
881
        sql = "CREATE INDEX datasource_stats_id ON datasource_stats USING btree (id)";
882
        stmt.executeUpdate(sql);
883

    
884
        sql = "CREATE INDEX datasource_stats_date ON datasource_stats USING btree(date)";
885
        stmt.executeUpdate(sql);
886

    
887
        // sql = "SELECT id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month INTO datasource_stats_monthly_clean FROM datasource_stats GROUP BY id, month;";
888
        sql = "CREATE TABLE datasource_stats_monthly_clean AS SELECT id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month FROM datasource_stats GROUP BY id, month;";
889
        stmt.executeUpdate(sql);
890

    
891
        sql = "CREATE INDEX datasource_stats_monthly_clean_id ON datasource_stats_monthly_clean USING btree (id)";
892
        stmt.executeUpdate(sql);
893

    
894
        sql = "CREATE INDEX datasource_stats_monthly_clean_month ON datasource_stats_monthly_clean USING btree(month)";
895
        stmt.executeUpdate(sql);
896

    
897
        Calendar startCalendar = Calendar.getInstance();
898
        startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01"));
899
        Calendar endCalendar = Calendar.getInstance();
900
        int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR);
901
        int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
902

    
903
        //sql="CREATE OR REPLACE view datasource_stats_monthly AS select d.id, d.new_date as month, case when rdm.sum is null then 0 else rdm.sum end from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY-MM-DD') AS new_date from generate_series(0, " + diffMonth +", 1) AS offs, datasource_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from datasource_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
904
        //sql = "select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end INTO datasource_stats_monthly from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY-MM-DD') AS new_date from generate_series(0, " + diffMonth + ", 1) AS offs, datasource_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from datasource_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
905
        sql = "CREATE TABLE datasource_stats_monthly AS select d.id, d.new_date AS month, case when rdm.sum is null then 0 else rdm.sum end from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY-MM-DD') AS new_date from generate_series(0, " + diffMonth + ", 1) AS offs, datasource_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from datasource_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
906
        stmt.executeUpdate(sql);
907

    
908
        sql = "CREATE INDEX datasource_stats_monthly_id ON datasource_stats_monthly USING btree (id)";
909
        stmt.executeUpdate(sql);
910

    
911
        sql = "CREATE INDEX datasource_stats_monthly_month ON datasource_stats_monthly USING btree(month)";
912
        stmt.executeUpdate(sql);
913

    
914
        stmt.close();
915
        conn.commit();
916
        conn.close();
917
    }
918

    
919
    //Create OpenAIRE's portal results statistics
920
    private void resultStats() throws Exception {
921
        if (conn.isClosed())
922
            connectDB();
923

    
924
        stmt = conn.createStatement();
925
        conn.setAutoCommit(false);
926

    
927
        // String sql = "SELECT orgid AS id, max(viewcount) AS number_of_views, date INTO result_stats FROM (SELECT entity_id, ooid.id AS orgid, count(entity_id) viewcount, timestamp::date as date FROM piwiklog, result_oids ooid WHERE entity_id=ooid.orid AND source_item_type='oaItem' GROUP BY entity_id, ooid.id, date) AS temp GROUP BY orgid, date ORDER BY orgid, date ASC, max(viewcount) DESC;";
928
        String sql = "CREATE TABLE result_stats AS SELECT orgid AS id, max(viewcount) AS number_of_views, date  FROM (SELECT entity_id, ooid.id AS orgid, count(entity_id) viewcount, timestamp::date as date FROM piwiklog, result_oids ooid WHERE entity_id=ooid.orid AND source_item_type='oaItem' GROUP BY entity_id, ooid.id, date) AS temp GROUP BY orgid, date ORDER BY orgid, date ASC, max(viewcount) DESC;";
929
        stmt.executeUpdate(sql);
930

    
931
        sql = "CREATE INDEX result_stats_id ON result_stats USING btree (id)";
932
        stmt.executeUpdate(sql);
933

    
934
        sql = "CREATE INDEX result_stats_date ON result_stats USING btree(date)";
935
        stmt.executeUpdate(sql);
936

    
937
//        sql = "SELECT id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month INTO result_stats_monthly_clean FROM result_stats group by id, month;";
938
        sql = "CREATE TABLE IF NOT EXISTS result_stats_monthly_clean AS SELECT id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month FROM result_stats group by id, month;";
939
        stmt.executeUpdate(sql);
940

    
941
        sql = "CREATE INDEX result_stats_monthly_clean_id ON result_stats_monthly_clean USING btree (id)";
942
        stmt.executeUpdate(sql);
943

    
944
        sql = "CREATE INDEX result_stats_monthly_clean_month ON result_stats_monthly_clean USING btree(month)";
945
        stmt.executeUpdate(sql);
946

    
947
        Calendar startCalendar = Calendar.getInstance();
948
        startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01"));
949
        Calendar endCalendar = Calendar.getInstance();
950
        int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR);
951
        int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
952

    
953
        //sql="CREATE OR REPLACE view result_stats_monthly AS select d.id, d.new_date as month, case when rdm.sum is null then 0 else rdm.sum end from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date from generate_series(0, " + diffMonth +", 1) AS offs, result_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from result_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
954
//        sql = "select d.id, d.new_date as month, case when rdm.sum is null then 0 else rdm.sum end INTO result_stats_monthly from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date from generate_series(0, " + diffMonth + ", 1) AS offs, result_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from result_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
955
        sql = "CREATE TABLE IF NOT EXISTS result_stats_monthly AS select d.id, d.new_date as month, case when rdm.sum is null then 0 else rdm.sum end from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date from generate_series(0, " + diffMonth + ", 1) AS offs, result_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from result_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
956
        stmt.executeUpdate(sql);
957

    
958
        sql = "CREATE INDEX result_stats_monthly_id ON result_stats_monthly USING btree (id)";
959
        stmt.executeUpdate(sql);
960

    
961
        sql = "CREATE INDEX result_stats_monthly_month ON result_stats_monthly USING btree(month)";
962
        stmt.executeUpdate(sql);
963

    
964
        stmt.close();
965
        conn.commit();
966
        conn.close();
967
    }
968

    
969
    //Create OpenAIRE's portal organization statistics
970
    private void organizationsStats() throws Exception {
971
        if (conn.isClosed())
972
            connectDB();
973

    
974
        stmt = conn.createStatement();
975
        conn.setAutoCommit(false);
976

    
977
        // String sql = "SELECT orgid AS id, max(viewcount) AS number_of_views, date INTO organization_stats FROM (SELECT entity_id, ooid.id AS orgid, count(entity_id) viewcount, timestamp::date AS date FROM piwiklog, organization_oids ooid WHERE entity_id=ooid.orid AND source_item_type='organization' GROUP BY entity_id, ooid.id, date) AS temp GROUP BY orgid, date ORDER BY orgid, date ASC, max(viewcount) DESC;";
978
        String sql = "CREATE TABLE  organization_stats AS SELECT orgid AS id, max(viewcount) AS number_of_views, date FROM (SELECT entity_id, ooid.id AS orgid, count(entity_id) viewcount, timestamp::date AS date FROM piwiklog, organization_oids ooid WHERE entity_id=ooid.orid AND source_item_type='organization' GROUP BY entity_id, ooid.id, date) AS temp GROUP BY orgid, date ORDER BY orgid, date ASC, max(viewcount) DESC;";
979
        stmt.executeUpdate(sql);
980

    
981
        sql = "CREATE INDEX organization_stats_id ON organization_stats USING btree (id)";
982
        stmt.executeUpdate(sql);
983

    
984
        sql = "CREATE INDEX organization_stats_date ON organization_stats USING btree(date)";
985
        stmt.executeUpdate(sql);
986

    
987
        // sql = "SELECT id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month INTO organization_stats_monthly_clean FROM organization_stats group by id, month;";
988
        sql = "CREATE TABLE  organization_stats_monthly_clean AS SELECT id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month FROM organization_stats group by id, month;";
989
        stmt.executeUpdate(sql);
990

    
991
        sql = "CREATE INDEX organization_stats_monthly_clean_id ON organization_stats_monthly_clean USING btree (id)";
992
        stmt.executeUpdate(sql);
993

    
994
        sql = "CREATE INDEX organization_stats_monthly_clean_month ON organization_stats_monthly_clean USING btree(month)";
995
        stmt.executeUpdate(sql);
996

    
997
        Calendar startCalendar = Calendar.getInstance();
998
        startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01"));
999
        Calendar endCalendar = Calendar.getInstance();
1000
        int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR);
1001
        int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
1002

    
1003
        //sql="CREATE OR REPLACE view organization_stats_monthly AS select d.id, d.new_date as month, case when rdm.sum is null then 0 else rdm.sum end from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY-MM-DD') AS new_date from generate_series(0, " + diffMonth +", 1) AS offs, organization_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from organization_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
1004
        // sql = "select d.id, d.new_date as month, case when rdm.sum is null then 0 else rdm.sum end INTO organization_stats_monthly from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY-MM-DD') AS new_date from generate_series(0, " + diffMonth + ", 1) AS offs, organization_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from organization_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
1005
        sql = "CREATE TABLE  organization_stats_monthly AS select d.id, d.new_date as month, case when rdm.sum is null then 0 else rdm.sum end from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY-MM-DD') AS new_date from generate_series(0, " + diffMonth + ", 1) AS offs, organization_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from organization_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
1006
        stmt.executeUpdate(sql);
1007

    
1008
        sql = "CREATE INDEX organization_stats_monthly_id ON organization_stats_monthly USING btree (id)";
1009
        stmt.executeUpdate(sql);
1010

    
1011
        sql = "CREATE INDEX organization_stats_monthly_month ON organization_stats_monthly USING btree(month)";
1012
        stmt.executeUpdate(sql);
1013

    
1014
        stmt.close();
1015
        conn.commit();
1016
        conn.close();
1017
    }
1018

    
1019
    //Create OpenAIRE's portal projects statistics
1020
    private void projectsStats() throws Exception {
1021
        if (conn.isClosed())
1022
            connectDB();
1023

    
1024
        stmt = conn.createStatement();
1025
        conn.setAutoCommit(false);
1026

    
1027
        // String sql = "SELECT orgid AS id, max(viewcount) AS number_of_views, date INTO project_stats FROM (SELECT entity_id, ooid.id AS orgid, count(entity_id) viewcount, timestamp::date AS date FROM piwiklog, project_oids ooid WHERE entity_id=ooid.orid AND source_item_type='project' GROUP BY entity_id, ooid.id, date) AS temp GROUP BY orgid, date ORDER BY orgid, date ASC, max(viewcount) DESC;";
1028
        String sql = "CREATE TABLE  project_stats AS SELECT orgid AS id, max(viewcount) AS number_of_views, date FROM (SELECT entity_id, ooid.id AS orgid, count(entity_id) viewcount, timestamp::date AS date FROM piwiklog, project_oids ooid WHERE entity_id=ooid.orid AND source_item_type='project' GROUP BY entity_id, ooid.id, date) AS temp GROUP BY orgid, date ORDER BY orgid, date ASC, max(viewcount) DESC;";
1029
        stmt.executeUpdate(sql);
1030

    
1031
        sql = "CREATE INDEX project_stats_id ON project_stats USING btree (id)";
1032
        stmt.executeUpdate(sql);
1033

    
1034
        sql = "CREATE INDEX project_stats_date ON project_stats USING btree(date)";
1035
        stmt.executeUpdate(sql);
1036

    
1037
        // sql = "SELECT id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month INTO project_stats_monthly_clean FROM project_stats group by id, month;";
1038
        sql = "CREATE TABLE  project_stats_monthly_clean AS SELECT id, sum(number_of_views), extract('year' from date) ||'/'|| LPAD(CAST(extract('month' from date) AS VARCHAR), 2, '0') AS month FROM project_stats group by id, month;";
1039
        stmt.executeUpdate(sql);
1040

    
1041
        sql = "CREATE INDEX project_stats_monthly_clean_id ON project_stats_monthly_clean USING btree (id)";
1042
        stmt.executeUpdate(sql);
1043

    
1044
        sql = "CREATE INDEX project_stats_monthly_clean_month ON project_stats_monthly_clean USING btree(month)";
1045
        stmt.executeUpdate(sql);
1046

    
1047
        Calendar startCalendar = Calendar.getInstance();
1048
        startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01"));
1049
        Calendar endCalendar = Calendar.getInstance();
1050
        int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR);
1051
        int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH);
1052

    
1053
        // sql="CREATE OR REPLACE view project_stats_monthly AS select d.id, d.new_date as month, case when rdm.sum is null then 0 else rdm.sum end from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date from generate_series(0, " + diffMonth +", 1) AS offs, project_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from project_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
1054
        // sql = "select d.id, d.new_date as month, case when rdm.sum is null then 0 else rdm.sum end INTO project_stats_monthly from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date from generate_series(0, " + diffMonth + ", 1) AS offs, project_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from project_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
1055
        sql = "CREATE TABLE  project_stats_monthly AS select d.id, d.new_date as month, case when rdm.sum is null then 0 else rdm.sum end from (select distinct rdsm.id, to_char(date_trunc('month', ('2016-01-01'::date + interval '1 month'*offs)), 'YYYY/MM') AS new_date from generate_series(0, " + diffMonth + ", 1) AS offs, project_stats_monthly_clean rdsm) d LEFT JOIN (select id, month, sum from project_stats_monthly_clean) rdm ON d.new_date=rdm.month and d.id=rdm.id order by d.id, d.new_date";
1056
        stmt.executeUpdate(sql);
1057

    
1058
        sql = "CREATE INDEX project_stats_monthly_id ON project_stats_monthly USING btree (id)";
1059
        stmt.executeUpdate(sql);
1060

    
1061
        sql = "CREATE INDEX project_stats_monthly_month ON project_stats_monthly USING btree(month)";
1062
        stmt.executeUpdate(sql);
1063

    
1064
        stmt.close();
1065
        conn.commit();
1066
        conn.close();
1067
    }
1068

    
1069
    private String processPortalURL(String url) {
1070

    
1071
        if (url.indexOf("openaire.eu") > 0) {
1072
            try {
1073
                url = URLDecoder.decode(url, "UTF-8");
1074
            } catch (Exception e) {
1075
                log.info(url);
1076
            }
1077
            if (url.indexOf("datasourceId=") > 0 && url.substring(url.indexOf("datasourceId=") + 13).length() >= 46) {
1078
                url = "datasource|" + url.substring(url.indexOf("datasourceId=") + 13, url.indexOf("datasourceId=") + 59);
1079
            } else if (url.indexOf("datasource=") > 0 && url.substring(url.indexOf("datasource=") + 11).length() >= 46) {
1080
                url = "datasource|" + url.substring(url.indexOf("datasource=") + 11, url.indexOf("datasource=") + 57);
1081
            } else if (url.indexOf("datasourceFilter=") > 0 && url.substring(url.indexOf("datasourceFilter=") + 17).length() >= 46) {
1082
                url = "datasource|" + url.substring(url.indexOf("datasourceFilter=") + 17, url.indexOf("datasourceFilter=") + 63);
1083
            } else if (url.indexOf("articleId=") > 0 && url.substring(url.indexOf("articleId=") + 10).length() >= 46) {
1084
                url = "result|" + url.substring(url.indexOf("articleId=") + 10, url.indexOf("articleId=") + 56);
1085
            } else if (url.indexOf("datasetId=") > 0 && url.substring(url.indexOf("datasetId=") + 10).length() >= 46) {
1086
                url = "result|" + url.substring(url.indexOf("datasetId=") + 10, url.indexOf("datasetId=") + 56);
1087
            } else if (url.indexOf("projectId=") > 0 && url.substring(url.indexOf("projectId=") + 10).length() >= 46 && !url.contains("oai:dnet:corda")) {
1088
                url = "project|" + url.substring(url.indexOf("projectId=") + 10, url.indexOf("projectId=") + 56);
1089
            } else if (url.indexOf("organizationId=") > 0 && url.substring(url.indexOf("organizationId=") + 15).length() >= 46) {
1090
                url = "organization|" + url.substring(url.indexOf("organizationId=") + 15, url.indexOf("organizationId=") + 61);
1091
            } else {
1092
                url = "";
1093
            }
1094
        } else {
1095
            url = "";
1096
        }
1097

    
1098
        return url;
1099
    }
1100

    
1101
    private ArrayList<String> listHdfsDir(String dir) throws Exception {
1102

    
1103
        FileSystem hdfs = FileSystem.get(new Configuration());
1104
        RemoteIterator<LocatedFileStatus> Files;
1105
        ArrayList<String> fileNames = new ArrayList<>();
1106

    
1107
        try {
1108
            Path exportPath = new Path(hdfs.getUri() + dir);
1109
            Files = hdfs.listFiles(exportPath, false);
1110
            while (Files.hasNext()) {
1111
                String fileName = Files.next().getPath().toString();
1112
                //log.info("Found hdfs file " + fileName);
1113
                fileNames.add(fileName);
1114
            }
1115
            //hdfs.close();
1116
        } catch (Exception e) {
1117
            log.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath));
1118
            throw new Exception("HDFS file path with exported data does not exist :   " + logPath, e);
1119
        }
1120

    
1121
        return fileNames;
1122
    }
1123

    
1124
    private String readHDFSFile(String filename) throws Exception {
1125
        String result;
1126
        try {
1127

    
1128
            FileSystem fs = FileSystem.get(new Configuration());
1129
            //log.info("reading file : " + filename);
1130

    
1131
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename))));
1132

    
1133
            StringBuilder sb = new StringBuilder();
1134
            String line = br.readLine();
1135

    
1136
            while (line != null) {
1137
                if (!line.equals("[]")) {
1138
                    sb.append(line);
1139
                }
1140
                //sb.append(line);
1141
                line = br.readLine();
1142
            }
1143
            result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\"");
1144
            if (result.equals("")) {
1145
                result = "[]";
1146
            }
1147

    
1148
            //fs.close();
1149
        } catch (Exception e) {
1150
            log.error(e);
1151
            throw new Exception(e);
1152
        }
1153

    
1154
        return result;
1155
    }
1156
}
(3-3/5)