Revision 34513
Added by Marek Horst over 9 years ago
modules/uoa-iis-websiteusage/trunk/src/main/resources/eu/dnetlib/iis/websiteusage/document/communities_builder/oozie_app/workflow.xml | ||
---|---|---|
1 |
<?xml version="1.0"?> |
|
2 |
<workflow-app xmlns="uri:oozie:workflow:0.4" name="websiteusage_document_communities_builder"> |
|
3 |
|
|
4 |
<parameters> |
|
5 |
<property> |
|
6 |
<name>input_logs</name> |
|
7 |
<description>input logs file</description> |
|
8 |
</property> |
|
9 |
<property> |
|
10 |
<name>output</name> |
|
11 |
<description>output publication to communities mappings</description> |
|
12 |
</property> |
|
13 |
</parameters> |
|
14 |
|
|
15 |
<start to="communities_builder" /> |
|
16 |
|
|
17 |
<action name="communities_builder"> |
|
18 |
<map-reduce> |
|
19 |
<job-tracker>${jobTracker}</job-tracker> |
|
20 |
<name-node>${nameNode}</name-node> |
|
21 |
<prepare> |
|
22 |
<delete path="${nameNode}${output}"/> |
|
23 |
</prepare> |
|
24 |
<streaming> |
|
25 |
<mapper>scripts/madis/mexec.py -f scripts/createCommunities.sql -w communities.db</mapper> |
|
26 |
</streaming> |
|
27 |
<configuration> |
|
28 |
<property> |
|
29 |
<name>mapred.output.format.class</name> |
|
30 |
<value>com.cloudera.science.avro.streaming.AvroAsJSONOutputFormat</value> |
|
31 |
</property> |
|
32 |
<property> |
|
33 |
<name>mapred.input.format.class</name> |
|
34 |
<value>com.cloudera.science.avro.streaming.AvroAsJSONInputFormat</value> |
|
35 |
</property> |
|
36 |
<property> |
|
37 |
<name>mapred.reduce.tasks</name> |
|
38 |
<value>0</value> |
|
39 |
</property> |
|
40 |
<property> |
|
41 |
<name>mapred.input.dir</name> |
|
42 |
<value>${input_logs}</value> |
|
43 |
</property> |
|
44 |
<property> |
|
45 |
<name>eu.dnetlib.iis.avro.input.class</name> |
|
46 |
<value>eu.dnetlib.iis.websiteusage.schemas.LogEntry</value> |
|
47 |
</property> |
|
48 |
<property> |
|
49 |
<name>mapred.output.dir</name> |
|
50 |
<value>${output}</value> |
|
51 |
</property> |
|
52 |
<property> |
|
53 |
<name>eu.dnetlib.iis.avro.output.class</name> |
|
54 |
<value>eu.dnetlib.iis.websiteusage.schemas.DocumentToCommunity</value> |
|
55 |
</property> |
|
56 |
<property> |
|
57 |
<name>mapred.task.timeout</name> |
|
58 |
<value>1800000</value> |
|
59 |
</property> |
|
60 |
<property> |
|
61 |
<name>mapreduce.task.timeout</name> |
|
62 |
<value>1800000</value> |
|
63 |
</property> |
|
64 |
|
|
65 |
</configuration> |
|
66 |
<!-- |
|
67 |
<file>${input_dataset_db}#dataset.db</file> |
|
68 |
--> |
|
69 |
</map-reduce> |
|
70 |
<ok to="end"/> |
|
71 |
<error to="fail"/> |
|
72 |
</action> |
|
73 |
|
|
74 |
<kill name="fail"> |
|
75 |
<message>Unfortunately, the process failed -- error message: |
|
76 |
[${wf:errorMessage(wf:lastErrorNode())}] |
|
77 |
</message> |
|
78 |
</kill> |
|
79 |
|
|
80 |
<end name="end"/> |
|
81 |
</workflow-app> |
|
82 | 0 |
modules/uoa-iis-websiteusage/trunk/src/main/resources/eu/dnetlib/iis/websiteusage/document/communities_builder/oozie_app/lib/scripts/import.txt | ||
---|---|---|
1 |
## This is a classpath-based import file (this header is required) |
|
2 |
madis classpath eu/dnetlib/iis/3rdparty/scripts/madis |
|
3 | 0 |
modules/uoa-iis-websiteusage/trunk/src/main/resources/eu/dnetlib/iis/websiteusage/document/communities_builder/oozie_app/lib/scripts/createCommunities.sql | ||
---|---|---|
1 |
drop table if exists piwiklog; |
|
2 |
create table piwiklog (datetime, action,user,session,data); |
|
3 |
insert into piwiklog select * from (setschema 'datetime, action,user,session,data' select jdictsplit(stripchars(c1,','),'timestamp','action','user','session','data') from stdinput()); |
|
4 |
create index piwiklog_index on piwiklog(action,data,session,user,datetime); |
|
5 |
|
|
6 |
|
|
7 |
select jdict('cid',cliqueid,'SimilarDocid',nodeid) from |
|
8 |
(select graphcliques(node1,node2) from |
|
9 |
(select pgroup1 as node1, pgroup2 as node2, count(*) as frequency from |
|
10 |
(setschema 'session,pgroup1,pgroup2' select session,jcombinations(jsort(jset(jgroup(data))),2) as pgroup from |
|
11 |
(select datetime, action,user,session,jlengthiest(jdictvals(urlquery2jdict(data),'personId','projectId','publicationId','datasourceId','datasetId')) as data from piwiklog) |
|
12 |
where (action='viewPerson' or action='viewPublication' or action='viewProject' or action='viewDatasource' or action='viewOrganization' or action='viewDataset') |
|
13 |
group by session) |
|
14 |
group by node1,node2 having frequency>4) |
|
15 |
) |
|
16 |
where nodeid is not null; |
modules/uoa-iis-websiteusage/trunk/src/main/resources/eu/dnetlib/iis/websiteusage/document/communities_builder/job.properties | ||
---|---|---|
1 |
input_logs=/share/import/logs/2014-11-03 |
|
2 |
output=${workingDir}/out |
|
3 | 0 |
Also available in: Unified diff
#118 renaming communities_builder to community_builder