Project

General

Profile

« Previous | Next » 

Revision 34513

Added by Marek Horst over 9 years ago

#118 renaming communities_builder to community_builder

View differences:

modules/uoa-iis-websiteusage/trunk/src/main/resources/eu/dnetlib/iis/websiteusage/document/communities_builder/oozie_app/workflow.xml
1
<?xml version="1.0"?>
2
<workflow-app xmlns="uri:oozie:workflow:0.4" name="websiteusage_document_communities_builder">
3
	
4
	<parameters>
5
		<property>
6
			<name>input_logs</name>
7
			<description>input logs file</description>
8
		</property>
9
		<property>
10
			<name>output</name>
11
			<description>output publication to communities mappings</description>
12
		</property>
13
	</parameters>
14

  
15
	<start to="communities_builder" />
16

  
17
    <action name="communities_builder">
18
        <map-reduce>
19
            <job-tracker>${jobTracker}</job-tracker>
20
            <name-node>${nameNode}</name-node>
21
            <prepare>
22
                <delete path="${nameNode}${output}"/>
23
            </prepare>
24
            <streaming>
25
                <mapper>scripts/madis/mexec.py -f scripts/createCommunities.sql -w communities.db</mapper>
26
            </streaming>
27
            <configuration>
28
                <property>
29
                    <name>mapred.output.format.class</name>
30
                    <value>com.cloudera.science.avro.streaming.AvroAsJSONOutputFormat</value>
31
                </property>
32
                <property>
33
                    <name>mapred.input.format.class</name>
34
                    <value>com.cloudera.science.avro.streaming.AvroAsJSONInputFormat</value>
35
                </property>
36
                <property>
37
                    <name>mapred.reduce.tasks</name>
38
                    <value>0</value>
39
                </property>
40
                <property>
41
                    <name>mapred.input.dir</name>
42
                    <value>${input_logs}</value>
43
                </property>
44
                <property>
45
                    <name>eu.dnetlib.iis.avro.input.class</name>
46
                    <value>eu.dnetlib.iis.websiteusage.schemas.LogEntry</value>
47
                </property>
48
                <property>
49
                    <name>mapred.output.dir</name>
50
                    <value>${output}</value>
51
                </property>          
52
                <property>
53
                    <name>eu.dnetlib.iis.avro.output.class</name>
54
                    <value>eu.dnetlib.iis.websiteusage.schemas.DocumentToCommunity</value>
55
                </property>
56
				<property>
57
					<name>mapred.task.timeout</name>
58
					<value>1800000</value>
59
				</property>
60
				<property>
61
					<name>mapreduce.task.timeout</name>
62
					<value>1800000</value>
63
				</property>
64
				
65
            </configuration>
66
            <!-- 
67
            <file>${input_dataset_db}#dataset.db</file>
68
             -->
69
        </map-reduce>
70
        <ok to="end"/>
71
        <error to="fail"/>
72
    </action>
73
    
74
    <kill name="fail">
75
        <message>Unfortunately, the process failed -- error message:
76
        			[${wf:errorMessage(wf:lastErrorNode())}]
77
        		</message>
78
    </kill>
79

  
80
    <end name="end"/>
81
</workflow-app>
82 0

  
modules/uoa-iis-websiteusage/trunk/src/main/resources/eu/dnetlib/iis/websiteusage/document/communities_builder/oozie_app/lib/scripts/import.txt
1
## This is a classpath-based import file (this header is required)
2
madis classpath eu/dnetlib/iis/3rdparty/scripts/madis
3 0

  
modules/uoa-iis-websiteusage/trunk/src/main/resources/eu/dnetlib/iis/websiteusage/document/communities_builder/oozie_app/lib/scripts/createCommunities.sql
1
drop table if exists piwiklog;
2
create table piwiklog (datetime, action,user,session,data);
3
insert into piwiklog select * from (setschema 'datetime, action,user,session,data' select jdictsplit(stripchars(c1,','),'timestamp','action','user','session','data') from stdinput());
4
create index piwiklog_index on piwiklog(action,data,session,user,datetime);
5

  
6

  
7
select jdict('cid',cliqueid,'SimilarDocid',nodeid) from 
8
    (select graphcliques(node1,node2) from 
9
        (select pgroup1 as node1, pgroup2 as node2, count(*) as frequency from 
10
            (setschema 'session,pgroup1,pgroup2' select session,jcombinations(jsort(jset(jgroup(data))),2) as pgroup from 
11
                (select datetime, action,user,session,jlengthiest(jdictvals(urlquery2jdict(data),'personId','projectId','publicationId','datasourceId','datasetId')) as data from piwiklog) 
12
                where (action='viewPerson' or action='viewPublication' or action='viewProject' or action='viewDatasource' or action='viewOrganization' or action='viewDataset') 
13
            group by session) 
14
        group by node1,node2 having frequency>4)
15
    ) 
16
where nodeid is not null;
modules/uoa-iis-websiteusage/trunk/src/main/resources/eu/dnetlib/iis/websiteusage/document/communities_builder/job.properties
1
input_logs=/share/import/logs/2014-11-03
2
output=${workingDir}/out
3 0

  

Also available in: Unified diff