Project

General

Profile

1
<?xml version="1.0"?>
2
<!-- Note that documentation placed in comments in this file uses the 
3
"markdown" syntax (along with its way of dividing text into sections). -->
4
<workflow-app xmlns="uri:oozie:workflow:0.3" name="test-core_examples_hadoopstreaming_cloner_without_reducer_with_explicit_schema_file">
5
	<start to="data_producer" />
6
	<action name="data_producer">
7
		<java>
8
			<job-tracker>${jobTracker}</job-tracker>
9
			<name-node>${nameNode}</name-node>
10
			<!-- The data generated by this node is deleted in this section -->
11
			<prepare>
12
				<delete path="${nameNode}${workingDir}/data_producer" />
13
				<mkdir path="${nameNode}${workingDir}/data_producer" />
14
			</prepare>
15
			<configuration>
16
				<property>
17
					<name>mapred.job.queue.name</name>
18
					<value>${queueName}</value>
19
				</property>
20
			</configuration>
21
			<!-- This is simple wrapper for the Java code -->
22
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
23
			<!-- The business Java code that gets to be executed -->
24
			<arg>eu.dnetlib.iis.core.examples.java.SampleDataProducer</arg>
25
			<!-- All input and output ports have to be bound to paths in HDFS -->
26
			<arg>-Operson=${workingDir}/data_producer/person</arg>
27
			<arg>-Odocument=${workingDir}/data_producer/document</arg>
28
		</java>
29
		<ok to="python_cloner" />
30
		<error to="fail" />
31
	</action>
32
    <action name="python_cloner">
33
        <map-reduce>
34
            <job-tracker>${jobTracker}</job-tracker>
35
            <name-node>${nameNode}</name-node>
36
            <prepare>
37
                <delete path="${nameNode}${workingDir}/python_cloner"/>
38
                <mkdir path="${nameNode}${workingDir}/python_cloner"/>
39
            </prepare>
40
            <streaming>
41
            	<!-- Here, we give the relative path to the script and pass it
42
            	the parameters of the workflow node. The script is held
43
            	in a directory having the same name as the workflow node.
44
            	
45
            	The parameters should be passed as **named** arguments. This
46
            	convention of passing them as named arguments makes the code
47
            	more readable/maintainable.
48
            	 -->
49
                <mapper>scripts/python_cloner/cloner.py --copies 3</mapper>
50
            </streaming>
51
            <configuration>
52
            	<!-- # Standard settings for our framework -->
53
                <property>
54
                    <name>mapred.output.format.class</name>
55
                    <value>com.cloudera.science.avro.streaming.AvroAsJSONOutputFormat</value>
56
                </property>
57
                <property>
58
                    <name>mapred.input.format.class</name>
59
                    <value>com.cloudera.science.avro.streaming.AvroAsJSONInputFormat</value>
60
                </property>
61
                <!-- # Custom settings for this workflow node -->
62
                <!-- We do not use any reducers, so we set their number to 0 -->
63
                <property>
64
                    <name>mapred.reduce.tasks</name>
65
                    <value>0</value>
66
                </property>
67
                <property>
68
                    <name>mapred.input.dir</name>
69
                    <value>${workingDir}/data_producer/person</value>
70
                </property>
71
                <!-- Path to the input schema. This is held in the same 
72
                directory as the script. -->
73
                <property>
74
                    <name>input.schema.url</name>
75
                    <value>${wf:appPath()}/lib/scripts/python_cloner/Person.avsc</value>
76
                </property>
77
                <property>
78
                    <name>mapred.output.dir</name>
79
                    <value>${workingDir}/python_cloner/output</value>
80
                </property>
81
                <!-- Path to the output schema. This is held in the same 
82
                directory as the script. -->
83
                <property>
84
                    <name>output.schema.url</name>
85
                    <value>${wf:appPath()}/lib/scripts/python_cloner/Person.avsc</value>
86
                </property>
87
            </configuration>
88
        </map-reduce>
89
        <ok to="cloner"/>
90
        <error to="fail"/>
91
    </action>
92
	<action name="cloner">
93
		<java>
94
			<job-tracker>${jobTracker}</job-tracker>
95
			<name-node>${nameNode}</name-node>
96
			<!-- The data generated by this node is deleted in this section -->
97
			<prepare>
98
				<delete path="${nameNode}${workingDir}/cloner" />
99
				<mkdir path="${nameNode}${workingDir}/cloner" />
100
			</prepare>
101
			<configuration>
102
				<property>
103
					<name>mapred.job.queue.name</name>
104
					<value>${queueName}</value>
105
				</property>
106
			</configuration>
107
			<!-- This is simple wrapper for the Java code -->
108
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
109
			<!-- The business Java code that gets to be executed -->
110
			<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg>
111
			<!-- All input and output ports have to be bound to paths in HDFS -->
112
			<arg>-Iperson=${workingDir}/python_cloner/output</arg>
113
			<arg>-Operson=${workingDir}/cloner/person</arg>
114
		</java>
115
		<ok to="end" />
116
		<error to="fail" />
117
	</action>
118
    <kill name="fail">
119
        <message>Unfortunately, the process failed -- error message:
120
        			[${wf:errorMessage(wf:lastErrorNode())}]
121
        		</message>
122
    </kill>
123
    <end name="end"/>
124
</workflow-app>
125

    
126

    
    (1-1/1)