Project

General

Profile

1
<?xml version="1.0"?>
2
<workflow-app xmlns="uri:oozie:workflow:0.3" name="test-core_examples_javamapreduce_person_by_age_splitter">
3
	<!-- 
4
		This example writes to multiple datastores: 2 sequence files with persons of odd and even age. 
5
		Then another MR job reads both datastores and clones all persons.
6
	-->
7
	<start to="data_producer" />
8
	<action name="data_producer">
9
		<java>
10
			<job-tracker>${jobTracker}</job-tracker>
11
			<name-node>${nameNode}</name-node>
12
			<!-- The data generated by this node is deleted in this section -->
13
			<prepare>
14
				<delete path="${nameNode}${workingDir}/data_producer" />
15
				<mkdir path="${nameNode}${workingDir}/data_producer" />
16
			</prepare>
17
			<configuration>
18
				<property>
19
					<name>mapred.job.queue.name</name>
20
					<value>${queueName}</value>
21
				</property>
22
			</configuration>
23
			<!-- This is simple wrapper for the Java code -->
24
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
25
			<!-- The business Java code that gets to be executed -->
26
			<arg>eu.dnetlib.iis.core.examples.java.SampleDataProducer</arg>
27
			<!-- All input and output ports have to be bound to paths in HDFS -->
28
			<arg>-Operson=${workingDir}/data_producer/person</arg>
29
			<arg>-Odocument=${workingDir}/data_producer/document</arg>
30
		</java>
31
		<ok to="mr_splitter" />
32
		<error to="fail" />
33
	</action>
34
	<action name="mr_splitter">
35
		<map-reduce>
36
			<job-tracker>${jobTracker}</job-tracker>
37
			<name-node>${nameNode}</name-node>
38
			<!-- The data generated by this node is deleted in this section -->
39
			<prepare>
40
				<delete path="${nameNode}${workingDir}/mr_splitter" />
41
				<!-- multiple output, no need to create mr_splitter, will be created by mapred
42
				<mkdir path="${nameNode}${workingDir}/mr_splitter" />
43
				-->
44
			</prepare>
45
			<configuration>
46
				<!-- This is required for new api usage -->
47
				<property>
48
					<name>mapred.mapper.new-api</name>
49
					<value>true</value>
50
				</property>
51
				<property>
52
					<name>mapred.reducer.new-api</name>
53
					<value>true</value>
54
				</property>
55

    
56
				<!-- Standard stuff for our framework -->
57
				<property>
58
					<name>mapred.job.queue.name</name>
59
					<value>${queueName}</value>
60
				</property>
61
				<property>
62
					<name>mapred.mapoutput.key.class</name>
63
					<value>org.apache.hadoop.io.Text</value>
64
				</property>
65
				<property>
66
					<name>mapred.mapoutput.value.class</name>
67
					<value>org.apache.hadoop.io.BytesWritable</value>
68
				</property>
69
				<property>
70
					<name>mapred.output.key.class</name>
71
					<value>org.apache.hadoop.io.Text</value>
72
				</property>
73
				<property>
74
					<name>mapred.output.value.class</name>
75
					<value>org.apache.hadoop.io.BytesWritable</value>
76
				</property>
77
				<property>
78
					<name>mapreduce.inputformat.class</name>
79
					<value>org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat
80
					</value>
81
				</property>
82
				<property>
83
					<name>mapreduce.outputformat.class</name>
84
					<value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
85
					</value>
86
				</property>
87
				<!-- Other stuff -->
88
				<property>
89
					<name>mapreduce.map.class</name>
90
					<value>eu.dnetlib.iis.core.examples.javamapreduce.PersonByAgeSplitter</value>
91
				</property>
92
				<property>
93
					<name>mapred.input.dir</name>
94
					<value>${workingDir}/data_producer/person</value>
95
				</property>
96
				<property>
97
					<name>mapred.output.dir</name>
98
					<value>${workingDir}/mr_splitter</value>
99
				</property>
100
				<!-- multiple output definitions -->
101
				<property>
102
				    <name>named.output.person.age.even</name>
103
				   <value>${outputAgeEven}</value>
104
				</property>
105
				<property>
106
				    <name>named.output.person.age.odd</name>
107
				   <value>${outputAgeOdd}</value>
108
				</property>
109
			</configuration>
110
		</map-reduce>
111
		<ok to="mr_cloner_with_multiple_input" />
112
		<error to="fail" />
113
	</action>
114
    
115
    <!-- cloner works on split data, reads from both datastores -->
116
	<action name="mr_cloner_with_multiple_input">
117
		<map-reduce>
118
			<job-tracker>${jobTracker}</job-tracker>
119
			<name-node>${nameNode}</name-node>
120
			<!-- The data generated by this node is deleted in this section -->
121
			<prepare>
122
				<delete path="${nameNode}${workingDir}/mr_cloner_with_multiple_input" />
123
				<mkdir path="${nameNode}${workingDir}/mr_cloner_with_multiple_input" />
124
			</prepare>
125
			<configuration>
126
				<!-- This is required for new api usage -->
127
				<property>
128
					<name>mapred.mapper.new-api</name>
129
					<value>true</value>
130
				</property>
131
				<property>
132
					<name>mapred.reducer.new-api</name>
133
					<value>true</value>
134
				</property>
135

    
136
				<!-- Standard stuff for our framework -->
137
				<property>
138
					<name>mapred.job.queue.name</name>
139
					<value>${queueName}</value>
140
				</property>
141
				<property>
142
					<name>mapred.mapoutput.key.class</name>
143
					<value>org.apache.hadoop.io.Text</value>
144
				</property>
145
				<property>
146
					<name>mapred.mapoutput.value.class</name>
147
					<value>org.apache.hadoop.io.BytesWritable</value>
148
				</property>
149
				<property>
150
					<name>mapred.output.key.class</name>
151
					<value>org.apache.hadoop.io.Text</value>
152
				</property>
153
				<property>
154
					<name>mapred.output.value.class</name>
155
					<value>org.apache.hadoop.io.BytesWritable</value>
156
				</property>
157
				<property>
158
					<name>mapreduce.inputformat.class</name>
159
					<value>org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat
160
					</value>
161
				</property>
162
				<property>
163
					<name>mapreduce.outputformat.class</name>
164
					<value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
165
					</value>
166
				</property>
167
				<!-- This is an outdated property -->
168
				<!-- <property> 
169
					<name>mapred.map.tasks</name>
170
					<value>1</value> 
171
				</property> -->
172

    
173
				<!-- Other stuff -->
174
				<property>
175
					<name>mapreduce.map.class</name>
176
					<value>eu.dnetlib.iis.core.examples.javamapreduce.PersonClonerMapper</value>
177
				</property>
178
				<property>
179
					<name>mapred.input.dir</name>
180
					<!-- simply setting multiple inputs as CSV -->
181
					<value>${workingDir}/mr_splitter/${outputAgeEven},${workingDir}/mr_splitter/${outputAgeOdd}</value>
182
				</property>
183
				<property>
184
					<name>mapred.output.dir</name>
185
					<value>${workingDir}/mr_cloner_with_multiple_input/person</value>
186
				</property>
187
				<!-- Workflow node parameters -->
188
				<property>
189
					<name>copiesCount</name>
190
					<value>2</value>
191
				</property>
192
			</configuration>
193
		</map-reduce>
194
		<ok to="cloner" />
195
		<error to="fail" />
196
	</action>
197
	
198
	<action name="cloner">
199
		<java>
200
			<job-tracker>${jobTracker}</job-tracker>
201
			<name-node>${nameNode}</name-node>
202
			<!-- The data generated by this node is deleted in this section -->
203
			<prepare>
204
				<delete path="${nameNode}${workingDir}/cloner" />
205
				<mkdir path="${nameNode}${workingDir}/cloner" />
206
			</prepare>
207
			<configuration>
208
				<property>
209
					<name>mapred.job.queue.name</name>
210
					<value>${queueName}</value>
211
				</property>
212
			</configuration>
213
			<!-- This is simple wrapper for the Java code -->
214
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
215
			<!-- The business Java code that gets to be executed -->
216
			<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg>
217
			<!-- All input and output ports have to be bound to paths in HDFS -->
218
			<arg>-Iperson=${workingDir}/mr_cloner_with_multiple_input/person</arg>
219
			<arg>-Operson=${workingDir}/cloner/person</arg>
220
		</java>
221
		<ok to="end" />
222
		<error to="fail" />
223
	</action>
224
	
225
	<kill name="fail">
226
		<message>Unfortunately, the process failed -- error message:
227
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
228
	</kill>
229
	<end name="end" />
230
</workflow-app>
    (1-1/1)