Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.3"
2
	name="test-javamapreduce_cloner_with_multiple_output_without_reducer_with_empty_input">
3
	<start to="producer"/>
4
	<action name="producer">
5
		<java>
6
			<job-tracker>${jobTracker}</job-tracker>
7
			<name-node>${nameNode}</name-node>
8
			<!-- The data generated by this node is deleted in this section -->
9
			<prepare>
10
				<delete path="${nameNode}${workingDir}/producer" />
11
				<mkdir path="${nameNode}${workingDir}/producer" />
12
			</prepare>
13
			<configuration>
14
				<property>
15
					<name>mapred.job.queue.name</name>
16
					<value>${queueName}</value>
17
				</property>
18
			</configuration>
19
			<!-- This is simple wrapper for the Java code -->
20
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
21
			<!-- The business Java code that gets to be executed -->
22
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
23
			<!-- Specification of the output ports -->
24
			<arg>-C{person, 
25
				eu.dnetlib.iis.core.examples.schemas.documentandauthor.Person,
26
				eu/dnetlib/iis/core/examples/data/empty.json}</arg>
27
			<!-- All input and output ports have to be bound to paths in HDFS -->
28
			<arg>-Operson=${workingDir}/producer/person</arg>
29
		</java>
30
		<ok to="cloner" />
31
		<error to="fail" />
32
	</action>
33
	<action name="cloner">
34
		<map-reduce>
35
			<job-tracker>${jobTracker}</job-tracker>
36
			<name-node>${nameNode}</name-node>
37
			<!-- The data generated by this node in the previous run is 
38
				deleted in this section -->
39
			<prepare>
40
				<delete path="${nameNode}${workingDir}/cloner" />
41
			</prepare>
42
			<!-- That's a multiple output MapReduce job, so no need to 
43
				create cloner directory, since it will be created by 
44
				MapReduce /> -->
45
			<configuration>
46

    
47
				<!-- # Standard set of options that stays the same regardless 
48
					of a concrete definition of map-reduce job -->
49

    
50
				<!-- ## Various options -->
51

    
52
				<!--This property seems to not be needed -->
53
				<!--<property> <name>mapred.job.queue.name</name> <value>${queueName}</value> 
54
					</property> -->
55
				<property>
56
					<name>mapreduce.inputformat.class</name>
57
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
58
				</property>
59
				<!-- The output format is not needed since there is no Reduce phase -->
60
				<!-- <property>
61
					<name>mapreduce.outputformat.class</name>
62
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyOutputFormat</value>
63
				</property>-->
64
				<property>
65
					<name>mapred.mapoutput.key.class</name>
66
					<value>org.apache.avro.mapred.AvroKey</value>
67
				</property>
68
				<property>
69
					<name>mapred.mapoutput.value.class</name>
70
					<value>org.apache.avro.mapred.AvroValue</value>
71
				</property>
72
				<property>
73
					<name>mapred.output.key.class</name>
74
					<value>org.apache.avro.mapred.AvroKey</value>
75
				</property>
76
				<property>
77
					<name>mapred.output.value.class</name>
78
					<value>org.apache.avro.mapred.AvroValue</value>
79
				</property>
80
				<property>
81
					<name>mapred.output.key.comparator.class</name>
82
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
83
				</property>
84
				<property>
85
					<name>io.serializations</name>
86
					<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
87
					</value>
88
				</property>
89
				<property>
90
					<name>mapred.output.value.groupfn.class</name>
91
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
92
				</property>
93
				<property>
94
					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
95
					</name>
96
					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
97
				</property>
98

    
99
				<!-- ## This is required for new MapReduce API usage -->
100

    
101
				<property>
102
					<name>mapred.mapper.new-api</name>
103
					<value>true</value>
104
				</property>
105
				<property>
106
					<name>mapred.reducer.new-api</name>
107
					<value>true</value>
108
				</property>
109

    
110
				<!-- # Job-specific options -->
111

    
112
				<!-- Since there is no reduce phase, there should be no
113
				reduce tasks -->
114
				<property>
115
                    <name>mapred.reduce.tasks</name>
116
                    <value>0</value>
117
                </property>
118

    
119
				<!-- ## Names of all output ports -->
120

    
121
				<property>
122
					<name>avro.mapreduce.multipleoutputs</name>
123
					<value>person age</value>
124
				</property>
125

    
126
				<!-- ## Output classes for all output ports -->
127

    
128
				<property>
129
					<name>avro.mapreduce.multipleoutputs.namedOutput.person.format
130
					</name>
131
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
132
				</property>
133
				<property>
134
					<name>avro.mapreduce.multipleoutputs.namedOutput.age.format
135
					</name>
136
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
137
				</property>
138

    
139
				<!-- ## Classes of mapper and reducer -->
140

    
141
				<property>
142
					<name>mapreduce.map.class</name>
143
					<value>eu.dnetlib.iis.core.examples.javamapreduce.MultipleOutputPersonClonerMapper</value>
144
				</property>
145
				
146
				<!-- No reducer -->
147

    
148
				<!-- ## Schemas -->
149

    
150
				<!-- ### Schema of the data ingested by the mapper. To be more precise, 
151
					it's the schema of Avro data passed as template parameter of the AvroKey 
152
					object passed to mapper. -->
153
				<property>
154
					<name>eu.dnetlib.iis.avro.input.class</name>
155
					<value>eu.dnetlib.iis.core.examples.schemas.documentandauthor.Person</value>
156
				</property>
157

    
158
				<!-- ### Schemas of the data produced by the mapper -->
159

    
160
				<!-- #### Schema of the key produced by the mapper. 
161
				To be more precise,	it's the schema of Avro data produced 
162
				by the mapper and passed forward as	template parameter of 
163
				AvroKey object. -->
164

    
165
				<!-- As a convention, we're setting "null" values 
166
				since mapper does not produce any standard data in this example 
167
				(probably any other valid Avro schema would be OK as well).-->
168
				
169
				<property>
170
					<name>eu.dnetlib.iis.avro.map.output.key.class</name>
171
                	<value>org.apache.avro.Schema.Type.NULL</value>
172
                </property>
173
				
174

    
175
				<!-- #### Schema of the value produced by the mapper. To be more precise, 
176
					it's the schema of Avro data produced by the mapper and passed forward as 
177
					template parameter of AvroValue object. -->
178
				
179
				<!-- As a convention, we're setting "null" values 
180
				since mapper does not produce any standard data in this example 
181
				(probably any other valid Avro schema would be OK as well).-->
182
				
183
				<property>
184
					<name>eu.dnetlib.iis.avro.map.output.value.class</name>
185
					<value>org.apache.avro.Schema.Type.NULL</value>
186
				</property>
187

    
188
				<!-- ### Schema of multiple output ports. -->
189

    
190
				<property>
191
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.person
192
					</name>
193
					<value>eu.dnetlib.iis.core.examples.schemas.documentandauthor.Person</value>
194
				</property>
195

    
196
				<property>
197
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.age
198
					</name>
199
					<value>eu.dnetlib.iis.core.examples.schemas.documentandauthor.PersonAge</value>
200
				</property>
201

    
202
				<!-- ## Specification of the input and output data store -->
203

    
204
				<property>
205
					<name>mapred.input.dir</name>
206
					<value>${workingDir}/producer/person</value>
207
				</property>
208
				<!-- This directory does not correspond to a data store. In fact,
209
				this directory only contains multiple data stores. It has to
210
				be set to the name of the workflow node.-->
211
				<property>
212
					<name>mapred.output.dir</name>
213
					<value>${workingDir}/cloner</value>
214
				</property>
215
				
216
				<!-- ## Workflow node parameters -->
217

    
218
				<property>
219
					<name>copiesCount</name>
220
					<value>2</value>
221
				</property>
222
			</configuration>
223
		</map-reduce>
224
		<ok to="consumer" />
225
		<error to="fail" />
226
	</action>
227
    <action name="consumer">
228
		<java>
229
			<job-tracker>${jobTracker}</job-tracker>
230
			<name-node>${nameNode}</name-node>
231
			<configuration>
232
				<property>
233
					<name>mapred.job.queue.name</name>
234
					<value>${queueName}</value>
235
				</property>
236
			</configuration>
237
			<!-- This is simple wrapper for the Java code -->
238
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
239
			<!-- The business Java code that gets to be executed -->
240
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.TestingConsumer</arg>
241
			<!-- Specification of the input ports -->
242
			<arg>-C{person, 
243
				eu.dnetlib.iis.core.examples.schemas.documentandauthor.Person,
244
				eu/dnetlib/iis/core/examples/data/empty.json}</arg>
245
			<arg>-C{age, 
246
				eu.dnetlib.iis.core.examples.schemas.documentandauthor.PersonAge,
247
				eu/dnetlib/iis/core/examples/data/empty.json}</arg>
248
			<!-- All input and output ports have to be bound to paths in HDFS -->
249
			<arg>-Iperson=${workingDir}/cloner/person</arg>
250
			<arg>-Iage=${workingDir}/cloner/age</arg>
251
		</java>
252
		<ok to="end" />
253
		<error to="fail" />
254
	</action>
255
	<kill name="fail">
256
		<message>Unfortunately, the process failed -- error message:
257
			[${wf:errorMessage(wf:lastErrorNode())}]
258
		</message>
259
	</kill>
260
	<end name="end" />
261
</workflow-app>
    (1-1/1)