Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.3"
2
	name="test-core_examples_javamapreduce_cloner_with_multiple_output_without_reducer_with_explicit_schema">
3
	<!-- This example writes to 2 datastores: person and documents. 
4
		The class responsible for writing multiple datastores is: 
5
		eu.dnetlib.iis.core.examples.javamapreduce.PersonClonerMapperMultipleOutput. -->
6
	<start to="data_producer" />
7
	<action name="data_producer">
8
		<java>
9
			<job-tracker>${jobTracker}</job-tracker>
10
			<name-node>${nameNode}</name-node>
11
			<!-- The data generated by this node is deleted in this section -->
12
			<prepare>
13
				<delete path="${nameNode}${workingDir}/data_producer" />
14
				<mkdir path="${nameNode}${workingDir}/data_producer" />
15
			</prepare>
16
			<configuration>
17
				<property>
18
					<name>mapred.job.queue.name</name>
19
					<value>${queueName}</value>
20
				</property>
21
			</configuration>
22
			<!-- This is simple wrapper for the Java code -->
23
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
24
			<!-- The business Java code that gets to be executed -->
25
			<arg>eu.dnetlib.iis.core.examples.java.SampleDataProducer</arg>
26
			<!-- All input and output ports have to be bound to paths in HDFS -->
27
			<arg>-Operson=${workingDir}/data_producer/person</arg>
28
			<arg>-Odocument=${workingDir}/data_producer/document</arg>
29
		</java>
30
		<ok to="mr_cloner" />
31
		<error to="fail" />
32
	</action>
33
	<action name="mr_cloner">
34
		<map-reduce>
35
			<job-tracker>${jobTracker}</job-tracker>
36
			<name-node>${nameNode}</name-node>
37
			<!-- The data generated by this node in the previous run is 
38
				deleted in this section -->
39
			<prepare>
40
				<delete path="${nameNode}${workingDir}/mr_cloner" />
41
			</prepare>
42
			<!-- That's a multiple output MapReduce job, so no need to 
43
				create mr_cloner directory, since it will be created by 
44
				MapReduce /> -->
45
			<configuration>
46

    
47
				<!-- # Standard set of options that stays the same regardless 
48
					of a concrete definition of map-reduce job -->
49

    
50
				<!-- ## Various options -->
51

    
52
				<!--This property seems to not be needed -->
53
				<!--<property> <name>mapred.job.queue.name</name> <value>${queueName}</value> 
54
					</property> -->
55
				<property>
56
					<name>mapreduce.inputformat.class</name>
57
					<value>org.apache.avro.mapreduce.AvroKeyInputFormat</value>
58
				</property>
59
				<!-- The output format is not needed since there is no Reduce phase -->
60
				<!-- <property>
61
					<name>mapreduce.outputformat.class</name>
62
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyOutputFormat</value>
63
				</property>-->
64
				<property>
65
					<name>mapred.mapoutput.key.class</name>
66
					<value>org.apache.avro.mapred.AvroKey</value>
67
				</property>
68
				<property>
69
					<name>mapred.mapoutput.value.class</name>
70
					<value>org.apache.avro.mapred.AvroValue</value>
71
				</property>
72
				<property>
73
					<name>mapred.output.key.class</name>
74
					<value>org.apache.avro.mapred.AvroKey</value>
75
				</property>
76
				<property>
77
					<name>mapred.output.value.class</name>
78
					<value>org.apache.avro.mapred.AvroValue</value>
79
				</property>
80
				<property>
81
					<name>mapred.output.key.comparator.class</name>
82
					<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
83
				</property>
84
				<property>
85
					<name>io.serializations</name>
86
					<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
87
					</value>
88
				</property>
89
				<property>
90
					<name>mapred.output.value.groupfn.class</name>
91
					<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
92
				</property>
93
				<property>
94
					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
95
					</name>
96
					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
97
				</property>				<!-- This directory does not correspond to a data store. In fact,
98
				this directory only contains multiple data stores.-->
99

    
100
				<!-- ## This is required for new MapReduce API usage -->
101

    
102
				<property>
103
					<name>mapred.mapper.new-api</name>
104
					<value>true</value>
105
				</property>
106
				<property>
107
					<name>mapred.reducer.new-api</name>
108
					<value>true</value>
109
				</property>
110

    
111
				<!-- # Job-specific options -->
112

    
113
				<!-- Since there is no reduce phase, there should be no
114
				reduce tasks -->
115
				<property>
116
                    <name>mapred.reduce.tasks</name>
117
                    <value>0</value>
118
                </property>
119

    
120
				<!-- ## Names of all output ports -->
121

    
122
				<property>
123
					<name>avro.mapreduce.multipleoutputs</name>
124
					<value>person age</value>
125
				</property>
126

    
127
				<!-- ## Output classes for all output ports -->
128
				
129
				<property>
130
					<name>avro.mapreduce.multipleoutputs.namedOutput.person.format
131
					</name>
132
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
133
				</property>
134
				<property>
135
					<name>avro.mapreduce.multipleoutputs.namedOutput.age.format
136
					</name>
137
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
138
				</property>
139

    
140
				<!-- ## Classes of mapper and reducer -->
141

    
142
				<property>
143
					<name>mapreduce.map.class</name>
144
					<value>eu.dnetlib.iis.core.examples.javamapreduce.MultipleOutputPersonClonerMapper</value>
145
				</property>
146
				
147
				<!-- No reducer -->
148

    
149
				<!-- ## Schemas -->
150

    
151
				<!-- ### Shema of the data ingested by the mapper. To be more precise, 
152
					it's the schema of Avro data passed as template parameter of the AvroKey 
153
					object passed to mapper. -->
154
				<property>
155
					<name>avro.schema.input.key</name>
156
					<value>{
157
						"type" :				<!-- This directory does not correspond to a data store. In fact,
158
				this directory only contains multiple data stores.--> "record",
159
						"name" : "Person",
160
						"namespace" : "eu.dnetlib.iis.core.examples.schemas.documentandauthor",
161
						"fields" : [ {
162
						"name" : "id",
163
						"type" : "int"
164
						}, {
165
						"name" : "name",
166
						"type" : "string"
167
						}, {
168
						"name" : "age",
169
						"type" : "int"
170
						} ]
171
						}
172
					</value>
173
				</property>
174

    
175
				<!-- ### Schemas of the data produced by the mapper -->
176

    
177
				<!-- #### Schema of the key produced by the mapper. To be more precise, 
178
					it's the schema of Avro data produced by the mapper and passed forward as 
179
					template paramter of AvroKey object. (it has to have the same value for the 
180
					"*.reader.schema" and "*.writer.schema"). -->
181

    
182
				<!-- As a convention, we're setting "null" values 
183
				since mapper does not produce any standard data in this example 
184
				(probably any other valid Avro schema would do as well).-->
185
				
186
				<property>
187
					<name>avro.serialization.key.reader.schema</name>
188
                	<value>"null"</value>
189
                </property>
190
                <property>
191
                	<name>avro.serialization.key.writer.schema</name>
192
                	<value>"null"</value>
193
                </property>
194
				
195

    
196
				<!-- #### Schema of the value produced by the mapper. To be more precise, 
197
					it's the schema of Avro data produced by the mapper and passed forward as 
198
					template paramter of AvroValue object. (it has to have the same value for 
199
					the "*.reader.schema" and "*.writer.schema") -->
200
				
201
				<!-- As a convention, we're setting "null" values 
202
				since mapper does not produce any standard data in this example 
203
				(probably any other valid Avro schema would do as well).-->
204
				
205
				<property>
206
					<name>avro.serialization.value.reader.schema</name>
207
					<value>"null"</value>
208
				</property>
209
				<property>
210
					<name>avro.serialization.value.writer.schema</name>
211
					<value>"null"</value>
212
				</property>
213

    
214
				<!-- ### Shema of multiple output ports. -->
215

    
216
				<property>
217
					<name>avro.mapreduce.multipleoutputs.namedOutput.person.keyschema
218
					</name>
219
					<value>{
220
						"type" : "record",
221
						"name" : "Person",
222
						"namespace" : "eu.dnetlib.iis.core.examples.schemas.documentandauthor",
223
						"fields" : [ {
224
						"name" : "id",
225
						"type" : "int"
226
						}, {
227
						"name" : "name",
228
						"type" : "string"
229
						}, {
230
						"name" : "age",
231
						"type" : "int"
232
						} ]
233
						}
234
					</value>
235
				</property>
236

    
237
				<property>
238
					<name>avro.mapreduce.multipleoutputs.namedOutput.age.keyschema
239
					</name>
240
					<value>{
241
							"type" : "record",
242
							"name" : "PersonAge",
243
							"namespace" : "eu.dnetlib.iis.core.examples.schemas.documentandauthor",
244
							"fields" : [ {
245
							  "name" : "age",
246
							  "type" : "int"
247
							} ]
248
							}
249
					</value>
250
				</property>
251

    
252
				<!-- ## Specification of the input and output data store -->
253

    
254
				<property>
255
					<name>mapred.input.dir</name>
256
					<value>${workingDir}/data_producer/person</value>
257
				</property>
258
				<!-- This directory does not correspond to a data store. In fact,
259
				this directory only contains multiple data stores. It has to
260
				be set to the name of the workflow node.-->
261
				<property>
262
					<name>mapred.output.dir</name>
263
					<value>${workingDir}/mr_cloner</value>
264
				</property>
265
				
266
				<!-- ## Workflow node parameters -->
267

    
268
				<property>
269
					<name>copiesCount</name>
270
					<value>2</value>
271
				</property>
272
			</configuration>
273
		</map-reduce>
274
		<ok to="cloner" />
275
		<error to="fail" />
276
	</action>
277

    
278
	<!-- cloner works on duplicated data -->
279
	<action name="cloner">
280
		<java>
281
			<job-tracker>${jobTracker}</job-tracker>
282
			<name-node>${nameNode}</name-node>
283
			<!-- The data generated by this node is deleted in this section -->
284
			<prepare>
285
				<delete path="${nameNode}${workingDir}/cloner" />
286
				<mkdir path="${nameNode}${workingDir}/cloner" />
287
			</prepare>
288
			<configuration>
289
				<property>
290
					<name>mapred.job.queue.name</name>
291
					<value>${queueName}</value>
292
				</property>
293
			</configuration>
294
			<!-- This is simple wrapper for the Java code -->
295
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
296
			<!-- The business Java code that gets to be executed -->
297
			<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg>
298
			<!-- All input and output ports have to be bound to paths in HDFS -->
299
			<arg>-Iperson=${workingDir}/mr_cloner/person</arg>
300
			<arg>-Operson=${workingDir}/cloner/person</arg>
301
		</java>
302
		<ok to="end" />
303
		<error to="fail" />
304
	</action>
305
	<kill name="fail">
306
		<message>Unfortunately, the process failed -- error message:
307
			[${wf:errorMessage(wf:lastErrorNode())}]
308
		</message>
309
	</kill>
310
	<end name="end" />
311
</workflow-app>
    (1-1/1)