/modules/icm-iis-core-examples/trunk/src/test/resources/eu/dnetlib/iis/core/examples/javamapreduce/cloner_with_multiple_output_without_reducer_with_explicit_schema/oozie_app/workflow.xml - D-Net - D-Net project tracking tool

dnet40/modules/icm-iis-core-examples/trunk/src/test/resources/eu/dnetlib/iis/core/examples/javamapreduce/cloner_with_multiple_output_without_reducer_with_explicit_schema/oozie_app/workflow.xml @ 35701

       <workflow-app xmlns="uri:oozie:workflow:0.3"
       	name="test-core_examples_javamapreduce_cloner_with_multiple_output_without_reducer_with_explicit_schema">
       	<!-- This example writes to 2 datastores: person and documents.
       		The class responsible for writing multiple datastores is:
       		eu.dnetlib.iis.core.examples.javamapreduce.PersonClonerMapperMultipleOutput. -->
       	<start to="data_producer" />
       	<action name="data_producer">
       		<java>
       			<job-tracker>${jobTracker}</job-tracker>
       			<name-node>${nameNode}</name-node>
       			<!-- The data generated by this node is deleted in this section -->
       			<prepare>
       				<delete path="${nameNode}${workingDir}/data_producer" />
       				<mkdir path="${nameNode}${workingDir}/data_producer" />
       			</prepare>
       			<configuration>
       				<property>
       					<name>mapred.job.queue.name</name>
       					<value>${queueName}</value>
       				</property>
       			</configuration>
       			<!-- This is simple wrapper for the Java code -->
       			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
       			<!-- The business Java code that gets to be executed -->
       			<arg>eu.dnetlib.iis.core.examples.java.SampleDataProducer</arg>
       			<!-- All input and output ports have to be bound to paths in HDFS -->
       			<arg>-Operson=${workingDir}/data_producer/person</arg>
       			<arg>-Odocument=${workingDir}/data_producer/document</arg>
       		</java>
       		<ok to="mr_cloner" />
       		<error to="fail" />
       	</action>
       	<action name="mr_cloner">
       		<map-reduce>
       			<job-tracker>${jobTracker}</job-tracker>
       			<name-node>${nameNode}</name-node>
       			<!-- The data generated by this node in the previous run is
       				deleted in this section -->
       			<prepare>
       				<delete path="${nameNode}${workingDir}/mr_cloner" />
       			</prepare>
       			<!-- That's a multiple output MapReduce job, so no need to
       				create mr_cloner directory, since it will be created by
       				MapReduce /> -->
       			<configuration>
       				<!-- # Standard set of options that stays the same regardless
       					of a concrete definition of map-reduce job -->
       				<!-- ## Various options -->
       				<!--This property seems to not be needed -->
       				<!--<property> <name>mapred.job.queue.name</name> <value>${queueName}</value>
       					</property> -->
       				<property>
       					<name>mapreduce.inputformat.class</name>
       					<value>org.apache.avro.mapreduce.AvroKeyInputFormat</value>
       				</property>
       				<!-- The output format is not needed since there is no Reduce phase -->
       				<!-- <property>
       					<name>mapreduce.outputformat.class</name>
       					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyOutputFormat</value>
       				</property>-->
       				<property>
       					<name>mapred.mapoutput.key.class</name>
       					<value>org.apache.avro.mapred.AvroKey</value>
       				</property>
       				<property>
       					<name>mapred.mapoutput.value.class</name>
       					<value>org.apache.avro.mapred.AvroValue</value>
       				</property>
       				<property>
       					<name>mapred.output.key.class</name>
       					<value>org.apache.avro.mapred.AvroKey</value>
       				</property>
       				<property>
       					<name>mapred.output.value.class</name>
       					<value>org.apache.avro.mapred.AvroValue</value>
       				</property>
       				<property>
       					<name>mapred.output.key.comparator.class</name>
       					<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
       				</property>
       				<property>
       					<name>io.serializations</name>
       					<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
       					</value>
       				</property>
       				<property>
       					<name>mapred.output.value.groupfn.class</name>
       					<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
       				</property>
       				<property>
       					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
       					</name>
       					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
       				</property>				<!-- This directory does not correspond to a data store. In fact,
       				this directory only contains multiple data stores.-->
       				<!-- ## This is required for new MapReduce API usage -->
       				<property>
       					<name>mapred.mapper.new-api</name>
       					<value>true</value>
       				</property>
       				<property>
       					<name>mapred.reducer.new-api</name>
       					<value>true</value>
       				</property>
       				<!-- # Job-specific options -->
       				<!-- Since there is no reduce phase, there should be no
       				reduce tasks -->
       				<property>
                           <name>mapred.reduce.tasks</name>
                           <value>0</value>
                       </property>
       				<!-- ## Names of all output ports -->
       				<property>
       					<name>avro.mapreduce.multipleoutputs</name>
       					<value>person age</value>
       				</property>
       				<!-- ## Output classes for all output ports -->
       				<property>
       					<name>avro.mapreduce.multipleoutputs.namedOutput.person.format
       					</name>
       					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
       				</property>
       				<property>
       					<name>avro.mapreduce.multipleoutputs.namedOutput.age.format
       					</name>
       					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
       				</property>
       				<!-- ## Classes of mapper and reducer -->
       				<property>
       					<name>mapreduce.map.class</name>
       					<value>eu.dnetlib.iis.core.examples.javamapreduce.MultipleOutputPersonClonerMapper</value>
       				</property>
       				<!-- No reducer -->
       				<!-- ## Schemas -->
       				<!-- ### Shema of the data ingested by the mapper. To be more precise,
       					it's the schema of Avro data passed as template parameter of the AvroKey
       					object passed to mapper. -->
       				<property>
       					<name>avro.schema.input.key</name>
       					<value>{
       						"type" :				<!-- This directory does not correspond to a data store. In fact,
       				this directory only contains multiple data stores.--> "record",
       						"name" : "Person",
       						"namespace" : "eu.dnetlib.iis.core.examples.schemas.documentandauthor",
       						"fields" : [ {
       						"name" : "id",
       						"type" : "int"
       						}, {
       						"name" : "name",
       						"type" : "string"
       						}, {
       						"name" : "age",
       						"type" : "int"
       						} ]
+      						}
       					</value>
       				</property>
       				<!-- ### Schemas of the data produced by the mapper -->
       				<!-- #### Schema of the key produced by the mapper. To be more precise,
       					it's the schema of Avro data produced by the mapper and passed forward as
       					template paramter of AvroKey object. (it has to have the same value for the
       					"*.reader.schema" and "*.writer.schema"). -->
       				<!-- As a convention, we're setting "null" values
       				since mapper does not produce any standard data in this example
       				(probably any other valid Avro schema would do as well).-->
       				<property>
       					<name>avro.serialization.key.reader.schema</name>
                       	<value>"null"</value>
                       </property>
                       <property>
                       	<name>avro.serialization.key.writer.schema</name>
                       	<value>"null"</value>
                       </property>
       				<!-- #### Schema of the value produced by the mapper. To be more precise,
       					it's the schema of Avro data produced by the mapper and passed forward as
       					template paramter of AvroValue object. (it has to have the same value for
       					the "*.reader.schema" and "*.writer.schema") -->
       				<!-- As a convention, we're setting "null" values
       				since mapper does not produce any standard data in this example
       				(probably any other valid Avro schema would do as well).-->
       				<property>
       					<name>avro.serialization.value.reader.schema</name>
       					<value>"null"</value>
       				</property>
       				<property>
       					<name>avro.serialization.value.writer.schema</name>
       					<value>"null"</value>
       				</property>
       				<!-- ### Shema of multiple output ports. -->
       				<property>
       					<name>avro.mapreduce.multipleoutputs.namedOutput.person.keyschema
       					</name>
       					<value>{
       						"type" : "record",
       						"name" : "Person",
       						"namespace" : "eu.dnetlib.iis.core.examples.schemas.documentandauthor",
       						"fields" : [ {
       						"name" : "id",
       						"type" : "int"
       						}, {
       						"name" : "name",
       						"type" : "string"
       						}, {
       						"name" : "age",
       						"type" : "int"
       						} ]
+      						}
       					</value>
       				</property>
       				<property>
       					<name>avro.mapreduce.multipleoutputs.namedOutput.age.keyschema
       					</name>
       					<value>{
       							"type" : "record",
       							"name" : "PersonAge",
       							"namespace" : "eu.dnetlib.iis.core.examples.schemas.documentandauthor",
       							"fields" : [ {
       							  "name" : "age",
       							  "type" : "int"
       							} ]
+      							}
       					</value>
       				</property>
       				<!-- ## Specification of the input and output data store -->
       				<property>
       					<name>mapred.input.dir</name>
       					<value>${workingDir}/data_producer/person</value>
       				</property>
       				<!-- This directory does not correspond to a data store. In fact,
       				this directory only contains multiple data stores. It has to
       				be set to the name of the workflow node.-->
       				<property>
       					<name>mapred.output.dir</name>
       					<value>${workingDir}/mr_cloner</value>
       				</property>
       				<!-- ## Workflow node parameters -->
       				<property>
       					<name>copiesCount</name>
       					<value>2</value>
       				</property>
       			</configuration>
       		</map-reduce>
       		<ok to="cloner" />
       		<error to="fail" />
       	</action>
       	<!-- cloner works on duplicated data -->
       	<action name="cloner">
       		<java>
       			<job-tracker>${jobTracker}</job-tracker>
       			<name-node>${nameNode}</name-node>
       			<!-- The data generated by this node is deleted in this section -->
       			<prepare>
       				<delete path="${nameNode}${workingDir}/cloner" />
       				<mkdir path="${nameNode}${workingDir}/cloner" />
       			</prepare>
       			<configuration>
       				<property>
       					<name>mapred.job.queue.name</name>
       					<value>${queueName}</value>
       				</property>
       			</configuration>
       			<!-- This is simple wrapper for the Java code -->
       			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
       			<!-- The business Java code that gets to be executed -->
       			<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg>
       			<!-- All input and output ports have to be bound to paths in HDFS -->
       			<arg>-Iperson=${workingDir}/mr_cloner/person</arg>
       			<arg>-Operson=${workingDir}/cloner/person</arg>
       		</java>
       		<ok to="end" />
       		<error to="fail" />
       	</action>
       	<kill name="fail">
       		<message>Unfortunately, the process failed -- error message:
       			[${wf:errorMessage(wf:lastErrorNode())}]
       		</message>
       	</kill>
       	<end name="end" />
       </workflow-app>

(1-1/1)

Project

General

Profile

D-Net