/ - Diff - D-Net - D-Net project tracking tool

« Previous | Next »

Revision 28252

Added by Eri Katsari over 10 years ago

     <workflow-app name="test-core_examples_javamapreduce_cloner_with_multiple_output"
     <workflow-app
     	name="test-core_examples_javamapreduce_cloner_with_multiple_output"
     	xmlns="uri:oozie:workflow:0.3">
     	<!-- This example writes to 2 datastores: person and documents. The class responsible
     		for writing multiple datastores is: eu.dnetlib.iis.core.examples.javamapreduce.PersonClonerMapperMultipleOutput.
     		-->
     	<start to="mr_exporter"/>
     	<!-- map reduce job that exports hbase data and prepares them for import
     		to the relation database used for statistics generation -->
     	<start to="mr_exporter" />
     	<action name="mr_exporter">
     		<map-reduce>
     			<job-tracker>${jobTracker}</job-tracker>
     			<name-node>${nameNode}</name-node>
     			<!-- The data generated by this node in the previous run is deleted in this section
     				-->
     			<!-- The data generated by this node in the previous run is deleted in
     				this section -->
     			<prepare>
     				<delete path="${nameNode}${workingDir}/mr_cloner" />
     				<delete path="${nameNode}${workingDir}/${outputPath}" />
     			</prepare>
     			<!-- That's a multiple output MapReduce job, so no need to create mr_cloner directory,
     				since it will be created by MapReduce /> -->
     			<!-- That's a multiple output MapReduce job, so no need to create mr_cloner
     				directory, since it will be created by MapReduce /> -->
     			<configuration>
     				<!-- # Standard set of options that stays the same regardless of a concrete definition
     					of map-reduce job -->
     				<!-- # Standard set of options that stays the same regardless of a concrete
     					definition of map-reduce job -->
     				<!-- ## Various options -->
     				<!--This property seems to not be needed -->
     				<!--<property> <name>mapred.job.queue.name</name> <value>${queueName}</value> </property>
     					-->
                   <!-- <PARAM name="index.entity.links" required="true" description="entity
     					joiner configuration" /> -->
     				<!-- <PARAM name="contextmap" required="true" description="context map
     					(ContextDSResources)" /> -->
     				<property>
     					<name>mapreduce.inputformat.class</name>
     					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
     					<value>org.apache.hadoop.hbase.mapreduce.TableInputFormat</value>
     				</property>
     				<property>
     					<name>mapred.mapoutput.key.class</name>
     					<value>org.apache.avro.mapred.AvroKey</value>
     					<value>org.apache.hadoop.io.Text</value>
     				</property>
     				<property>
     					<name>mapred.mapoutput.value.class</name>
     					<value>org.apache.avro.mapred.AvroValue</value>
     					<value>org.apache.hadoop.hbase.io.ImmutableBytesWritable</value>
     				</property>
     				<property>
     					<name>mapred.output.key.class</name>
     					<value>org.apache.avro.mapred.AvroKey</value>
     					<value>org.apache.hadoop.io.Text</value>
     				</property>
     				<property>
     					<name>mapred.output.value.class</name>
     					<value>org.apache.avro.mapred.AvroValue</value>
     				</property>
     				<property>
     					<name>mapred.output.key.comparator.class</name>
     					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
     				</property>
     				<property>
     					<name>io.serializations</name>
     					<value>
     						org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
     					<value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
     					</value>
     				</property>
                	<!-- ## This is required for new MapReduce API usage -->
     				<property>
     					<name>mapred.output.value.groupfn.class</name>
     					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
     				</property>
     				<property>
     					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB</name>
     					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
     				</property>
     				<!-- ## This is required for new MapReduce API usage -->
     				<property>
     					<name>mapred.mapper.new-api</name>
     					<value>true</value>
     				</property>
-...
     					<name>mapred.reducer.new-api</name>
     					<value>true</value>
     				</property>
     				<!-- # Job-specific options -->
     				<!-- ## Names of all output ports -->
     				<property>
     					<name>avro.mapreduce.multipleoutputs</name>
     					<value>person age</value>
     					<name>dfs.blocksize</name>
     					<value>32M</value>
     				</property>
     				<!-- ## Output classes for all output ports -->
     				<property>
     					<name>avro.mapreduce.multipleoutputs.namedOutput.person.format</name>
     					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
     					<name>mapred.output.compress</name>
     					<value>false</value>
     				</property>
     				<property>
     					<name>avro.mapreduce.multipleoutputs.namedOutput.age.format</name>
     					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
     					<name>mapred.reduce.tasks.speculative.execution</name>
     					<value>false</value>
     				</property>
     				<!-- ## Classes of mapper and reducer -->
     				<property>
     					<name>mapreduce.map.class</name>
     					<value>eu.dnetlib.iis.core.examples.javamapreduce.PersonClonerMapper</value>
     					<name>mapred.reduce.tasks.speculative.execution</name>
     					<value>false</value>
     				</property>
     				<property>
     					<name>mapreduce.reduce.class</name>
     					<value>eu.dnetlib.iis.core.examples.javamapreduce.MultipleOutputPersonClonerReducer</value>
     					<name>mapreduce.map.speculative</name>
     					<value>false</value>
     				</property>
     				<!-- ## Schemas -->
     				<!-- ### Schema of the data ingested by the mapper. To be more precise, it's the schema
     					of Avro data passed as template parameter of the AvroKey object passed to mapper.
     					-->
     				<!-- I/O FORMAT -->
     				<property>
     					<name>eu.dnetlib.iis.avro.input.class</name>
     					<value>eu.dnetlib.iis.core.examples.schemas.documentandauthor.Person</value>
     					<name>mapreduce.outputformat.class</name>
     					<value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
     					</value>
     				</property>
     				<!-- ### Schemas of the data produced by the mapper -->
     				<!-- #### Schema of the key produced by the mapper. To be more precise, it's the schema
     					of Avro data produced by the mapper and passed forward as template paramter of
     					AvroKey object. -->
     				<!-- ## Names of all output ports -->
     				<property>
     					<name>eu.dnetlib.iis.avro.map.output.key.class</name>
     					<value>org.apache.avro.Schema.Type.STRING</value>
     					<name>mapreduce.multipleoutputs</name>
     					<value>datasourceLanguage datasource
     <!-- 					      project  -->
     <!-- 					    result organization datasourceOrganization -->
     <!-- 					    datasourceTopic projectOrganization  -->
     <!-- 					    resultClaim resultClassification resultConcept  -->
     <!-- 					    resultLanguage resultOrganization  -->
     <!-- 					    resultResult resultProject resultResult resultTopic category claim concept  -->
     <!-- 					      resultLanguage resultDatasource -->
     					    	</property>
     				<!-- ## Output classes for all output ports -->
     				 <property>
     					<name>mapreduce.multipleoutputs.namedOutput.datasource.format </name>
     					<value>TextOutputFormat.class</value>
     				</property>
     				<!-- #### Schema of the value produced by the mapper. To be more precise, it's the
     					schema of Avro data produced by the mapper and passed forward as template paramter
     					of AvroValue object. -->
     				<property>
     					<name>eu.dnetlib.iis.avro.map.output.value.class</name>
     					<value>eu.dnetlib.iis.core.examples.schemas.documentandauthor.Person</value>
     					<name>avro.mapreduce.multipleoutputs.namedOutput.datasourceLanguage.format</name>
     				<value>TextOutputFormat.class</value>
     				</property>
     				<!-- ### Shema of multiple output ports. -->
     				<!-- ## Classes of mapper and reducer -->
     				<property>
     					<name>eu.dnetlib.iis.avro.multipleoutputs.class.person</name>
     					<value>eu.dnetlib.iis.core.examples.schemas.documentandauthor.Person</value>
     					<name>mapreduce.map.class</name>
     					<value>eu.dnetlib.data.mapreduce.hbase.statsExport.StatsMapper
     					</value>
     				</property>
     				<property>
     					<name>eu.dnetlib.iis.avro.multipleoutputs.class.age</name>
     					<value>eu.dnetlib.iis.core.examples.schemas.documentandauthor.PersonAge</value>
     					<name>mapreduce.reduce.class</name>
     					<value>eu.dnetlib.data.mapreduce.hbase.statsExport.StatsReducer
     					</value>
     				</property>
     				<!-- ## Specification of the input and output data store -->
     				<!--delim character used to seperate fields in hdfs dump files -->
     				<property>
     					<name>mapred.input.dir</name>
     					<value>${workingDir}/data_producer/person</value>
     					<name>mapred.output.delim</name>
     					<value>${Stats.delimCharacter}</value>
     				</property>
     				<!-- This directory does not correspond to a data store. In fact, this directory only
     					contains multiple data stores. It has to be set to the name of the workflow node.-->
     				<!--default string for Null String Values -->
     				<property>
     					<name>mapred.output.dir</name>
     					<value>${workingDir}/mr_cloner</value>
     					<name>mapred.output.nullString</name>
     					<value>${Stats.nullStringField}</value>
     				</property>
     				<!-- ## Workflow node parameters -->
     				<!--default string for Null Numeric Values -->
     				<property>
     					<name>copiesCount</name>
     					<value>2</value>
     					<name>mapred.output.nullNum</name>
     					<value>${Stats.nullNumericField}</value>
     				</property>
     				<!--source hbase table -->
     				<property>
     					<name>reducerCopiesCount</name>
     					<value>3</value>
     					<name>hbase.mapreduce.inputtable</name>
     					<value>${Stats.HbaseSourceTable}</value>
     				</property>
     			</configuration>
     		</map-reduce>
     		<ok to="db_prepare" />
     		<error to="fail" />
     	</action>
     	<!-- cloner works on duplicated data -->
     	<action name="db_prepare">
     		<java>
     			<job-tracker>${jobTracker}</job-tracker>
     			<name-node>${nameNode}</name-node>
     			<!-- The data generated by this node is deleted in this section -->
     			<prepare>
     				<delete path="${nameNode}${workingDir}/cloner" />
     				<mkdir path="${nameNode}${workingDir}/cloner" />
     			</prepare>
     			<configuration>
     				<!-- This directory does not correspond to a data store. In fact, this
     					directory only contains multiple data stores. It has to be set to the name
     					of the workflow node. -->
     				<property>
     					<name>mapred.job.queue.name</name>
     					<value>${queueName}</value>
     					<name>mapred.output.dir</name>
     					<value>${nameNode}${workingDir}/${Stats.outputPath}</value>
     				</property>
     			</configuration>
     			<!-- This is simple wrapper for the Java code -->
     			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
     			<!-- The business Java code that gets to be executed -->
     			<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg>
     			<!-- All input and output ports have to be bound to paths in HDFS, working directory
     				has to be specified as well -->
     			<arg>-SworkingDir=${workingDir}/cloner/working_dir</arg>
     			<arg>-Iperson=${workingDir}/mr_cloner/person</arg>
     			<arg>-Operson=${workingDir}/cloner/person</arg>
     		</java>
     		<ok to="mr_SqoopImport" />
     		<error to="fail" />
     	</action>
     	<action name="mr_SqoopImport">
     		<java>
     			<job-tracker>${jobTracker}</job-tracker>
     			<name-node>${nameNode}</name-node>
     			<!-- The data generated by this node is deleted in this section -->
     			<prepare>
     				<delete path="${nameNode}${workingDir}/cloner" />
     				<mkdir path="${nameNode}${workingDir}/cloner" />
     			</prepare>
     			<configuration>
     				<!-- ## Workflow node parameters -->
     				<property>
     					<name>mapred.job.queue.name</name>
     					<value>${queueName}</value>
     					<name>copiesCount</name>
     					<value>1</value>
     				</property>
     			</configuration>
     			<!-- This is simple wrapper for the Java code -->
     			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
     			<!-- The business Java code that gets to be executed -->
     			<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg>
     			<!-- All input and output ports have to be bound to paths in HDFS, working directory
     				has to be specified as well -->
     			<arg>-SworkingDir=${workingDir}/cloner/working_dir</arg>
     			<arg>-Iperson=${workingDir}/mr_cloner/person</arg>
     			<arg>-Operson=${workingDir}/cloner/person</arg>
     		</java>
     		<ok to="db_finalize" />
     		<error to="fail" />
     	</action>
     	<action name="db_finalize">
     		<java>
     			<job-tracker>${jobTracker}</job-tracker>
     			<name-node>${nameNode}</name-node>
     			<!-- The data generated by this node is deleted in this section -->
     			<prepare>
     				<delete path="${nameNode}${workingDir}/cloner" />
     				<mkdir path="${nameNode}${workingDir}/cloner" />
     			</prepare>
     			<configuration>
     				<property>
     					<name>mapred.job.queue.name</name>
     					<value>${queueName}</value>
     					<name>reducerCopiesCount</name>
     					<value>10</value>
     				</property>
     			</configuration>
     			<!-- This is simple wrapper for the Java code -->
     			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
     			<!-- The business Java code that gets to be executed -->
     			<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg>
     			<!-- All input and output ports have to be bound to paths in HDFS, working directory
     				has to be specified as well -->
     			<arg>-SworkingDir=${workingDir}/cloner/working_dir</arg>
     			<arg>-Iperson=${workingDir}/mr_cloner/person</arg>
     			<arg>-Operson=${workingDir}/cloner/person</arg>
     		</java>
     		</map-reduce>
     		<ok to="end" />
     		<error to="fail" />
     	</action>
     	<!-- cloner works on duplicated data -->
     <!-- 	<action name="db_prepare"> -->
     <!-- 		<java> -->
     <!-- 			<job-tracker>${jobTracker}</job-tracker> -->
     <!-- 			<name-node>${nameNode}</name-node> -->
     <!-- 			<!-- The data generated by this node is deleted in this section --> -->
     <!-- 			<prepare> -->
     <!-- 				<delete path="${nameNode}${workingDir}/cloner" /> -->
     <!-- 				<mkdir path="${nameNode}${workingDir}/cloner" /> -->
     <!-- 			</prepare> -->
     <!-- 			<configuration> -->
     <!-- 				<property> -->
     <!-- 					<name>mapred.job.queue.name</name> -->
     <!-- 					<value>${queueName}</value> -->
     <!-- 				</property> -->
     <!-- 			</configuration> -->
     <!-- 			<!-- This is simple wrapper for the Java code --> -->
     <!-- 			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class> -->
     <!-- 			<!-- The business Java code that gets to be executed --> -->
     <!-- 			<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg> -->
     <!-- 			<!-- All input and output ports have to be bound to paths in HDFS, working  -->
     <!-- 				directory has to be specified as well --> -->
     <!-- 			<arg>-SworkingDir=${workingDir}/cloner/working_dir</arg> -->
     <!-- 			<arg>-Iperson=${workingDir}/mr_cloner/person</arg> -->
     <!-- 			<arg>-Operson=${workingDir}/cloner/person</arg> -->
     <!-- 		</java> -->
     <!-- 		<ok to="mr_SqoopImport" /> -->
     <!-- 		<error to="fail" /> -->
     <!-- 	</action> -->
     <!-- 	<action name="mr_SqoopImport"> -->
     <!-- 		<java> -->
     <!-- 			<job-tracker>${jobTracker}</job-tracker> -->
     <!-- 			<name-node>${nameNode}</name-node> -->
     <!-- 			<!-- The data generated by this node is deleted in this section --> -->
     <!-- 			<prepare> -->
     <!-- 				<delete path="${nameNode}${workingDir}/cloner" /> -->
     <!-- 				<mkdir path="${nameNode}${workingDir}/cloner" /> -->
     <!-- 			</prepare> -->
     <!-- 			<configuration> -->
     <!-- 				<property> -->
     <!-- 					<name>mapred.job.queue.name</name> -->
     <!-- 					<value>${queueName}</value> -->
     <!-- 				</property> -->
     <!-- 			</configuration> -->
     <!-- 			<!-- This is simple wrapper for the Java code --> -->
     <!-- 			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class> -->
     <!-- 			<!-- The business Java code that gets to be executed --> -->
     <!-- 			<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg> -->
     <!-- 			<!-- All input and output ports have to be bound to paths in HDFS, working  -->
     <!-- 				directory has to be specified as well --> -->
     <!-- 			<arg>-SworkingDir=${workingDir}/cloner/working_dir</arg> -->
     <!-- 			<arg>-Iperson=${workingDir}/mr_cloner/person</arg> -->
     <!-- 			<arg>-Operson=${workingDir}/cloner/person</arg> -->
     <!-- 		</java> -->
     <!-- 		<ok to="db_finalize" /> -->
     <!-- 		<error to="fail" /> -->
     <!-- 	</action> -->
     <!-- 	<action name="db_finalize"> -->
     <!-- 		<java> -->
     <!-- 			<job-tracker>${jobTracker}</job-tracker> -->
     <!-- 			<name-node>${nameNode}</name-node> -->
     <!-- 			<!-- The data generated by this node is deleted in this section --> -->
     <!-- 			<prepare> -->
     <!-- 				<delete path="${nameNode}${workingDir}/cloner" /> -->
     <!-- 				<mkdir path="${nameNode}${workingDir}/cloner" /> -->
     <!-- 			</prepare> -->
     <!-- 			<configuration> -->
     <!-- 				<property> -->
     <!-- 					<name>mapred.job.queue.name</name> -->
     <!-- 					<value>${queueName}</value> -->
     <!-- 				</property> -->
     <!-- 			</configuration> -->
     <!-- 			<!-- This is simple wrapper for the Java code --> -->
     <!-- 			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class> -->
     <!-- 			<!-- The business Java code that gets to be executed --> -->
     <!-- 			<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg> -->
     <!-- 			<!-- All input and output ports have to be bound to paths in HDFS, working  -->
     <!-- 				directory has to be specified as well --> -->
     <!-- 			<arg>-SworkingDir=${workingDir}/cloner/working_dir</arg> -->
     <!-- 			<arg>-Iperson=${workingDir}/mr_cloner/person</arg> -->
     <!-- 			<arg>-Operson=${workingDir}/cloner/person</arg> -->
     <!-- 		</java> -->
     <!-- 		<ok to="end" /> -->
     <!-- 		<error to="fail" /> -->
     <!-- 	</action> -->
     	<kill name="fail">
     		<message>
     			Unfortunately, the process failed -- error message: [${wf:errorMessage(wf:lastErrorNode())}]
     			Unfortunately, the process failed -- error message:
     			[${wf:errorMessage(wf:lastErrorNode())}]
     		</message>
     	</kill>
     	<end name="end" />

Also available in: Unified diff

Project

General

Profile

D-Net

Revision 28252

Added by Eri Katsari over 10 years ago