/modules/dnet-openaire-lod-interlinking-wf/src/main/resources/eu/dnetlib/iis/core/javamapreduce/lodexport/oozie_app/workflow.xml - D-Net - D-Net project tracking tool

dnet40/modules/dnet-openaire-lod-interlinking-wf/src/main/resources/eu/dnetlib/iis/core/javamapreduce/lodexport/oozie_app/workflow.xml @ 43137

       <workflow-app name="lod_interlinking" xmlns="uri:oozie:workflow:0.4">
           <!-- map reduce job that exports hbase data and prepares them for import
               to the lod_generation -->
           <global>
               <job-tracker>${jobTracker}</job-tracker>
               <name-node>${nameNode}</name-node>
               <configuration>
                   <property>
                       <name>mapred.job.queue.name</name>
                       <value>${queueName}</value>
                   </property>
                   <property>
                       <name>oozie.sqoop.log.level</name>
                       <value>DEBUG</value>
                   </property>
               </configuration>
           </global>
           <start to='preProcessing'/>
           <action name="preProcessing">
               <map-reduce>
                   <configuration>
                       <!-- ZOOKEEPER -->
                       <property>
                           <name>hbase.zookeeper.quorum</name>
                           <value>
                          <!--     namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
       -->                        ${zookeeperQuorum}
                           </value>
                       </property>
                       <property>
                           <name>zookeeper.znode.rootserver</name>
                           <value>
                               ${zookeeperZnode}</value>
                       </property>
                       <property>
                           <name>hbase.zookeeper.property.clientPort</name>
                           <value> ${zookeepeClientPort}</value>
                       </property>
                       <!-- MR IO FOR MULTIPLE INPUTS-->
                       <property>
                           <name>mapreduce.inputformat.class</name>
                           <value>org.apache.hadoop.mapreduce.lib.input.DelegatingInputFormat</value>
                       </property>
                       <property>
                           <name>mapreduce.map.class</name>
                           <value>org.apache.hadoop.mapreduce.lib.input.DelegatingMapper</value>
                       </property>
                       <property>
                           <name>mapred.input.dir.formats</name>
                           <value>
                               ${nameNode}${sourceInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat,${nameNode}${targetInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat
                           </value>
                       </property>
                       <property>
                           <name>mapred.input.dir.mappers</name>
                           <value>
                               ${nameNode}${sourceInput};eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing.SourceMapper,${nameNode}${targetInput};eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing.TargetMapper
                           </value>
                       </property>
                       <property>
                           <name>mapred.mapoutput.key.class</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapred.mapoutput.value.class</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapred.output.key.class</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapred.output.value.class</name>
                           <value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat</value>
                       </property>
                       <!-- ## This is required for new MapReduce API usage -->
                       <property>
                           <name>mapred.mapper.new-api</name>
                           <value>true</value>
                       </property>
                       <property>
                           <name>mapred.reducer.new-api</name>
                           <value>true</value>
                       </property>
                       <!-- # Job-specific options -->
                       <property>
                           <name>dfs.blocksize</name>
                           <value>32M</value>
                       </property>
                       <property>
                           <name>mapred.reduce.tasks.speculative.execution</name>
                           <value>false</value>
                       </property>
                       <property>
                           <name>mapreduce.map.speculative</name>
                           <value>false</value>
                       </property>
                       <!-- Compress Output-->
                       <property>
                           <name>mapred.output.compress</name>
                           <value>true</value>
                       </property>
                       <property>
                           <name>mapred.output.compression.type</name>
                           <value>BLOCK</value>
                       </property>
                       <property>
                           <name>mapred.output.compression.codec</name>
                           <value>org.apache.hadoop.io.compress.GzipCodec</value>
                       </property>
                       <property>
                           <name>mapreduce.reduce.class</name>
                           <value>eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing.DatasetReducer</value>
                       </property>
                       <!-- I/O FORMAT -->
                       <!-- IMPORTANT: sets default delimeter used by text output writer. Required
                           to fix issue with traling tab added between id and value in multiple outputs -->
                       <property>
                           <name>mapred.textoutputformat.separator</name>
                           <value>${lod_delim}</value>
                       </property>
                       <property>
                           <name>io.serializations</name>
                           <value>org.apache.hadoop.io.serializer.WritableSerialization</value>
                       </property>
                       <!-- ## Names of all output ports -->
                       <property>
                           <name>mapreduce.multipleoutputs</name>
                           <value>
                               ${out1} ${out2} ${out3} ${out4} ${out5}
                           </value>
                       </property>
                       <!--datasource-->
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out1}.key</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out1}.value</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out1}.format</name>
                           <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
                       </property>
                       <!-- result -->
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out2}.key</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out2}.value</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out2}.format</name>
                           <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
                       </property>
                       <!-- project -->
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out3}.key</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out3}.value</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out3}.format</name>
                           <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
                       </property>
                       <!-- person -->
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out4}.key</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out4}.value</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out4}.format</name>
                           <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
                       </property>
                       <!-- organization -->
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out5}.key</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out5}.value</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapreduce.multipleoutputs.namedOutput.${out5}.format</name>
                           <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
                       </property>
                       <!-- ## Custom config -->
                       <!--delim character used to seperate fields in hdfs dump files <property> -->
                       <property>
                           <name>lod.delim</name>
                           <value>${lod_delim}</value>
                       </property>
                       <property>
                           <name>lod.sourceMappings</name>
                           <value>${lod_sourceMappings}</value>
                       </property>
                       <property>
                           <name>lod.targetMappings</name>
                           <value>${lod_targetMappings}</value>
                       </property>
                       <!-- This directory does not correspond to a data store. In fact, this
                           directory only contains multiple data stores. It has to be set to the name
                           of the workflow node. -->
                       <property>
                           <name>mapred.output.dir</name>
                           <value>${lod_output}</value>
                       </property>
                       <!-- ## Workflow node parameters -->
                       <property>
                           <name>mapred.reduce.tasks</name>
                           <value>${numReducers}</value>
                       </property>
                   </configuration>
               </map-reduce>
               <ok to="end"/>
               <error to="fail"/>
           </action>
           <action name="build">
               <map-reduce>
                   <configuration>
                       <!-- ZOOKEEPER -->
                       <property>
                           <name>hbase.zookeeper.quorum</name>
                           <value>
                               namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
                           </value>
                       </property>
                       <property>
                           <name>zookeeper.znode.rootserver</name>
                           <value>root-region-server</value>
                       </property>
                       <property>
                           <name>hbase.zookeeper.property.clientPort</name>
                           <value>2181</value>
                       </property>
                       <!-- MR IO FOR MULTIPLE INPUTS-->
                       <property>
                           <name>mapreduce.inputformat.class</name>
                           <value>org.apache.hadoop.mapreduce.lib.input.DelegatingInputFormat</value>
                       </property>
                       <property>
                           <name>mapreduce.map.class</name>
                           <value>org.apache.hadoop.mapreduce.lib.input.DelegatingMapper</value>
                       </property>
                       <property>
                           <name>mapred.input.dir.formats</name>
                           <value>
                               ${nameNode}${sourceBuildInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat,${nameNode}${targetBuildInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat
                           </value>
                       </property>
                       <property>
                           <name>mapred.input.dir.mappers</name>
                           <value>
                               ${nameNode}${sourceBuildInput};eu.dnetlib.data.mapreduce.hbase.lodExport.build.SourceBuildMapper,${nameNode}${targetBuildInput};eu.dnetlib.data.mapreduce.hbase.lodExport.build.TargetBuildMapper
                           </value>
                       </property>
                       <property>
                           <name>mapred.mapoutput.key.class</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapred.mapoutput.value.class</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapred.output.key.class</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapred.output.value.class</name>
                           <value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat</value>
                       </property>
                       <!-- ## This is required for new MapReduce API usage -->
                       <property>
                           <name>mapred.mapper.new-api</name>
                           <value>true</value>
                       </property>
                       <property>
                           <name>mapred.reducer.new-api</name>
                           <value>true</value>
                       </property>
                       <!-- # Job-specific options -->
                       <property>
                           <name>dfs.blocksize</name>
                           <value>32M</value>
                       </property>
                       <property>
                           <name>mapred.reduce.tasks.speculative.execution</name>
                           <value>false</value>
                       </property>
                       <property>
                           <name>mapreduce.map.speculative</name>
                           <value>false</value>
                       </property>
                       <!-- Compress Output-->
                       <property>
                           <name>mapred.output.compress</name>
                           <value>true</value>
                       </property>
                       <property>
                           <name>mapred.output.compression.type</name>
                           <value>BLOCK</value>
                       </property>
                       <property>
                           <name>mapred.output.compression.codec</name>
                           <value>org.apache.hadoop.io.compress.GzipCodec</value>
                       </property>
                       <property>
                           <name>mapreduce.reduce.class</name>
                           <value>eu.dnetlib.data.mapreduce.hbase.lodExport.build.BlockReducer</value>
                       </property>
                       <!-- I/O FORMAT -->
                       <!-- IMPORTANT: sets default delimeter used by text output writer. Required
                           to fix issue with traling tab added between id and value in multiple outputs
                         <property>
                           <name>mapred.textoutputformat.separator</name>
                           <value>${lod_delim}</value>
                       </property>
        -->
                       <property>
                           <name>io.serializations</name>
                           <value>org.apache.hadoop.io.serializer.WritableSerialization</value>
                       </property>
                       <!-- ## Custom config -->
                       <!--delim character used to seperate fields in hdfs dump files -->
                       <property>
                           <name>lod.delim</name>
                           <value>${lod_delim}</value>
                       </property>
                       <property>
                           <name>lod.sourceMappings</name>
                           <value>${lod_sourceMappings}</value>
                       </property>
                       <property>
                           <name>lod.redisHost</name>
                           <value>${lod_redisHost}</value>
                       </property>
                       <property>
                           <name>lod.redisPort</name>
                           <value>${lod_redisPort}</value>
                       </property>
                       <property>
                           <name>lod.targetMappings</name>
                           <value>${lod_targetMappings}</value>
                       </property>
                       <!-- This directory does not correspond to a data store. In fact, this
                           directory only contains multiple data stores. It has to be set to the name
                           of the workflow node. -->
                       <property>
                           <name>mapred.output.dir</name>
                           <value>${lod_block_output}</value>
                       </property>
                       <!-- ## Workflow node parameters -->
                       <property>
                           <name>mapred.reduce.tasks</name>
                           <value>${numReducers}</value>
                       </property>
                   </configuration>
               </map-reduce>
               <ok to="linkage"/>
               <error to="fail"/>
           </action>
           <action name="linkage">
               <map-reduce>
                   <configuration>
                       <!-- ZOOKEEPER -->
                       <property>
                           <name>hbase.zookeeper.quorum</name>
                           <value>
                               namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
                           </value>
                       </property>
                       <property>
                           <name>zookeeper.znode.rootserver</name>
                           <value>root-region-server</value>
                       </property>
                       <property>
                           <name>hbase.zookeeper.property.clientPort</name>
                           <value>2181</value>
                       </property>
                       <property>
                           <name>mapreduce.mapper.class</name>
                           <value>eu.dnetlib.data.mapreduce.hbase.lodExport.linkage.LinkageMapper</value>
                       </property>
                       <property>
                           <name>mapreduce.reduce.class</name>
                           <value>eu.dnetlib.data.mapreduce.hbase.lodExport.linkage.LimesReducer</value>
                       </property>
                       <property>
                           <name>mapreduce.inputformat.class</name>
                           <value>org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat</value>
                       </property>
                       <property>
                           <name>mapred.mapoutput.key.class</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapred.mapoutput.value.class</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapred.output.key.class</name>
                           <value>org.apache.hadoop.io.Text</value>
                       </property>
                       <property>
                           <name>mapred.output.value.class</name>
                           <value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat</value>
                       </property>
                       <!-- ## This is required for new MapReduce API usage -->
                       <property>
                           <name>mapred.mapper.new-api</name>
                           <value>true</value>
                       </property>
                       <property>
                           <name>mapred.reducer.new-api</name>
                           <value>true</value>
                       </property>
                       <!-- # Job-specific options -->
                       <property>
                           <name>dfs.blocksize</name>
                           <value>32M</value>
                       </property>
                       <property>
                           <name>mapred.reduce.tasks.speculative.execution</name>
                           <value>false</value>
                       </property>
                       <property>
                           <name>mapreduce.map.speculative</name>
                           <value>false</value>
                       </property>
                       <!-- Compress Output-->
                       <property>
                           <name>mapred.output.compress</name>
                           <value>false</value>
                       </property>
       <!--
                       <property>
                           <name>mapred.output.compression.type</name>
                           <value>BLOCK</value>
                       </property>
                       <property>
                           <name>mapred.output.compression.codec</name>
                           <value>org.apache.hadoop.io.compress.BZip2Codec</value>
                       </property>-->
                       <!-- I/O FORMAT -->
                       <!-- IMPORTANT: sets default delimeter used by text output writer. Required to fix
                   issue with traling tab added between id and value in multiple outputs -->
                       <property>
                           <name>mapred.textoutputformat.separator</name>
                           <value>${lod_delim}</value>
                       </property>
                       <property>
                           <name>io.serializations</name>
                           <value>org.apache.hadoop.io.serializer.WritableSerialization</value>
                       </property>
                       <!-- ## Custom config -->
                       <!--delim character used to seperate fields in hdfs dump files <property> -->
                       <property>
                           <name>lod.delim</name>
                           <value>${lod_delim}</value>
                       </property>
                       <property>
                           <name>lod.sourceMappings</name>
                           <value>${lod_sourceMappings}</value>
                       </property>
                       <property>
                           <name>lod.redisHost</name>
                           <value>${lod_redisHost}</value>
                       </property>
                       <property>
                           <name>lod.redisPort</name>
                           <value>${lod_redisPort}</value>
                       </property>
                       <property>
                           <name>lod.targetMappings</name>
                           <value>${lod_targetMappings}</value>
                       </property>
                       <!-- This directory does not correspond to a data store. In fact, this
                           directory only contains multiple data stores. It has to be set to the name
                           of the workflow node. -->
                       <property>
                           <name>mapred.output.dir</name>
                           <value>${lod_final_output}</value>
                       </property>
                       <property>
                           <name>mapred.input.dir</name>
                           <value>${lod_block_output}*</value>
                       </property>
                       <property>
                           <name>lod.configXML</name>
                           <value>${lod_configXML}</value>
                       </property>
                       <property>
                           <name>lod.limesDTD</name>
                           <value>${lod_limesDTD}</value>
                       </property>
                       <!-- ## Workflow node parameters -->
                       <property>
                           <name>mapred.reduce.tasks</name>
                           <value>${numReducers}</value>
                       </property>
                   </configuration>
               </map-reduce>
               <ok to="end"/>
               <error to="fail"/>
           </action>
           <kill name="fail">
               <message>
                   Unfortunately, the process failed -- error message:
                   [${wf:errorMessage(wf:lastErrorNode())}]
               </message>
           </kill>
           <end name="end"/>
       </workflow-app>

(1-1/1)

Project

General

Profile

D-Net