Project

General

Profile

1
<workflow-app name="lod_generation" xmlns="uri:oozie:workflow:0.4">
2
    <!-- map reduce job that exports hbase data and prepares them for import
3
        to the lod_generation -->
4

    
5
    <global>
6
        <job-tracker>${jobTracker}</job-tracker>
7
        <name-node>${nameNode}</name-node>
8
        <configuration>
9
            <property>
10
                <name>mapred.job.queue.name</name>
11
                <value>${queueName}</value>
12
            </property>
13
            <property>
14
                <name>oozie.sqoop.log.level</name>
15
                <value>DEBUG</value>
16
            </property>
17
        </configuration>
18
    </global>
19

    
20

    
21
    <start to='datasetProcessing'/>
22

    
23
    <action name="datasetProcessing">
24
        <map-reduce>
25

    
26
           <!-- <prepare>
27
                <delete path="${nameNode}${lod_output}"/>
28
            </prepare>
29
-->
30
            <configuration>
31

    
32
                <property>
33

    
34
                <!-- ZOOKEEPER -->
35

    
36
                <property>
37
                    <name>hbase.zookeeper.quorum</name>
38
                    <value>
39
                        namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
40
                    </value>
41
                    <!-- <value>quorum1.t.hadoop.research-infrastructures.eu,quorum2.t.hadoop.research-infrastructures.eu,quorum3.t.hadoop.research-infrastructures.eu,quorum4.t.hadoop.research-infrastructures.eu,jobtracker.t.hadoop.research-infrastructures.eu
42
                        </value> -->
43
                </property>
44

    
45
                <property>
46
                    <name>zookeeper.znode.rootserver</name>
47
                    <value>root-region-server</value>
48
                </property>
49

    
50
                <property>
51
                    <name>hbase.zookeeper.property.clientPort</name>
52
                    <value>2181</value>
53

    
54
                </property>
55

    
56

    
57
                <!-- MR IO FOR MULTIPLE INPUTS-->
58

    
59
                <property>
60
                    <name>mapreduce.inputformat.class</name>
61
                    <value>org.apache.hadoop.mapreduce.lib.input.DelegatingInputFormat</value>
62
                </property>
63

    
64
                <property>
65
                    <name>mapreduce.map.class</name>
66
                    <value>org.apache.hadoop.mapreduce.lib.input.DelegatingMapper</value>
67
                </property>
68

    
69
                <property>
70
                    <name>mapreduce.input.multipleinputs.dir.formats</name>
71
                    <value>${nameNode}${sourceInput};org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat,${nameNode}${targetInput};org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat</value>
72
                </property>
73
                <property>
74
                    <name>mapreduce.input.multipleinputs.dir.mappers</name>
75
                    <value>${nameNode}${sourceInput};eu.dnetlib.data.mapreduce.hbase.lodExport.SourceMapper,${nameNode}${targetInput};eu.dnetlib.data.mapreduce.hbase.lodExport.TargetMapper</value>
76
                </property>
77

    
78
                <property>
79
                    <name>mapred.mapoutput.key.class</name>
80
                    <value>org.apache.hadoop.io.Text</value>
81
                </property>
82
                <property>
83
                    <name>mapred.mapoutput.value.class</name>
84
                    <value>org.apache.hadoop.io.Text</value>
85
                </property>
86

    
87

    
88
                <property>
89
                    <name>mapred.output.key.class</name>
90
                    <value>org.apache.hadoop.io.Text</value>
91
                </property>
92

    
93
                <property>
94
                    <name>mapred.output.value.class</name>
95
                    <value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat</value>
96
                </property>
97

    
98
                <!-- ## This is required for new MapReduce API usage -->
99
                <property>
100
                    <name>mapred.mapper.new-api</name>
101
                    <value>true</value>
102
                </property>
103
                <property>
104
                    <name>mapred.reducer.new-api</name>
105
                    <value>true</value>
106
                </property>
107

    
108
                <!-- # Job-specific options -->
109
                <property>
110
                    <name>dfs.blocksize</name>
111
                    <value>32M</value>
112
                </property>
113
                <property>
114
                    <name>mapred.output.compress</name>
115
                    <value>false</value>
116
                </property>
117
                <property>
118
                    <name>mapred.reduce.tasks.speculative.execution</name>
119
                    <value>false</value>
120
                </property>
121
                <property>
122
                    <name>mapred.reduce.tasks.speculative.execution</name>
123
                    <value>false</value>
124
                </property>
125

    
126
                <property>
127
                    <name>mapreduce.map.speculative</name>
128
                    <value>false</value>
129
                </property>
130

    
131
                <!-- I/O FORMAT -->
132
                <!-- IMPORTANT: sets default delimeter used by text output writer. Required
133
                    to fix issue with traling tab added between id and value in multiple outputs -->
134

    
135
                <property>
136
                    <name>mapred.textoutputformat.separator</name>
137
                    <value>${lod_delim}</value>
138
                </property>
139

    
140
                <!-- ## Names of all output ports -->
141
<!--
142

    
143
                <property>
144
                    <name>mapreduce.output</name>
145
                    <value>
146
                        ${out1}
147
                    </value>
148

    
149
                </property>
150
-->
151

    
152

    
153
                <!-- ## Classes of mapper and reducer -->
154

    
155
                <!--<property>
156
                    <name>mapreduce.map.class</name>
157
                    <value>eu.dnetlib.data.mapreduce.hbase.lodExport.LodMapper</value>
158
                </property>
159
               -->
160
                <property>
161
                    <name>mapreduce.reduce.class</name>
162
                    <value>eu.dnetlib.data.mapreduce.hbase.lodExport.DatasetReducer</value>
163
                </property>
164

    
165

    
166
                <property>
167
                    <name>io.serializations</name>
168
                    <value>org.apache.hadoop.io.serializer.WritableSerialization</value>
169
                </property>
170

    
171
                <!-- ## Custom config -->
172

    
173
                <!--delim character used to seperate fields in hdfs dump files <property> -->
174
                <property>
175
                    <name>lod.delim</name>
176
                    <value>${lod_delim}</value>
177
                </property>
178

    
179

    
180
                <!-- This directory does not correspond to a data store. In fact, this
181
                    directory only contains multiple data stores. It has to be set to the name
182
                    of the workflow node. -->
183
                <property>
184
                    <name>mapred.output.dir</name>
185
                    <value>${lod_output}${out}</value>
186
                </property>
187

    
188
                <!-- ## Workflow node parameters -->
189
                <property>
190
                    <name>mapred.reduce.tasks</name>
191
                    <value>${numReducers}</value>
192
                </property>
193

    
194
            </configuration>
195

    
196
        </map-reduce>
197
        <ok to="end"/>
198

    
199
        <error to="fail"/>
200
    </action>
201

    
202
<!--
203

    
204
    <action name='blocking'>
205
        <java>
206
            <prepare>
207
            </prepare>
208
            <configuration>
209
                <property>
210
                    <name>mapred.job.queue.name</name>
211
                    <value>${queueName}</value>
212
                </property>
213
            </configuration>
214
            <main-class>eu.dnetlib.iis.core.workflows.lodexport.ClearGraph</main-class>
215

    
216
            <arg>${lod_relationsGraph}</arg>
217

    
218
        </java>
219
        <ok to="end"/>
220
        <error to="fail"/>
221
    </action>
222
-->
223

    
224

    
225
    <!--
226
    <action name="cleanUpHDFS">
227
        <fs>
228
            <delete path="${lod_output}test"/>
229
        </fs>
230

    
231
        <ok to="end"/>
232
        <error to="fail"/>
233
    </action>
234
-->
235

    
236
    <kill name="fail">
237
        <message>
238
            Unfortunately, the process failed -- error message:
239
            [${wf:errorMessage(wf:lastErrorNode())}]
240
        </message>
241
    </kill>
242
    <end name="end"/>
243
</workflow-app>
    (1-1/1)