Project

General

Profile

1
<workflow-app name="lod_csv_export" xmlns="uri:oozie:workflow:0.4">
2
    <!-- map reduce job that exports hbase data and prepares them for import to the lod_generation -->
3

    
4
    <global>
5
        <job-tracker>${jobTracker}</job-tracker>
6
        <name-node>${nameNode}</name-node>
7
        <configuration>
8
            <property>
9
                <name>mapred.job.queue.name</name>
10
                <value>${queueName}</value>
11
            </property>
12
            <property>
13
                <name>oozie.sqoop.log.level</name>
14
                <value>DEBUG</value>
15
            </property>
16
        </configuration>
17
    </global>
18
    <start to='rdf_import'/>
19
    <action name="csv_export">
20
        <map-reduce>
21

    
22
            <prepare>
23
                <delete path="${nameNode}${lod_output}"/>
24
            </prepare>
25
            <configuration>
26
                <property>
27
                    <name>hbase.mapreduce.scan</name>
28
                    <value>${wf:actionData('get-scanner')['scan']}</value>
29
                </property>
30
                <property>
31
                    <name>hbase.rootdir</name>
32
                    <value>$nameNode/hbase</value>
33

    
34
                </property>
35

    
36
                <property>
37
                    <name>hbase.security.authentication</name>
38
                    <value>simple</value>
39
                </property>
40
                <!-- ZOOKEEPER -->
41

    
42
                <property>
43
                    <name>hbase.zookeeper.quorum</name>
44
                    <value>
45
                        namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
46
                    </value>
47
                    <!-- <value>quorum1.t.hadoop.research-infrastructures.eu,quorum2.t.hadoop.research-infrastructures.eu,quorum3.t.hadoop.research-infrastructures.eu,quorum4.t.hadoop.research-infrastructures.eu,jobtracker.t.hadoop.research-infrastructures.eu
48
                     </value>-->
49
                </property>
50
                <property>
51
                    <name>zookeeper.znode.rootserver</name>
52
                    <value>root-region-server</value>
53

    
54
                </property>
55
                <property>
56
                    <name>hbase.zookeeper.property.clientPort</name>
57
                    <value>2181</value>
58
                    <!--<value>2182</value> -->
59
                </property>
60

    
61

    
62
                <!-- MR IO -->
63

    
64
                <property>
65
                    <name>mapreduce.inputformat.class</name>
66
                    <value>org.apache.hadoop.hbase.mapreduce.TableInputFormat</value>
67
                </property>
68

    
69
                <property>
70
                    <name>mapred.mapoutput.key.class</name>
71
                    <value>org.apache.hadoop.io.Text</value>
72
                </property>
73
                <property>
74
                    <name>mapred.mapoutput.value.class</name>
75
                    <value>org.apache.hadoop.hbase.io.ImmutableBytesWritable</value>
76
                </property>
77
                <property>
78
                    <name>mapred.output.key.class</name>
79
                    <value>org.apache.hadoop.io.Text</value>
80
                </property>
81
                <property>
82
                    <name>mapred.output.value.class</name>
83
                    <value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat</value>
84
                </property>
85

    
86
                <!-- ## This is required for new MapReduce API usage -->
87
                <property>
88
                    <name>mapred.mapper.new-api</name>
89
                    <value>true</value>
90
                </property>
91
                <property>
92
                    <name>mapred.reducer.new-api</name>
93
                    <value>true</value>
94
                </property>
95

    
96
                <!-- # Job-specific options -->
97
                <property>
98
                    <name>dfs.blocksize</name>
99
                    <value>32M</value>
100
                </property>
101
                <property>
102
                    <name>mapred.output.compress</name>
103
                    <value>false</value>
104
                </property>
105
                <property>
106
                    <name>mapred.reduce.tasks.speculative.execution</name>
107
                    <value>false</value>
108
                </property>
109
                <property>
110
                    <name>mapred.reduce.tasks.speculative.execution</name>
111
                    <value>false</value>
112
                </property>
113
                <property>
114
                    <name>mapreduce.map.speculative</name>
115
                    <value>false</value>
116
                </property>
117

    
118
                <!-- I/O FORMAT -->
119
                <!-- IMPORTANT: sets default delimeter used by text output writer. Required to fix
120
                    issue with traling tab added between id and value in multiple outputs -->
121
                <property>
122
                    <name>mapred.textoutputformat.separator</name>
123
                    <value>${lod_delim}</value>
124
                </property>
125
                <!-- ## Names of all output ports -->
126

    
127
                <property>
128
                    <name>mapreduce.multipleoutputs</name>
129
                    <value>
130
                        ${out1} ${out2}
131
                    </value>
132

    
133
                </property>
134
                <property>
135
                    <name>mapreduce.multipleoutputs.namedOutput.${out1}.key</name>
136
                    <value>org.apache.hadoop.io.Text</value>
137
                </property>
138
                <property>
139
                    <name>mapreduce.multipleoutputs.namedOutput.${out1}.value</name>
140
                    <value>org.apache.hadoop.io.Text</value>
141
                </property>
142
                <property>
143
                    <name>mapreduce.multipleoutputs.namedOutput.${out1}.format</name>
144
                    <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
145
                </property>
146
                <!-- datasourceLanguage -->
147
                <property>
148
                    <name>mapreduce.multipleoutputs.namedOutput.${out2}.key</name>
149
                    <value>org.apache.hadoop.io.Text</value>
150
                </property>
151
                <property>
152
                    <name>mapreduce.multipleoutputs.namedOutput.${out2}.value</name>
153
                    <value>org.apache.hadoop.io.Text</value>
154
                </property>
155
                <property>
156
                    <name>mapreduce.multipleoutputs.namedOutput.${out2}.format</name>
157
                    <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
158
                </property>
159

    
160

    
161
                <!-- ## Classes of mapper and reducer -->
162

    
163
                <property>
164
                    <name>mapreduce.map.class</name>
165
                    <value>eu.dnetlib.data.mapreduce.hbase.lodExport.LodMapper</value>
166
                </property>
167
                <property>
168
                    <name>mapreduce.reduce.class</name>
169
                    <value>eu.dnetlib.data.mapreduce.hbase.lodExport.LodReducer</value>
170
                </property>
171
                <property>
172
                    <name>io.serializations</name>
173
                    <value>org.apache.hadoop.io.serializer.WritableSerialization</value>
174
                </property>
175

    
176
                <!-- ## Custom config -->
177

    
178
                <!--delim character used to seperate fields in hdfs dump files <property> -->
179
                <property>
180
                    <name>lod.delim</name>
181
                    <value>${lod_delim}</value>
182
                </property>
183
                <property>
184
                    <name>lod.enclosing</name>
185
                    <value>${lod_enclosing}</value>
186
                </property>
187

    
188
                <!--source hbase table -->
189
                <property>
190
                    <name>hbase.mapreduce.inputtable</name>
191
                    <value>${lod_hbase_table}</value>
192
                </property>
193
                <property>
194
                    <name>hbase.mapred.inputtable</name>
195
                    <value>${lod_hbase_table}</value>
196
                </property>
197

    
198
                <!-- This directory does not correspond to a data store. In fact, this directory only
199
                    contains multiple data stores. It has to be set to the name of the workflow node.
200
                    -->
201
                <property>
202
                    <name>mapred.output.dir</name>
203
                    <value>${lod_output}</value>
204
                </property>
205
                <property>
206
                    <name>index.conf</name>
207
                    <value>${lod_indexConf}</value>
208
                </property>
209
                <!-- ## Workflow node parameters -->
210
                <property>
211
                    <name>mapred.reduce.tasks</name>
212
                    <value>${numReducers}</value>
213
                </property>
214

    
215
            </configuration>
216
        </map-reduce>
217
        <ok to="end"/>
218

    
219
        <error to="fail"/>
220
    </action>
221
    <action name="rdf_import">
222
        <map-reduce>
223
            <prepare>
224
            </prepare>
225

    
226
            <configuration>
227
                <property>
228
                    <name>hbase.security.authentication</name>
229
                    <value>simple</value>
230
                </property>
231

    
232
                <!-- ZOOKEEPER -->
233
                <property>
234
                    <name>hbase.zookeeper.quorum</name>
235
                    <!--<value>
236
                        namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
237
                    </value>-->
238
                    <value>
239
                        quorum1.t.hadoop.research-infrastructures.eu,quorum2.t.hadoop.research-infrastructures.eu,quorum3.t.hadoop.research-infrastructures.eu,quorum4.t.hadoop.research-infrastructures.eu,jobtracker.t.hadoop.research-infrastructures.eu
240
                    </value>
241
                </property>
242
                <property>
243
                    <name>zookeeper.znode.rootserver</name>
244
                    <value>root-region-server</value>
245
                </property>
246

    
247
                <!-- CSV PROPS GO HERE -->
248
                <property>
249
                    <name>hbase.zookeeper.property.clientPort</name>
250
                    <value>2181</value>
251
                    <!--<value>2182</value> -->
252
                </property>
253

    
254
                <property>
255
                    <name>CSVLineRecordReader.FORMAT_DELIMITER</name>
256
                    <value>"</value>
257
                </property>
258
                <property>
259
                    <name>CSVLineRecordReader.FORMAT_SEPARATOR</name>
260
                    <value>,</value>
261
                </property>
262

    
263
                <property>
264
                    <name>CSVNLineInputFormat.LINES_PER_MAP</name>
265
                    <value>40000</value>
266
                </property>
267
                <property>
268
                    <name>CSVLineRecordReader.IS_ZIPFILE</name>
269
                    <value>false</value>
270
                </property>
271

    
272
                <!-- MR IO -->
273
                <!-- TODO here: add csv input format to mapper
274
                -->
275

    
276
                <property>
277
                    <name>mapred.input.dir</name>
278
                    <value>${lod_EntitiesInputFile}</value>
279
                </property>
280

    
281
                <property>
282
                    <name>mapreduce.inputformat.class</name>
283
                    <value>org.apache.hadoop.mapreduce.lib.input.TextInputFormat</value>
284
                </property>
285

    
286
                <property>
287
                    <name>mapred.mapoutput.key.class</name>
288
                    <value>org.apache.hadoop.io.Text</value>
289
                </property>
290

    
291
                <property>
292
                    <name>mapred.mapoutput.value.class</name>
293
                    <value>java.util.List</value>
294
                </property>
295

    
296
                <!-- <property>
297
                     <name>mapred.output.key.class</name>
298
                     <value>org.apache.hadoop.io.Text</value>
299
                 </property>-->
300
                <property>
301
                    <name>mapred.output.value.class</name>
302
                    <value>org.apache.hadoop.mapreduce.lib.output.NullOutputFormat</value>
303
                </property>
304

    
305

    
306
                <!-- ## This is required for new MapReduce API usage -->
307
                <property>
308
                    <name>mapred.mapper.new-api</name>
309
                    <value>true</value>
310
                </property>
311
                <property>
312
                    <name>mapred.reducer.new-api</name>
313
                    <value>true</value>
314
                </property>
315

    
316
                <!-- # Job-specific options -->
317
                <property>
318
                    <name>dfs.blocksize</name>
319
                    <value>32M</value>
320
                </property>
321
                <property>
322
                    <name>mapred.output.compress</name>
323
                    <value>false</value>
324
                </property>
325
                <property>
326
                    <name>mapred.reduce.tasks.speculative.execution</name>
327
                    <value>false</value>
328
                </property>
329
                <property>
330
                    <name>mapred.reduce.tasks.speculative.execution</name>
331
                    <value>false</value>
332
                </property>
333
                <property>
334
                    <name>mapreduce.map.speculative</name>
335
                    <value>false</value>
336
                </property>
337

    
338

    
339
                <property>
340
                    <name>map.output.key.field.separator</name>
341
                    <value>${lod_delim}</value>
342
                </property>
343

    
344

    
345
                <!-- ## Classes of mapper and reducer -->
346

    
347
                <property>
348
                    <name>mapreduce.map.class</name>
349
                    <value>eu.dnetlib.data.mapreduce.hbase.lodImport.LodImportMapper</value>
350
                </property>
351
                <property>
352
                    <name>mapreduce.reduce.class</name>
353
                    <value>eu.dnetlib.data.mapreduce.hbase.lodImport.LodImportReducer</value>
354
                </property>
355
                <property>
356
                    <name>io.serializations</name>
357
                    <value>org.apache.hadoop.io.serializer.WritableSerialization</value>
358
                </property>
359

    
360
                <!-- ## Custom config -->
361

    
362
                <!--delim character used to seperate fields in hdfs dump files <property> -->
363
                <property>
364
                    <name>lod.delim</name>
365
                    <value>${lod_delim}</value>
366
                </property>
367
                <property>
368
                    <name>lod.enclosing</name>
369
                    <value>${lod_enclosing}</value>
370
                </property>
371

    
372
                <property>
373
                    <name>lod.lastExecutionDate</name>
374
                    <value>${lod_lastExecutionDate}</value>
375
                </property>
376

    
377
                <property>
378
                    <name>lod.conLine</name>
379
                    <value>${lod_conLine}</value>
380
                </property>
381

    
382

    
383
                <property>
384
                    <name>lod.username</name>
385
                    <value>${lod_username}</value>
386
                </property>
387

    
388

    
389
                <property>
390
                    <name>lod.password</name>
391
                    <value>${lod_password}</value>
392
                </property>
393

    
394
                <property>
395
                    <name>lod_minCpart</name>
396
                    <value>${lod_minCpart}</value>
397
                </property>
398

    
399

    
400
                <property>
401
                    <name>lod_maxCpart</name>
402
                    <value>${lod_maxCpart}</value>
403
                </property>
404

    
405
                <property>
406
                    <name>lod.part</name>
407
                    <value>${lod_part}</value>
408
                </property>
409

    
410
                <property>
411
                    <name>lod.jsonRels</name>
412
                    <value>${lod_jsonRels}</value>
413
                </property>
414

    
415
                <property>
416
                    <name>lod.jsonEntities</name>
417
                    <value>${lod_jsonEntities}</value>
418
                </property>
419

    
420
                <property>
421
                    <name>lod.defaultGraph</name>
422
                    <value>${lod_defaultGraph}</value>
423
                </property>
424

    
425
               <property>
426
                    <name>mapred.reduce.tasks</name>
427
                    <value>${numReducers}</value>
428
                </property>
429

    
430
                <property>
431
                    <name>index.conf</name>
432
                    <value>${lod_indexConf}</value>
433
                </property>
434

    
435
                <property>
436
                    <name>mapred.output.dir</name>
437
                    <value>${lod_output}test</value>
438
                </property>
439

    
440
            </configuration>
441
        </map-reduce>
442
        <ok to="end"/>
443

    
444
        <error to="fail"/>
445
    </action>
446

    
447

    
448
 <action name="cleanUpHDFS">
449
        <fs>
450
            <delete path=">${lod_output}test" />
451
        </fs>
452
        <ok to="end" />
453
        <error to="fail" />
454
    </action>
455

    
456
    <kill name="fail">
457
        <message>
458
            Unfortunately, the process failed -- error message: [${wf:errorMessage(wf:lastErrorNode())}]
459
        </message>
460
    </kill>
461
    <end name="end"/>
462
</workflow-app>
    (1-1/1)