Project

General

Profile

1
<workflow-app name="lod_interlinking" xmlns="uri:oozie:workflow:0.4">
2
    <!-- map reduce job that exports hbase data and prepares them for import
3
        to the lod_generation -->
4

    
5
    <global>
6
        <job-tracker>${jobTracker}</job-tracker>
7
        <name-node>${nameNode}</name-node>
8
        <configuration>
9
            <property>
10
                <name>mapred.job.queue.name</name>
11
                <value>${queueName}</value>
12
            </property>
13
            <property>
14
                <name>oozie.sqoop.log.level</name>
15
                <value>DEBUG</value>
16
            </property>
17
        </configuration>
18
    </global>
19

    
20

    
21
    <start to='preProcessing'/>
22

    
23
    <action name="preProcessing">
24
        <map-reduce>
25
            <configuration>
26

    
27
                <!-- ZOOKEEPER -->
28

    
29
                <property>
30
                    <name>hbase.zookeeper.quorum</name>
31
                    <value>
32
                   <!--     namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
33
-->                        ${zookeeperQuorum}
34
                    </value>
35

    
36
                </property>
37

    
38
                <property>
39
                    <name>zookeeper.znode.rootserver</name>
40
                    <value>
41
                        ${zookeeperZnode}</value>
42
                </property>
43

    
44
                <property>
45
                    <name>hbase.zookeeper.property.clientPort</name>
46
                    <value> ${zookeepeClientPort}</value>
47

    
48
                </property>
49

    
50

    
51
                <!-- MR IO FOR MULTIPLE INPUTS-->
52

    
53
                <property>
54
                    <name>mapreduce.inputformat.class</name>
55
                    <value>org.apache.hadoop.mapreduce.lib.input.DelegatingInputFormat</value>
56
                </property>
57

    
58
                <property>
59
                    <name>mapreduce.map.class</name>
60
                    <value>org.apache.hadoop.mapreduce.lib.input.DelegatingMapper</value>
61
                </property>
62

    
63

    
64
                <property>
65

    
66
                    <name>mapred.input.dir.formats</name>
67
                    <value>
68
                        ${nameNode}${sourceInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat,${nameNode}${targetInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat
69
                    </value>
70
                </property>
71

    
72
                <property>
73
                    <name>mapred.input.dir.mappers</name>
74
                    <value>
75
                        ${nameNode}${sourceInput};eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing.SourceMapper,${nameNode}${targetInput};eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing.TargetMapper
76
                    </value>
77
                </property>
78

    
79

    
80
                <property>
81
                    <name>mapred.mapoutput.key.class</name>
82
                    <value>org.apache.hadoop.io.Text</value>
83
                </property>
84
                <property>
85
                    <name>mapred.mapoutput.value.class</name>
86
                    <value>org.apache.hadoop.io.Text</value>
87
                </property>
88

    
89

    
90
                <property>
91
                    <name>mapred.output.key.class</name>
92
                    <value>org.apache.hadoop.io.Text</value>
93
                </property>
94

    
95
                <property>
96
                    <name>mapred.output.value.class</name>
97
                    <value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat</value>
98
                </property>
99

    
100
                <!-- ## This is required for new MapReduce API usage -->
101
                <property>
102
                    <name>mapred.mapper.new-api</name>
103
                    <value>true</value>
104
                </property>
105
                <property>
106
                    <name>mapred.reducer.new-api</name>
107
                    <value>true</value>
108
                </property>
109

    
110
                <!-- # Job-specific options -->
111
                <property>
112
                    <name>dfs.blocksize</name>
113
                    <value>32M</value>
114
                </property>
115

    
116
                <property>
117
                    <name>mapred.reduce.tasks.speculative.execution</name>
118
                    <value>false</value>
119
                </property>
120

    
121

    
122
                <property>
123
                    <name>mapreduce.map.speculative</name>
124
                    <value>false</value>
125
                </property>
126

    
127
                <!-- Compress Output-->
128
                <property>
129
                    <name>mapred.output.compress</name>
130
                    <value>true</value>
131
                </property>
132

    
133
                <property>
134
                    <name>mapred.output.compression.type</name>
135
                    <value>BLOCK</value>
136
                </property>
137

    
138
                <property>
139
                    <name>mapred.output.compression.codec</name>
140
                    <value>org.apache.hadoop.io.compress.GzipCodec</value>
141
                </property>
142

    
143
                <property>
144
                    <name>mapreduce.reduce.class</name>
145
                    <value>eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing.DatasetReducer</value>
146
                </property>
147
                <!-- I/O FORMAT -->
148
                <!-- IMPORTANT: sets default delimeter used by text output writer. Required
149
                    to fix issue with traling tab added between id and value in multiple outputs -->
150
                <property>
151
                    <name>mapred.textoutputformat.separator</name>
152
                    <value>${lod_delim}</value>
153
                </property>
154

    
155
                <property>
156
                    <name>io.serializations</name>
157
                    <value>org.apache.hadoop.io.serializer.WritableSerialization</value>
158
                </property>
159

    
160

    
161
                <!-- ## Names of all output ports -->
162

    
163

    
164
                <property>
165
                    <name>mapreduce.multipleoutputs</name>
166
                    <value>
167
                        ${out1} ${out2} ${out3} ${out4} ${out5}
168
                    </value>
169
                </property>
170

    
171
                <!--datasource-->
172
                <property>
173
                    <name>mapreduce.multipleoutputs.namedOutput.${out1}.key</name>
174
                    <value>org.apache.hadoop.io.Text</value>
175
                </property>
176
                <property>
177
                    <name>mapreduce.multipleoutputs.namedOutput.${out1}.value</name>
178
                    <value>org.apache.hadoop.io.Text</value>
179
                </property>
180
                <property>
181
                    <name>mapreduce.multipleoutputs.namedOutput.${out1}.format</name>
182
                    <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
183
                </property>
184

    
185
                <!-- result -->
186
                <property>
187
                    <name>mapreduce.multipleoutputs.namedOutput.${out2}.key</name>
188
                    <value>org.apache.hadoop.io.Text</value>
189
                </property>
190
                <property>
191
                    <name>mapreduce.multipleoutputs.namedOutput.${out2}.value</name>
192
                    <value>org.apache.hadoop.io.Text</value>
193
                </property>
194
                <property>
195
                    <name>mapreduce.multipleoutputs.namedOutput.${out2}.format</name>
196
                    <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
197
                </property>
198

    
199
                <!-- project -->
200
                <property>
201
                    <name>mapreduce.multipleoutputs.namedOutput.${out3}.key</name>
202
                    <value>org.apache.hadoop.io.Text</value>
203
                </property>
204
                <property>
205
                    <name>mapreduce.multipleoutputs.namedOutput.${out3}.value</name>
206
                    <value>org.apache.hadoop.io.Text</value>
207
                </property>
208
                <property>
209
                    <name>mapreduce.multipleoutputs.namedOutput.${out3}.format</name>
210
                    <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
211
                </property>
212

    
213
                <!-- person -->
214
                <property>
215
                    <name>mapreduce.multipleoutputs.namedOutput.${out4}.key</name>
216
                    <value>org.apache.hadoop.io.Text</value>
217
                </property>
218
                <property>
219
                    <name>mapreduce.multipleoutputs.namedOutput.${out4}.value</name>
220
                    <value>org.apache.hadoop.io.Text</value>
221
                </property>
222
                <property>
223
                    <name>mapreduce.multipleoutputs.namedOutput.${out4}.format</name>
224
                    <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
225
                </property>
226

    
227
                <!-- organization -->
228
                <property>
229
                    <name>mapreduce.multipleoutputs.namedOutput.${out5}.key</name>
230
                    <value>org.apache.hadoop.io.Text</value>
231
                </property>
232
                <property>
233
                    <name>mapreduce.multipleoutputs.namedOutput.${out5}.value</name>
234
                    <value>org.apache.hadoop.io.Text</value>
235
                </property>
236
                <property>
237
                    <name>mapreduce.multipleoutputs.namedOutput.${out5}.format</name>
238
                    <value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
239
                </property>
240

    
241

    
242
                <!-- ## Custom config -->
243

    
244
                <!--delim character used to seperate fields in hdfs dump files <property> -->
245
                <property>
246
                    <name>lod.delim</name>
247
                    <value>${lod_delim}</value>
248
                </property>
249

    
250
                <property>
251
                    <name>lod.sourceMappings</name>
252
                    <value>${lod_sourceMappings}</value>
253
                </property>
254

    
255

    
256
                <property>
257
                    <name>lod.targetMappings</name>
258
                    <value>${lod_targetMappings}</value>
259
                </property>
260

    
261
                <!-- This directory does not correspond to a data store. In fact, this
262
                    directory only contains multiple data stores. It has to be set to the name
263
                    of the workflow node. -->
264

    
265
                <property>
266
                    <name>mapred.output.dir</name>
267
                    <value>${lod_output}</value>
268
                </property>
269

    
270

    
271
                <!-- ## Workflow node parameters -->
272
                <property>
273
                    <name>mapred.reduce.tasks</name>
274
                    <value>${numReducers}</value>
275
                </property>
276

    
277
            </configuration>
278

    
279
        </map-reduce>
280
        <ok to="end"/>
281

    
282
        <error to="fail"/>
283
    </action>
284

    
285

    
286
    <action name="build">
287
        <map-reduce>
288
            <configuration>
289

    
290
                <!-- ZOOKEEPER -->
291

    
292
                <property>
293
                    <name>hbase.zookeeper.quorum</name>
294
                    <value>
295
                        namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
296
                    </value>
297

    
298
                </property>
299

    
300
                <property>
301
                    <name>zookeeper.znode.rootserver</name>
302
                    <value>root-region-server</value>
303
                </property>
304

    
305
                <property>
306
                    <name>hbase.zookeeper.property.clientPort</name>
307
                    <value>2181</value>
308

    
309
                </property>
310

    
311

    
312
                <!-- MR IO FOR MULTIPLE INPUTS-->
313

    
314
                <property>
315
                    <name>mapreduce.inputformat.class</name>
316
                    <value>org.apache.hadoop.mapreduce.lib.input.DelegatingInputFormat</value>
317
                </property>
318

    
319
                <property>
320
                    <name>mapreduce.map.class</name>
321
                    <value>org.apache.hadoop.mapreduce.lib.input.DelegatingMapper</value>
322
                </property>
323

    
324

    
325
                <property>
326

    
327
                    <name>mapred.input.dir.formats</name>
328
                    <value>
329
                        ${nameNode}${sourceBuildInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat,${nameNode}${targetBuildInput};org.apache.hadoop.mapreduce.lib.input.TextInputFormat
330
                    </value>
331
                </property>
332

    
333
                <property>
334
                    <name>mapred.input.dir.mappers</name>
335
                    <value>
336
                        ${nameNode}${sourceBuildInput};eu.dnetlib.data.mapreduce.hbase.lodExport.build.SourceBuildMapper,${nameNode}${targetBuildInput};eu.dnetlib.data.mapreduce.hbase.lodExport.build.TargetBuildMapper
337
                    </value>
338
                </property>
339

    
340

    
341
                <property>
342
                    <name>mapred.mapoutput.key.class</name>
343
                    <value>org.apache.hadoop.io.Text</value>
344
                </property>
345
                <property>
346
                    <name>mapred.mapoutput.value.class</name>
347
                    <value>org.apache.hadoop.io.Text</value>
348
                </property>
349

    
350

    
351
                <property>
352
                    <name>mapred.output.key.class</name>
353
                    <value>org.apache.hadoop.io.Text</value>
354
                </property>
355

    
356
                <property>
357
                    <name>mapred.output.value.class</name>
358
                    <value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat</value>
359
                </property>
360

    
361
                <!-- ## This is required for new MapReduce API usage -->
362
                <property>
363
                    <name>mapred.mapper.new-api</name>
364
                    <value>true</value>
365
                </property>
366
                <property>
367
                    <name>mapred.reducer.new-api</name>
368
                    <value>true</value>
369
                </property>
370

    
371
                <!-- # Job-specific options -->
372
                <property>
373
                    <name>dfs.blocksize</name>
374
                    <value>32M</value>
375
                </property>
376

    
377
                <property>
378
                    <name>mapred.reduce.tasks.speculative.execution</name>
379
                    <value>false</value>
380
                </property>
381

    
382

    
383
                <property>
384
                    <name>mapreduce.map.speculative</name>
385
                    <value>false</value>
386
                </property>
387

    
388
                <!-- Compress Output-->
389
                <property>
390
                    <name>mapred.output.compress</name>
391
                    <value>true</value>
392
                </property>
393

    
394
                <property>
395
                    <name>mapred.output.compression.type</name>
396
                    <value>BLOCK</value>
397
                </property>
398

    
399
                <property>
400
                    <name>mapred.output.compression.codec</name>
401
                    <value>org.apache.hadoop.io.compress.GzipCodec</value>
402
                </property>
403

    
404

    
405
                <property>
406
                    <name>mapreduce.reduce.class</name>
407
                    <value>eu.dnetlib.data.mapreduce.hbase.lodExport.build.BlockReducer</value>
408
                </property>
409
                <!-- I/O FORMAT -->
410
                <!-- IMPORTANT: sets default delimeter used by text output writer. Required
411
                    to fix issue with traling tab added between id and value in multiple outputs
412

    
413
                  <property>
414
                    <name>mapred.textoutputformat.separator</name>
415
                    <value>${lod_delim}</value>
416
                </property>
417

    
418
 -->
419
                <property>
420
                    <name>io.serializations</name>
421
                    <value>org.apache.hadoop.io.serializer.WritableSerialization</value>
422
                </property>
423

    
424

    
425
                <!-- ## Custom config -->
426

    
427
                <!--delim character used to seperate fields in hdfs dump files -->
428
                <property>
429
                    <name>lod.delim</name>
430
                    <value>${lod_delim}</value>
431
                </property>
432

    
433
                <property>
434
                    <name>lod.sourceMappings</name>
435
                    <value>${lod_sourceMappings}</value>
436
                </property>
437
                <property>
438
                    <name>lod.redisHost</name>
439
                    <value>${lod_redisHost}</value>
440
                </property>
441

    
442
                <property>
443
                    <name>lod.redisPort</name>
444
                    <value>${lod_redisPort}</value>
445
                </property>
446

    
447

    
448
                <property>
449
                    <name>lod.targetMappings</name>
450
                    <value>${lod_targetMappings}</value>
451
                </property>
452

    
453
                <!-- This directory does not correspond to a data store. In fact, this
454
                    directory only contains multiple data stores. It has to be set to the name
455
                    of the workflow node. -->
456

    
457
                <property>
458
                    <name>mapred.output.dir</name>
459
                    <value>${lod_block_output}</value>
460
                </property>
461

    
462

    
463
                <!-- ## Workflow node parameters -->
464
                <property>
465
                    <name>mapred.reduce.tasks</name>
466
                    <value>${numReducers}</value>
467
                </property>
468

    
469
            </configuration>
470

    
471
        </map-reduce>
472
        <ok to="linkage"/>
473

    
474
        <error to="fail"/>
475
    </action>
476

    
477
    <action name="linkage">
478
        <map-reduce>
479
            <configuration>
480

    
481
                <!-- ZOOKEEPER -->
482

    
483
                <property>
484
                    <name>hbase.zookeeper.quorum</name>
485
                    <value>
486
                        namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
487
                    </value>
488

    
489
                </property>
490

    
491
                <property>
492
                    <name>zookeeper.znode.rootserver</name>
493
                    <value>root-region-server</value>
494
                </property>
495

    
496
                <property>
497
                    <name>hbase.zookeeper.property.clientPort</name>
498
                    <value>2181</value>
499

    
500
                </property>
501

    
502

    
503

    
504
                <property>
505
                    <name>mapreduce.mapper.class</name>
506
                    <value>eu.dnetlib.data.mapreduce.hbase.lodExport.linkage.LinkageMapper</value>
507
                </property>
508

    
509
                <property>
510
                    <name>mapreduce.reduce.class</name>
511
                    <value>eu.dnetlib.data.mapreduce.hbase.lodExport.linkage.LimesReducer</value>
512
                </property>
513

    
514

    
515
                <property>
516
                    <name>mapreduce.inputformat.class</name>
517
                    <value>org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat</value>
518
                </property>
519

    
520

    
521

    
522

    
523

    
524
                <property>
525
                    <name>mapred.mapoutput.key.class</name>
526
                    <value>org.apache.hadoop.io.Text</value>
527
                </property>
528

    
529
                <property>
530
                    <name>mapred.mapoutput.value.class</name>
531
                    <value>org.apache.hadoop.io.Text</value>
532
                </property>
533

    
534

    
535
                <property>
536
                    <name>mapred.output.key.class</name>
537
                    <value>org.apache.hadoop.io.Text</value>
538
                </property>
539

    
540
                <property>
541
                    <name>mapred.output.value.class</name>
542
                    <value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat</value>
543
                </property>
544

    
545

    
546

    
547

    
548

    
549
                <!-- ## This is required for new MapReduce API usage -->
550
                <property>
551
                    <name>mapred.mapper.new-api</name>
552
                    <value>true</value>
553
                </property>
554
                <property>
555
                    <name>mapred.reducer.new-api</name>
556
                    <value>true</value>
557
                </property>
558

    
559
                <!-- # Job-specific options -->
560
                <property>
561
                    <name>dfs.blocksize</name>
562
                    <value>32M</value>
563
                </property>
564

    
565
                <property>
566
                    <name>mapred.reduce.tasks.speculative.execution</name>
567
                    <value>false</value>
568
                </property>
569

    
570

    
571
                <property>
572
                    <name>mapreduce.map.speculative</name>
573
                    <value>false</value>
574
                </property>
575

    
576
                <!-- Compress Output-->
577
                <property>
578
                    <name>mapred.output.compress</name>
579
                    <value>false</value>
580
                </property>
581
<!--
582
                <property>
583
                    <name>mapred.output.compression.type</name>
584
                    <value>BLOCK</value>
585
                </property>
586

    
587
                <property>
588
                    <name>mapred.output.compression.codec</name>
589
                    <value>org.apache.hadoop.io.compress.BZip2Codec</value>
590
                </property>-->
591

    
592

    
593
                <!-- I/O FORMAT -->
594
                <!-- IMPORTANT: sets default delimeter used by text output writer. Required to fix
595
            issue with traling tab added between id and value in multiple outputs -->
596
                <property>
597
                    <name>mapred.textoutputformat.separator</name>
598
                    <value>${lod_delim}</value>
599
                </property>
600

    
601
                <property>
602
                    <name>io.serializations</name>
603
                    <value>org.apache.hadoop.io.serializer.WritableSerialization</value>
604
                </property>
605

    
606

    
607
                <!-- ## Custom config -->
608

    
609
                <!--delim character used to seperate fields in hdfs dump files <property> -->
610
                <property>
611
                    <name>lod.delim</name>
612
                    <value>${lod_delim}</value>
613
                </property>
614

    
615
                <property>
616
                    <name>lod.sourceMappings</name>
617
                    <value>${lod_sourceMappings}</value>
618
                </property>
619

    
620

    
621
                <property>
622
                    <name>lod.redisHost</name>
623
                    <value>${lod_redisHost}</value>
624
                </property>
625

    
626

    
627
                <property>
628
                    <name>lod.redisPort</name>
629
                    <value>${lod_redisPort}</value>
630
                </property>
631

    
632

    
633
                <property>
634
                    <name>lod.targetMappings</name>
635
                    <value>${lod_targetMappings}</value>
636
                </property>
637

    
638
                <!-- This directory does not correspond to a data store. In fact, this
639
                    directory only contains multiple data stores. It has to be set to the name
640
                    of the workflow node. -->
641

    
642
                <property>
643
                    <name>mapred.output.dir</name>
644
                    <value>${lod_final_output}</value>
645
                </property>
646
                <property>
647
                    <name>mapred.input.dir</name>
648
                    <value>${lod_block_output}*</value>
649
                </property>
650

    
651
                <property>
652
                    <name>lod.configXML</name>
653
                    <value>${lod_configXML}</value>
654
                </property>
655

    
656

    
657
                <property>
658
                    <name>lod.limesDTD</name>
659
                    <value>${lod_limesDTD}</value>
660
                </property>
661

    
662

    
663

    
664
                <!-- ## Workflow node parameters -->
665
                <property>
666
                    <name>mapred.reduce.tasks</name>
667
                    <value>${numReducers}</value>
668
                </property>
669

    
670

    
671

    
672

    
673
            </configuration>
674

    
675
        </map-reduce>
676
        <ok to="end"/>
677

    
678
        <error to="fail"/>
679
    </action>
680

    
681
    <kill name="fail">
682
        <message>
683
            Unfortunately, the process failed -- error message:
684
            [${wf:errorMessage(wf:lastErrorNode())}]
685
        </message>
686
    </kill>
687
    <end name="end"/>
688
</workflow-app>
    (1-1/1)