Project

General

Profile

« Previous | Next » 

Revision 28930

Added by Eri Katsari over 10 years ago

testing direct sqoop import in oozie

View differences:

modules/dnet-openaire-stats-workflow/src/main/resources/eu/dnetlib/iis/core/examples/helloTest/oozie_app/workflow.xml
1
<workflow-app name="test-core_examples_javamapreduce_stats"
2
	xmlns="uri:oozie:workflow:0.4">
3
	<!-- map reduce job that exports hbase data and prepares them for import 
4
		to the relation database used for statistics generation -->
5

  
6
	<global>
7
		<job-tracker>${jobTracker}</job-tracker>
8
		<name-node>${nameNode}</name-node>
9
		<configuration>
10
			<property>
11
				<name>mapred.job.queue.name</name>
12
				<value>${queueName}</value>
13
			</property>
14
		</configuration>
15
	</global>
16

  
17
	<start to='get-scanner' />
18
	<action name='get-scanner'>
19
		<java>
20

  
21
			<main-class>eu.dnetlib.iis.core.workflows.stats.HbaseScannerGenerator
22
			</main-class>
23
			<!-- column families: -->
24

  
25
			<arg>
26
				-f datasource ,
27
				datasourceOrganization_provision_provides,
28
				project , organization
29
				<!-- projectOrganization_participation_isParticipant , -->
30
				<!-- projectOrganization_participation_hasParticipant -->
31
				<!-- result, -->
32
				<!-- resultProject_outcome_produces , -->
33
				<!-- resultResult_publicationDataset_isRelatedTo, -->
34
				<!-- personResult_authorship_hasAuthor -->
35
			</arg>
36

  
37

  
38
			<capture-output />
39
		</java>
40
		<ok to="mr_export" />
41
		<error to="fail" />
42
	</action>
43

  
44
	<action name="mr_export">
45
		<map-reduce>
46

  
47
			<prepare>
48
				<delete path="${nameNode}${Stats_output_Path}" />
49
				<!-- <delete path="${Stats_output_Path}" /> -->
50

  
51
			</prepare>
52
			<configuration>
53
				<!-- TODO CHECK THIS -->
54
				<!-- HDFS -->
55
				<!-- hdfs://nmis-hadoop-cluster< -->
56
				<!-- <property> -->
57
				<!-- <name>dfs.nameservices</name> -->
58
				<!-- <value>nmis-hadoop-cluster</value> -->
59
				<!-- </property> -->
60

  
61
				<!-- <property> -->
62
				<!-- <name>dfs.ha.namenodes.nmis-hadoop-cluster</name> -->
63
				<!-- <value>nn1,nn2 nn1,nn2 -->
64
				<!-- dfs.namenode.rpc-address.nmis-hadoop-cluster.nn1=quorum1.t.hadoop.research-infrastructures.eu:8020 -->
65
				<!-- dfs.namenode.http-address.nmis-hadoop-cluster.nn1=quorum1.t.hadoop.research-infrastructures.eu:50070 -->
66
				<!-- dfs.namenode.rpc-address.nmis-hadoop-cluster.nn2=quorum2.t.hadoop.research-infrastructures.eu:8020 -->
67
				<!-- dfs.namenode.http-address.nmis-hadoop-cluster.nn2=quorum2.t.hadoop.research-infrastructures.eu:50070 -->
68
				<!-- dfs.client.failover.proxy.provider.nmis-hadoop-cluster=org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider -->
69
				<!-- </value> -->
70
				<!-- </property> -->
71

  
72

  
73
				<!-- HBASE -->
74
				<property>
75
					<name>hbase.mapreduce.scan</name>
76
					<value>${wf:actionData('get-scanner')['scan']}</value>
77
				</property>
78
				<property>
79
					<name>hbase.rootdir</name>
80
					<value>hdfs://nmis-hadoop-cluster/hbase</value>
81
				</property>
82

  
83
				<property>
84
					<name>hbase.security.authentication</name>
85
					<value> simple</value>
86
				</property>
87
				<!-- ZOOKEEPER -->
88

  
89
				<property>
90
					<name>hbase.zookeeper.quorum</name>
91
					<value>quorum1.t.hadoop.research-infrastructures.eu,quorum2.t.hadoop.research-infrastructures.eu,quorum3.t.hadoop.research-infrastructures.eu,quorum4.t.hadoop.research-infrastructures.eu,jobtracker.t.hadoop.research-infrastructures.eu
92
					</value>
93
				</property>
94
				<property>
95
					<name>zookeeper.znode.rootserver</name>
96
					<value> root-region-server
97
					</value>
98
				</property>
99

  
100
				<property>
101
					<name>hbase.zookeeper.property.clientPort</name>
102
					<value>2182</value>
103
				</property>
104

  
105
				<!-- MR IO -->
106

  
107

  
108
				<property>
109
					<name>mapreduce.inputformat.class</name>
110
					<value>org.apache.hadoop.hbase.mapreduce.TableInputFormat</value>
111
				</property>
112

  
113
				<property>
114
					<name>mapred.mapoutput.key.class</name>
115
					<value>org.apache.hadoop.io.Text</value>
116
				</property>
117
				<property>
118
					<name>mapred.mapoutput.value.class</name>
119
					<value>org.apache.hadoop.hbase.io.ImmutableBytesWritable</value>
120
				</property>
121
				<property>
122
					<name>mapred.output.key.class</name>
123
					<value>org.apache.hadoop.io.Text</value>
124
				</property>
125
				<property>
126
					<name>mapred.output.value.class</name>
127
					<value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
128
					</value>
129
				</property>
130

  
131
				<!-- ## This is required for new MapReduce API usage -->
132
				<property>
133
					<name>mapred.mapper.new-api</name>
134
					<value>true</value>
135
				</property>
136
				<property>
137
					<name>mapred.reducer.new-api</name>
138
					<value>true</value>
139
				</property>
140

  
141
				<!-- # Job-specific options -->
142
				<property>
143
					<name>dfs.blocksize</name>
144
					<value>32M</value>
145
				</property>
146
				<property>
147
					<name>mapred.output.compress</name>
148
					<value>false</value>
149
				</property>
150
				<property>
151
					<name>mapred.reduce.tasks.speculative.execution</name>
152
					<value>false</value>
153
				</property>
154
				<property>
155
					<name>mapred.reduce.tasks.speculative.execution</name>
156
					<value>false</value>
157
				</property>
158
				<property>
159
					<name>mapreduce.map.speculative</name>
160
					<value>false</value>
161
				</property>
162

  
163
				<!-- I/O FORMAT -->
164

  
165
				<!-- ## Names of all output ports -->
166

  
167

  
168
				<property>
169
					<name>mapreduce.multipleoutputs</name>
170
					<value>
171
						datasource datasourceLanguage project organization datasourceOrganization
172
						<!-- datasourceTopic resultProject -->
173
						<!-- 'projectOrganization' -->
174
						<!-- result resultClaim resultClassification resultConcept -->
175
						<!-- resultLanguage resultOrganization resultProject resultResult resultTopic 
176
							resultLanguage resultDatasource resultResult -->
177
						<!-- category claim concept -->
178
					</value>
179
				</property>
180

  
181
				<!-- datasource -->
182
				<property>
183
					<name> mapreduce.multipleoutputs.namedOutput.datasource.key</name>
184
					<value>org.apache.hadoop.io.Text</value>
185

  
186
				</property>
187
				<property>
188
					<name> mapreduce.multipleoutputs.namedOutput.datasource.value
189
					</name>
190
					<value>org.apache.hadoop.io.Text</value>
191
				</property>
192
				<property>
193
					<name> mapreduce.multipleoutputs.namedOutput.datasource.format
194
					</name>
195
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
196
					</value>
197
				</property>
198
				<!-- datasourceLanguage -->
199
				<property>
200
					<name>mapreduce.multipleoutputs.namedOutput.datasourceLanguage.key
201
					</name>
202
					<value>org.apache.hadoop.io.Text</value>
203
				</property>
204
				<property>
205

  
206
					<name>
207
						mapreduce.multipleoutputs.namedOutput.datasourceLanguage.value
208
					</name>
209
					<value>org.apache.hadoop.io.Text</value>
210
				</property>
211
				<property>
212
					<name>
213
						mapreduce.multipleoutputs.namedOutput.datasourceLanguage.format
214
					</name>
215
					<value>
216
						org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
217

  
218
					</value>
219
				</property>
220

  
221
				<!-- datasourceOrganization -->
222
				<property>
223
					<name>mapreduce.multipleoutputs.namedOutput.datasourceOrganization.key
224
					</name>
225
					<value>org.apache.hadoop.io.Text</value>
226
				</property>
227
				<property>
228

  
229
					<name>
230
						mapreduce.multipleoutputs.namedOutput.datasourceOrganization.value
231
					</name>
232
					<value>org.apache.hadoop.io.Text</value>
233
				</property>
234
				<property>
235
					<name>
236
						mapreduce.multipleoutputs.namedOutput.datasourceOrganization.format
237
					</name>
238
					<value>
239
						org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
240

  
241
					</value>
242
				</property>
243

  
244
				<!-- datasourceTopic -->
245
				<!-- <property> -->
246
				<!-- <name>mapreduce.multipleoutputs.namedOutput.datasourceTopic.key -->
247
				<!-- </name> -->
248
				<!-- <value>org.apache.hadoop.io.Text</value> -->
249
				<!-- </property> -->
250
				<!-- <property> -->
251

  
252
				<!-- <name> -->
253
				<!-- mapreduce.multipleoutputs.namedOutput.datasourceTopic.value -->
254
				<!-- </name> -->
255
				<!-- <value>org.apache.hadoop.io.Text</value> -->
256
				<!-- </property> -->
257
				<!-- <property> -->
258
				<!-- <name> -->
259
				<!-- mapreduce.multipleoutputs.namedOutput.datasourceTopic.format -->
260
				<!-- </name> -->
261
				<!-- <value> -->
262
				<!-- org.apache.hadoop.mapreduce.lib.output.TextOutputFormat -->
263

  
264
				<!-- </value> -->
265
				<!-- </property> -->
266

  
267
				<!-- project -->
268
				<property>
269
					<name>mapreduce.multipleoutputs.namedOutput.project.key
270
					</name>
271
					<value>org.apache.hadoop.io.Text</value>
272
				</property>
273
				<property>
274

  
275
					<name>
276
						mapreduce.multipleoutputs.namedOutput.project.value
277
					</name>
278
					<value>org.apache.hadoop.io.Text</value>
279
				</property>
280
				<property>
281
					<name>
282
						mapreduce.multipleoutputs.namedOutput.project.format
283
					</name>
284
					<value>
285
						org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
286

  
287
					</value>
288
				</property>
289

  
290
				<!-- resultProject -->
291
				<!-- <property> -->
292
				<!-- <name>mapreduce.multipleoutputs.namedOutput.resultProject.key -->
293
				<!-- </name> -->
294
				<!-- <value>org.apache.hadoop.io.Text</value> -->
295
				<!-- </property> -->
296
				<!-- <property> -->
297

  
298
				<!-- <name> -->
299
				<!-- mapreduce.multipleoutputs.namedOutput.resultProject.value -->
300
				<!-- </name> -->
301
				<!-- <value>org.apache.hadoop.io.Text</value> -->
302
				<!-- </property> -->
303
				<!-- <property> -->
304
				<!-- <name> -->
305
				<!-- mapreduce.multipleoutputs.namedOutput.resultProject.format -->
306
				<!-- </name> -->
307
				<!-- <value> -->
308
				<!-- org.apache.hadoop.mapreduce.lib.output.TextOutputFormat -->
309

  
310
				<!-- </value> -->
311
				<!-- </property> -->
312
				<!-- projectOrganization -->
313
				<!-- <property> -->
314
				<!-- <name>mapreduce.multipleoutputs.namedOutput.projectOrganization.key -->
315
				<!-- </name> -->
316
				<!-- <value>org.apache.hadoop.io.Text</value> -->
317
				<!-- </property> -->
318
				<!-- <property> -->
319

  
320
				<!-- <name> -->
321
				<!-- mapreduce.multipleoutputs.namedOutput.projectOrganization.value -->
322
				<!-- </name> -->
323
				<!-- <value>org.apache.hadoop.io.Text</value> -->
324
				<!-- </property> -->
325
				<!-- <property> -->
326
				<!-- <name> -->
327
				<!-- mapreduce.multipleoutputs.namedOutput.projectOrganization.format -->
328
				<!-- </name> -->
329
				<!-- <value> -->
330
				<!-- org.apache.hadoop.mapreduce.lib.output.TextOutputFormat -->
331

  
332
				<!-- </value> -->
333
				<!-- </property> -->
334

  
335

  
336
				<!-- organization -->
337
				<property>
338
					<name>mapreduce.multipleoutputs.namedOutput.organization.key
339
					</name>
340
					<value>org.apache.hadoop.io.Text</value>
341
				</property>
342
				<property>
343

  
344
					<name>
345
						mapreduce.multipleoutputs.namedOutput.organization.value
346
					</name>
347
					<value>org.apache.hadoop.io.Text</value>
348
				</property>
349
				<property>
350
					<name>
351
						mapreduce.multipleoutputs.namedOutput.organization.format
352
					</name>
353
					<value>
354
						org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
355

  
356
					</value>
357
				</property>
358

  
359

  
360
				<!-- ## Classes of mapper and reducer -->
361

  
362
				<property>
363
					<name>mapreduce.map.class</name>
364
					<value>eu.dnetlib.data.mapreduce.hbase.statsExport.StatsMapper
365
					</value>
366
				</property>
367
				<property>
368
					<name>mapreduce.reduce.class</name>
369
					<value>eu.dnetlib.data.mapreduce.hbase.statsExport.StatsReducer
370
					</value>
371
				</property>
372
				<property>
373
					<name>io.serializations</name>
374
					<value>org.apache.hadoop.io.serializer.WritableSerialization
375
					</value>
376
				</property>
377
				<!-- ## Custom config -->
378

  
379
				<!--delim character used to seperate fields in hdfs dump files <property> -->
380
				<property>
381
					<name>stats.delim </name>
382
					<value>${Stats_delim_Character}</value>
383
				</property>
384
				<!--default string for Null String Values -->
385
				<property>
386
					<name>stats.nullString</name>
387
					<value>${Stats_null_String_Field}</value>
388
				</property>
389
				<!--default string for Null Numeric Values -->
390
				<property>
391
					<name>stats.nullNum</name>
392
					<value>${Stats_null_Numeric_Field}</value>
393
				</property>
394
				<!--source hbase table -->
395
				<property>
396
					<name>hbase.mapreduce.inputtable</name>
397
					<value>${Stats_Hbase_Source_Table}</value>
398
				</property>
399
				<!-- This directory does not correspond to a data store. In fact, this 
400
					directory only contains multiple data stores. It has to be set to the name 
401
					of the workflow node. -->
402
				<property>
403
					<name>mapred.output.dir</name>
404
					<value>${Stats_output_Path}</value>
405
				</property>
406
				<!-- ## Workflow node parameters -->
407
				<property>
408
					<name>mapred.reduce.tasks</name>
409
					<value>1</value>
410
				</property>
411
			</configuration>
412
		</map-reduce>
413
		<ok to="end" />
414
		<error to="fail" />
415
	</action>
416

  
417

  
418
	<action name="prepareDatabase">
419
		<java>
420

  
421
			<prepare>
422
			</prepare>
423
			<configuration>
424
				<property>
425
					<name>mapred.job.queue.name</name>
426
					<value>${queueName}</value>
427
				</property>
428
			</configuration>
429

  
430
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper
431
			</main-class>
432
			<arg>-SworkingDir=${workingDir}</arg>
433
			<arg>eu.dnetlib.iis.core.workflows.stats.DBInitWrapper</arg>
434

  
435
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
436
			<arg>-PStats_db_User=${Stats_db_User}</arg>
437
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
438
			<arg>-PStats_db_Name=${Stats_db_Name}</arg>
439
			<arg>-PStats_db_Driver=${Stats_db_Driver}</arg>
440

  
441
		</java>
442
		<ok to="finalizeDatabase" />
443
		<error to="fail" />
444
	</action>
445

  
446
	<action name="sqoopImport">
447
		<java>
448
			<prepare>
449
			</prepare>
450
			<configuration>
451
				<property>
452
					<name>mapred.job.queue.name</name>
453
					<value>${queueName}</value>
454
				</property>
455
			</configuration>
456

  
457
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper
458
			</main-class>
459
			<arg>-SworkingDir=${workingDir}</arg>
460
			<arg>eu.dnetlib.iis.core.workflows.stats.SqoopWrapper</arg>
461

  
462
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
463
			<arg>-PStats_db_User=${Stats_db_User}</arg>
464
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
465
			<arg>-PStats_db_Name=${Stats_db_Name}</arg>
466
			<arg>-PStats_output_Path=${Stats_output_Path}</arg>
467
			<arg>-PStats_sqoop_RecsPerStatement=${Stats_sqoop_RecsPerStatement}
468
			</arg>
469
			<arg>-PStats_sqoop_ReducersCount=${Stats_sqoop_ReducersCount}</arg>
470
			<arg>-PStats_sqoop_StatementPerTrans=${Stats_sqoop_StatementPerTrans}
471
			</arg>
472
			<!-- <java-opts>-Dlog4j.configuration=log4jConfig</java-opts> -->
473
		</java>
474
		<ok to="end" />
475
		<error to="fail" />
476
	</action>
477

  
478
	<action name="sqoopDirectImport">
479
		<sqoop xmlns="uri:oozie:sqoop-action:0.2">
480
			<job-tracker>${jobTracker}</job-tracker>
481
			<name-node>${nameNode}</name-node>
482
			<prepare>
483

  
484

  
485
			</prepare>
486
			<configuration>
487
				<property>
488
					<name>mapred.compress.map.output</name>
489
					<value>true</value>
490
				</property>
491
			</configuration>
492
			<arg>export</arg>
493
			<arg> -Dsqoop.export.records.per.statement</arg>
494
			<arg> ${Stats_sqoop_RecsPerStatement}</arg>
495

  
496

  
497
			<arg>-Dsqoop.export.statements.per.transaction </arg>
498
			<arg>${Stats_sqoop_StatementPerTrans}</arg>
499

  
500

  
501
			<arg>--connect</arg>
502
			<arg>${Stats_db_Url}/${Stats_db_Name}</arg>
503
			<arg>--table</arg>
504
			<arg>datasource</arg>
505

  
506
			<arg>--table</arg>
507
			<arg>datasource</arg>
508

  
509
			<arg>--export-dir</arg>
510
			<arg>${Stats_output_Path}/datasource-r-00000</arg>
511
			<arg> --optionally-enclosed-by </arg>
512
			<arg> " "  </arg>
513
			<arg>--input-fields-terminated-by </arg>
514
			<arg> ${Stats_delim_Character } </arg>
515
			<arg> --verbose </arg>
516
			<arg>--username </arg>
517
			<arg>${Stats_db_User} </arg>
518
			<arg>--password</arg>
519
			<arg>${Stats_db_Pass} </arg>
520
			<arg>--batch</arg>
521
			<arg>-m</arg>
522
			<arg>${Stats_sqoop_ReducersCount}</arg>
523
		</sqoop>
524
		<ok to="end" />
525
		<error to="fail" />
526
	</action>
527
	<action name="finalizeDatabase">
528
		<java>
529

  
530
			<prepare>
531
			</prepare>
532
			<configuration>
533
				<property>
534
					<name>mapred.job.queue.name</name>
535
					<value>${queueName}</value>
536
				</property>
537
			</configuration>
538

  
539
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper
540
			</main-class>
541
			<arg>-SworkingDir=${workingDir}</arg>
542
			<arg>eu.dnetlib.iis.core.workflows.stats.DBFinalizeWrapper
543
			</arg>
544
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
545
			<arg>-PStats_db_User=${Stats_db_User}</arg>
546
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
547
			<arg>-PStats_db_Name=${Stats_db_Name}</arg>
548
			<arg>-PStats_db_Driver=${Stats_db_Driver}</arg>
549
		</java>
550
		<ok to="end" />
551
		<error to="fail" />
552
	</action>
553

  
554

  
555
	<kill name="fail">
556
		<message>
557
			Unfortunately, the process failed -- error message:
558
			[${wf:errorMessage(wf:lastErrorNode())}]
559
		</message>
560
	</kill>
561
	<end name="end" />
562
</workflow-app>
1
<workflow-app name="test-core_examples_javamapreduce_stats"
2
	xmlns="uri:oozie:workflow:0.4">
3
	<!-- map reduce job that exports hbase data and prepares them for import to the relation
4
		database used for statistics generation -->
5
	
6
	<global>
7
		<job-tracker>${jobTracker}</job-tracker>
8
		<name-node>${nameNode}</name-node>
9
		<configuration>
10
			<property>
11
				<name>mapred.job.queue.name</name>
12
				<value>${queueName}</value>
13
			</property>
14
		</configuration>
15
	</global>
16
<start to='sqoopDirectImport' />
17
	<action name='get-scanner'>
18
		<java>
19
			
20
			<main-class>eu.dnetlib.iis.core.workflows.stats.HbaseScannerGenerator</main-class>
21
			<!-- column families: -->
22
			
23
			<arg>
24
				-f datasource
25
				<!--, datasourceOrganization_provision_provides, project , organization-->
26
				<!-- projectOrganization_participation_isParticipant , -->
27
				<!-- projectOrganization_participation_hasParticipant -->
28
				<!-- result, -->
29
				<!-- resultProject_outcome_produces , -->
30
				<!-- resultResult_publicationDataset_isRelatedTo, -->
31
				<!-- personResult_authorship_hasAuthor -->
32
			</arg>
33
			
34
			
35
			<capture-output />
36
		</java>
37
		<ok to="mr_export" />
38
		<error to="fail" />
39
	</action>
40
	<action name='get-multiple_outputs'>
41
		<java>
42
			
43
			<main-class>eu.dnetlib.iis.core.workflows.stats.MultipleOutputsGenerator</main-class>
44
			<!--  MoS names -->
45
			
46
			<arg>-n datasource ,
47
				datasourceLanguage</arg>
48
			
49
			<arg>-k org.apache.hadoop.io.Text.class</arg>
50
			
51
			<arg>-v org.apache.hadoop.io.Text.class</arg>
52
			
53
			
54
			<arg>-f org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class</arg>
55
			
56
			<capture-output />
57
		</java>
58
		<ok to="mr_export" />
59
		<error to="fail" />
60
	</action>
61
	<action name="mr_export">
62
		<map-reduce>
63
			
64
			<prepare>
65
				<delete path="${nameNode}${Stats_output_Path}" />
66
				
67
				
68
			</prepare>
69
			<configuration>
70
				<!-- TODO CHECK THIS -->
71
				<!-- HDFS -->
72
				<!-- hdfs://nmis-hadoop-cluster< -->
73
				<!-- <property> -->
74
				<!-- <name>dfs.nameservices</name> -->
75
				<!-- <value>nmis-hadoop-cluster</value> -->
76
				<!-- </property> -->
77
				
78
				<!-- <property> -->
79
				<!-- <name>dfs.ha.namenodes.nmis-hadoop-cluster</name> -->
80
				<!-- <value>nn1,nn2 nn1,nn2 -->
81
				<!-- dfs.namenode.rpc-address.nmis-hadoop-cluster.nn1=quorum1.t.hadoop.research-infrastructures.eu:8020
82
					-->
83
				<!-- dfs.namenode.http-address.nmis-hadoop-cluster.nn1=quorum1.t.hadoop.research-infrastructures.eu:50070
84
					-->
85
				<!-- dfs.namenode.rpc-address.nmis-hadoop-cluster.nn2=quorum2.t.hadoop.research-infrastructures.eu:8020
86
					-->
87
				<!-- dfs.namenode.http-address.nmis-hadoop-cluster.nn2=quorum2.t.hadoop.research-infrastructures.eu:50070
88
					-->
89
				<!-- dfs.client.failover.proxy.provider.nmis-hadoop-cluster=org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
90
					-->
91
				<!-- </value> -->
92
				<!-- </property> -->
93
				
94
				
95
				<!-- HBASE -->
96
				<property>
97
					<name>hbase.mapreduce.scan</name>
98
					<value>${wf:actionData('get-scanner')['scan']}</value>
99
				</property>
100
				<property>
101
					<name>hbase.rootdir</name>
102
					<value>hdfs://nmis-hadoop-cluster/hbase</value>
103
				</property>
104
				
105
				<property>
106
					<name>hbase.security.authentication</name>
107
					<value>simple</value>
108
				</property>
109
				<!-- ZOOKEEPER -->
110
				
111
				<property>
112
					<name>hbase.zookeeper.quorum</name>
113
					<value>
114
						quorum1.t.hadoop.research-infrastructures.eu,quorum2.t.hadoop.research-infrastructures.eu,quorum3.t.hadoop.research-infrastructures.eu,quorum4.t.hadoop.research-infrastructures.eu,jobtracker.t.hadoop.research-infrastructures.eu
115
					</value>
116
				</property>
117
				<property>
118
					<name>zookeeper.znode.rootserver</name>
119
					<value>root-region-server</value>
120
				</property>
121
				
122
				<property>
123
					<name>hbase.zookeeper.property.clientPort</name>
124
					<value>2182</value>
125
				</property>
126
				
127
				
128
				<property>
129
					<name>mapreduce.framework.name</name>
130
					<value>classic</value>
131
				</property>
132
				
133
				<property>
134
					<name>mapreduce.jobtracker.address</name>
135
					<value>nmis-hadoop-jt</value>
136
				</property>
137
				
138
				<!-- MR IO -->
139
				
140
				
141
				<property>
142
					<name>mapreduce.inputformat.class</name>
143
					<value>org.apache.hadoop.hbase.mapreduce.TableInputFormat</value>
144
				</property>
145
				
146
				<property>
147
					<name>mapred.mapoutput.key.class</name>
148
					<value>org.apache.hadoop.io.Text</value>
149
				</property>
150
				<property>
151
					<name>mapred.mapoutput.value.class</name>
152
					<value>org.apache.hadoop.hbase.io.ImmutableBytesWritable</value>
153
				</property>
154
				<property>
155
					<name>mapred.output.key.class</name>
156
					<value>org.apache.hadoop.io.Text</value>
157
				</property>
158
				<property>
159
					<name>mapred.output.value.class</name>
160
					<value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat</value>
161
				</property>
162
				
163
				<!-- ## This is required for new MapReduce API usage -->
164
				<property>
165
					<name>mapred.mapper.new-api</name>
166
					<value>true</value>
167
				</property>
168
				<property>
169
					<name>mapred.reducer.new-api</name>
170
					<value>true</value>
171
				</property>
172
				
173
				<!-- # Job-specific options -->
174
				<property>
175
					<name>dfs.blocksize</name>
176
					<value>32M</value>
177
				</property>
178
				<property>
179
					<name>mapred.output.compress</name>
180
					<value>false</value>
181
				</property>
182
				<property>
183
					<name>mapred.reduce.tasks.speculative.execution</name>
184
					<value>false</value>
185
				</property>
186
				<property>
187
					<name>mapred.reduce.tasks.speculative.execution</name>
188
					<value>false</value>
189
				</property>
190
				<property>
191
					<name>mapreduce.map.speculative</name>
192
					<value>false</value>
193
				</property>
194
				
195
				<!-- I/O FORMAT -->
196
				
197
				<!-- ## Names of all output ports -->
198
				
199
				<property>
200
					<name>mapreduce.multipleoutputs</name>
201
					
202
					<value>
203
						datasource datasourceLanguage
204
						<!--	project organization datasourceOrganization-->
205
						<!-- datasourceTopic resultProject -->
206
						<!-- 'projectOrganization' -->
207
						<!-- result resultClaim resultClassification resultConcept -->
208
						<!-- resultLanguage resultOrganization resultProject resultResult resultTopic resultLanguage
209
							resultDatasource resultResult -->
210
						<!-- category claim concept -->
211
					</value>
212
				</property>
213
				<!-- datasource -->
214
				<property>
215
					<name>mapreduce.multipleoutputs.namedOutput.datasource.key</name>
216
					<value>org.apache.hadoop.io.Text</value>
217
				</property>
218
				<property>
219
					<name>mapreduce.multipleoutputs.namedOutput.datasource.value</name>
220
					<value>org.apache.hadoop.io.Text</value>
221
				</property>
222
				<property>
223
					<name>mapreduce.multipleoutputs.namedOutput.datasource.format</name>
224
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
225
				</property>
226
				<!-- datasourceLanguage -->
227
				<property>
228
					<name>mapreduce.multipleoutputs.namedOutput.datasourceLanguage.key</name>
229
					<value>org.apache.hadoop.io.Text</value>
230
				</property>
231
				<property>
232
					<name>mapreduce.multipleoutputs.namedOutput.datasourceLanguage.value</name>
233
					<value>org.apache.hadoop.io.Text</value>
234
				</property>
235
				<property>
236
					<name>mapreduce.multipleoutputs.namedOutput.datasourceLanguage.format</name>
237
					<value>org.apache.hadoop.mapreduce.lib.output.TextOutputFormat</value>
238
				</property>
239
				
240
				<!-- datasourceOrganization -->
241
				
242
				<!--<property> <name>mapreduce.multipleoutputs.namedOutput.datasourceOrganization.key
243
					</name> <value>org.apache.hadoop.io.Text</value> </property> <property> <name>
244
					mapreduce.multipleoutputs.namedOutput.datasourceOrganization.value </name> <value>org.apache.hadoop.io.Text</value>
245
					</property> <property> <name> mapreduce.multipleoutputs.namedOutput.datasourceOrganization.format
246
					</name> <value> org.apache.hadoop.mapreduce.lib.output.TextOutputFormat </value>
247
					</property> -->
248
				<!-- datasourceTopic -->
249
				<!-- <property> -->
250
				<!-- <name>mapreduce.multipleoutputs.namedOutput.datasourceTopic.key -->
251
				<!-- </name> -->
252
				<!-- <value>org.apache.hadoop.io.Text</value> -->
253
				<!-- </property> -->
254
				<!-- <property> -->
255
				
256
				<!-- <name> -->
257
				<!-- mapreduce.multipleoutputs.namedOutput.datasourceTopic.value -->
258
				<!-- </name> -->
259
				<!-- <value>org.apache.hadoop.io.Text</value> -->
260
				<!-- </property> -->
261
				<!-- <property> -->
262
				<!-- <name> -->
263
				<!-- mapreduce.multipleoutputs.namedOutput.datasourceTopic.format -->
264
				<!-- </name> -->
265
				<!-- <value> -->
266
				<!-- org.apache.hadoop.mapreduce.lib.output.TextOutputFormat -->
267
				
268
				<!-- </value> -->
269
				<!-- </property> -->
270
				
271
				<!-- project -->
272
				<!--<property> <name>mapreduce.multipleoutputs.namedOutput.project.key </name> <value>org.apache.hadoop.io.Text</value>
273
					</property> <property> <name> mapreduce.multipleoutputs.namedOutput.project.value
274
					</name> <value>org.apache.hadoop.io.Text</value> </property> <property> <name>
275
					mapreduce.multipleoutputs.namedOutput.project.format </name> <value> org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
276
					</value> </property> -->
277
				<!-- resultProject -->
278
				<!-- <property> -->
279
				<!-- <name>mapreduce.multipleoutputs.namedOutput.resultProject.key -->
280
				<!-- </name> -->
281
				<!-- <value>org.apache.hadoop.io.Text</value> -->
282
				<!-- </property> -->
283
				<!-- <property> -->
284
				
285
				<!-- <name> -->
286
				<!-- mapreduce.multipleoutputs.namedOutput.resultProject.value -->
287
				<!-- </name> -->
288
				<!-- <value>org.apache.hadoop.io.Text</value> -->
289
				<!-- </property> -->
290
				<!-- <property> -->
291
				<!-- <name> -->
292
				<!-- mapreduce.multipleoutputs.namedOutput.resultProject.format -->
293
				<!-- </name> -->
294
				<!-- <value> -->
295
				<!-- org.apache.hadoop.mapreduce.lib.output.TextOutputFormat -->
296
				
297
				<!-- </value> -->
298
				<!-- </property> -->
299
				<!-- projectOrganization -->
300
				<!-- <property> -->
301
				<!-- <name>mapreduce.multipleoutputs.namedOutput.projectOrganization.key -->
302
				<!-- </name> -->
303
				<!-- <value>org.apache.hadoop.io.Text</value> -->
304
				<!-- </property> -->
305
				<!-- <property> -->
306
				
307
				<!-- <name> -->
308
				<!-- mapreduce.multipleoutputs.namedOutput.projectOrganization.value -->
309
				<!-- </name> -->
310
				<!-- <value>org.apache.hadoop.io.Text</value> -->
311
				<!-- </property> -->
312
				<!-- <property> -->
313
				<!-- <name> -->
314
				<!-- mapreduce.multipleoutputs.namedOutput.projectOrganization.format -->
315
				<!-- </name> -->
316
				<!-- <value> -->
317
				<!-- org.apache.hadoop.mapreduce.lib.output.TextOutputFormat -->
318
				
319
				<!-- </value> -->
320
				<!-- </property> -->
321
				
322
				
323
				<!-- organization <property> <name>mapreduce.multipleoutputs.namedOutput.organization.key
324
					</name> <value>org.apache.hadoop.io.Text</value> </property> <property> <name>
325
					mapreduce.multipleoutputs.namedOutput.organization.value </name> <value>org.apache.hadoop.io.Text</value>
326
					</property> <property> <name> mapreduce.multipleoutputs.namedOutput.organization.format
327
					</name> <value> org.apache.hadoop.mapreduce.lib.output.TextOutputFormat </value>
328
					</property> -->
329
				
330
				<!-- ## Classes of mapper and reducer -->
331
				
332
				<property>
333
					<name>mapreduce.map.class</name>
334
					<value>eu.dnetlib.data.mapreduce.hbase.statsExport.StatsMapper</value>
335
				</property>
336
				<property>
337
					<name>mapreduce.reduce.class</name>
338
					<value>eu.dnetlib.data.mapreduce.hbase.statsExport.StatsReducer</value>
339
				</property>
340
				<property>
341
					<name>io.serializations</name>
342
					<value>org.apache.hadoop.io.serializer.WritableSerialization</value>
343
				</property>
344
				<!-- ## Custom config -->
345
				
346
				<!--delim character used to seperate fields in hdfs dump files <property> -->
347
				<property>
348
					<name>stats.delim</name>
349
					<value>${Stats_delim_Character}</value>
350
				</property>
351
				<!--default string for Null String Values -->
352
				<property>
353
					<name>stats.nullString</name>
354
					<value>${Stats_null_String_Field}</value>
355
				</property>
356
				<!--default string for Null Numeric Values -->
357
				<property>
358
					<name>stats.nullNum</name>
359
					<value>${Stats_null_Numeric_Field}</value>
360
				</property>
361
				<!--source hbase table -->
362
				<property>
363
					<name>hbase.mapreduce.inputtable</name>
364
					<value>${Stats_Hbase_Source_Table}</value>
365
				</property>
366
				<!-- This directory does not correspond to a data store. In fact, this directory only
367
					contains multiple data stores. It has to be set to the name of the workflow node.
368
					-->
369
				<property>
370
					<name>mapred.output.dir</name>
371
					<value>${Stats_output_Path}</value>
372
				</property>
373
				<!-- ## Workflow node parameters -->
374
				<property>
375
					<name>mapred.reduce.tasks</name>
376
					<value>1</value>
377
				</property>
378
			</configuration>
379
		</map-reduce>
380
		<ok to="end" />
381
		<error to="fail" />
382
	</action>
383
	
384
	
385
	<action name="prepareDatabase">
386
		<java>
387
			
388
			<prepare>
389
			</prepare>
390
			<configuration>
391
				<property>
392
					<name>mapred.job.queue.name</name>
393
					<value>${queueName}</value>
394
				</property>
395
			</configuration>
396
			
397
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
398
			<arg>-SworkingDir=${workingDir}</arg>
399
			<arg>eu.dnetlib.iis.core.workflows.stats.DBInitWrapper</arg>
400
			
401
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
402
			<arg>-PStats_db_User=${Stats_db_User}</arg>
403
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
404
			<arg>-PStats_db_Name=${Stats_db_Name}</arg>
405
			<arg>-PStats_db_Driver=${Stats_db_Driver}</arg>
406
			
407
		</java>
408
		<ok to="finalizeDatabase" />
409
		<error to="fail" />
410
	</action>
411
	
412
	<action name="sqoopImport">
413
		<java>
414
			<prepare>
415
			</prepare>
416
			<configuration>
417
				<property>
418
					<name>mapred.job.queue.name</name>
419
					<value>${queueName}</value>
420
				</property>
421
			</configuration>
422
			
423
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
424
			<arg>-SworkingDir=${workingDir}</arg>
425
			<arg>eu.dnetlib.iis.core.workflows.stats.SqoopWrapper</arg>
426
			
427
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
428
			<arg>-PStats_db_User=${Stats_db_User}</arg>
429
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
430
			<arg>-PStats_db_Name=${Stats_db_Name}</arg>
431
			<arg>-PStats_output_Path=${Stats_output_Path}</arg>
432
			<arg>-PStats_sqoop_RecsPerStatement=${Stats_sqoop_RecsPerStatement}</arg>
433
			<arg>-PStats_sqoop_ReducersCount=${Stats_sqoop_ReducersCount}</arg>
434
			<arg>-PStats_sqoop_StatementPerTrans=${Stats_sqoop_StatementPerTrans}</arg>
435
			<!-- <java-opts>-Dlog4j.configuration=log4jConfig</java-opts> -->
436
		</java>
437
		<ok to="end" />
438
		<error to="fail" />
439
	</action>
440
	
441
	<action name="sqoopDirectImport">
442
		<sqoop xmlns="uri:oozie:sqoop-action:0.2">
443
			<job-tracker>${jobTracker}</job-tracker>
444
			<name-node>${nameNode}</name-node>
445
			<prepare>
446
				
447
				
448
			</prepare>
449
			<configuration>
450
				<property>
451
					<name>mapred.compress.map.output</name>
452
					<value>false</value>
453
				</property>
454
				
455
			</configuration>
456
			<arg>--export</arg>
457
			<arg>-Dsqoop.export.records.per.statement</arg>
458
			<arg>${Stats_sqoop_RecsPerStatement}</arg>			
459
			
460
			<arg>-Dsqoop.export.statements.per.transaction</arg>
461
			<arg>${Stats_sqoop_StatementPerTrans}</arg>
462
			
463
			<arg>--connect</arg>
464
			<arg>${Stats_db_Url}/${Stats_db_Name}</arg>
465
			<arg>--table</arg>
466
			<arg>datasource</arg>
467
			
468
			<arg>--table</arg>
469
			<arg>datasource</arg>
470
			
471
			<arg>--export-dir</arg>
472
			<arg>${Stats_output_Path}/datasource-r-00000</arg>
473
			<arg>--optionally-enclosed-by</arg>
474
			<arg>"	"</arg>
475
			<arg>--input-fields-terminated-by</arg>
476
			<arg>${Stats_delim_Character }</arg>
477
			<arg>--verbose</arg>
478
			<arg>--username</arg>
479
			<arg>${Stats_db_User}</arg>
480
			<arg>--password</arg>
481
			<arg>${Stats_db_Pass}</arg>
482
			<arg>--batch</arg>
483
			<arg>-m</arg>
484
			<arg>${Stats_sqoop_ReducersCount}</arg>
485
		</sqoop>
486
		<ok to="end" />
487
		<error to="fail" />
488
	</action>
489
	<action name="finalizeDatabase">
490
		<java>
491
			
492
			<prepare>
493
			</prepare>
494
			<configuration>
495
				<property>
496
					<name>mapred.job.queue.name</name>
497
					<value>${queueName}</value>
498
				</property>
499
			</configuration>
500
			
501
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
502
			<arg>-SworkingDir=${workingDir}</arg>
503
			<arg>eu.dnetlib.iis.core.workflows.stats.DBFinalizeWrapper</arg>
504
			<arg>-PStats_db_Url=${Stats_db_Url}</arg>
505
			<arg>-PStats_db_User=${Stats_db_User}</arg>
506
			<arg>-PStats_db_Pass=${Stats_db_Pass}</arg>
507
			<arg>-PStats_db_Name=${Stats_db_Name}</arg>
508
			<arg>-PStats_db_Driver=${Stats_db_Driver}</arg>
509
		</java>
510
		<ok to="end" />
511
		<error to="fail" />
512
	</action>
513
	
514
	
515
	<kill name="fail">
516
		<message>
517
			Unfortunately, the process failed -- error message: [${wf:errorMessage(wf:lastErrorNode())}]
518
		</message>
519
	</kill>
520
	<end name="end" />
521
</workflow-app>

Also available in: Unified diff