Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_preprocessing_main">
2
	<parameters>
3
		<property>
4
            <name>remove_sideproducts</name>
5
            <value>true</value>
6
            <description>flag indicating inference side products will be erased</description>
7
        </property>
8
		<!-- import concepts related -->
9
		<property>
10
			<name>import_islookup_service_location</name>
11
			<description>IS Lookup service location</description>
12
		</property>
13
		<property>
14
			<name>import_project_concepts_context_ids_csv</name>
15
			<value>fet-fp7,fet-h2020</value>
16
			<description>comma separated list of concepts context identifiers to be picked by ISLookup</description>
17
		</property>
18
		<!-- import project related -->
19
		<property>
20
			<name>import_database_service_location</name>
21
			<description>Database service (not WSDL) location URL</description>
22
		</property>
23
		<property>
24
			<name>import_database_dbname</name>
25
			<value>dnet_openaireplus_node0_t</value>
26
			<description>database name</description>
27
		</property>
28
		<!-- import datacite related, export datacite & wos related -->
29
		<property>
30
			<name>import_mdstore_service_location</name>
31
			<description>MDStore service (not WSDL) location URL</description>
32
		</property>
33
		<property>
34
			<name>import_dataset_mdstore_ids_csv</name>
35
			<description>dataset MDStore identifier</description>
36
		</property>
37
		<property>
38
			<name>import_wos_mdstore_id</name>
39
			<description>WoS MDStore identifier</description>
40
		</property>
41
		<!-- import content related -->
42
		<!-- currently disabled, input_document_content property is handled as input holding DocumentContent datastore -->
43
		<property>
44
			<name>import_content_object_store_location</name>
45
			<value>$UNDEFINED$</value>
46
			<description>object store service location required for content retrieval</description>
47
		</property>
48
		<property>
49
			<name>import_content_wos_plaintext_objectstores_csv</name>
50
			<value>$UNDEFINED$</value>
51
			<description>CSV list of objectstore ids to be approved during WoS plaintext import.</description>
52
		</property>
53
		<property>
54
			<name>import_content_datacite_objectstores_csv</name>
55
			<value>$UNDEFINED$</value>
56
			<description>CSV list of objectstore ids to be approved during datacite import.</description>
57
		</property>
58
		<!-- import content mime types -->
59
		<property>
60
			<name>import_content_mimetypes_pdf</name>
61
			<value>pdf,application/pdf</value>
62
			<description>pdf mime types</description>
63
		</property>
64
		<property>
65
			<name>import_content_mimetypes_text</name>
66
			<value>text,text/plain</value>
67
			<description>text mime types</description>
68
		</property>
69
		<property>
70
			<name>import_content_mimetypes_html</name>
71
			<value>text/html</value>
72
			<description>html mime types</description>
73
		</property>
74
		<property>
75
			<name>import_content_mimetypes_xml_pmc</name>
76
			<value>xml</value>
77
			<description>xml pmc types</description>
78
		</property>
79
		<property>
80
			<name>import_content_mimetypes_wos</name>
81
			<value>file::WoS</value>
82
			<description>WoS mime types</description>
83
		</property>
84
		<!-- import timeouts related -->
85
		<property>
86
			<name>import_resultset_client_read_timeout</name>
87
			<value>60000</value>
88
			<description>resultset client read timeout</description>
89
		</property>
90
		<property>
91
			<name>import_content_connection_timeout</name>
92
			<value>60000</value>
93
			<description>import content connection timeout</description>
94
		</property>
95
		<property>
96
			<name>import_content_read_timeout</name>
97
			<value>60000</value>
98
			<description>import content read timeout</description>
99
		</property>
100
		<!-- metadata extraction related -->
101
		<property>
102
			<name>metadataextraction_excluded_checksums</name>
103
			<value>$UNDEFINED$</value>
104
			<description>list of content checksums excluded from metadataextraction processing</description>
105
		</property>
106
		<property>
107
			<name>metadataextraction_max_file_size_mb</name>
108
			<value>500</value>
109
			<description>maximum allowed file size in Megabytes</description>
110
		</property>
111
		<property>
112
			<name>metadataextraction_default_cache_location</name>
113
			<value>/cache/metadataextraction</value>
114
			<description>metadata extraction cache location, path pointing to root cache directory holding meta.json file</description>
115
		</property>
116
		<!-- export related -->
117
		<property>
118
			<name>active_export_to_hbase</name>
119
			<value>true</value>
120
			<description>flag indicating hbase export should be performed</description>
121
		</property>
122
		<property>
123
			<name>active_export_to_json</name>
124
			<value>false</value>
125
			<description>flag indicating json export should be performed</description>
126
		</property>
127
		<property>
128
			<name>export_action_hbase_table_name</name>
129
			<description>action manager hbase table name</description>
130
		</property>
131
		<property>
132
			<name>export_action_hbase_table_initialize</name>
133
			<description>flag indicating input table should be initialized</description>
134
		</property>
135
		<!-- action set id properties -->
136
		<property>
137
			<name>export_action_set_id</name>
138
			<value>$UNDEFINED$</value>
139
			<description>action-set identifier of exported data</description>
140
		</property>
141
		<property>
142
			<name>export_action_set_id_document_referencedProjects</name>
143
			<value>$UNDEFINED$</value>
144
			<description>document_referencedProjects action-set identifier of exported data</description>
145
		</property>
146
		<property>
147
			<name>export_action_set_id_document_referencedDatasets</name>
148
			<value>$UNDEFINED$</value>
149
			<description>document_referencedDatasets action-set identifier of exported data</description>
150
		</property>
151
		<property>
152
			<name>export_action_set_id_entity_wos</name>
153
			<description>action-set identifier of exported data containing wos entities</description>
154
		</property>
155
		<property>
156
			<name>export_action_set_id_entity_dataset</name>
157
			<description>action-set identifier of exported data containing dataset entities</description>
158
		</property>
159
		<property>
160
			<name>export_action_hbase_remote_zookeeper_quorum</name>
161
			<value>$UNDEFINED$</value>
162
			<description>external hbase zookeeper quorum, set to empty value by default which means data will be exported to local hbase instance</description>
163
		</property>
164
		<property>
165
			<name>export_action_hbase_remote_zookeeper_clientport</name>
166
			<value>$UNDEFINED$</value>
167
			<description>external hbase zookeeper client port, required only whe zookeeper quorum property is set</description>
168
		</property>
169
		<!-- trust level threshold section -->
170
		<property>
171
			<name>export_trust_level_threshold</name>
172
			<value>$UNDEFINED$</value>
173
			<description>default trust level threshold of exported data</description>
174
		</property>
175
		<property>
176
			<name>export_trust_level_threshold_document_referencedProjects</name>
177
			<value>$UNDEFINED$</value>
178
			<description>document_referencedProjects trust level threshold</description>
179
		</property>
180
		<property>
181
			<name>export_trust_level_threshold_document_referencedDatasets</name>
182
			<value>$UNDEFINED$</value>
183
			<description>document_referencedDatasets trust level threshold</description>
184
		</property>
185
		<!--  -->
186
		<!-- working directory related -->
187
		<property>
188
			<name>execution_environment</name>
189
			<value>preprocessing</value>
190
			<description>execution environment used for workingDir creation</description>
191
		</property>
192
		<property>
193
			<name>workingDir</name>
194
			<value>/user/${user.name}/iis/working_dirs/${execution_environment}</value>
195
			<description>working directory</description>
196
		</property>
197
	</parameters>
198

    
199
	<global>
200
        <job-tracker>${jobTracker}</job-tracker>
201
        <name-node>${nameNode}</name-node>
202
        <configuration>
203
            <property>
204
                <name>mapred.job.queue.name</name>
205
                <value>${queueName}</value>
206
            </property>
207
		</configuration>
208
	</global>
209

    
210
	<start to="init-workingDir"/>
211

    
212
	<action name="init-workingDir">
213
         <fs>
214
            <delete path="${nameNode}${workingDir}" />
215
			<mkdir path="${nameNode}${workingDir}" />
216
        </fs>
217
        <ok to="copy-version"/>
218
        <error to="fail"/>
219
    </action>
220
	
221
	<action name="copy-version">
222
       <distcp xmlns="uri:oozie:distcp-action:0.1">
223
           <job-tracker>${jobTracker}</job-tracker>
224
           <name-node>${nameNode}</name-node>
225
           <arg>${wf:appPath()}/version.properties</arg>
226
           <arg>${nameNode}${workingDir}</arg>
227
           </distcp>
228
       <ok to="import"/>
229
       <error to="fail"/>
230
	</action>
231

    
232
	<action name="import">
233
	    <sub-workflow>
234
            <app-path>${wf:appPath()}/mainworkflows_preprocessing_import</app-path>
235
            <propagate-configuration/>
236
            <configuration>
237
            	<property>
238
                    <name>workingDir</name>
239
                    <value>${workingDir}/mainworkflows_preprocessing_import/working_dir</value>
240
                </property>
241
                <!-- import project related -->
242
				<property>
243
					<name>database_service_location</name>
244
					<value>${import_database_service_location}</value>
245
				</property>
246
				<property>
247
					<name>database_dbname</name>
248
					<value>${import_database_dbname}</value>
249
				</property>
250
				<!-- project concept related -->
251
				<property>
252
					<name>islookup_service_location</name>
253
					<value>${import_islookup_service_location}</value>
254
				</property>
255
				<property>
256
					<name>project_concepts_context_ids_csv</name>
257
					<value>${import_project_concepts_context_ids_csv}</value>
258
				</property>
259
				<!-- import datacite related -->
260
				<property>
261
					<name>mdstore_service_location</name>
262
					<value>${import_mdstore_service_location}</value>
263
				</property>
264
				<property>
265
					<name>dataset_mdstore_ids_csv</name>
266
					<value>${import_dataset_mdstore_ids_csv}</value>
267
				</property>
268
				<!-- import content related -->
269
				<property>
270
					<name>objectstore_service_location</name>
271
					<value>${import_content_object_store_location}</value>
272
				</property>
273
				<property>
274
					<name>approved_objectstores_csv</name>
275
					<value>${import_content_datacite_objectstores_csv}</value>
276
				</property>
277
				<property>
278
					<name>wos_plaintext_objectstores_csv</name>
279
					<value>${import_content_wos_plaintext_objectstores_csv}</value>
280
				</property>
281
				<property>
282
					<name>mimetypes_pdf</name>
283
					<value>${import_content_mimetypes_pdf}</value>
284
				</property>
285
				<property>
286
					<name>mimetypes_text</name>
287
					<value>${import_content_mimetypes_text}</value>
288
				</property>
289
				<property>
290
					<name>mimetypes_html</name>
291
					<value>${import_content_mimetypes_html}</value>
292
				</property>
293
				<property>
294
					<name>mimetypes_xml_pmc</name>
295
					<value>${import_content_mimetypes_xml_pmc}</value>
296
				</property>
297
				<property>
298
					<name>mimetypes_wos</name>
299
					<value>${import_content_mimetypes_wos}</value>
300
				</property>
301
				<!-- import timeouts related -->
302
				<property>
303
					<name>resultset_client_read_timeout</name>
304
					<value>${import_resultset_client_read_timeout}</value>
305
				</property>
306
				<property>
307
					<name>content_connection_timeout</name>
308
					<value>${import_content_connection_timeout}</value>
309
				</property>
310
				<property>
311
					<name>content_read_timeout</name>
312
					<value>${import_content_read_timeout}</value>
313
				</property>
314
				<!-- metadata extraction related are autmatically propagated-->
315
				<!-- output parameters -->
316
				<property>
317
					<name>output_extracted_document_metadata</name>
318
					<value>${workingDir}/mainworkflows_preprocessing_import/extracted_document_metadata</value>
319
				</property>
320
				<property>
321
					<name>output_metadataimport_root</name>
322
					<value>${workingDir}/mainworkflows_preprocessing_import/metadataimport</value>
323
				</property>
324
				<property>
325
					<name>output_dataset</name>
326
					<value>${workingDir}/mainworkflows_preprocessing_import/dataset</value>
327
				</property>
328
				<property>
329
					<name>output_dataset_to_mdstore</name>
330
					<value>${workingDir}/mainworkflows_preprocessing_import/dataset_to_mdstore</value>
331
				</property>
332
				<property>
333
					<name>output_document_text</name>
334
					<value>${workingDir}/mainworkflows_preprocessing_import/document-text</value>
335
				</property>
336
				<property>
337
					<name>output_wos_text</name>
338
					<value>${workingDir}/mainworkflows_preprocessing_import/wos-text</value>
339
				</property>
340
				<property>
341
					<name>output_project_concept</name>
342
					<value>${workingDir}/mainworkflows_preprocessing_import/project-concept</value>
343
				</property>
344
				<property>
345
					<name>output_faults</name>
346
					<value>${workingDir}/mainworkflows_preprocessing_import/faults</value>
347
				</property>
348
				<!-- TODO output_wos is part of common/import and should replace output_wos_text!!! 
349
					Currently this directory is not used!!!
350
				-->
351
				<property>
352
					<name>output_wos</name>
353
					<value>${workingDir}/mainworkflows_preprocessing_import/wos</value>
354
				</property>
355
            </configuration>
356
        </sub-workflow>
357
		<ok to="referenceextraction_forking"/>
358
		<error to="fail" />
359
    </action>
360
	
361
	<fork name="referenceextraction_forking">
362
		<path start="referenceextraction_dataset"/>
363
    	<path start="ingest_webcrawl_fundings"/>
364
    </fork>
365
	
366
	<action name="referenceextraction_dataset">
367
	    <sub-workflow>
368
            <app-path>${wf:appPath()}/referenceextraction_dataset</app-path>
369
            <propagate-configuration/>
370
            <configuration>
371
            	<property>
372
                    <name>workingDir</name>
373
                    <value>${workingDir}/referenceextraction_dataset/working_dir</value>
374
                </property>
375
            	<property>
376
					<name>input_document_text</name>
377
					<value>${workingDir}/mainworkflows_preprocessing_import/document-text</value>
378
				</property>
379
				<property>
380
					<name>input_dataset</name>
381
					<value>${workingDir}/mainworkflows_preprocessing_import/dataset</value>
382
				</property>
383
				<property>
384
					<name>output_document_to_dataset</name>
385
					<value>${workingDir}/referenceextraction_dataset/document_datasets</value>
386
				</property>
387
            </configuration>
388
        </sub-workflow>
389
		<ok to="referenceextraction_joining"/>
390
		<error to="fail" />
391
    </action>
392
	
393
	<action name="ingest_webcrawl_fundings">
394
	    <sub-workflow>
395
            <app-path>${wf:appPath()}/ingest_webcrawl_fundings</app-path>
396
            <propagate-configuration/>
397
            <configuration>
398
            	<property>
399
                    <name>workingDir</name>
400
                    <value>${workingDir}/ingest_webcrawl_fundings/working_dir</value>
401
                </property>
402
            	<property>
403
					<name>input</name>
404
					<value>${workingDir}/mainworkflows_preprocessing_import/wos-text</value>
405
				</property>
406
				<property>
407
					<name>output</name>
408
					<value>${workingDir}/ingest_webcrawl_fundings/output</value>
409
				</property>
410
            </configuration>
411
        </sub-workflow>
412
		<ok to="referenceextraction_project"/>
413
		<error to="fail" />
414
    </action>
415
	
416
	<action name="referenceextraction_project">
417
	    <sub-workflow>
418
            <app-path>${wf:appPath()}/referenceextraction_project</app-path>
419
            <propagate-configuration/>
420
            <configuration>
421
            	<property>
422
                    <name>workingDir</name>
423
                    <value>${workingDir}/referenceextraction_project/working_dir</value>
424
                </property>
425
            	<property>
426
					<name>input_document_text</name>
427
					<value>${workingDir}/ingest_webcrawl_fundings/output</value>
428
				</property>
429
				<property>
430
					<name>input_project</name>
431
					<value>${workingDir}/mainworkflows_preprocessing_import/metadataimport/project</value>
432
				</property>
433
				<property>
434
					<name>output_document_to_project</name>
435
					<value>${workingDir}/referenceextraction_project/document_projects</value>
436
				</property>
437
            </configuration>
438
        </sub-workflow>
439
		<ok to="transformers_project_toconcept"/>
440
		<error to="fail" />
441
    </action>
442
    
443
    <action name="transformers_project_toconcept">
444
	    <sub-workflow>
445
            <app-path>${wf:appPath()}/transformers_project_toconcept</app-path>
446
            <propagate-configuration/>
447
            <configuration>
448
            	<property>
449
                    <name>workingDir</name>
450
                    <value>${workingDir}/transformers_project_toconcept/working_dir</value>
451
                </property>
452
            	<property>
453
					<name>input_document_to_project</name>
454
					<value>${workingDir}/referenceextraction_project/document_projects</value>
455
				</property>
456
				<property>
457
					<name>input_project</name>
458
					<value>${workingDir}/mainworkflows_preprocessing_import/metadataimport/project</value>
459
				</property>
460
				<property>
461
					<name>input_concept</name>
462
					<value>${workingDir}/mainworkflows_preprocessing_import/project-concept</value>
463
				</property>
464
				<property>
465
					<name>output</name>
466
					<value>${workingDir}/transformers_project_toconcept/out</value>
467
				</property>
468
            </configuration>
469
        </sub-workflow>
470
		<ok to="transformers_concept_to_researchinitiatives"/>
471
		<error to="fail" />
472
    </action>
473
    
474
    <action name="transformers_concept_to_researchinitiatives">
475
	    <sub-workflow>
476
            <app-path>${wf:appPath()}/transformers_export_researchinitiatives</app-path>
477
            <propagate-configuration/>
478
            <configuration>
479
            	<property>
480
                    <name>workingDir</name>
481
                    <value>${workingDir}/transformers_concept_to_researchinitiatives/working_dir</value>
482
                </property>
483
            	<property>
484
					<name>input_document_to_research_initiative</name>
485
					<value>${workingDir}/transformers_project_toconcept/out</value>
486
				</property>
487
				<property>
488
					<name>output_document_to_research_initiatives</name>
489
					<value>${workingDir}/transformers_concept_to_researchinitiatives/out</value>
490
				</property>
491
            </configuration>
492
        </sub-workflow>
493
		<ok to="referenceextraction_joining"/>
494
		<error to="fail" />
495
    </action>
496
    
497
    <join name="referenceextraction_joining" to="decision-export-to-hbase"/>
498
    
499
    <decision name="decision-export-to-hbase">
500
        <switch>
501
            <case to="export-to-hbase">${active_export_to_hbase eq "true"}</case>
502
            <default to="decision-export-to-json"/>
503
        </switch>
504
    </decision>
505
    
506
  	<action name="export-to-hbase">
507
		<sub-workflow>
508
            <app-path>${wf:appPath()}/mainworkflows_common_export</app-path>
509
            <propagate-configuration/>
510
            <configuration>
511
            	<property>
512
                    <name>workingDir</name>
513
                    <value>${workingDir}/mainworkflows_common_export/working_dir</value>
514
                </property>
515
                <!-- input ports -->
516
                <property>
517
					<name>input_document_to_project</name>
518
					<value>${workingDir}/referenceextraction_project/document_projects</value>
519
				</property>
520
				<property>
521
					<name>input_document_to_project_concepts</name>
522
					<value>${workingDir}/transformers_concept_to_researchinitiatives/out</value>
523
				</property>
524
				<property>
525
					<name>input_document_to_dataset</name>
526
					<value>${workingDir}/referenceextraction_dataset/document_datasets</value>
527
				</property>
528
				<property>
529
					<name>input_document_to_mdstore</name>
530
					<value>${workingDir}/mainworkflows_preprocessing_import/dataset_to_mdstore</value>
531
				</property>
532
				<!-- entities exporting modes -->
533
				<property>
534
					<name>active_export_referenceddataset_datasets</name>
535
					<value>true</value>
536
				</property>
537
				<property>
538
					<name>active_export_referencedproject_entities</name>
539
					<value>true</value>
540
				</property>
541
				<property>
542
					<name>mdstore_service_location</name>
543
					<value>${import_mdstore_service_location}</value>
544
				</property>
545
				<property>
546
					<name>wos_mdstore_id</name>
547
					<value>${import_wos_mdstore_id}</value>
548
				</property>
549
				<!-- export related -->
550
				<property>
551
					<name>action_hbase_table_name</name>
552
					<value>${export_action_hbase_table_name}</value>
553
				</property>
554
				<property>
555
					<name>action_hbase_table_initialize</name>
556
					<value>${export_action_hbase_table_initialize}</value>
557
				</property>
558
				<!-- action set id properties -->
559
				<property>
560
					<name>action_set_id</name>
561
					<value>${export_action_set_id}</value>
562
				</property>
563
				<property>
564
					<name>action_set_id_document_referencedProjects</name>
565
					<value>${export_action_set_id_document_referencedProjects}</value>
566
				</property>
567
				<property>
568
					<name>action_set_id_document_referencedDatasets</name>
569
					<value>${export_action_set_id_document_referencedDatasets}</value>
570
				</property>
571
				<property>
572
					<name>action_set_id_entity_wos</name>
573
					<value>${export_action_set_id_entity_wos}</value>
574
				</property>
575
				<property>
576
					<name>action_set_id_entity_dataset</name>
577
					<value>${export_action_set_id_entity_dataset}</value>
578
				</property>
579
				<property>
580
					<name>trust_level_threshold</name>
581
					<value>${export_trust_level_threshold}</value>
582
				</property>
583
				<property>
584
					<name>trust_level_threshold_document_referencedProjects</name>
585
					<value>${export_trust_level_threshold_document_referencedProjects}</value>
586
				</property>
587
				<property>
588
					<name>trust_level_threshold_document_referencedDatasets</name>
589
					<value>${export_trust_level_threshold_document_referencedDatasets}</value>
590
				</property>
591
				<property>
592
					<name>action_hbase_remote_zookeeper_quorum</name>
593
					<value>${export_action_hbase_remote_zookeeper_quorum}</value>
594
				</property>
595
				<property>
596
					<name>action_hbase_remote_zookeeper_clientport</name>
597
					<value>${export_action_hbase_remote_zookeeper_clientport}</value>
598
				</property>
599
            </configuration>
600
        </sub-workflow>
601
		<ok to="decision-export-to-json"/>
602
		<error to="fail" />
603
	</action>
604
    
605
    <decision name="decision-export-to-json">
606
        <switch>
607
            <case to="export-to-json">${active_export_to_json eq "true"}</case>
608
            <default to="end"/>
609
        </switch>
610
    </decision>
611
    
612
    <action name="export-to-json">
613
		<sub-workflow>
614
            <app-path>${wf:appPath()}/mainworkflows_common_export_to_json</app-path>
615
            <propagate-configuration/>
616
            <configuration>
617
                <property>
618
					<name>input_document_to_project</name>
619
					<value>${workingDir}/referenceextraction_project/document_projects</value>
620
				</property>
621
				<property>
622
					<name>input_document_to_project_concepts</name>
623
					<value>${workingDir}/transformers_concept_to_researchinitiatives/out</value>
624
				</property>
625
				<property>
626
					<name>input_document_to_dataset</name>
627
					<value>${workingDir}/referenceextraction_dataset/document_datasets</value>
628
				</property>
629
				<property>
630
					<name>output_root</name>
631
					<value>${workingDir}/exported_as_json</value>
632
				</property>
633
            </configuration>
634
        </sub-workflow>
635
		<ok to="end"/>
636
		<error to="fail" />
637
	</action>
638
    
639
	<kill name="fail">
640
		<message>Unfortunately, the process failed -- error message:
641
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
642
	</kill>
643
	<end name="end" />
644
</workflow-app>
(2-2/2)