Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_primary_main">
2
	
3
	<parameters>
4
		<property>
5
            <name>remove_sideproducts</name>
6
            <value>true</value>
7
            <description>flag indicating inference side products will be erased</description>
8
        </property>
9
		<property>
10
			<name>active_existence_filter</name>
11
			<value>true</value>
12
			<description>flag indicating contents should be filtered against metadata entries retrieved from InformationSpace.
13
			This way only contents having metadata representation will be processed. 
14
			To be disabled when processing new contents which metadata is not available in hbase.</description>
15
		</property>
16
		<!-- processing modes -->
17
		<property>
18
			<name>active_metadataextraction_export</name>
19
			<value>false</value>
20
			<description>flag indicating metadata extraction export should be enabled</description>
21
		</property>
22
		<property>
23
			<name>active_referenceextraction_project</name>
24
			<value>false</value>
25
			<description>flag indicating project reference extraction should be enabled</description>
26
		</property>
27
		<property>
28
			<name>active_referenceextraction_dataset</name>
29
			<value>false</value>
30
			<description>flag indicating dataset reference extraction should be enabled</description>
31
		</property>
32
		<property>
33
			<!-- currently disabled by default -->
34
			<name>active_referenceextraction_researchinitiative</name>
35
			<value>false</value>
36
			<description>flag indicating researchinitiative reference extraction should be enabled</description>
37
		</property>
38
		<property>
39
			<!-- currently disabled by default -->
40
			<name>active_referenceextraction_pdb</name>
41
			<value>false</value>
42
			<description>flag indicating protein databank reference extraction should be enabled</description>
43
		</property>
44
		<property>
45
			<name>active_documentsclassification</name>
46
			<value>false</value>
47
			<description>flag indicating documents classification should be enabled</description>
48
		</property>
49
		<property>
50
			<name>active_documentssimilarity</name>
51
			<value>false</value>
52
			<description>flag indicating documents similarity should be enabled</description>
53
		</property>
54
		<property>
55
			<name>active_citationmatching</name>
56
			<value>false</value>
57
			<description>flag indicating citation matching should be enabled</description>
58
		</property>
59
		<property>
60
			<name>active_statistics</name>
61
			<value>false</value>
62
			<description>flag indicating statistics generation should be enabled</description>
63
		</property>
64
		<property>
65
			<name>active_websiteusage_analysis</name>
66
			<value>false</value>
67
			<description>flag indicating logs should be imported from HDFS log file into avro datastore 
68
				and website usage analysis should be performed</description>
69
		</property>
70
		<property>
71
			<name>active_export_to_hbase</name>
72
			<value>true</value>
73
			<description>flag indicating hbase export should be performed</description>
74
		</property>
75
		<property>
76
			<name>active_export_to_json</name>
77
			<value>false</value>
78
			<description>flag indicating json export should be performed</description>
79
		</property>
80
		<!-- import concepts related -->
81
		<property>
82
			<name>import_islookup_service_location</name>
83
			<description>IS Lookup service location</description>
84
		</property>
85
		<property>
86
			<name>import_project_concepts_context_ids_csv</name>
87
			<value>fet-fp7,fet-h2020</value>
88
			<description>comma separated list of concepts context identifiers to be picked by ISLookup</description>
89
		</property>
90
		<!-- import metadata related -->
91
		<property>
92
			<name>import_hbase_input_table</name>
93
			<description>HBase input table holding InformationSpace, available on local cluster</description>
94
		</property>
95
		<property>
96
			<name>import_hbase_approved_datasources_csv</name>
97
			<value>$UNDEFINED$</value>
98
			<description>CSV list of datasource ids to be approved during import. Applied on result and person entities.</description>
99
		</property>
100
		<!-- import datacite related -->
101
		<property>
102
			<name>import_mdstore_service_location</name>
103
			<value>$UNDEFINED$</value>
104
			<description>MDStore service (not WSDL) location URL</description>
105
		</property>
106
		<property>
107
			<name>import_dataset_mdstore_ids_csv</name>
108
			<value>$UNDEFINED$</value>
109
			<description>MDStore identifier</description>
110
		</property>
111
		<!-- import content related -->
112
		<property>
113
			<name>import_content_object_store_location</name>
114
			<value>$UNDEFINED$</value>
115
			<description>object store service location required for content retrieval</description>
116
		</property>
117
		<property>
118
			<name>import_content_objectstores_csv</name>
119
			<value>$UNDEFINED$</value>
120
			<description>CSV list of object stores identifiers to be processed</description>
121
		</property>
122
		<property>
123
			<name>import_content_mimetypes_pdf</name>
124
			<value>pdf,application/pdf</value>
125
			<description>pdf mime types</description>
126
		</property>
127
		<property>
128
			<name>import_content_mimetypes_text</name>
129
			<value>text,text/plain</value>
130
			<description>text mime types</description>
131
		</property>
132
		<property>
133
			<name>import_content_mimetypes_html</name>
134
			<value>text/html</value>
135
			<description>html mime types</description>
136
		</property>
137
		<property>
138
			<name>import_content_mimetypes_xml_pmc</name>
139
			<value>xml</value>
140
			<description>xml pmc types</description>
141
		</property>
142
		<property>
143
			<name>import_content_mimetypes_wos</name>
144
			<value>file::WoS</value>
145
			<description>WoS types</description>
146
		</property>
147
		<!-- import timeouts related -->
148
		<property>
149
			<name>import_resultset_client_read_timeout</name>
150
			<value>60000</value>
151
			<description>resultset client read timeout</description>
152
		</property>
153
		<property>
154
			<name>import_content_connection_timeout</name>
155
			<value>60000</value>
156
			<description>import content connection timeout</description>
157
		</property>
158
		<property>
159
			<name>import_content_read_timeout</name>
160
			<value>60000</value>
161
			<description>import content read timeout</description>
162
		</property>
163
		<!-- import logs related -->
164
		<property>
165
			<name>portal_logs_location</name>
166
			<value>/cache/portal-piwik-logs</value>
167
			<description>portal log files HDFS location</description>
168
		</property>
169
		<!-- metadata extraction related -->
170
		<property>
171
			<name>metadataextraction_excluded_checksums</name>
172
			<value>$UNDEFINED$</value>
173
			<description>list of content checksums excluded from metadataextraction processing</description>
174
		</property>
175
		<property>
176
			<name>metadataextraction_max_file_size_mb</name>
177
			<value>500</value>
178
			<description>maximum allowed file size in Megabytes</description>
179
		</property>
180
		<property>
181
			<name>metadataextraction_default_cache_location</name>
182
			<value>/cache/metadataextraction</value>
183
			<description>metadata extraction HDFS cache location</description>
184
		</property>
185
		<property>
186
			<name>metadataextraction_processing_mode</name>
187
			<value>StreamingMetadataExtractorMapper</value>
188
			<description>metadata extraction processing mode</description>
189
		</property>
190
		<property>
191
			<name>metadataextraction_input_classname</name>
192
			<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
193
			<description>metadata extraction input classname</description>
194
		</property>
195
		<!-- export related -->
196
		<property>
197
			<name>export_action_hbase_table_name</name>
198
			<description>action manager hbase table name</description>
199
		</property>
200
		<property>
201
			<name>export_action_hbase_table_initialize</name>
202
			<description>flag indicating input table should be initialized</description>
203
		</property>
204
		<!-- action set id properties -->
205
		<property>
206
			<name>export_action_set_id</name>
207
			<value>$UNDEFINED$</value>
208
			<description>action-set identifier of exported data</description>
209
		</property>
210
		<property>
211
			<name>export_action_set_id_dataset_similarities_websiteusage</name>
212
			<value>$UNDEFINED$</value>
213
			<description>dataset_similarities_websiteusage action-set identifier of exported data</description>
214
		</property>
215
		<property>
216
			<name>export_action_set_id_person_similarities_websiteusage</name>
217
			<value>$UNDEFINED$</value>
218
			<description>person_similarities_websiteusage action-set identifier of exported data</description>
219
		</property>
220
		<property>
221
			<name>export_action_set_id_person_statistics</name>
222
			<value>$UNDEFINED$</value>
223
			<description>person_statistics action-set identifier of exported data</description>
224
		</property>
225
		<property>
226
			<name>export_action_set_id_project_statistics</name>
227
			<value>$UNDEFINED$</value>
228
			<description>project_statistics action-set identifier of exported data</description>
229
		</property>
230
		<property>
231
			<name>export_action_set_id_document_similarities_standard</name>
232
			<value>$UNDEFINED$</value>
233
			<description>document_similarities_standard action-set identifier of exported data</description>
234
		</property>
235
		<property>
236
			<name>export_action_set_id_document_similarities_websiteusage</name>
237
			<value>$UNDEFINED$</value>
238
			<description>document_similarities_websiteusage action-set identifier of exported data</description>
239
		</property>
240
		<property>
241
			<name>export_action_set_id_document_statistics</name>
242
			<value>$UNDEFINED$</value>
243
			<description>document_statistics action-set identifier of exported data</description>
244
		</property>
245
		<property>
246
			<name>export_action_set_id_document_classes</name>
247
			<value>$UNDEFINED$</value>
248
			<description>document_classes action-set identifier of exported data</description>
249
		</property>
250
		<property>
251
			<name>export_action_set_id_document_referencedProjects</name>
252
			<value>$UNDEFINED$</value>
253
			<description>document_referencedProjects action-set identifier of exported data</description>
254
		</property>
255
		<property>
256
			<name>export_action_set_id_document_referencedDatasets</name>
257
			<value>$UNDEFINED$</value>
258
			<description>document_referencedDatasets action-set identifier of exported data</description>
259
		</property>
260
		<property>
261
			<name>export_action_set_id_document_referencedDocuments</name>
262
			<value>$UNDEFINED$</value>
263
			<description>document_referencedDocuments action-set identifier of exported data</description>
264
		</property>
265
		<property>
266
			<name>export_action_set_id_document_research_initiative</name>
267
			<value>$UNDEFINED$</value>
268
			<description>document research initiative action-set identifier of exported data</description>
269
		</property>
270
		<property>
271
			<name>export_action_set_id_document_pdb</name>
272
			<value>$UNDEFINED$</value>
273
			<description>document to protein databank action-set identifier of exported data</description>
274
		</property>
275
		<!--  -->
276
		<property>
277
			<name>export_action_set_id_entity_dataset</name>
278
			<value>$UNDEFINED$</value>
279
			<description>action-set identifier of exported data containing dataset entities</description>
280
		</property>
281
		<property>
282
			<name>export_action_hbase_remote_zookeeper_quorum</name>
283
			<value>$UNDEFINED$</value>
284
			<description>external hbase zookeeper quorum, set to empty value by default which means data will be exported to local hbase instance</description>
285
		</property>
286
		<property>
287
			<name>export_action_hbase_remote_zookeeper_clientport</name>
288
			<value>$UNDEFINED$</value>
289
			<description>external hbase zookeeper client port, required only whe zookeeper quorum property is set</description>
290
		</property>
291
		<property>
292
			<name>export_documentssimilarity_threshold</name>
293
			<value>$UNDEFINED$</value>
294
			<description>documents similarity threshold value below which similarity export is omitted</description>
295
		</property>
296
		<property>
297
			<name>export_referenceextraction_pdb_url_root</name>
298
			<value>http://www.rcsb.org/pdb/explore/explore.do?structureId=</value>
299
			<description>protein databank URL root part to be concatenated with pdb identifier when forming final URL</description>
300
		</property>
301
		<!-- working directory related -->
302
		<property>
303
			<name>execution_environment</name>
304
			<value>primary</value>
305
			<description>execution environment used for workingDir creation</description>
306
		</property>
307
		<property>
308
			<name>workingDir</name>
309
			<value>/user/${user.name}/iis/working_dirs/${execution_environment}</value>
310
			<description>working directory</description>
311
		</property>
312
	</parameters>
313
	
314
	<global>
315
        <job-tracker>${jobTracker}</job-tracker>
316
        <name-node>${nameNode}</name-node>
317
        <configuration>
318
            <property>
319
                <name>mapred.job.queue.name</name>
320
                <value>${queueName}</value>
321
            </property>
322
		</configuration>
323
	</global>
324
	
325
	<start to="init-workingDir"/>
326

    
327
	<action name="init-workingDir">
328
         <fs>
329
            <delete path="${nameNode}${workingDir}" />
330
			<mkdir path="${nameNode}${workingDir}" />
331
        </fs>
332
        <ok to="copy-version"/>
333
        <error to="fail"/>
334
    </action>
335
	
336
	<action name="copy-version">
337
       <distcp xmlns="uri:oozie:distcp-action:0.1">
338
           <job-tracker>${jobTracker}</job-tracker>
339
           <name-node>${nameNode}</name-node>
340
           <arg>${wf:appPath()}/version.properties</arg>
341
           <arg>${nameNode}${workingDir}</arg>
342
           </distcp>
343
       <ok to="import"/>
344
       <error to="fail"/>
345
	</action>
346

    
347
	<action name="import">
348
	    <sub-workflow>
349
            <app-path>${wf:appPath()}/mainworkflows_common_import</app-path>
350
            <propagate-configuration/>
351
            <configuration>
352
            	<property>
353
                    <name>workingDir</name>
354
                    <value>${workingDir}/mainworkflows_common_import/working_dir</value>
355
                </property>
356
                <!-- importing modes -->
357
                <property>
358
					<name>active_existence_filter</name>
359
					<value>${active_existence_filter}</value>
360
				</property>
361
				<property>
362
					<name>active_import_metadata</name>
363
					<value>true</value>
364
				</property>
365
				<property>
366
					<name>active_import_dataset</name>
367
					<value>${active_referenceextraction_dataset}</value>
368
				</property>
369
				<property>
370
					<name>active_import_concept</name>
371
					<value>${active_referenceextraction_project}</value>
372
				</property>
373
				<property>
374
					<name>active_ingest_pmc_citations</name>
375
					<value>${active_citationmatching}</value>
376
				</property>
377
                <!-- import metadata related -->
378
				<property>
379
					<name>hbase_input_table</name>
380
					<value>${import_hbase_input_table}</value>
381
				</property>
382
				<property>
383
					<name>hbase_approved_datasources_csv</name>
384
					<value>${import_hbase_approved_datasources_csv}</value>
385
				</property>
386
				<!-- import datacite related -->
387
				<property>
388
					<name>mdstore_service_location</name>
389
					<value>${import_mdstore_service_location}</value>
390
				</property>
391
				<property>
392
					<name>dataset_mdstore_ids_csv</name>
393
					<value>${import_dataset_mdstore_ids_csv}</value>
394
				</property>
395
				<!-- project concept related -->
396
				<property>
397
					<name>islookup_service_location</name>
398
					<value>${import_islookup_service_location}</value>
399
				</property>
400
				<property>
401
					<name>project_concepts_context_ids_csv</name>
402
					<value>${import_project_concepts_context_ids_csv}</value>
403
				</property>
404
				<!-- import content related -->
405
				<property>
406
					<name>objectstore_service_location</name>
407
					<value>${import_content_object_store_location}</value>
408
				</property>
409
				<property>
410
					<name>approved_objectstores_csv</name>
411
					<value>${import_content_objectstores_csv}</value>
412
				</property>
413
				<property>
414
					<name>mimetypes_pdf</name>
415
					<value>${import_content_mimetypes_pdf}</value>
416
				</property>
417
				<property>
418
					<name>mimetypes_text</name>
419
					<value>${import_content_mimetypes_text}</value>
420
				</property>
421
				<property>
422
					<name>mimetypes_html</name>
423
					<value>${import_content_mimetypes_html}</value>
424
				</property>
425
				<property>
426
					<name>mimetypes_xml_pmc</name>
427
					<value>${import_content_mimetypes_xml_pmc}</value>
428
				</property>
429
				<property>
430
					<name>mimetypes_wos</name>
431
					<value>${import_content_mimetypes_wos}</value>
432
				</property>
433
				<!-- import timeouts related -->
434
				<property>
435
					<name>resultset_client_read_timeout</name>
436
					<value>${import_resultset_client_read_timeout}</value>
437
				</property>
438
				<property>
439
					<name>content_connection_timeout</name>
440
					<value>${import_content_connection_timeout}</value>
441
				</property>
442
				<property>
443
					<name>content_read_timeout</name>
444
					<value>${import_content_read_timeout}</value>
445
				</property>
446
				<!-- metadata extraction related -->
447
				<property>
448
					<name>metadataextraction_excluded_checksums</name>
449
					<value>${metadataextraction_excluded_checksums}</value>
450
				</property>
451
				<property>
452
					<name>metadataextraction_max_file_size_mb</name>
453
					<value>${metadataextraction_max_file_size_mb}</value>
454
				</property>
455
				<property>
456
					<name>metadataextraction_default_cache_location</name>
457
					<value>${metadataextraction_default_cache_location}</value>
458
				</property>
459
				<!-- metadatainput and metadataextraction output subdirectory names -->
460
				<property>
461
					<name>metadataimport_output_name_document_meta</name>
462
					<value>docmeta</value>
463
				</property>
464
				<property>
465
					<name>metadataimport_output_name_document_project</name>
466
					<value>docproject</value>
467
				</property>
468
				<property>
469
					<name>metadataimport_output_name_project</name>
470
					<value>project</value>
471
				</property>
472
				<property>
473
					<name>metadataimport_output_name_person</name>
474
					<value>person</value>
475
				</property>
476
				<property>
477
					<name>metadataimport_output_name_dedup_mapping</name>
478
					<value>dedupmapping</value>
479
				</property>
480
				<!-- output parameters -->
481
				<property>
482
					<name>output_extracted_document_metadata</name>
483
					<value>${workingDir}/mainworkflows_common_import/extracted_document_metadata</value>
484
				</property>
485
				<property>
486
					<name>output_metadataimport_root</name>
487
					<value>${workingDir}/mainworkflows_common_import/metadataimport</value>
488
				</property>
489
				<property>
490
					<name>output_dataset</name>
491
					<value>${workingDir}/mainworkflows_common_import/dataset</value>
492
				</property>
493
				<property>
494
					<name>output_dataset_to_mdstore</name>
495
					<value>${workingDir}/mainworkflows_common_import/dataset_to_mdstore</value>
496
				</property>
497
				<property>
498
					<name>output_citation_pmc</name>
499
					<value>${workingDir}/mainworkflows_common_import/citation_pmc</value>
500
				</property>
501
				<property>
502
					<name>output_document_text</name>
503
					<value>${workingDir}/mainworkflows_common_import/document-text</value>
504
				</property>
505
				<property>
506
					<name>output_project_concept</name>
507
					<value>${workingDir}/mainworkflows_common_import/project-concept</value>
508
				</property>
509
				<property>
510
					<name>output_wos</name>
511
					<value>${workingDir}/mainworkflows_common_import/wos</value>
512
				</property>
513
				<property>
514
					<name>output_faults</name>
515
					<value>${workingDir}/mainworkflows_common_import/faults</value>
516
				</property>
517
            </configuration>
518
        </sub-workflow>
519
		<ok to="decision-import_logs"/>
520
		<error to="fail" />
521
    </action>
522

    
523
	<decision name="decision-import_logs">
524
        <switch>
525
            <case to="import_logs">${active_websiteusage_analysis eq "true"}</case>
526
            <default to="mainworkflows_primary_processing"/>
527
        </switch>
528
    </decision>
529

    
530
	<action name="import_logs">
531
		<sub-workflow>
532
            <app-path>${wf:appPath()}/import_logs</app-path>
533
            <propagate-configuration/>
534
            <configuration>
535
            	<property>
536
                    <name>workingDir</name>
537
                    <value>${workingDir}/websiteusage_analysis/import_logs/working_dir</value>
538
                </property>
539
                <property>
540
					<name>input</name>
541
					<value>${portal_logs_location}</value>
542
				</property>
543
            	<property>
544
					<name>output</name>
545
					<value>${workingDir}/websiteusage_analysis/import_logs/output</value>
546
				</property>
547
			</configuration>
548
        </sub-workflow>
549
		<ok to="mainworkflows_primary_processing" />
550
		<error to="fail" />
551
	</action>
552

    
553
	<action name="mainworkflows_primary_processing">
554
		<sub-workflow>
555
            <app-path>${wf:appPath()}/mainworkflows_primary_processing</app-path>
556
            <propagate-configuration/>
557
            <configuration>
558
            	<property>
559
                    <name>input_document_metadata</name>
560
                    <value>${workingDir}/mainworkflows_common_import/metadataimport/docmeta</value>
561
                </property>
562
                <property>
563
                    <name>input_document_to_project</name>
564
                    <value>${workingDir}/mainworkflows_common_import/metadataimport/docproject</value>
565
                </property>
566
                <property>
567
                    <name>input_document_text</name>
568
                    <value>${workingDir}/mainworkflows_common_import/document-text</value>
569
                </property>
570
                <property>
571
                    <name>input_document_text_wos</name>
572
                    <value>${workingDir}/mainworkflows_common_import/wos</value>
573
                </property>
574
                <property>
575
                    <name>input_project</name>
576
                    <value>${workingDir}/mainworkflows_common_import/metadataimport/project</value>
577
                </property>
578
                <property>
579
                    <name>input_person</name>
580
                    <value>${workingDir}/mainworkflows_common_import/metadataimport/person</value>
581
                </property>
582
                <property>
583
                    <name>input_dataset</name>
584
                    <value>${workingDir}/mainworkflows_common_import/dataset</value>
585
                </property>
586
                <property>
587
					<name>input_extracted_document_metadata</name>
588
					<value>${workingDir}/mainworkflows_common_import/extracted_document_metadata</value>
589
				</property>
590
				<property>
591
					<name>input_citation_pmc</name>
592
					<value>${workingDir}/mainworkflows_common_import/citation_pmc</value>
593
				</property>
594
				<property>
595
					<name>input_project_concept</name>
596
					<value>${workingDir}/mainworkflows_common_import/project-concept</value>
597
				</property>
598
				<property>
599
                    <name>output_document_to_project</name>
600
                    <value>${workingDir}/exported/document_to_project</value>
601
                </property>
602
                <property>
603
                    <name>output_document_to_project_concepts</name>
604
                    <value>${workingDir}/exported/document_to_project_concepts</value>
605
                </property>
606
                <property>
607
                    <name>output_document_to_dataset</name>
608
                    <value>${workingDir}/exported/document_to_dataset</value>
609
                </property>
610
                <property>
611
                    <name>output_document_to_research_initiatives</name>
612
                    <value>${workingDir}/exported/document_to_research_initiatives</value>
613
                </property>
614
                <property>
615
                    <name>output_document_to_pdb</name>
616
                    <value>${workingDir}/exported/document_to_pdb</value>
617
                </property>
618
                <property>
619
                    <name>output_document_to_document_classes</name>
620
                    <value>${workingDir}/exported/document_to_document_classes</value>
621
                </property>
622
                <property>
623
                    <name>output_citation</name>
624
                    <value>${workingDir}/exported/citation</value>
625
                </property>
626
                <property>
627
                    <name>output_document_similarity</name>
628
                    <value>${workingDir}/exported/document_similarity</value>
629
                </property>
630
				<property>
631
                    <name>output_document_statistics</name>
632
                    <value>${workingDir}/exported/document_statistics</value>
633
                </property>                
634
                <property>
635
                    <name>output_author_statistics</name>
636
                    <value>${workingDir}/exported/author_statistics</value>
637
                </property>
638
                <property>
639
                    <name>output_project_statistics</name>
640
                    <value>${workingDir}/exported/project_statistics</value>
641
                </property>
642
            </configuration>
643
        </sub-workflow>
644
		<ok to="decision-websiteusage_analysis"/>
645
		<error to="fail" />
646
	</action>
647
    
648
    <!-- website usage analysis block -->
649
    <decision name="decision-websiteusage_analysis">
650
        <switch>
651
            <case to="websiteusage-idextractor">${active_websiteusage_analysis eq "true"}</case>
652
            <default to="skip-websiteusage"/>
653
        </switch>
654
    </decision>
655
    
656
    <action name="websiteusage-idextractor">
657
        <sub-workflow>
658
            <app-path>${wf:appPath()}/transformers_idextractor</app-path>
659
            <propagate-configuration/>
660
            <configuration>
661
                <property>
662
                    <name>workingDir</name>
663
                    <value>${workingDir}/websiteusage_analysis/transformers_idextractor/working_dir</value>
664
                </property>
665
                <property>
666
                    <name>input_document_metadata</name>
667
                    <value>${workingDir}/mainworkflows_common_import/metadataimport/docmeta</value>
668
                </property>
669
                <property>
670
                    <name>output_identifier</name>
671
                    <value>${workingDir}/websiteusage_analysis/transformers_idextractor/output</value>
672
                </property>
673
            </configuration>
674
        </sub-workflow>
675
        <ok to="mainworkflows_websiteusage_document"/>
676
        <error to="fail"/>
677
    </action>
678
    
679
    <action name="mainworkflows_websiteusage_document">
680
        <sub-workflow>
681
            <app-path>${wf:appPath()}/mainworkflows_websiteusage_document</app-path>
682
            <propagate-configuration/>
683
            <configuration>
684
                <property>
685
                    <name>workingDir</name>
686
                    <value>${workingDir}/websiteusage_analysis/working_dir</value>
687
                </property>
688
                <property>
689
                    <name>input_logs</name>
690
                    <value>${workingDir}/websiteusage_analysis/import_logs/output</value>
691
                </property>
692
                <property>
693
                    <name>input_id_mapping</name>
694
                    <value>${workingDir}/mainworkflows_common_import/metadataimport/dedupmapping</value>
695
                </property>
696
                <property>
697
                    <name>input_document_id</name>
698
                    <value>${workingDir}/websiteusage_analysis/transformers_idextractor/output</value>
699
                </property>
700
                <property>
701
                    <name>output</name>
702
                    <value>${workingDir}/websiteusage_analysis/output</value>
703
                </property>
704
            </configuration>
705
        </sub-workflow>
706
        <ok to="decision-transform-metadataextraction-for-export"/>
707
        <error to="fail"/>
708
    </action>    
709
    
710
    <action name="skip-websiteusage">
711
        <java>
712
			<prepare>
713
				<!-- notice: directory have to aligned with skipped action output -->
714
				<delete path="${nameNode}${workingDir}/websiteusage_analysis" />
715
				<mkdir path="${nameNode}${workingDir}/websiteusage_analysis" />
716
			</prepare>
717
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
718
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
719
            <arg>-C{websiteusage_analysis_document,
720
				eu.dnetlib.iis.websiteusage.schemas.DocumentsWithWebsiteUsageSimilarities,
721
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
722
            <arg>-Owebsiteusage_analysis_document=${workingDir}/websiteusage_analysis/output</arg>
723
        </java>
724
        <ok to="decision-transform-metadataextraction-for-export"/>
725
        <error to="fail"/>
726
    </action>
727
    <!-- end of website usage analysis block -->
728
    
729
    <decision name="decision-transform-metadataextraction-for-export">
730
        <switch>
731
            <case to="transformers_export_documentmetadata">${active_metadataextraction_export eq "true"}</case>
732
            <default to="skip-transformers_export_documentmetadata"/>
733
        </switch>
734
    </decision>
735
    
736
    <action name="transformers_export_documentmetadata">
737
        <sub-workflow>
738
            <app-path>${wf:appPath()}/transformers_export_documentmetadata</app-path>
739
            <configuration>
740
                <property>
741
                    <name>jobTracker</name>
742
                    <value>${jobTracker}</value>
743
                </property>
744
                <property>
745
                    <name>nameNode</name>
746
                    <value>${nameNode}</value>
747
                </property>
748
                <property>
749
                    <name>queueName</name>
750
                    <value>${queueName}</value>
751
                </property>
752
                <!-- Working directory of the subworkflow -->
753
                <property>
754
                    <name>workingDir</name>
755
                    <value>${workingDir}/transformers_export_documentmetadata/working_dir</value>
756
                </property>
757
                <property>
758
                    <name>input_extracted_metadata</name>
759
                    <value>${workingDir}/mainworkflows_common_import/extracted_document_metadata</value>
760
                </property>
761
                <property>
762
                    <name>output_metadata</name>
763
                    <value>${workingDir}/transformers_export_documentmetadata/output_metadata</value>
764
                </property>
765
            </configuration>
766
        </sub-workflow>
767
        <ok to="decision-export"/>
768
        <error to="fail"/>
769
    </action>
770
    
771
    <action name="skip-transformers_export_documentmetadata">
772
        <java>
773
			<prepare>
774
				<!-- notice: directory have to aligned with skipped action output -->
775
				<delete path="${nameNode}${workingDir}/transformers_export_documentmetadata" />
776
				<mkdir path="${nameNode}${workingDir}/transformers_export_documentmetadata" />
777
			</prepare>
778
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
779
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
780
            <arg>-C{document_metadata,
781
				eu.dnetlib.iis.export.schemas.DocumentMetadata,
782
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
783
            <!-- notice: directory have to aligned with skipped action output -->
784
            <arg>-Odocument_metadata=${workingDir}/transformers_export_documentmetadata/output_metadata</arg>
785
        </java>
786
        <ok to="decision-export-to-hbase"/>
787
        <error to="fail"/>
788
    </action>
789
    
790
    <decision name="decision-export-to-hbase">
791
        <switch>
792
            <case to="export-to-hbase">${active_export_to_hbase eq "true"}</case>
793
            <default to="decision-export-to-json"/>
794
        </switch>
795
    </decision>
796
    
797
    <action name="export-to-hbase">
798
		<sub-workflow>
799
            <app-path>${wf:appPath()}/mainworkflows_common_export</app-path>
800
            <propagate-configuration/>
801
            <configuration>
802
            	<property>
803
                    <name>workingDir</name>
804
                    <value>${workingDir}/mainworkflows_common_export/working_dir</value>
805
                </property>
806
                <!-- input ports -->
807
                <property>
808
					<name>input_document_metadata</name>
809
					<value>${workingDir}/transformers_export_documentmetadata/output_metadata</value>
810
				</property>
811
                <property>
812
					<name>input_document_to_project</name>
813
					<value>${workingDir}/exported/document_to_project</value>
814
				</property>
815
				<property>
816
					<name>input_document_to_project_concepts</name>
817
					<value>${workingDir}/exported/document_to_project_concepts</value>
818
				</property>
819
				<property>
820
					<name>input_document_to_dataset</name>
821
					<value>${workingDir}/exported/document_to_dataset</value>
822
				</property>
823
				<property>
824
					<name>input_document_to_mdstore</name>
825
					<value>${workingDir}/mainworkflows_common_import/dataset_to_mdstore</value>
826
				</property>
827
				<property>
828
					<name>input_document_to_research_initiatives</name>
829
					<value>${workingDir}/exported/document_to_research_initiatives</value>
830
				</property>
831
				<property>
832
					<name>input_document_to_pdb</name>
833
					<value>${workingDir}/exported/document_to_pdb</value>
834
				</property>
835
				<property>
836
					<name>input_document_to_document_classes</name>
837
					<value>${workingDir}/exported/document_to_document_classes</value>
838
				</property>
839
				<property>
840
					<name>input_citations</name>
841
					<value>${workingDir}/exported/citation</value>
842
				</property>
843
				<property>
844
					<name>input_document_similarity</name>
845
					<value>${workingDir}/exported/document_similarity</value>
846
				</property>
847
				<property>
848
					<name>input_document_statistics</name>
849
					<value>${workingDir}/exported/document_statistics</value>
850
				</property>
851
				<property>
852
					<name>input_document_websiteusage_similarity</name>
853
					<value>${workingDir}/websiteusage_analysis/output</value>
854
				</property>
855
				<property>
856
					<name>input_author_statistics</name>
857
					<value>${workingDir}/exported/author_statistics</value>
858
				</property>
859
				<property>
860
					<name>input_project_statistics</name>
861
					<value>${workingDir}/exported/project_statistics</value>
862
				</property>
863
				<!-- entities exporting modes -->
864
				<property>
865
					<name>active_export_referenceddataset_datasets</name>
866
					<value>${active_referenceextraction_dataset}</value>
867
				</property>
868
				<property>
869
					<name>active_export_referencedproject_entities</name>
870
					<value>false</value>
871
				</property>
872
				<property>
873
					<name>mdstore_service_location</name>
874
					<value>${import_mdstore_service_location}</value>
875
				</property>
876
				<!-- export related -->
877
				<property>
878
					<name>action_hbase_table_name</name>
879
					<value>${export_action_hbase_table_name}</value>
880
				</property>
881
				<property>
882
					<name>action_hbase_table_initialize</name>
883
					<value>${export_action_hbase_table_initialize}</value>
884
				</property>
885
				<!-- action set id properties -->
886
				<property>
887
					<name>action_set_id</name>
888
					<value>${export_action_set_id}</value>
889
				</property>
890
				<property>
891
					<name>action_set_id_dataset_similarities_websiteusage</name>
892
					<value>${export_action_set_id_dataset_similarities_websiteusage}</value>
893
				</property>
894
				<property>
895
					<name>action_set_id_person_similarities_websiteusage</name>
896
					<value>${export_action_set_id_person_similarities_websiteusage}</value>
897
				</property>
898
				<property>
899
					<name>action_set_id_person_statistics</name>
900
					<value>${export_action_set_id_person_statistics}</value>
901
				</property>
902
				<property>
903
					<name>action_set_id_project_statistics</name>
904
					<value>${export_action_set_id_project_statistics}</value>
905
				</property>
906
				<property>
907
					<name>action_set_id_document_similarities_standard</name>
908
					<value>${export_action_set_id_document_similarities_standard}</value>
909
				</property>
910
				<property>
911
					<name>action_set_id_document_similarities_websiteusage</name>
912
					<value>${export_action_set_id_document_similarities_websiteusage}</value>
913
				</property>
914
				<property>
915
					<name>action_set_id_document_statistics</name>
916
					<value>${export_action_set_id_document_statistics}</value>
917
				</property>
918
				<property>
919
					<name>action_set_id_document_classes</name>
920
					<value>${export_action_set_id_document_classes}</value>
921
				</property>
922
				<property>
923
					<name>action_set_id_document_referencedProjects</name>
924
					<value>${export_action_set_id_document_referencedProjects}</value>
925
				</property>
926
				<property>
927
					<name>action_set_id_document_referencedDatasets</name>
928
					<value>${export_action_set_id_document_referencedDatasets}</value>
929
				</property>
930
				<property>
931
					<name>action_set_id_document_referencedDocuments</name>
932
					<value>${export_action_set_id_document_referencedDocuments}</value>
933
				</property>
934
				<property>
935
					<name>action_set_id_document_research_initiative</name>
936
					<value>${export_action_set_id_document_research_initiative}</value>
937
				</property>
938
				<property>
939
					<name>action_set_id_document_pdb</name>
940
					<value>${export_action_set_id_document_pdb}</value>
941
				</property>
942
				<property>
943
					<name>action_set_id_entity_dataset</name>
944
					<value>${export_action_set_id_entity_dataset}</value>
945
				</property>
946
				<property>
947
					<name>action_hbase_remote_zookeeper_quorum</name>
948
					<value>${export_action_hbase_remote_zookeeper_quorum}</value>
949
				</property>
950
				<property>
951
					<name>action_hbase_remote_zookeeper_clientport</name>
952
					<value>${export_action_hbase_remote_zookeeper_clientport}</value>
953
				</property>
954
				<property>
955
					<name>documentssimilarity_threshold</name>
956
					<value>${export_documentssimilarity_threshold}</value>
957
				</property>
958
				<property>
959
					<name>referenceextraction_pdb_url_root</name>
960
					<value>${export_referenceextraction_pdb_url_root}</value>
961
				</property>
962
            </configuration>
963
        </sub-workflow>
964
		<ok to="decision-export-to-json"/>
965
		<error to="fail" />
966
	</action>
967
    
968
    <decision name="decision-export-to-json">
969
        <switch>
970
            <case to="export-to-json">${active_export_to_json eq "true"}</case>
971
            <default to="end"/>
972
        </switch>
973
    </decision>
974
    
975
    <action name="export-to-json">
976
		<sub-workflow>
977
            <app-path>${wf:appPath()}/mainworkflows_common_export_to_json</app-path>
978
            <propagate-configuration/>
979
            <configuration>
980
                <!-- input ports -->
981
                <property>
982
					<name>input_document_metadata</name>
983
					<value>${workingDir}/transformers_export_documentmetadata/output_metadata</value>
984
				</property>
985
                <property>
986
					<name>input_document_to_project</name>
987
					<value>${workingDir}/exported/document_to_project</value>
988
				</property>
989
				<property>
990
					<name>input_document_to_project_concepts</name>
991
					<value>${workingDir}/exported/document_to_project_concepts</value>
992
				</property>
993
				<property>
994
					<name>input_document_to_dataset</name>
995
					<value>${workingDir}/exported/document_to_dataset</value>
996
				</property>
997
				<property>
998
					<name>input_document_to_mdstore</name>
999
					<value>${workingDir}/mainworkflows_common_import/dataset_to_mdstore</value>
1000
				</property>
1001
				<property>
1002
					<name>input_document_to_research_initiatives</name>
1003
					<value>${workingDir}/exported/document_to_research_initiatives</value>
1004
				</property>
1005
				<property>
1006
					<name>input_document_to_pdb</name>
1007
					<value>${workingDir}/exported/document_to_pdb</value>
1008
				</property>
1009
				<property>
1010
					<name>input_document_to_document_classes</name>
1011
					<value>${workingDir}/exported/document_to_document_classes</value>
1012
				</property>
1013
				<property>
1014
					<name>input_citations</name>
1015
					<value>${workingDir}/exported/citation</value>
1016
				</property>
1017
				<property>
1018
					<name>input_document_similarity</name>
1019
					<value>${workingDir}/exported/document_similarity</value>
1020
				</property>
1021
				<property>
1022
					<name>input_document_statistics</name>
1023
					<value>${workingDir}/exported/document_statistics</value>
1024
				</property>
1025
				<property>
1026
					<name>input_document_websiteusage_similarity</name>
1027
					<value>${workingDir}/websiteusage_analysis/output</value>
1028
				</property>
1029
				<property>
1030
					<name>input_author_statistics</name>
1031
					<value>${workingDir}/exported/author_statistics</value>
1032
				</property>
1033
				<property>
1034
					<name>input_project_statistics</name>
1035
					<value>${workingDir}/exported/project_statistics</value>
1036
				</property>
1037
				<property>
1038
					<name>output_root</name>
1039
					<value>${workingDir}/exported_as_json</value>
1040
				</property>
1041
            </configuration>
1042
        </sub-workflow>
1043
		<ok to="end"/>
1044
		<error to="fail" />
1045
	</action>
1046
    
1047
	<kill name="fail">
1048
		<message>Unfortunately, the process failed -- error message:
1049
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
1050
	</kill>
1051
	<end name="end" />
1052
</workflow-app>
(2-2/2)