Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_preprocessing">
2
	
3
	<parameters>
4
		<!-- import project related -->
5
		<property>
6
			<name>import_database_service_location</name>
7
			<value>$UNDEFINED$</value>
8
			<description>Database service (not WSDL) location URL</description>
9
		</property>
10
		<property>
11
			<name>import_database_dbname</name>
12
			<value>dnet_openaireplus_node0_t</value>
13
			<description>database name</description>
14
		</property>
15
		<!-- import datacite related, export datacite & wos related -->
16
		<property>
17
			<name>import_mdstore_service_location</name>
18
			<value>$UNDEFINED$</value>
19
			<description>MDStore service (not WSDL) location URL</description>
20
		</property>
21
		<property>
22
			<name>import_dataset_mdstore_id</name>
23
			<value>$UNDEFINED$</value>
24
			<description>dataset MDStore identifier</description>
25
		</property>
26
		<property>
27
			<name>import_wos_mdstore_id</name>
28
			<value>$UNDEFINED$</value>
29
			<description>WoS MDStore identifier</description>
30
		</property>
31
		<!-- import content related -->
32
		<!-- currently disabled, input_document_content property is handled as input holding DocumentContent datastore -->
33
		<property>
34
			<name>import_content_object_store_location</name>
35
			<value>$UNDEFINED$</value>
36
			<description>object store service location required for content retrieval</description>
37
		</property>
38
		<property>
39
			<name>import_content_lookup_service_location</name>
40
			<value>$UNDEFINED$</value>
41
			<description>lookup service location required for content retrieval, finding object store id based on repository id</description>
42
		</property>
43
		<property>
44
			<name>import_content_wos_plaintext_objectstores_csv</name>
45
			<value>$UNDEFINED$</value>
46
			<description>CSV list of objectstore ids to be approved during WoS plaintext import.</description>
47
		</property>
48
		<property>
49
			<name>import_content_datacite_objectstores_csv</name>
50
			<value>$UNDEFINED$</value>
51
			<description>CSV list of objectstore ids to be approved during datacite import.</description>
52
		</property>
53
		<!-- import content mime types -->
54
		<property>
55
			<name>import_content_mimetypes_pdf_csv</name>
56
			<value>pdf,application/pdf</value>
57
			<description>pdf mime types</description>
58
		</property>
59
		<property>
60
			<name>import_content_mimetypes_text_csv</name>
61
			<value>text,text/plain</value>
62
			<description>text mime types</description>
63
		</property>
64
		<property>
65
			<name>import_content_mimetypes_xml_pmc_csv</name>
66
			<value>xml</value>
67
			<description>xml pmc types</description>
68
		</property>
69
		<property>
70
			<name>import_content_mimetypes_wos_text_csv</name>
71
			<value>file::WoS</value>
72
			<description>WoS mime types</description>
73
		</property>
74
		<!-- import timeouts related -->
75
		<property>
76
			<name>import_resultset_client_read_timeout</name>
77
			<value>60000</value>
78
			<description>resultset client read timeout</description>
79
		</property>
80
		<property>
81
			<name>import_content_connection_timeout</name>
82
			<value>60000</value>
83
			<description>import content connection timeout</description>
84
		</property>
85
		<property>
86
			<name>import_content_read_timeout</name>
87
			<value>60000</value>
88
			<description>import content read timeout</description>
89
		</property>
90
		<!-- metadata extraction related -->
91
		<property>
92
			<name>metadataextraction_excluded_ids</name>
93
			<value>$UNDEFINED$</value>
94
			<description>list of content identifiers excluded from metadataextraction processing</description>
95
		</property>
96
		<property>
97
			<name>metadataextraction_default_cache_location</name>
98
			<description>metadata extraction cache location, path pointing to root cache directory holding meta.json file</description>
99
		</property>
100
		
101
		<!-- export related -->
102
		<property>
103
			<name>export_action_hbase_table_name</name>
104
			<description>action manager hbase table name</description>
105
		</property>
106
		<property>
107
			<name>export_action_hbase_table_initialize</name>
108
			<description>flag indicating input table should be initialized</description>
109
		</property>
110
		<!-- action set id properties -->
111
		<property>
112
			<name>export_action_set_id</name>
113
			<value>$UNDEFINED$</value>
114
			<description>action-set identifier of exported data</description>
115
		</property>
116
		<property>
117
			<name>export_action_set_id_document_referencedProjects</name>
118
			<value>$UNDEFINED$</value>
119
			<description>document_referencedProjects action-set identifier of exported data</description>
120
		</property>
121
		<property>
122
			<name>export_action_set_id_document_referencedDatasets</name>
123
			<value>$UNDEFINED$</value>
124
			<description>document_referencedDatasets action-set identifier of exported data</description>
125
		</property>
126
		<property>
127
			<name>export_action_set_id_entity_wos</name>
128
			<description>action-set identifier of exported data containing wos entities</description>
129
		</property>
130
		<property>
131
			<name>export_action_set_id_entity_dataset</name>
132
			<description>action-set identifier of exported data containing dataset entities</description>
133
		</property>
134
		<property>
135
			<name>export_action_hbase_remote_zookeeper_quorum</name>
136
			<value>$UNDEFINED$</value>
137
			<description>external hbase zookeeper quorum, set to empty value by default which means data will be exported to local hbase instance</description>
138
		</property>
139
		<property>
140
			<name>export_action_hbase_remote_zookeeper_clientport</name>
141
			<value>$UNDEFINED$</value>
142
			<description>external hbase zookeeper client port, required only whe zookeeper quorum property is set</description>
143
		</property>
144
	</parameters>
145

    
146
	<start to="import_forking" />
147

    
148
	<fork name="import_forking">
149
    	<path start="import_project"/>
150
		<path start="import_dataset"/>
151
    	<path start="content_importer_forking"/>
152
    </fork>
153
	
154
	<fork name="content_importer_forking">
155
    	<path start="wos_url_importer"/>
156
    	<path start="dataset_url_importer"/>
157
    </fork>
158

    
159
	<action name="wos_url_importer">
160
		<sub-workflow>
161
            <app-path>${wf:appPath()}/import_content_url</app-path>
162
            <propagate-configuration/>
163
            <configuration>
164
            	<property>
165
                    <name>workingDir</name>
166
                    <value>${workingDir}/wos_url_import/working_dir</value>
167
                </property>
168
                <property>
169
					<name>objectstore_service_location</name>
170
					<value>${import_content_object_store_location}</value>
171
				</property>
172
				<property>
173
					<name>approved_objectstores_csv</name>
174
					<value>${import_content_wos_plaintext_objectstores_csv}</value>
175
				</property>
176
            	<property>
177
					<name>output_dir</name>
178
					<value>${workingDir}/wos_url_import/imported</value>
179
				</property>
180
				<property>
181
					<name>mimetypes_text_csv</name>
182
					<value>${import_content_mimetypes_wos_text_csv}</value>
183
				</property>
184
				<property>
185
					<name>resultset_client_read_timeout</name>
186
					<value>${import_resultset_client_read_timeout}</value>
187
				</property>
188
			</configuration>
189
        </sub-workflow>
190
		<ok to="wos_import_plaintext" />
191
		<error to="fail" />
192
	</action>
193

    
194
	<action name="wos_import_plaintext">
195
		<sub-workflow>
196
            <app-path>${wf:appPath()}/import_plaintext</app-path>
197
            <propagate-configuration/>
198
            <configuration>
199
            	<property>
200
                    <name>workingDir</name>
201
                    <value>${workingDir}/wos_import_plaintext/working_dir</value>
202
                </property>
203
                <property>
204
					<name>input</name>
205
					<value>${workingDir}/wos_url_import/imported/plaintext_url</value>
206
				</property>
207
				<property>
208
				    <name>content_connection_timeout</name>
209
				   <value>${import_content_connection_timeout}</value>
210
				</property>
211
				<property>
212
				    <name>content_read_timeout</name>
213
				   <value>${import_content_read_timeout}</value>
214
				</property>
215
            	<property>
216
					<name>output</name>
217
					<value>${workingDir}/wos_import_plaintext/imported</value>
218
				</property>
219
			</configuration>
220
        </sub-workflow>
221
		<ok to="content_importer_joining" />
222
		<error to="fail" />
223
	</action>
224

    
225
	<action name="dataset_url_importer">
226
		<sub-workflow>
227
            <app-path>${wf:appPath()}/import_content_url</app-path>
228
            <propagate-configuration/>
229
            <configuration>
230
            	<property>
231
                    <name>workingDir</name>
232
                    <value>${workingDir}/dataset_url_import/working_dir</value>
233
                </property>
234
                <property>
235
					<name>objectstore_service_location</name>
236
					<value>${import_content_object_store_location}</value>
237
				</property>
238
				<property>
239
					<name>approved_objectstores_csv</name>
240
					<value>${import_content_datacite_objectstores_csv}</value>
241
				</property>
242
				<property>
243
					<name>mimetypes_pdf_csv</name>
244
					<value>${import_content_mimetypes_pdf_csv}</value>
245
				</property>
246
				<property>
247
					<name>mimetypes_text_csv</name>
248
					<value>${import_content_mimetypes_text_csv}</value>
249
				</property>
250
				<property>
251
					<name>mimetypes_xml_pmc_csv</name>
252
					<value>${import_content_mimetypes_xml_pmc_csv}</value>
253
				</property>
254
				<property>
255
					<name>resultset_client_read_timeout</name>
256
					<value>${import_resultset_client_read_timeout}</value>
257
				</property>
258
            	<property>
259
					<name>output_dir</name>
260
					<value>${workingDir}/dataset_url_import/imported</value>
261
				</property>
262
			</configuration>
263
        </sub-workflow>
264
		<ok to="dataset_import_urlbased_forking" />
265
		<error to="fail" />
266
	</action>
267

    
268
	<fork name="dataset_import_urlbased_forking">
269
    	<path start="dataset_import_plaintext"/>
270
    	<path start="dataset_import_plaintext_pmc"/>
271
		<path start="metadataextraction_cached"/>
272
    </fork>
273

    
274
	<action name="dataset_import_plaintext">
275
		<sub-workflow>
276
            <app-path>${wf:appPath()}/import_plaintext</app-path>
277
            <propagate-configuration/>
278
            <configuration>
279
            	<property>
280
                    <name>workingDir</name>
281
                    <value>${workingDir}/dataset_import_plaintext/working_dir</value>
282
                </property>
283
                <property>
284
					<name>input</name>
285
					<value>${workingDir}/dataset_url_import/imported/plaintext_url</value>
286
				</property>
287
				<property>
288
				    <name>content_connection_timeout</name>
289
				   <value>${import_content_connection_timeout}</value>
290
				</property>
291
				<property>
292
				    <name>content_read_timeout</name>
293
				   <value>${import_content_read_timeout}</value>
294
				</property>
295
            	<property>
296
					<name>output</name>
297
					<value>${workingDir}/dataset_import_plaintext/imported</value>
298
				</property>
299
			</configuration>
300
        </sub-workflow>
301
		<ok to="dataset_import_urlbased_joining" />
302
		<error to="fail" />
303
	</action>
304

    
305
	<action name="dataset_import_plaintext_pmc">
306
		<sub-workflow>
307
            <app-path>${wf:appPath()}/import_plaintext</app-path>
308
            <propagate-configuration/>
309
            <configuration>
310
            	<property>
311
                    <name>workingDir</name>
312
                    <value>${workingDir}/import_plaintext_pmc/working_dir</value>
313
                </property>
314
                <property>
315
					<name>input</name>
316
					<value>${workingDir}/dataset_url_import/imported/xml_pmc_url</value>
317
				</property>
318
				<property>
319
				    <name>content_connection_timeout</name>
320
				   <value>${import_content_connection_timeout}</value>
321
				</property>
322
				<property>
323
				    <name>content_read_timeout</name>
324
				   <value>${import_content_read_timeout}</value>
325
				</property>
326
            	<property>
327
					<name>output</name>
328
					<value>${workingDir}/import_plaintext_pmc/imported</value>
329
				</property>
330
			</configuration>
331
        </sub-workflow>
332
		<ok to="import_pmc" />
333
		<error to="fail" />
334
	</action>
335
	
336
	<action name="import_pmc">
337
		<sub-workflow>
338
            <app-path>${wf:appPath()}/import_pmc</app-path>
339
            <propagate-configuration/>
340
            <configuration>
341
            	<property>
342
                    <name>workingDir</name>
343
                    <value>${workingDir}/import_pmc/working_dir</value>
344
                </property>
345
                <property>
346
					<name>input_document_nlm</name>
347
					<value>${workingDir}/import_plaintext_pmc/imported</value>
348
				</property>
349
            	<property>
350
					<name>output_document_plaintext</name>
351
					<value>${workingDir}/import_pmc/imported</value>
352
				</property>
353
			</configuration>
354
        </sub-workflow>
355
		<ok to="dataset_import_urlbased_joining" />
356
		<error to="fail" />
357
	</action>
358

    
359
	<action name="metadataextraction_cached">
360
		<sub-workflow>
361
            <app-path>${wf:appPath()}/metadataextraction_cached</app-path>
362
            <propagate-configuration/>
363
            <configuration>
364
            	<property>
365
                    <name>workingDir</name>
366
                    <value>${workingDir}/metadataextraction_cached/working_dir</value>
367
                </property>
368
            	<property>
369
                    <name>input</name>
370
                    <value>${workingDir}/dataset_url_import/imported/content_url</value>
371
                </property>
372
                <property>
373
                    <name>excluded_ids</name>
374
                    <value>${metadataextraction_excluded_ids}</value>
375
                </property>
376
                <property>
377
                    <name>default_cache_location</name>
378
                    <value>${metadataextraction_default_cache_location}</value>
379
                </property>
380
                <property>
381
                    <name>mapred_max_split_size</name>
382
                    <value>10000</value>
383
                </property>
384
                <property>
385
				    <name>content_connection_timeout</name>
386
				   <value>${import_content_connection_timeout}</value>
387
				</property>
388
				<property>
389
				    <name>content_read_timeout</name>
390
				   <value>${import_content_read_timeout}</value>
391
				</property>
392
            	<property>
393
					<name>output_root</name>
394
					<value>${workingDir}/metadataextraction_cached/out</value>
395
				</property>
396
			</configuration>
397
        </sub-workflow>
398
		<ok to="dataset_import_urlbased_joining" />
399
		<error to="fail" />
400
	</action>
401

    
402
	<join name="dataset_import_urlbased_joining" to="transformers_common_union_plaintext_merge_outputs"/>
403

    
404
 	<!-- merging document text datastores: 
405
    	1) retrieved directly from objectstore 
406
    	2) generated by metadataextraction 
407
    	3) imported from PMC XMLs -->
408
	<action name="transformers_common_union_plaintext_merge_outputs">
409
		<sub-workflow>
410
            <app-path>${wf:appPath()}/transformers_common_union3</app-path>
411
            <propagate-configuration/>
412
            <configuration>
413
            	<property>
414
					<name>input_a</name>
415
					<value>${workingDir}/dataset_import_plaintext/imported</value>
416
				</property>
417
				<property>
418
					<name>input_b</name>
419
					<value>${workingDir}/metadataextraction_cached/out/plaintext</value>
420
				</property>
421
				<property>
422
					<name>input_c</name>
423
					<value>${workingDir}/import_pmc/imported</value>
424
				</property>
425
				<property>
426
					<name>output</name>
427
					<value>${workingDir}/dataset_plaintext</value>
428
				</property>
429
				<property>
430
					<name>schema</name>
431
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
432
				</property>
433
            </configuration>
434
        </sub-workflow>
435
		<ok to="content_importer_joining"/>
436
		<error to="fail" />
437
	</action>
438

    
439
	<join name="content_importer_joining" to="import_joining"/>
440

    
441
	<action name="import_project">
442
		<sub-workflow>
443
            <app-path>${wf:appPath()}/import_project</app-path>
444
            <propagate-configuration/>
445
            <configuration>
446
            	<property>
447
                    <name>workingDir</name>
448
                    <value>${workingDir}/import_project/working_dir</value>
449
                </property>
450
                <property>
451
					<name>input_database_service_location</name>
452
					<value>${import_database_service_location}</value>
453
				</property>
454
				<property>
455
					<name>input_database_name</name>
456
					<value>${import_database_dbname}</value>
457
				</property>
458
				<property>
459
					<name>resultset_client_read_timeout</name>
460
					<value>${import_resultset_client_read_timeout}</value>
461
				</property>
462
            	<property>
463
					<name>output_dir</name>
464
					<value>${workingDir}/import_project/imported</value>
465
				</property>
466
			</configuration>
467
        </sub-workflow>
468
		<ok to="import_joining" />
469
		<error to="fail" />
470
	</action>
471

    
472
	<action name="import_dataset">
473
		<sub-workflow>
474
            <app-path>${wf:appPath()}/import_dataset</app-path>
475
            <propagate-configuration/>
476
            <configuration>
477
            	<property>
478
                    <name>workingDir</name>
479
                    <value>${workingDir}/import_dataset/working_dir</value>
480
                </property>
481
                <property>
482
					<name>input_mdstore_service_location</name>
483
					<value>${import_mdstore_service_location}</value>
484
				</property>
485
				<property>
486
					<name>input_mdstore_id</name>
487
					<value>${import_dataset_mdstore_id}</value>
488
				</property>
489
				<property>
490
					<name>resultset_client_read_timeout</name>
491
					<value>${import_resultset_client_read_timeout}</value>
492
				</property>
493
            	<property>
494
					<name>output_dir</name>
495
					<value>${workingDir}/import_dataset/imported</value>
496
				</property>
497
			</configuration>
498
        </sub-workflow>
499
		<ok to="import_joining" />
500
		<error to="fail" />
501
	</action>
502

    
503
	<join name="import_joining" to="referenceextraction_forking"/>
504
	
505
	<fork name="referenceextraction_forking">
506
		<path start="referenceextraction_dataset"/>
507
    	<path start="referenceextraction_project"/>
508
    </fork>
509
	
510
	<action name="referenceextraction_dataset">
511
	    <sub-workflow>
512
            <app-path>${wf:appPath()}/referenceextraction_dataset</app-path>
513
            <propagate-configuration/>
514
            <configuration>
515
            	<property>
516
                    <name>workingDir</name>
517
                    <value>${workingDir}/referenceextraction_dataset/working_dir</value>
518
                </property>
519
            	<property>
520
					<name>input_document_text</name>
521
					<value>${workingDir}/dataset_plaintext</value>
522
				</property>
523
				<property>
524
					<name>input_dataset</name>
525
					<value>${workingDir}/import_dataset/imported</value>
526
				</property>
527
				<property>
528
					<name>output_document_to_dataset</name>
529
					<value>${workingDir}/referenceextraction_dataset/document_datasets</value>
530
				</property>
531
            </configuration>
532
        </sub-workflow>
533
		<ok to="referenceextraction_joining"/>
534
		<error to="fail" />
535
    </action>
536
	
537
	<action name="referenceextraction_project">
538
	    <sub-workflow>
539
            <app-path>${wf:appPath()}/referenceextraction_project</app-path>
540
            <propagate-configuration/>
541
            <configuration>
542
            	<property>
543
                    <name>workingDir</name>
544
                    <value>${workingDir}/referenceextraction_project/working_dir</value>
545
                </property>
546
            	<property>
547
					<name>input_document_text</name>
548
					<value>${workingDir}/wos_import_plaintext/imported</value>
549
				</property>
550
				<property>
551
					<name>input_project</name>
552
					<value>${workingDir}/import_project/imported</value>
553
				</property>
554
				<property>
555
					<name>output_document_to_project</name>
556
					<value>${workingDir}/referenceextraction_project/document_projects</value>
557
				</property>
558
            </configuration>
559
        </sub-workflow>
560
		<ok to="referenceextraction_joining"/>
561
		<error to="fail" />
562
    </action>
563
    
564
    <join name="referenceextraction_joining" to="transformers_export_document_producer"/>
565
    
566
    <!-- this node is required due to the PIG limitation 
567
    	disallowing empty directories as input avro storages -->
568
    <action name="transformers_export_document_producer">
569
        <java>
570
            <job-tracker>${jobTracker}</job-tracker>
571
            <name-node>${nameNode}</name-node>
572
			<!-- The data generated by this node is deleted in this section -->
573
			<prepare>
574
				<delete path="${nameNode}${workingDir}/producer" />
575
				<mkdir path="${nameNode}${workingDir}/producer" />
576
			</prepare>
577
            <configuration>
578
                <property>
579
                    <name>mapred.job.queue.name</name>
580
                    <value>${queueName}</value>
581
                </property>
582
            </configuration>
583
            <!-- This is simple wrapper for the Java code -->
584
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
585
			<!-- The business Java code that gets to be executed -->
586
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
587
			<!-- Specification of the output ports -->
588
            <arg>-C{extracted_document_metadata,
589
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
590
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
591
            <arg>-C{citation,
592
				eu.dnetlib.iis.citationmatching.schemas.Citation,
593
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
594
            <arg>-C{document_to_project,
595
				eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject,
596
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
597
            <arg>-C{document_to_dataset,
598
				eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet,
599
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
600
			<arg>-C{document_to_research_initiative,
601
				eu.dnetlib.iis.referenceextraction.researchinitiative.schemas.DocumentToConceptId,
602
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
603
            <arg>-C{document_to_document_clusters,
604
				eu.dnetlib.iis.documentsclustering.schemas.DocumentToDocumentClusters,
605
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>    
606
            <arg>-C{document_to_document_classes,
607
				eu.dnetlib.iis.documentsclassification.schemas.DocumentToDocumentClasses,
608
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>    
609
            <arg>-C{document_to_document_statistics,
610
				eu.dnetlib.iis.statistics.schemas.DocumentToDocumentStatistics,
611
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>    
612
            <arg>-C{document_with_website_usage_similarities,
613
				eu.dnetlib.iis.websiteusage.schemas.DocumentsWithWebsiteUsageSimilarities,
614
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
615
			<!-- this one is required by dataset id generator, it is impossible to check
616
			existing dataset records therefore generating empty datastore -->	
617
			<arg>-C{dataset_existing_id,
618
				eu.dnetlib.iis.importer.schemas.DocumentId,
619
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
620
			<!-- this one is required by WoS document exporter, it is impossible to check
621
			existing WoS publications therefore generating empty datastore -->	
622
			<arg>-C{document_existing_meta,
623
				eu.dnetlib.iis.importer.schemas.DocumentMetadata,
624
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
625
			<!-- All input and output ports have to be bound to paths in HDFS, working 
626
				directory has to be specified as well -->
627
            <arg>-SworkingDir=${workingDir}/producer/working_dir</arg>
628
            <arg>-Oextracted_document_metadata=${workingDir}/producer/extracted_document_metadata</arg>
629
            <arg>-Ocitation=${workingDir}/producer/citation</arg>
630
            <arg>-Odocument_to_project=${workingDir}/producer/document_to_project</arg>
631
            <arg>-Odocument_to_dataset=${workingDir}/producer/document_to_dataset</arg>
632
            <arg>-Odocument_to_research_initiative=${workingDir}/producer/document_to_research_initiative</arg>
633
            <arg>-Odocument_to_document_clusters=${workingDir}/producer/document_to_document_clusters</arg>
634
            <arg>-Odocument_to_document_classes=${workingDir}/producer/document_to_document_classes</arg>
635
            <arg>-Odocument_to_document_statistics=${workingDir}/producer/document_to_document_statistics</arg>
636
            <arg>-Odocument_with_website_usage_similarities=${workingDir}/producer/document_with_website_usage_similarities</arg>
637
            <arg>-Odataset_existing_id=${workingDir}/producer/dataset_existing_id</arg>
638
            <arg>-Odocument_existing_meta=${workingDir}/producer/document_existing_meta</arg>
639
        </java>
640
        <ok to="transformers_export_document"/>
641
        <error to="fail"/>
642
    </action>
643
    
644
    <action name="transformers_export_document">
645
	    <sub-workflow>
646
            <app-path>${wf:appPath()}/transformers_export_document</app-path>
647
            <propagate-configuration/>
648
            <configuration>
649
				<property>
650
                    <name>workingDir</name>
651
                    <value>${workingDir}/transformers_export_document/working_dir</value>
652
                </property>
653
                <!-- real input datastores -->
654
				<property>
655
					<name>input_document_to_project</name>
656
					<value>${workingDir}/referenceextraction_project/document_projects</value>
657
				</property>
658
				<property>
659
					<name>input_document_to_dataset</name>
660
					<value>${workingDir}/referenceextraction_dataset/document_datasets</value>
661
				</property>
662
				<!-- fake input datastores -->
663
				<property>
664
                    <name>input_document_to_research_initiative</name>
665
                    <value>${workingDir}/producer/document_to_research_initiative</value>
666
                </property>
667
				<property>
668
					<name>input_extracted_document_metadata</name>
669
					<value>${workingDir}/producer/extracted_document_metadata</value>
670
				</property>
671
				<property>
672
					<name>input_citation</name>
673
					<value>${workingDir}/producer/citation</value>
674
				</property>
675
				<property>
676
					<name>input_document_to_document_statistics</name>
677
					<value>${workingDir}/producer/document_to_document_statistics</value>
678
				</property>
679
				<property>
680
					<name>input_document_to_document_clusters</name>
681
					<value>${workingDir}/producer/document_to_document_clusters</value>
682
				</property>
683
				<property>
684
					<name>input_document_to_document_classes</name>
685
					<value>${workingDir}/producer/document_to_document_classes</value>
686
				</property>
687
				<property>
688
					<name>input_document_with_website_usage_similarities</name>
689
					<value>${workingDir}/producer/document_with_website_usage_similarities</value>
690
				</property>
691
				<!-- output datastore -->
692
				<property>
693
					<name>output_document_with_inferenced_data</name>
694
					<value>${workingDir}/exported/document_with_inferenced_data</value>
695
				</property>
696
            </configuration>
697
        </sub-workflow>
698
		<ok to="export_actionmanager"/>
699
		<error to="fail" />
700
    </action>
701
    
702
  	<action name="export_actionmanager">
703
		<sub-workflow>
704
            <app-path>${wf:appPath()}/export_actionmanager</app-path>
705
            <propagate-configuration/>
706
            <configuration>
707
            	<property>
708
                    <name>workingDir</name>
709
                    <value>${workingDir}/export_actionmanager/working_dir</value>
710
                </property>
711
            	<property>
712
					<name>input_document</name>
713
					<value>${workingDir}/exported/document_with_inferenced_data</value>
714
				</property>
715
				<property>
716
					<name>action_hbase_table_name</name>
717
					<value>${export_action_hbase_table_name}</value>
718
				</property>
719
				<property>
720
					<name>action_hbase_table_initialize</name>
721
					<value>${export_action_hbase_table_initialize}</value>
722
				</property>
723
				<property>
724
					<name>action_set_id</name>
725
					<value>${export_action_set_id}</value>
726
				</property>
727
				<property>
728
					<name>action_set_id_document_referencedProjects</name>
729
					<value>${export_action_set_id_document_referencedProjects}</value>
730
				</property>
731
				<property>
732
					<name>action_set_id_document_referencedDatasets</name>
733
					<value>${export_action_set_id_document_referencedDatasets}</value>
734
				</property>
735
				<property>
736
					<name>action_hbase_remote_zookeeper_quorum</name>
737
					<value>${export_action_hbase_remote_zookeeper_quorum}</value>
738
				</property>
739
				<property>
740
					<name>action_hbase_remote_zookeeper_clientport</name>
741
					<value>${export_action_hbase_remote_zookeeper_clientport}</value>
742
				</property>
743
            </configuration>
744
        </sub-workflow>
745
		<ok to="export_entities_forking"/>
746
		<error to="fail" />
747
	</action>
748
 
749
 	<fork name="export_entities_forking">
750
    	<path start="transformers_export_identifier_referenceddatasets"/>
751
    	<path start="transformers_export_identifier_document_to_project"/>
752
    </fork>
753
 
754
 	<!-- dataset entities export section -->
755
 	<action name="transformers_export_identifier_referenceddatasets">
756
	    <sub-workflow>
757
            <app-path>${wf:appPath()}/transformers_export_identifier_referenceddatasets</app-path>
758
            <propagate-configuration/>
759
            <configuration>
760
				<property>
761
                    <name>workingDir</name>
762
                    <value>${workingDir}/export_identifier_referenceddatasets/working_dir</value>
763
                </property>
764
                <property>
765
					<name>input_document_id</name>
766
					<!-- this is generated empty datastore -->
767
					<value>${workingDir}/producer/dataset_existing_id</value>
768
				</property>
769
				<property>
770
					<name>input_document_with_inferenced_data</name>
771
					<value>${workingDir}/exported/document_with_inferenced_data</value>
772
				</property>
773
				<property>
774
					<name>output_identifier</name>
775
					<value>${workingDir}/identifier/datasets</value>
776
				</property>
777
            </configuration>
778
        </sub-workflow>
779
        <ok to="exporter-dataset-entities"/>
780
		<error to="fail" />
781
    </action>
782
 
783
 	<action name="exporter-dataset-entities">
784
		<java>
785
			<job-tracker>${jobTracker}</job-tracker>
786
			<name-node>${nameNode}</name-node>
787
			<prepare>
788
				<delete path="${nameNode}${workingDir}/export-dataset-entities" />
789
				<mkdir path="${nameNode}${workingDir}/export-dataset-entities" />
790
			</prepare>
791
			<configuration>
792
				<property>
793
					<name>mapred.job.queue.name</name>
794
					<value>${queueName}</value>
795
				</property>
796
			</configuration>
797
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
798
			<arg>eu.dnetlib.iis.export.actionmanager.entity.dataset.DatasetExporterProcess</arg>
799
			<arg>-SworkingDir=${workingDir}</arg>
800
			<arg>-Iinput=${workingDir}/identifier/datasets</arg>
801
			
802
			<arg>-Pimport.datacite.mdstore.service.location=${import_mdstore_service_location}</arg>
803
			<arg>-Pimport.datacite.mdstore.id=${import_dataset_mdstore_id}</arg>
804
			<arg>-Pexport.action.setid=${export_action_set_id_entity_dataset}</arg>
805
			<arg>-Pexport.action.hbase.table.name=${export_action_hbase_table_name}</arg>
806
			<arg>-Pexport.action.hbase.remote.zookeeper.quorum=${export_action_hbase_remote_zookeeper_quorum}</arg>
807
			<arg>-Pexport.action.hbase.remote.zookeeper.clientport=${export_action_hbase_remote_zookeeper_clientport}</arg>
808
			<arg>-Pexport.action.hbase.table.initialize=${export_action_hbase_table_initialize}</arg>
809
		</java>
810
		<ok to="export_entities_joining" />
811
		<error to="fail" />
812
	</action>
813
    <!-- end of dataset entities export section -->
814
    
815
    <!-- WoS entities export section -->
816
 	<action name="transformers_export_identifier_document_to_project">
817
	    <sub-workflow>
818
            <app-path>${wf:appPath()}/transformers_export_identifier_document_to_project</app-path>
819
            <propagate-configuration/>
820
            <configuration>
821
				<property>
822
                    <name>workingDir</name>
823
                    <value>${workingDir}/export_identifier_documents/working_dir</value>
824
                </property>
825
                <property>
826
					<name>input_document_to_project</name>
827
					<value>${workingDir}/referenceextraction_project/document_projects</value>
828
				</property>
829
				<property>
830
					<name>output_identifier</name>
831
					<value>${workingDir}/identifier/documents</value>
832
				</property>
833
            </configuration>
834
        </sub-workflow>
835
        <ok to="exporter-document-entities"/>
836
		<error to="fail" />
837
    </action>
838
 
839
 	<action name="exporter-document-entities">
840
		<java>
841
			<job-tracker>${jobTracker}</job-tracker>
842
			<name-node>${nameNode}</name-node>
843
			<prepare>
844
				<delete path="${nameNode}${workingDir}/export-document-entities" />
845
				<mkdir path="${nameNode}${workingDir}/export-document-entities" />
846
			</prepare>
847
			<configuration>
848
				<property>
849
					<name>mapred.job.queue.name</name>
850
					<value>${queueName}</value>
851
				</property>
852
			</configuration>
853
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
854
			<arg>eu.dnetlib.iis.export.actionmanager.entity.document.DocumentExporterProcess</arg>
855
			<arg>-SworkingDir=${workingDir}</arg>
856
			<arg>-Iinput=${workingDir}/identifier/documents</arg>
857
			
858
			<arg>-Pimport.document.mdstore.service.location=${import_mdstore_service_location}</arg>
859
			<arg>-Pimport.document.mdstore.id=${import_wos_mdstore_id}</arg>
860
			<arg>-Pexport.action.setid=${export_action_set_id_entity_wos}</arg>
861
			<arg>-Pexport.action.hbase.table.name=${export_action_hbase_table_name}</arg>
862
			<arg>-Pexport.action.hbase.remote.zookeeper.quorum=${export_action_hbase_remote_zookeeper_quorum}</arg>
863
			<arg>-Pexport.action.hbase.remote.zookeeper.clientport=${export_action_hbase_remote_zookeeper_clientport}</arg>
864
			<arg>-Pexport.action.hbase.table.initialize=${export_action_hbase_table_initialize}</arg>
865
		</java>
866
		<ok to="export_entities_joining" />
867
		<error to="fail" />
868
	</action>
869
    <!-- end of WoS entities export section -->
870
    
871
    <join name="export_entities_joining" to="end"/>
872
    
873
	<kill name="fail">
874
		<message>Unfortunately, the process failed -- error message:
875
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
876
	</kill>
877
	<end name="end" />
878
</workflow-app>
(1-1/2)