Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_primary_processing">
2
	
3
	<parameters>
4
		<property>
5
            <name>remove_sideproducts</name>
6
            <value>true</value>
7
            <description>flag indicating inference side products will be erased</description>
8
        </property>
9
		<!-- processing modes -->
10
		<property>
11
			<name>active_referenceextraction_project</name>
12
			<value>true</value>
13
			<description>flag indicating project reference extraction should be enabled</description>
14
		</property>
15
		<property>
16
			<name>active_referenceextraction_dataset</name>
17
			<value>true</value>
18
			<description>flag indicating dataset reference extraction should be enabled</description>
19
		</property>
20
		<property>
21
			<!-- currently disabled by default -->
22
			<name>active_referenceextraction_researchinitiative</name>
23
			<value>false</value>
24
			<description>flag indicating researchinitiative reference extraction should be enabled</description>
25
		</property>
26
		<property>
27
			<!-- currently disabled by default -->
28
			<name>active_referenceextraction_pdb</name>
29
			<value>false</value>
30
			<description>flag indicating protein databank reference extraction should be enabled</description>
31
		</property>
32
		<property>
33
			<name>active_documentsclassification</name>
34
			<value>true</value>
35
			<description>flag indicating documents classification should be enabled</description>
36
		</property>
37
		<property>
38
			<name>active_documentssimilarity</name>
39
			<value>true</value>
40
			<description>flag indicating documents similarity should be enabled</description>
41
		</property>
42
		<property>
43
			<name>active_citationmatching</name>
44
			<!-- currently disabled by default -->
45
			<value>false</value>
46
			<description>flag indicating citation matching should be enabled</description>
47
		</property>
48
		<property>
49
			<name>active_statistics</name>
50
			<!-- currently disabled by default -->
51
			<value>false</value>
52
			<description>flag indicating statistics generation should be enabled</description>
53
		</property>
54
		<!-- input ports -->
55
		<property>
56
			<name>input_document_metadata</name>
57
			<description>input document metadata directory</description>
58
		</property>
59
		<property>
60
			<name>input_document_to_project</name>
61
			<description>input document to project relation directory</description>
62
		</property>
63
		<property>
64
			<name>input_document_text</name>
65
			<description>input document text directory</description>
66
		</property>
67
		<property>
68
            <name>input_document_text_wos</name>
69
			<description>input document text directory holding WOS records</description>
70
        </property>
71
		<property>
72
			<name>input_project</name>
73
			<description>input project directory</description>
74
		</property>
75
		<property>
76
			<name>input_person</name>
77
			<description>input person directory</description>
78
		</property>
79
		<property>
80
			<name>input_dataset</name>
81
			<description>input dataset directory</description>
82
		</property>
83
		<property>
84
			<name>input_extracted_document_metadata</name>
85
			<description>input extracted document metadata directory</description>
86
		</property>
87
		<property>
88
			<name>input_citation_pmc</name>
89
			<description>input directory holding citations extracted from PMC</description>
90
		</property>
91
		<property>
92
			<name>input_project_concept</name>
93
			<description>input project concept directory</description>
94
		</property>
95
		<!-- citation matching related -->
96
		<property>
97
            <name>cit_genAuthorIdxJavaOpts</name>
98
            <value>-Xmx8g</value>
99
            <description>java opts for author index creation for citation purposes</description>
100
        </property>
101
        <!-- document similarity related -->
102
        <property>
103
            <name>ds_parallel</name>
104
            <value>20</value>
105
            <description>document similarity pig parallel</description>
106
        </property>
107
        <property>
108
            <name>ds_mapredChildJavaOpts</name>
109
            <value>-Xmx20g</value>
110
            <description>mapred child java opts</description>
111
        </property>
112
        <property>
113
            <name>ds_sample</name>
114
            <value>1.0</value>
115
            <description>sample rate</description>
116
        </property>
117
        <property>
118
            <name>ds_removal_rate</name>
119
            <value>0.99</value>
120
            <description>document similarity removal rate</description>
121
        </property>
122
        <property>
123
            <name>ds_removal_least_used</name>
124
            <value>20</value>
125
            <description>document similarity least used removal</description>
126
        </property>
127
        <property>
128
            <name>ds_tfidfTopnTermPerDocument</name>
129
            <value>20</value>
130
        </property>
131
        <property>
132
            <name>ds_similarityTopnDocumentPerDocument</name>
133
            <value>20</value>
134
        </property>
135
		<!-- output ports -->
136
		<property>
137
			<name>output_document_to_project</name>
138
			<description>project reference extraction output directory</description>
139
		</property>
140
		<property>
141
			<name>output_document_to_project_concepts</name>
142
			<description>document to project concepts output directory</description>
143
		</property>
144
		<property>
145
			<name>output_document_to_dataset</name>
146
			<description>dataset reference extraction output directory</description>
147
		</property>
148
		<property>
149
			<name>output_document_to_research_initiatives</name>
150
			<description>research initiatives reference extraction output directory</description>
151
		</property>
152
		<property>
153
			<name>output_document_to_pdb</name>
154
			<description>protein databank reference extraction output directory</description>
155
		</property>
156
		<property>
157
			<name>output_document_to_document_classes</name>
158
			<description>output document classification directory</description>
159
		</property>
160
		<property>
161
			<name>output_citation</name>
162
			<description>output containing grouped citations coming from citation matching and pmc ingestion</description>			
163
		</property>
164
		<property>
165
			<name>output_document_similarity</name>
166
			<description>output document similarity directory</description>
167
		</property>
168
		<property>
169
			<name>output_document_statistics</name>
170
			<description>output document statistics directory</description>
171
		</property>
172
		<property>
173
			<name>output_author_statistics</name>
174
			<description>output author statistics directory</description>
175
		</property>
176
		<property>
177
			<name>output_project_statistics</name>
178
			<description>output project statistics directory</description>
179
		</property>
180
	</parameters>
181

    
182
	<global>
183
        <job-tracker>${jobTracker}</job-tracker>
184
        <name-node>${nameNode}</name-node>
185
        <configuration>
186
            <property>
187
                <name>mapred.job.queue.name</name>
188
                <value>${queueName}</value>
189
            </property>
190
		</configuration>
191
	</global>
192

    
193
	<start to="forking" />
194
    
195
    <fork name="forking">
196
    	<path start="decision-referenceextraction_project"/>
197
    	<path start="decision-referenceextraction_dataset"/>
198
    	<path start="decision-referenceextraction_researchinitiative"/>
199
    	<path start="decision-referenceextraction_pdb"/>
200
        <path start="transformers_metadatamerger"/>
201
    </fork>
202
    
203
    <!-- start of project reference extraction block -->
204
    <decision name="decision-referenceextraction_project">
205
        <switch>
206
            <case to="referenceextraction_project">${active_referenceextraction_project eq "true"}</case>
207
            <default to="skip-referenceextraction_project"/>
208
        </switch>
209
    </decision>
210
    
211
    <action name="referenceextraction_project">
212
	    <sub-workflow>
213
            <app-path>${wf:appPath()}/referenceextraction_project</app-path>
214
            <propagate-configuration/>
215
            <configuration>
216
            	<property>
217
                    <name>workingDir</name>
218
                    <value>${workingDir}/referenceextraction_project/working_dir</value>
219
                </property>
220
            	<property>
221
					<name>input_document_text</name>
222
					<value>${input_document_text}</value>
223
				</property>
224
				<property>
225
					<name>input_project</name>
226
					<value>${input_project}</value>
227
				</property>
228
				<property>
229
					<name>output_document_to_project</name>
230
					<!-- referenceextraction_project directory is created at subworkflow prepare phase -->
231
					<value>${output_document_to_project}</value>
232
				</property>
233
            </configuration>
234
        </sub-workflow>
235
		<ok to="transformers_project_toconcept"/>
236
		<error to="fail" />
237
    </action>
238
    
239
    <action name="transformers_project_toconcept">
240
	    <sub-workflow>
241
            <app-path>${wf:appPath()}/transformers_project_toconcept</app-path>
242
            <propagate-configuration/>
243
            <configuration>
244
            	<property>
245
                    <name>workingDir</name>
246
                    <value>${workingDir}/transformers_project_toconcept/working_dir</value>
247
                </property>
248
            	<property>
249
					<name>input_document_to_project</name>
250
					<value>${output_document_to_project}</value>
251
				</property>
252
				<property>
253
					<name>input_project</name>
254
					<value>${input_project}</value>
255
				</property>
256
				<property>
257
					<name>input_concept</name>
258
					<value>${input_project_concept}</value>
259
				</property>
260
				<property>
261
					<name>output</name>
262
					<value>${workingDir}/transformers_project_toconcept/out</value>
263
				</property>
264
            </configuration>
265
        </sub-workflow>
266
		<ok to="transformers_concept_to_researchinitiatives"/>
267
		<error to="fail" />
268
    </action>
269
    
270
    <action name="transformers_concept_to_researchinitiatives">
271
	    <sub-workflow>
272
            <app-path>${wf:appPath()}/transformers_export_researchinitiatives</app-path>
273
            <propagate-configuration/>
274
            <configuration>
275
            	<property>
276
                    <name>workingDir</name>
277
                    <value>${workingDir}/transformers_concept_to_researchinitiatives/working_dir</value>
278
                </property>
279
            	<property>
280
					<name>input_document_to_research_initiative</name>
281
					<value>${workingDir}/transformers_project_toconcept/out</value>
282
				</property>
283
				<property>
284
					<name>output_document_to_research_initiatives</name>
285
					<value>${output_document_to_project_concepts}</value>
286
				</property>
287
            </configuration>
288
        </sub-workflow>
289
		<ok to="joining"/>
290
		<error to="fail" />
291
    </action>
292
    
293
    <action name="skip-referenceextraction_project">
294
        <java>
295
			<prepare>
296
				<!-- notice: directory have to aligned with skipped action output -->
297
				<delete path="${nameNode}${workingDir}/referenceextraction_project" />
298
				<delete path="${nameNode}${workingDir}/transformers_project_toconcept" />
299
				<delete path="${nameNode}${workingDir}/transformers_concept_to_researchinitiatives" />
300
				<delete path="${nameNode}${output_document_to_project}"/>
301
				<delete path="${nameNode}${output_document_to_project_concepts}"/>
302
				<mkdir path="${nameNode}${workingDir}/referenceextraction_project" />
303
				<mkdir path="${nameNode}${output_document_to_project}"/>
304
				<mkdir path="${nameNode}${output_document_to_project_concepts}"/>
305
			</prepare>
306
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
307
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
308
            <arg>-C{referenceextraction_project,
309
				eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject,
310
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
311
            <arg>-C{document_to_project_concepts,
312
				eu.dnetlib.iis.export.schemas.DocumentToConceptIds,
313
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
314
            <!-- notice: directory have to aligned with skipped action output -->
315
            <arg>-Oreferenceextraction_project=${output_document_to_project}</arg>
316
            <arg>-Odocument_to_project_concepts=${output_document_to_project_concepts}</arg>   
317
        </java>
318
        <ok to="joining"/>
319
        <error to="fail"/>
320
    </action>
321
    <!-- end of project reference extraction block -->
322
    
323
    <!-- start of dataset reference extraction block -->
324
    <decision name="decision-referenceextraction_dataset">
325
        <switch>
326
            <case to="referenceextraction_dataset">${active_referenceextraction_dataset eq "true"}</case>
327
            <default to="skip-referenceextraction_dataset"/>
328
        </switch>
329
    </decision>
330
    
331
    <action name="referenceextraction_dataset">
332
	    <sub-workflow>
333
            <app-path>${wf:appPath()}/referenceextraction_dataset</app-path>
334
            <propagate-configuration/>
335
            <configuration>
336
            	<property>
337
                    <name>workingDir</name>
338
                    <value>${workingDir}/referenceextraction_dataset/working_dir</value>
339
                </property>
340
            	<property>
341
					<name>input_document_text</name>
342
					<value>${input_document_text}</value>
343
				</property>
344
				<property>
345
					<name>input_dataset</name>
346
					<value>${input_dataset}</value>
347
				</property>
348
				<property>
349
					<name>output_document_to_dataset</name>
350
					<!-- referenceextraction_dataset directory is created at subworkflow prepare phase -->
351
					<value>${output_document_to_dataset}</value>
352
				</property>
353
            </configuration>
354
        </sub-workflow>
355
		<ok to="joining"/>
356
		<error to="fail" />
357
    </action>
358
    
359
    <action name="skip-referenceextraction_dataset">
360
        <java>
361
			<prepare>
362
				<!-- notice: directory have to aligned with skipped action output -->
363
				<delete path="${nameNode}${workingDir}/referenceextraction_dataset" />
364
				<delete path="${nameNode}${output_document_to_dataset}"/>
365
				<mkdir path="${nameNode}${workingDir}/referenceextraction_dataset" />
366
				<mkdir path="${nameNode}${output_document_to_dataset}"/>
367
			</prepare>
368
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
369
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
370
            <arg>-C{referenceextraction_dataset,
371
				eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet,
372
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
373
            <!-- notice: directory have to aligned with skipped action output -->
374
            <arg>-Oreferenceextraction_dataset=${output_document_to_dataset}</arg>
375
        </java>
376
        <ok to="joining"/>
377
        <error to="fail"/>
378
    </action>
379
    <!-- end of dataset reference extraction block -->
380
    
381
    <!-- start of researchinitiative reference extraction block -->
382
    <decision name="decision-referenceextraction_researchinitiative">
383
        <switch>
384
            <case to="referenceextraction_researchinitiative_collapser">${active_referenceextraction_researchinitiative eq "true"}</case>
385
            <default to="skip-referenceextraction_researchinitiative"/>
386
        </switch>
387
    </decision>
388
    
389
    <action name="referenceextraction_researchinitiative_collapser">
390
        <sub-workflow>
391
            <app-path>${wf:appPath()}/collapsers_multiple_input_collapser</app-path>
392
            <propagate-configuration/>
393
            <configuration>
394
                <property>
395
                    <name>workingDir</name>
396
                    <value>${workingDir}/referenceextraction_researchinitiative_collapser/working_dir</value>
397
                </property>
398
                <property>
399
                    <name>origin_1</name>
400
                    <value>document_text</value>
401
                </property>
402
                <property>
403
                    <name>input_1</name>
404
                    <value>${input_document_text}</value>
405
                </property>
406
                <property>
407
                    <name>origin_2</name>
408
                    <value>document_text_wos</value>
409
                </property>
410
                <property>
411
                    <name>input_2</name>
412
                    <value>${input_document_text_wos}</value>
413
                </property>
414
                <property>
415
                    <name>blocking_field</name>
416
                    <value>id</value>
417
                </property>
418
                <property>
419
                    <name>schema_input</name>
420
                    <value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
421
                </property>
422
                <property>
423
                    <name>output</name>
424
                    <value>${workingDir}/referenceextraction_researchinitiative_collapser/output</value>
425
                </property>
426
                <property>
427
                    <name>schema_input_envelope</name>
428
                    <value>eu.dnetlib.iis.collapsers.schemas.DocumentTextEnvelope</value>
429
                </property>
430
                <property>
431
                    <name>record_collapser</name>
432
                    <value>eu.dnetlib.iis.collapsers.origins.DocumentTextCollapser</value>
433
        		</property>
434
            </configuration>
435
        </sub-workflow>
436
        <ok to="referenceextraction_researchinitiative"/>
437
        <error to="fail"/>
438
    </action>
439
    
440
    <action name="referenceextraction_researchinitiative">
441
	    <sub-workflow>
442
            <app-path>${wf:appPath()}/referenceextraction_researchinitiative</app-path>
443
            <propagate-configuration/>
444
            <configuration>
445
            	<property>
446
                    <name>workingDir</name>
447
                    <value>${workingDir}/referenceextraction_researchinitiative/working_dir</value>
448
                </property>
449
            	<property>
450
					<name>input_document_text</name>
451
					<value>${workingDir}/referenceextraction_researchinitiative_collapser/output</value>
452
				</property>
453
				<property>
454
					<name>output_document_to_research_initiative</name>
455
					<value>${workingDir}/referenceextraction_researchinitiative/output</value>
456
				</property>
457
            </configuration>
458
        </sub-workflow>
459
		<ok to="transformers_export_researchinitiatives"/>
460
		<error to="fail" />
461
    </action>
462
    
463
    <action name="transformers_export_researchinitiatives">
464
	    <sub-workflow>
465
            <app-path>${wf:appPath()}/transformers_export_researchinitiatives</app-path>
466
            <propagate-configuration/>
467
            <configuration>
468
            	<property>
469
                    <name>workingDir</name>
470
                    <value>${workingDir}/transformers_export_researchinitiatives/working_dir</value>
471
                </property>
472
            	<property>
473
					<name>input_document_to_research_initiative</name>
474
					<value>${workingDir}/referenceextraction_researchinitiative/output</value>
475
				</property>
476
				<property>
477
					<name>output_document_to_research_initiatives</name>
478
					<value>${output_document_to_research_initiatives}</value>
479
				</property>
480
            </configuration>
481
        </sub-workflow>
482

    
483
		<ok to="joining"/>
484
		<error to="fail" />
485
    </action>
486
    
487
    <action name="skip-referenceextraction_researchinitiative">
488
        <java>
489
			<prepare>
490
				<!-- notice: directory have to aligned with skipped action output -->
491
				<delete path="${nameNode}${workingDir}/transformers_export_researchinitiatives" />
492
				<delete path="${nameNode}${output_document_to_research_initiatives}"/>
493
				<mkdir path="${nameNode}${workingDir}/transformers_export_researchinitiatives" />
494
				<mkdir path="${nameNode}${output_document_to_research_initiatives}"/>
495
			</prepare>
496
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
497
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
498
            <arg>-C{referenceextraction_researchinitiatives,
499
				eu.dnetlib.iis.export.schemas.DocumentToConceptIds,
500
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
501
            <!-- notice: directory have to aligned with skipped action output -->
502
            <arg>-Oreferenceextraction_researchinitiatives=${output_document_to_research_initiatives}</arg>
503
        </java>
504
        <ok to="joining"/>
505
        <error to="fail"/>
506
    </action>
507
    <!-- end of researchinitiative reference extraction block -->
508
    
509
	<!-- start of pdb reference extraction block -->
510
    <decision name="decision-referenceextraction_pdb">
511
        <switch>
512
            <case to="referenceextraction_pdb">${active_referenceextraction_pdb eq "true"}</case>
513
            <default to="skip-referenceextraction_pdb"/>
514
        </switch>
515
    </decision>
516
    
517
    <action name="referenceextraction_pdb">
518
	    <sub-workflow>
519
            <app-path>${wf:appPath()}/referenceextraction_pdb</app-path>
520
            <propagate-configuration/>
521
            <configuration>
522
            	<property>
523
                    <name>workingDir</name>
524
                    <value>${workingDir}/referenceextraction_pdb/working_dir</value>
525
                </property>
526
            	<property>
527
					<name>input_document_text</name>
528
					<value>${input_document_text}</value>
529
				</property>
530
				<property>
531
					<name>output</name>
532
					<value>${workingDir}/referenceextraction_pdb/output</value>
533
				</property>
534
            </configuration>
535
        </sub-workflow>
536
		<ok to="transformers_export_pdb"/>
537
		<error to="fail" />
538
    </action>
539
    
540
    <action name="transformers_export_pdb">
541
	    <sub-workflow>
542
            <app-path>${wf:appPath()}/transformers_export_researchinitiatives</app-path>
543
            <propagate-configuration/>
544
            <configuration>
545
            	<property>
546
                    <name>workingDir</name>
547
                    <value>${workingDir}/transformers_export_pdb/working_dir</value>
548
                </property>
549
            	<property>
550
					<name>input_document_to_research_initiative</name>
551
					<value>${workingDir}/referenceextraction_pdb/output</value>
552
				</property>
553
				<property>
554
					<name>output_document_to_research_initiatives</name>
555
					<value>${output_document_to_pdb}</value>
556
				</property>
557
            </configuration>
558
        </sub-workflow>
559

    
560
		<ok to="joining"/>
561
		<error to="fail" />
562
    </action>
563
    
564
    <action name="skip-referenceextraction_pdb">
565
        <java>
566
			<prepare>
567
				<!-- notice: directory have to aligned with skipped action output -->
568
				<delete path="${nameNode}${workingDir}/transformers_export_pdb" />
569
				<delete path="${nameNode}${output_document_to_pdb}"/>
570
				<mkdir path="${nameNode}${workingDir}/transformers_export_pdb" />
571
				<mkdir path="${nameNode}${output_document_to_pdb}"/>
572
			</prepare>
573
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
574
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
575
            <arg>-C{referenceextraction_pdb,
576
				eu.dnetlib.iis.export.schemas.DocumentToConceptIds,
577
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
578
            <!-- notice: directory have to aligned with skipped action output -->
579
            <arg>-Oreferenceextraction_pdb=${output_document_to_pdb}</arg>
580
        </java>
581
        <ok to="joining"/>
582
        <error to="fail"/>
583
    </action>
584
    <!-- end of pdb reference extraction block -->    
585
    
586
    <!-- metadatamerger branch -->
587
    <action name="transformers_metadatamerger">
588
	    <sub-workflow>
589
            <app-path>${wf:appPath()}/transformers_metadatamerger</app-path>
590
            <propagate-configuration/>
591
            <configuration>
592
            	<property>
593
                    <name>workingDir</name>
594
                    <value>${workingDir}/transformers_metadatamerger/working_dir</value>
595
                </property>
596
            	<property>
597
					<name>input_base_metadata</name>
598
					<value>${input_document_metadata}</value>
599
				</property>
600
				<property>
601
					<name>input_extracted_metadata</name>
602
					<value>${input_extracted_document_metadata}</value>
603
				</property>
604
				<property>
605
					<name>output_merged_metadata</name>
606
					<value>${workingDir}/transformers_metadatamerger/output_merged_metadata</value>
607
				</property>
608
            </configuration>
609
        </sub-workflow>
610

    
611
		<ok to="decision-documentsclassification"/>
612
		<error to="fail" />
613
    </action>
614
    
615
    <!-- start of documents classification part -->
616
    <decision name="decision-documentsclassification">
617
        <switch>
618
            <case to="transformers_documentsclassification">${active_documentsclassification eq "true"}</case>
619
            <default to="skip-documentsclassification"/>
620
        </switch>
621
    </decision>
622
    
623
    <action name="transformers_documentsclassification">
624
	    <sub-workflow>
625
            <app-path>${wf:appPath()}/transformers_documentsclassification</app-path>
626
            <propagate-configuration/>
627
            <configuration>
628
            	<property>
629
                    <name>workingDir</name>
630
                    <value>${workingDir}/transformers_documentsclassification/working_dir</value>
631
                </property>
632
            	<property>
633
					<name>input_merged_metadata</name>
634
					<value>${workingDir}/transformers_metadatamerger/output_merged_metadata</value>
635
				</property>
636
				<property>
637
					<name>output_document_metadata</name>
638
					<value>${workingDir}/transformers_documentsclassification/output_document_metadata</value>
639
				</property>
640
            </configuration>
641
        </sub-workflow>
642
		<ok to="documentsclassification_main"/>
643
		<error to="fail" />
644
    </action>
645
    
646
    <action name="documentsclassification_main">
647
	    <sub-workflow>
648
            <app-path>${wf:appPath()}/documentsclassification_main</app-path>
649
            <propagate-configuration/>
650
            <configuration>
651
            	<property>
652
                    <name>workingDir</name>
653
                    <value>${workingDir}/documentsclassification_main/working_dir</value>
654
                </property>
655
            	<property>
656
					<name>input_document_metadata</name>
657
					<value>${workingDir}/transformers_documentsclassification/output_document_metadata</value>
658
				</property>
659
				<property>
660
					<name>output_document_to_document_classes</name>
661
					<value>${output_document_to_document_classes}</value>
662
				</property>
663
            </configuration>
664
        </sub-workflow>
665
		<ok to="joining"/>
666
		<error to="fail" />
667
    </action>
668
    
669
    <action name="skip-documentsclassification">
670
        <java>
671
			<prepare>
672
				<!-- notice: directory have to aligned with skipped action output -->
673
				<delete path="${nameNode}${workingDir}/documentsclassification_main" />
674
				<delete path="${nameNode}${output_document_to_document_classes}"/>
675
				<mkdir path="${nameNode}${workingDir}/documentsclassification_main" />
676
				<mkdir path="${nameNode}${output_document_to_document_classes}"/>
677
			</prepare>
678
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
679
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
680
            <arg>-C{documentsclassification,
681
				eu.dnetlib.iis.documentsclassification.schemas.DocumentToDocumentClasses,
682
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
683
            <!-- notice: directory have to aligned with skipped action output -->
684
            <arg>-Odocumentsclassification=${output_document_to_document_classes}</arg>
685
        </java>
686
        <ok to="joining"/>
687
        <error to="fail"/>
688
    </action>
689
    <!-- end of documents classification part -->
690
    
691
    <join name="joining" to="decision-citationmatching"/>
692
    
693
    <!-- citation matching part -->
694
    <decision name="decision-citationmatching">
695
        <switch>
696
            <case to="transformers_citationmatching">${active_citationmatching eq "true"}</case>
697
            <default to="skip-citationmatching"/>
698
        </switch>
699
    </decision>
700
    
701
    <action name="transformers_citationmatching">
702
	    <sub-workflow>
703
            <app-path>${wf:appPath()}/transformers_citationmatching</app-path>
704
            <propagate-configuration/>
705
            <configuration>
706
            	<property>
707
                    <name>workingDir</name>
708
                    <value>${workingDir}/transformers_citationmatching/working_dir</value>
709
                </property>
710
            	<property>
711
					<name>input_metadata</name>
712
					<value>${workingDir}/transformers_metadatamerger/output_merged_metadata</value>
713
				</property>
714
				<property>
715
					<name>input_person</name>
716
					<value>${input_person}</value>
717
				</property>
718
				<property>
719
					<name>output_citation_metadata</name>
720
					<value>${workingDir}/transformers_citationmatching/output_citation_metadata</value>
721
				</property>
722
            </configuration>
723
        </sub-workflow>
724
		<ok to="citationmatching_chain"/>
725
		<error to="fail" />
726
    </action>
727
    
728
    <action name="citationmatching_chain">
729
	    <sub-workflow>
730
            <app-path>${wf:appPath()}/citationmatching_chain</app-path>
731
            <propagate-configuration/>
732
            <configuration>
733
            	<property>
734
                    <name>workingDir</name>
735
                    <value>${workingDir}/citationmatching_chain/working_dir</value>
736
                </property>
737
            	<property>
738
					<name>input</name>
739
					<value>${workingDir}/transformers_citationmatching/output_citation_metadata</value>
740
				</property>
741
				<property>
742
					<name>output</name>
743
					<value>${workingDir}/citationmatching_chain/output</value>
744
				</property>
745
				<property>
746
            		<name>cit_genAuthorIdxJavaOpts</name>
747
        		    <value>${cit_genAuthorIdxJavaOpts}</value>
748
		        </property>
749
            </configuration>
750
        </sub-workflow>
751
		<ok to="decision-documentssimilarity"/>
752
		<error to="fail" />
753
    </action>
754
        
755
    <action name="skip-citationmatching">
756
        <java>
757
			<prepare>
758
				<!-- notice: directory have to aligned with skipped action output -->
759
				<delete path="${nameNode}${workingDir}/citationmatching_chain" />
760
				<mkdir path="${nameNode}${workingDir}/citationmatching_chain" />
761
			</prepare>
762
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
763
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
764
            <arg>-C{citation,
765
				eu.dnetlib.iis.citationmatching.schemas.Citation,
766
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
767
            <!-- notice: directory have to aligned with skipped action output -->
768
            <arg>-Ocitation=${workingDir}/citationmatching_chain/output</arg>
769
        </java>
770
        <ok to="decision-documentssimilarity"/>
771
        <error to="fail"/>
772
    </action>
773
    <!-- end of citation matching part -->
774
    
775
    <!-- start of documents similarity part -->
776
    <!-- running documentsimilarity sequentially to all the other KDM modules 
777
    	due to the lack of memory when executed in parallel -->
778
    <decision name="decision-documentssimilarity">
779
        <switch>
780
            <case to="transformers_documentssimilarity">${active_documentssimilarity eq "true"}</case>
781
            <default to="skip-documentssimilarity"/>
782
        </switch>
783
    </decision>
784
    
785
    <action name="transformers_documentssimilarity">
786
	    <sub-workflow>
787
            <app-path>${wf:appPath()}/transformers_documentssimilarity</app-path>
788
            <propagate-configuration/>
789
            <configuration>
790
            	<property>
791
                    <name>workingDir</name>
792
                    <value>${workingDir}/transformers_documentssimilarity/working_dir</value>
793
                </property>
794
            	<property>
795
					<name>input_person</name>
796
					<value>${input_person}</value>
797
				</property>
798
				<property>
799
					<name>input_metadata</name>
800
					<value>${workingDir}/transformers_metadatamerger/output_merged_metadata</value>
801
				</property>
802
				<property>
803
					<name>output_document_metadata</name>
804
					<value>${workingDir}/transformers_documentssimilarity/output_document_metadata</value>
805
				</property>
806
            </configuration>
807
        </sub-workflow>
808
		<ok to="documentssimilarity_chain"/>
809
		<error to="fail" />
810
    </action>
811
    
812
    <action name="documentssimilarity_chain">
813
	    <sub-workflow>
814
            <app-path>${wf:appPath()}/documentssimilarity_chain</app-path>
815
            <propagate-configuration/>
816
            <configuration>
817
            	<property>
818
                    <name>workingDir</name>
819
                    <value>${workingDir}/documentssimilarity_chain/working_dir</value>
820
                </property>
821
            	<property>
822
					<name>input_document</name>
823
					<value>${workingDir}/transformers_documentssimilarity/output_document_metadata</value>
824
				</property>
825
				<property>
826
					<name>output_documents_similarity</name>
827
					<value>${output_document_similarity}</value>
828
				</property>
829
				<property>
830
		            <name>parallel</name>
831
		            <value>${ds_parallel}</value>
832
		        </property>
833
		        <property>
834
		            <name>mapredChildJavaOpts</name>
835
		            <value>${ds_mapredChildJavaOpts}</value>
836
		        </property>
837
		        <property>
838
		            <name>sample</name>
839
		            <value>${ds_sample}</value>
840
		        </property>
841
		        <property>
842
		            <name>removal_rate</name>
843
		            <value>${ds_removal_rate}</value>
844
		        </property>
845
		        <property>
846
		            <name>removal_least_used</name>
847
		            <value>${ds_removal_least_used}</value>
848
		        </property>
849
		        <property>
850
		            <name>tfidfTopnTermPerDocument</name>
851
		            <value>${ds_tfidfTopnTermPerDocument}</value>
852
		        </property>
853
		        <property>
854
		            <name>similarityTopnDocumentPerDocument</name>
855
		            <value>${ds_similarityTopnDocumentPerDocument}</value>
856
		        </property>
857
            </configuration>
858
        </sub-workflow>
859
		<ok to="decision-statistics"/>
860
		<error to="fail" />
861
    </action>
862
    
863
    <action name="skip-documentssimilarity">
864
        <java>
865
			<prepare>
866
				<!-- notice: directory have to aligned with skipped action output -->
867
				<delete path="${nameNode}${workingDir}/documentssimilarity_chain" />
868
				<delete path="${nameNode}${output_document_similarity}" />
869
				<mkdir path="${nameNode}${workingDir}/documentssimilarity_chain" />
870
				<mkdir path="${nameNode}${output_document_similarity}" />
871
			</prepare>
872
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
873
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
874
            <arg>-C{documentssimilarity,
875
				eu.dnetlib.iis.documentssimilarity.schemas.DocumentSimilarity,
876
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
877
            <!-- notice: directory have to aligned with skipped action output -->
878
            <arg>-Odocumentssimilarity=${output_document_similarity}</arg>
879
        </java>
880
        <ok to="decision-statistics"/>
881
        <error to="fail"/>
882
    </action>
883
    <!-- end of documents similarity part -->
884
        
885
    <!-- statistics are calculated at the end, because they are taking two forked paths
886
    outcome into account: transformers_metadatamerger and referenceextraction_project -->
887
    <!-- statistics part -->
888
    <decision name="decision-statistics">
889
        <switch>
890
            <case to="transformers_statistics">${active_statistics eq "true"}</case>
891
            <default to="skip-statistics"/>
892
        </switch>
893
    </decision>
894
    
895
    <action name="transformers_statistics">
896
	    <sub-workflow>
897
            <app-path>${wf:appPath()}/transformers_statistics</app-path>
898
            <propagate-configuration/>
899
            <configuration>
900
            	<property>
901
                    <name>workingDir</name>
902
                    <value>${workingDir}/transformers_statistics/working_dir</value>
903
                </property>
904
            	<property>
905
					<name>input_document</name>
906
					<value>${workingDir}/transformers_metadatamerger/output_merged_metadata</value>
907
				</property>
908
				<property>
909
					<name>input_citation</name>
910
					<value>${workingDir}/citationmatching_chain/output</value>
911
				</property>
912
				<property>
913
					<!-- NOTICE: reference extraction will have to be enabled to get this input -->
914
					<name>input_document_to_project</name>
915
					<value>${output_document_to_project}</value>
916
				</property>
917
				<property>
918
					<name>input_person</name>
919
					<value>${input_person}</value>
920
				</property>
921
				<property>
922
					<name>input_project</name>
923
					<value>${input_project}</value>
924
				</property>
925
				<property>
926
					<name>output_document_authors_citations</name>
927
					<value>${workingDir}/transformers_statistics/output_document_authors_citations</value>
928
				</property>
929
				<property>
930
					<name>output_person_id</name>
931
					<value>${workingDir}/transformers_statistics/output_person_id</value>
932
				</property>
933
				<property>
934
					<name>output_project_id</name>
935
					<value>${workingDir}/transformers_statistics/output_project_id</value>
936
				</property>
937
            </configuration>
938
        </sub-workflow>
939
		<ok to="statistics"/>
940
		<error to="fail" />
941
    </action>
942
    
943
    <action name="statistics">
944
	    <sub-workflow>
945
            <app-path>${wf:appPath()}/statistics</app-path>
946
            <propagate-configuration/>
947
            <configuration>
948
            	<property>
949
                    <name>workingDir</name>
950
                    <value>${workingDir}/statistics/working_dir</value>
951
                </property>
952
            	<property>
953
					<name>input_document_authors_citations</name>
954
					<value>${workingDir}/transformers_statistics/output_document_authors_citations</value>
955
				</property>
956
				<property>
957
					<name>input_person_id</name>
958
					<value>${workingDir}/transformers_statistics/output_person_id</value>
959
				</property>
960
				<property>
961
					<name>input_project_id</name>
962
					<value>${workingDir}/transformers_statistics/output_project_id</value>
963
				</property>
964
				<property>
965
					<name>output_document_statistics</name>
966
					<value>${output_document_statistics}</value>
967
				</property>
968
				<property>
969
					<name>output_author_statistics</name>
970
					<value>${output_author_statistics}</value>
971
				</property>
972
				<property>
973
					<name>output_project_statistics</name>
974
					<value>${output_project_statistics}</value>
975
				</property>
976
				<property>
977
					<name>output_global_statistics</name>
978
					<value>${workingDir}/statistics/output_global_statistics</value>
979
				</property>
980
            </configuration>
981
        </sub-workflow>
982
		<ok to="transformers_citations_from_matching"/>
983
		<error to="fail" />
984
    </action>
985
    
986
    <action name="skip-statistics">
987
        <java>
988
			<prepare>
989
				<!-- notice: directory have to aligned with skipped action output -->
990
				<delete path="${nameNode}${workingDir}/statistics" />
991
				<delete path="${nameNode}${output_document_statistics}"/>
992
				<delete path="${nameNode}${output_author_statistics}"/>
993
				<delete path="${nameNode}${output_project_statistics}"/>
994
				<mkdir path="${nameNode}${workingDir}/statistics" />
995
				<mkdir path="${nameNode}${output_document_statistics}"/>
996
				<mkdir path="${nameNode}${output_author_statistics}"/>
997
				<mkdir path="${nameNode}${output_project_statistics}"/>
998
			</prepare>
999
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
1000
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
1001
            <arg>-C{document_statistics,
1002
				eu.dnetlib.iis.statistics.schemas.DocumentToDocumentStatistics,
1003
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
1004
			<arg>-C{author_statistics,
1005
				eu.dnetlib.iis.statistics.schemas.AuthorToAuthorStatistics,
1006
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
1007
			<arg>-C{project_statistics,
1008
				eu.dnetlib.iis.statistics.schemas.ProjectToProjectStatistics,
1009
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
1010
			<!-- FIXME currently global statistics are not available -->
1011
			<!-- 
1012
			<arg>-C{global_statistics,
1013
				eu.dnetlib.iis.statistics.schemas.,
1014
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
1015
			-->
1016
            <!-- notice: directory have to aligned with skipped action output -->
1017
            <arg>-Odocument_statistics=${output_document_statistics}</arg>
1018
            <arg>-Oauthor_statistics=${output_author_statistics}</arg>
1019
            <arg>-Oproject_statistics=${output_project_statistics}</arg>
1020
            <!-- FIXME currently global statistics are not available -->
1021
            <!-- 
1022
            <arg>-Oglobal_statistics=${workingDir}/statistics/output_global_statistics</arg>
1023
             -->
1024
        </java>
1025
        <ok to="transformers_citations_from_matching"/>
1026
        <error to="fail"/>
1027
    </action>
1028
    <!-- end of statistics part -->
1029
    
1030
    <!-- normalize and group citations part -->
1031
    <action name="transformers_citations_from_matching">
1032
	    <sub-workflow>
1033
            <app-path>${wf:appPath()}/transformers_citations_from_matching</app-path>
1034
            <propagate-configuration/>
1035
            <configuration>
1036
            	<property>
1037
                    <name>workingDir</name>
1038
                    <value>${workingDir}/transformers_citations_from_matching/working_dir</value>
1039
                </property>
1040
            	<property>
1041
					<name>input</name>
1042
					<value>${workingDir}/citationmatching_chain/output</value>
1043
				</property>
1044
				<property>
1045
					<name>output</name>
1046
					<value>${workingDir}/transformers_citations_from_matching/output</value>
1047
				</property>
1048
            </configuration>
1049
        </sub-workflow>
1050
		<ok to="transformers_citations_from_ingestpmc"/>
1051
		<error to="fail" />
1052
    </action>
1053
    
1054
    <action name="transformers_citations_from_ingestpmc">
1055
	    <sub-workflow>
1056
            <app-path>${wf:appPath()}/transformers_citations_from_ingestpmc</app-path>
1057
            <propagate-configuration/>
1058
            <configuration>
1059
            	<property>
1060
                    <name>workingDir</name>
1061
                    <value>${workingDir}/transformers_citations_from_ingestpmc/working_dir</value>
1062
                </property>
1063
            	<property>
1064
					<name>input</name>
1065
					<value>${input_citation_pmc}</value>
1066
				</property>
1067
				<property>
1068
					<name>output</name>
1069
					<value>${workingDir}/transformers_citations_from_ingestpmc/output</value>
1070
				</property>
1071
            </configuration>
1072
        </sub-workflow>
1073
		<ok to="citations_collapser"/>
1074
		<error to="fail" />
1075
    </action>
1076
    
1077
    <action name="citations_collapser">
1078
		<sub-workflow>
1079
            <app-path>${wf:appPath()}/collapsers_multiple_input_collapser</app-path>
1080
            <propagate-configuration/>
1081
            <configuration>
1082
                <property>
1083
                    <name>workingDir</name>
1084
                    <value>${workingDir}/citations_collapser/working_dir</value>
1085
                </property>
1086
                <!-- Input ports & parameters. -->
1087
                <property>
1088
                    <name>origin_1</name>
1089
                    <value>ingested</value>
1090
                </property>
1091
                <property>
1092
                    <name>input_1</name>
1093
                    <value>${workingDir}/transformers_citations_from_ingestpmc/output</value>
1094
                </property>
1095
                <property>
1096
                    <name>origin_2</name>
1097
                    <value>matched</value>
1098
                </property>
1099
                <property>
1100
                    <name>input_2</name>
1101
                    <value>${workingDir}/transformers_citations_from_matching/output</value>
1102
                </property>
1103
                <property>
1104
                    <name>blocking_field</name>
1105
                    <value>sourceDocumentId</value>
1106
                </property>
1107
                <property>
1108
                    <name>schema_input</name>
1109
                    <value>eu.dnetlib.iis.common.citations.schemas.Citation</value>
1110
                </property>
1111
                <property>
1112
                    <name>output</name>
1113
                    <value>${workingDir}/citations_collapser/output</value>
1114
                </property>
1115
                <property>
1116
                    <name>schema_input_envelope</name>
1117
                    <value>eu.dnetlib.iis.common.citations.schemas.CitationEnvelope</value>
1118
                </property>
1119
                <property>
1120
                    <name>record_collapser</name>
1121
                    <value>eu.dnetlib.iis.collapsers.origins.PMCCitationCollapser</value>
1122
        		</property>
1123
            </configuration>
1124
        </sub-workflow>
1125
		<ok to="transformers_export_citations"/>
1126
		<error to="fail" />
1127
    </action>
1128
    
1129
    <action name="transformers_export_citations">
1130
	    <sub-workflow>
1131
            <app-path>${wf:appPath()}/transformers_export_citations</app-path>
1132
            <propagate-configuration/>
1133
            <configuration>
1134
            	<property>
1135
                    <name>workingDir</name>
1136
                    <value>${workingDir}/transformers_export_citations/working_dir</value>
1137
                </property>
1138
            	<property>
1139
					<name>input</name>
1140
					<value>${workingDir}/citations_collapser/output</value>
1141
				</property>
1142
				<property>
1143
					<name>output</name>
1144
					<value>${output_citation}</value>
1145
				</property>
1146
            </configuration>
1147
        </sub-workflow>
1148
		<ok to="end"/>
1149
		<error to="fail" />
1150
    </action>
1151
    
1152
    <!-- end of normalize and group citations part -->
1153
    
1154
	<kill name="fail">
1155
		<message>Unfortunately, the process failed -- error message:
1156
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
1157
	</kill>
1158
	<end name="end" />
1159
</workflow-app>
(2-2/2)