Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_common_import">
2
	<parameters>
3
		<!-- importing modes -->
4
		<property>
5
			<name>active_import_metadata</name>
6
			<value>false</value>
7
			<description>flag indicating HBase metadata import should be enabled, when set to false db-based project import will be performed</description>
8
		</property>
9
		<property>
10
			<name>active_import_dataset</name>
11
			<value>false</value>
12
			<description>flag indicating dataset import should be enabled</description>
13
		</property>
14
		<property>
15
			<name>active_ingest_pmc_citations</name>
16
			<value>false</value>
17
			<description>flag indicating pmc citations ingestions should be performed</description>
18
		</property>
19
		<!-- import metadata related -->
20
		<property>
21
			<name>hbase_input_table</name>
22
			<value>$UNDEFINED$</value>
23
			<description>HBase input table holding InformationSpace, available on local cluster</description>
24
		</property>
25
		<property>
26
			<name>hbase_approved_datasources_csv</name>
27
			<value>$UNDEFINED$</value>
28
			<description>CSV list of datasource ids to be approved during import. Applied on result and person entities.</description>
29
		</property>
30
		<property>
31
			<name>inference_provenance_blacklist</name>
32
			<value>iis::.*</value>
33
			<description>list of blacklisted inference provenance which sould not be taken into account by importer, skipped when set to $UNDEFINED$</description>
34
		</property>
35
		<property>
36
			<name>trust_level_threshold</name>
37
			<value>$UNDEFINED$</value>
38
			<description>trust level threshold represented as float value, ignored when set to $UNDEFINED$ value</description>
39
		</property>
40
		<!-- import project related -->
41
		<!-- will be used when active_import_metadata=false  -->
42
		<property>
43
			<name>database_service_location</name>
44
			<value>$UNDEFINED$</value>
45
			<description>Database service (not WSDL) location URL</description>
46
		</property>
47
		<property>
48
			<name>database_dbname</name>
49
			<value>dnet_openaireplus_node6_t</value>
50
			<description>database name</description>
51
		</property>
52
		<!-- import datacite related -->
53
		<property>
54
			<name>mdstore_service_location</name>
55
			<value>$UNDEFINED$</value>
56
			<description>MDStore service (not WSDL) location URL</description>
57
		</property>
58
		<property>
59
			<name>dataset_mdstore_ids_csv</name>
60
			<value>$UNDEFINED$</value>
61
			<description>MDStore identifier</description>
62
		</property>
63
		<!-- import content related -->
64
		<property>
65
			<name>objectstore_service_location</name>
66
			<value>$UNDEFINED$</value>
67
			<description>object store service location required for content retrieval</description>
68
		</property>
69
		<property>
70
			<name>approved_objectstores_csv</name>
71
			<value>$UNDEFINED$</value>
72
			<description>CSV list of object stores identifiers to be processed</description>
73
		</property>
74
		<property>
75
			<name>mimetypes_pdf</name>
76
			<description>pdf mime types</description>
77
		</property>
78
		<property>
79
			<name>mimetypes_text</name>
80
			<description>text mime types</description>
81
		</property>
82
		<property>
83
			<name>mimetypes_xml_pmc</name>
84
			<description>xml pmc types</description>
85
		</property>
86
		<property>
87
			<name>mimetypes_wos</name>
88
			<description>wos types</description>
89
		</property>
90
		<!-- import timeouts related -->
91
		<property>
92
			<name>resultset_client_read_timeout</name>
93
			<value>60000</value>
94
			<description>resultset client read timeout</description>
95
		</property>
96
		<property>
97
			<name>content_connection_timeout</name>
98
			<value>60000</value>
99
			<description>import content connection timeout</description>
100
		</property>
101
		<property>
102
			<name>content_read_timeout</name>
103
			<value>60000</value>
104
			<description>import content read timeout</description>
105
		</property>
106
		<!-- metadata extraction related -->
107
		<property>
108
			<name>metadataextraction_excluded_ids</name>
109
			<value>$UNDEFINED$</value>
110
			<description>list of content identifiers excluded from metadataextraction processing</description>
111
		</property>
112
		<property>
113
			<name>metadataextraction_default_cache_location</name>
114
			<value>/cache/metadataextraction</value>
115
			<description>metadata extraction HDFS cache location</description>
116
		</property>
117
		<!-- metadataextraction output subdirectory names -->
118
		<property>
119
			<name>metadataextraction_output_name_meta</name>
120
			<value>meta</value>
121
			<description>metadataextraction metadata output subdirectory name</description>
122
		</property>
123
		<property>
124
			<name>metadataextraction_output_name_plaintext</name>
125
			<value>plaintext</value>
126
			<description>metadataextraction plaintext output subdirectory name</description>
127
		</property>
128
		<!-- metadata import output subdirectory names -->
129
		<property>
130
			<name>metadataimport_output_name_document_meta</name>
131
			<value>docmeta</value>
132
			<description>metadata import docmeta output subdirectory name</description>
133
		</property>
134
		<property>
135
			<name>metadataimport_output_name_document_project</name>
136
			<value>docproject</value>
137
			<description>metadata import document to project relation subdirectory name</description>
138
		</property>
139
		<property>
140
			<name>metadataimport_output_name_document_relation</name>
141
			<value>docrelation</value>
142
			<description>metadata import document relation output subdirectory name</description>
143
		</property>
144
		<property>
145
			<name>metadataimport_output_name_project</name>
146
			<value>project</value>
147
			<description>metadata import project output subdirectory name</description>
148
		</property>
149
		<property>
150
			<name>metadataimport_output_name_person</name>
151
			<value>person</value>
152
			<description>metadata import person output subdirectory name</description>
153
		</property>
154
		<property>
155
			<name>metadataimport_output_name_dataset_id</name>
156
			<value>datasetid</value>
157
			<description>metadata import dataset identifier output subdirectory name</description>
158
		</property>
159
		<property>
160
			<name>metadataimport_output_name_dedup_mapping</name>
161
			<value>dedupmapping</value>
162
			<description>metadata import deduplication mapping output subdirectory name</description>
163
		</property>
164
		<!-- output parameters -->
165
		<property>
166
			<name>output_metadataextraction_root</name>
167
			<description>metadataextraction output root directory</description>
168
		</property>
169
		<property>
170
			<name>output_metadataimport_root</name>
171
			<value>$UNDEFINED$</value>
172
			<description>metadata importer output root directory, required when ${active_import_metadata}=true</description>
173
		</property>
174
		<property>
175
			<name>output_citation_pmc</name>
176
			<description>PMC citation output directory, required when ${active_import_metadata}=true</description>
177
		</property>
178
		<property>
179
			<name>output_dataset</name>
180
			<description>dataset importer output directory, required when ${active_import_dataset}=true</description>
181
		</property>
182
		<property>
183
			<name>output_document_text</name>
184
			<description>text import output directory. merged from three different sources</description>
185
		</property>
186
		<property>
187
			<name>output_wos</name>
188
			<description>wos import output directory</description>
189
		</property>
190
	</parameters>
191
	
192
	<global>
193
        <job-tracker>${jobTracker}</job-tracker>
194
        <name-node>${nameNode}</name-node>
195
        <configuration>
196
            <property>
197
                <name>mapred.job.queue.name</name>
198
                <value>${queueName}</value>
199
            </property>
200
		</configuration>
201
	</global>
202
	 
203
	<start to="import_forking" />
204
	
205
	<fork name="import_forking">
206
    	<path start="decision-metadata_importer"/>
207
        <path start="decision-import_dataset"/>
208
    </fork>
209
	
210
	<decision name="decision-metadata_importer">
211
        <switch>
212
            <case to="metadata_importer">${active_import_metadata eq "true"}</case>
213
            <default to="import_project"/>
214
        </switch>
215
    </decision>
216
	
217
	<action name="metadata_importer">
218
		<sub-workflow>
219
            <app-path>${wf:appPath()}/import_mapred</app-path>
220
            <propagate-configuration/>
221
            <configuration>
222
            	<property>
223
                    <name>workingDir</name>
224
                    <value>${workingDir}/import/working_dir</value>
225
                </property>
226
				<property>
227
					<name>hbase_input_table</name>
228
					<value>${hbase_input_table}</value>
229
				</property>
230
				<property>
231
					<name>approved_datasources_csv</name>
232
					<value>${hbase_approved_datasources_csv}</value>
233
				</property>
234
				<property>
235
					<name>inference_provenance_blacklist</name>
236
					<value>${inference_provenance_blacklist}</value>
237
				</property>
238
				<property>
239
					<name>trust_level_threshold</name>
240
					<value>${trust_level_threshold}</value>
241
				</property>
242
				<property>
243
					<name>output</name>
244
					<value>${output_metadataimport_root}</value>
245
				</property>
246
				<!-- subdirectory names -->
247
		        <property>
248
		            <name>output_name_document_meta</name>
249
		            <value>${metadataimport_output_name_document_meta}</value>
250
		        </property>
251
		        <property>
252
		            <name>output_name_document_project</name>
253
		            <value>${metadataimport_output_name_document_project}</value>
254
		        </property>
255
		        <property>
256
		            <name>output_name_document_relation</name>
257
		            <value>${metadataimport_output_name_document_relation}</value>
258
		        </property>
259
		        <property>
260
		            <name>output_name_project</name>
261
		            <value>${metadataimport_output_name_project}</value>
262
		        </property>
263
		        <property>
264
		            <name>output_name_person</name>
265
		            <value>${metadataimport_output_name_person}</value>
266
		        </property>
267
		        <property>
268
		            <name>output_name_dataset_id</name>
269
		            <value>${metadataimport_output_name_dataset_id}</value>
270
		        </property>
271
		        <property>
272
		            <name>output_name_dedup_mapping</name>
273
		            <value>${metadataimport_output_name_dedup_mapping}</value>
274
		        </property>
275
			</configuration>
276
        </sub-workflow>
277
		<ok to="transformers-idextractor" />
278
		<error to="fail" />
279
	</action>
280
	
281
	<action name="transformers-idextractor">
282
        <sub-workflow>
283
            <app-path>${wf:appPath()}/transformers_idextractor</app-path>
284
            <propagate-configuration/>
285
            <configuration>
286
                <property>
287
                    <name>workingDir</name>
288
                    <value>${workingDir}/transformers_idextractor/working_dir</value>
289
                </property>
290
                <property>
291
                    <name>input_document_metadata</name>
292
                    <value>${output_metadataimport_root}/${metadataimport_output_name_document_meta}</value>
293
                </property>
294
                <property>
295
                    <name>output_identifier</name>
296
                    <value>${workingDir}/transformers_idextractor/output</value>
297
                </property>
298
            </configuration>
299
        </sub-workflow>
300
        <ok to="decision-import_content_url"/>
301
        <error to="fail"/>
302
    </action>
303
	
304
	<action name="import_project">
305
		<sub-workflow>
306
            <app-path>${wf:appPath()}/import_project</app-path>
307
            <propagate-configuration/>
308
            <configuration>
309
            	<property>
310
                    <name>workingDir</name>
311
                    <value>${workingDir}/import_project/working_dir</value>
312
                </property>
313
                <property>
314
					<name>database_service_location</name>
315
					<value>${database_service_location}</value>
316
				</property>
317
				<property>
318
					<name>database_name</name>
319
					<value>${database_dbname}</value>
320
				</property>
321
				<property>
322
					<name>resultset_client_read_timeout</name>
323
					<value>${resultset_client_read_timeout}</value>
324
				</property>
325
            	<property>
326
					<name>output</name>
327
					<value>${output_metadataimport_root}/${metadataimport_output_name_project}</value>
328
				</property>
329
			</configuration>
330
        </sub-workflow>
331
		<ok to="decision-import_content_url" />
332
		<error to="fail" />
333
	</action>
334
	
335
	<decision name="decision-import_dataset">
336
        <switch>
337
            <case to="import_dataset">${active_import_dataset eq "true"}</case>
338
            <default to="skip-import_dataset"/>
339
        </switch>
340
    </decision>
341
	
342
	<action name="import_dataset">
343
		<sub-workflow>
344
            <app-path>${wf:appPath()}/import_dataset</app-path>
345
            <propagate-configuration/>
346
            <configuration>
347
            	<property>
348
                    <name>workingDir</name>
349
                    <value>${workingDir}/import_dataset/working_dir</value>
350
                </property>
351
                <property>
352
					<name>mdstore_service_location</name>
353
					<value>${mdstore_service_location}</value>
354
				</property>
355
				<property>
356
					<name>mdstore_ids_csv</name>
357
					<value>${dataset_mdstore_ids_csv}</value>
358
				</property>
359
				<property>
360
					<name>resultset_client_read_timeout</name>
361
					<value>${resultset_client_read_timeout}</value>
362
				</property>
363
            	<property>
364
					<name>output</name>
365
					<value>${output_dataset}</value>
366
				</property>
367
			</configuration>
368
        </sub-workflow>
369
		<ok to="import_joining" />
370
		<error to="fail" />
371
	</action>
372

    
373
	<action name="skip-import_dataset">
374
        <java>
375
			<prepare>
376
				<!-- notice: directory have to aligned with skipped action output -->
377
				<delete path="${nameNode}${workingDir}/import_dataset" />
378
				<delete path="${nameNode}${output_dataset}" />
379
				<mkdir path="${nameNode}${workingDir}/import_dataset/working_dir" />
380
			</prepare>
381
            <main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
382
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
383
            <arg>-C{dataset,
384
				eu.dnetlib.iis.importer.schemas.DataSetReference,
385
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
386
            <arg>-SworkingDir=${workingDir}/import_dataset/working_dir</arg>
387
            <!-- notice: directory have to aligned with skipped action output -->
388
            <arg>-Odataset=${output_dataset}</arg>
389
        </java>
390
        <ok to="import_joining"/>
391
        <error to="fail"/>
392
    </action>
393

    
394
	<decision name="decision-import_content_url">
395
        <switch>
396
            <case to="skip-import_content_url">${objectstore_service_location eq "$UNDEFINED$"}</case>
397
            <default to="input_id_mapping-path-setter"/>
398
        </switch>
399
    </decision>
400

    
401
	<action name='input_id_mapping-path-setter'>
402
		<java>
403
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
404
			<arg>eu.dnetlib.iis.common.oozie.property.ConditionalPropertySetter</arg>
405
			<arg>-SworkingDir=${workingDir}</arg>
406
			<arg>-Pcondition=${active_import_metadata eq "true"}</arg>
407
			<arg>-PinCaseOfTrue=${output_metadataimport_root}/${metadataimport_output_name_dedup_mapping}</arg>
408
			<arg>-PelseCase=$UNDEFINED$</arg>
409
			<capture-output />
410
		</java>
411
		<ok to="input_id-path-setter" />
412
		<error to="fail" />
413
	</action>
414

    
415
	<action name='input_id-path-setter'>
416
		<java>
417
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
418
			<arg>eu.dnetlib.iis.common.oozie.property.ConditionalPropertySetter</arg>
419
			<arg>-SworkingDir=${workingDir}</arg>
420
			<arg>-Pcondition=${active_import_metadata eq "true"}</arg>
421
			<arg>-PinCaseOfTrue=${workingDir}/transformers_idextractor/output</arg>
422
			<arg>-PelseCase=$UNDEFINED$</arg>
423
			<capture-output />
424
		</java>
425
		<ok to="import_content_url" />
426
		<error to="fail" />
427
	</action>
428

    
429
	<action name="import_content_url">
430
		<sub-workflow>
431
            <app-path>${wf:appPath()}/import_content_url</app-path>
432
            <propagate-configuration/>
433
            <configuration>
434
            	<property>
435
                    <name>workingDir</name>
436
                    <value>${workingDir}/import_content_url/working_dir</value>
437
                </property>
438
                <property>
439
					<name>objectstore_service_location</name>
440
					<value>${objectstore_service_location}</value>
441
				</property>
442
				<property>
443
					<name>approved_objectstores_csv</name>
444
					<value>${approved_objectstores_csv}</value>
445
				</property>
446
				<property>
447
					<name>mimetypes_pdf</name>
448
					<value>${mimetypes_pdf}</value>
449
				</property>
450
				<property>
451
					<name>mimetypes_text</name>
452
					<value>${mimetypes_text}</value>
453
				</property>
454
				<property>
455
					<name>mimetypes_xml_pmc</name>
456
					<value>${mimetypes_xml_pmc}</value>
457
				</property>
458
				<property>
459
					<name>mimetypes_wos</name>
460
					<value>${mimetypes_wos}</value>
461
				</property>
462
				<property>
463
					<name>resultset_client_read_timeout</name>
464
					<value>${resultset_client_read_timeout}</value>
465
				</property>
466
				<property>
467
					<name>input_id</name>
468
					<value>${wf:actionData('input_id-path-setter')['result']}</value>
469
				</property>
470
				<property>
471
					<name>input_id_mapping</name>
472
					<value>${wf:actionData('input_id_mapping-path-setter')['result']}</value>
473
				</property>
474
				<property>
475
					<name>output_root</name>
476
					<value>${workingDir}/import_content_url/imported</value>
477
				</property>
478
				<property>
479
					<name>output_name_pdf</name>
480
					<value>pdf</value>
481
				</property>
482
				<property>
483
					<name>output_name_text</name>
484
					<value>text</value>
485
				</property>
486
				<property>
487
					<name>output_name_xml_pmc</name>
488
					<value>xmlpmc</value>
489
				</property>
490
				<property>
491
					<name>output_name_wos</name>
492
					<value>wos</value>
493
				</property>
494
			</configuration>
495
        </sub-workflow>
496
		<ok to="import_urlbased_forking" />
497
		<error to="fail" />
498
	</action>
499

    
500
	<action name="skip-import_content_url">
501
        <java>
502
			<prepare>
503
				<!-- notice: directory have to aligned with skipped action output -->
504
				<delete path="${nameNode}${workingDir}/import_content_url" />
505
				<delete path="${nameNode}${output_document_text}" />
506
				<delete path="${nameNode}${output_metadataextraction_root}/${metadataextraction_output_name_meta}" />
507
				<mkdir path="${nameNode}${workingDir}/import_content_url/working_dir" />
508
			</prepare>
509
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
510
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
511
            <arg>-C{document_text,
512
				eu.dnetlib.iis.metadataextraction.schemas.DocumentText,
513
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
514
			<arg>-C{extracted_document_metadata,
515
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
516
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
517
            <arg>-SworkingDir=${workingDir}/import_content_url/working_dir</arg>
518
            <!-- notice: directory have to aligned with skipped action output -->
519
            <arg>-Odocument_text=${output_document_text}</arg>
520
            <arg>-Oextracted_document_metadata=${output_metadataextraction_root}/${metadataextraction_output_name_meta}</arg>
521
        </java>
522
        <ok to="import_joining"/>
523
        <error to="fail"/>
524
    </action>
525

    
526
	<fork name="import_urlbased_forking">
527
    	<path start="import_plaintext"/>
528
    	<path start="import_wos"/>
529
    	<path start="import_plaintext_pmc"/>
530
		<path start="decision-metadata_extractor_use_cache"/>
531
    </fork>
532

    
533
	<action name="import_plaintext">
534
		<sub-workflow>
535
            <app-path>${wf:appPath()}/import_plaintext</app-path>
536
            <propagate-configuration/>
537
            <configuration>
538
            	<property>
539
                    <name>workingDir</name>
540
                    <value>${workingDir}/import_plaintext/working_dir</value>
541
                </property>
542
                <property>
543
					<name>input</name>
544
					<value>${workingDir}/import_content_url/imported/text</value>
545
				</property>
546
				<property>
547
				    <name>content_connection_timeout</name>
548
				   <value>${content_connection_timeout}</value>
549
				</property>
550
				<property>
551
				    <name>content_read_timeout</name>
552
				   <value>${content_read_timeout}</value>
553
				</property>
554
            	<property>
555
					<name>output</name>
556
					<value>${workingDir}/import_plaintext/imported</value>
557
				</property>
558
			</configuration>
559
        </sub-workflow>
560
		<ok to="import_urlbased_joining" />
561
		<error to="fail" />
562
	</action>
563
	
564
	<action name="import_wos">
565
		<sub-workflow>
566
            <app-path>${wf:appPath()}/import_plaintext</app-path>
567
            <propagate-configuration/>
568
            <configuration>
569
            	<property>
570
                    <name>workingDir</name>
571
                    <value>${workingDir}/import_wos/working_dir</value>
572
                </property>
573
                <property>
574
					<name>input</name>
575
					<value>${workingDir}/import_content_url/imported/wos</value>
576
				</property>
577
				<property>
578
				    <name>content_connection_timeout</name>
579
				   <value>${content_connection_timeout}</value>
580
				</property>
581
				<property>
582
				    <name>content_read_timeout</name>
583
				   <value>${content_read_timeout}</value>
584
				</property>
585
            	<property>
586
					<name>output</name>
587
					<value>${output_wos}</value>
588
				</property>
589
			</configuration>
590
        </sub-workflow>
591
		<ok to="import_urlbased_joining" />
592
		<error to="fail" />
593
	</action>
594
	
595
	<action name="import_plaintext_pmc">
596
		<sub-workflow>
597
            <app-path>${wf:appPath()}/import_plaintext</app-path>
598
            <propagate-configuration/>
599
            <configuration>
600
            	<property>
601
                    <name>workingDir</name>
602
                    <value>${workingDir}/import_plaintext_pmc/working_dir</value>
603
                </property>
604
                <property>
605
					<name>input</name>
606
					<value>${workingDir}/import_content_url/imported/xmlpmc</value>
607
				</property>
608
				<property>
609
				    <name>content_connection_timeout</name>
610
				   <value>${content_connection_timeout}</value>
611
				</property>
612
				<property>
613
				    <name>content_read_timeout</name>
614
				   <value>${content_read_timeout}</value>
615
				</property>
616
            	<property>
617
					<name>output</name>
618
					<value>${workingDir}/import_plaintext_pmc/imported</value>
619
				</property>
620
			</configuration>
621
        </sub-workflow>
622
		<ok to="ingest_pmc_forking" />
623
		<error to="fail" />
624
	</action>
625
	
626
	<fork name="ingest_pmc_forking">
627
    	<path start="ingest_pmc_plaintext"/>
628
		<path start="decision-ingest_pmc_metadata"/>
629
    </fork>
630
	
631
	<action name="ingest_pmc_plaintext">
632
		<sub-workflow>
633
            <app-path>${wf:appPath()}/ingest_pmc_plaintext</app-path>
634
            <propagate-configuration/>
635
            <configuration>
636
            	<property>
637
                    <name>workingDir</name>
638
                    <value>${workingDir}/ingest_pmc_plaintext/working_dir</value>
639
                </property>
640
                <property>
641
					<name>input_document_nlm</name>
642
					<value>${workingDir}/import_plaintext_pmc/imported</value>
643
				</property>
644
            	<property>
645
					<name>output_document_plaintext</name>
646
					<value>${workingDir}/ingest_pmc_plaintext/imported</value>
647
				</property>
648
			</configuration>
649
        </sub-workflow>
650
		<ok to="ingest_pmc_joining" />
651
		<error to="fail" />
652
	</action>
653

    
654
	<decision name="decision-ingest_pmc_metadata">
655
        <switch>
656
        	<!-- define ingest_pmc_metadata_joining here when introducing pmc metadata ingestion -->
657
            <case to="transformers-doitooaid">${active_import_metadata eq "true" and active_ingest_pmc_citations eq "true"}</case>
658
            <default to="skip-ingest_pmc_citations"/>
659
        </switch>
660
    </decision>
661

    
662
	<action name="transformers-doitooaid">
663
        <sub-workflow>
664
            <app-path>${wf:appPath()}/transformers_externalidtooaid</app-path>
665
            <propagate-configuration/>
666
            <configuration>
667
                <property>
668
                    <name>workingDir</name>
669
                    <value>${workingDir}/transformers_doitooaid/working_dir</value>
670
                </property>
671
                <property>
672
                    <name>input_document_metadata</name>
673
                    <value>${output_metadataimport_root}/${metadataimport_output_name_document_meta}</value>
674
                </property>
675
                <property>
676
                    <name>external_id_type</name>
677
                    <value>doi</value>
678
                </property>
679
                <property>
680
                    <name>output</name>
681
                    <value>${workingDir}/transformers_doitooaid/out</value>
682
                </property>
683
            </configuration>
684
        </sub-workflow>
685
        <ok to="generate_pmid_to_oaid_mocked_input"/>
686
        <error to="fail"/>
687
    </action>
688

    
689
	<action name="generate_pmid_to_oaid_mocked_input">
690
        <java>
691
            <job-tracker>${jobTracker}</job-tracker>
692
            <name-node>${nameNode}</name-node>
693
			<!-- The data generated by this node is deleted in this section -->
694
			<prepare>
695
				<delete path="${nameNode}${workingDir}/pmid_to_oaid_producer" />
696
				<mkdir path="${nameNode}${workingDir}/pmid_to_oaid_producer" />
697
			</prepare>
698
            <configuration>
699
                <property>
700
                    <name>mapred.job.queue.name</name>
701
                    <value>${queueName}</value>
702
                </property>
703
            </configuration>
704
            <!-- This is simple wrapper for the Java code -->
705
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
706
			<!-- The business Java code that gets to be executed -->
707
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
708
            <!-- Specification of the output ports -->
709
            <arg>-C{pmid_to_oaid,
710
            eu.dnetlib.iis.common.schemas.IdentifierMapping,
711
            eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
712
            <arg>-SworkingDir=${workingDir}/pmid_to_oaid_producer/working_dir</arg>
713
            <arg>-Opmid_to_oaid=${workingDir}/pmid_to_oaid_producer/pmid_to_oaid</arg>
714
        </java>
715
		<ok to="ingest_pmc_citations"/>
716
        <error to="fail"/>
717
    </action>
718
    
719
	<action name="ingest_pmc_citations">
720
		<sub-workflow>
721
            <app-path>${wf:appPath()}/ingest_pmc_citations</app-path>
722
            <propagate-configuration/>
723
            <configuration>
724
            	<property>
725
                    <name>workingDir</name>
726
                    <value>${workingDir}/ingest_pmc_citations/working_dir</value>
727
                </property>
728
                <property>
729
					<name>input_document_nlm</name>
730
					<value>${workingDir}/import_plaintext_pmc/imported</value>
731
				</property>
732
				<property>
733
					<name>input_dedup_map</name>
734
					<value>${wf:actionData('input_id_mapping-path-setter')['result']}</value>
735
				</property>
736
				<property>
737
                    <name>input_doi_to_oaid</name>
738
                    <value>${workingDir}/transformers_doitooaid/out</value>
739
                </property>
740
                <property>
741
                    <name>input_pmid_to_oaid</name>
742
                    <value>${workingDir}/pmid_to_oaid_producer/pmid_to_oaid</value>
743
                </property>
744
            	<property>
745
					<name>output_citation</name>
746
					<value>${output_citation_pmc}</value>
747
				</property>
748
			</configuration>
749
        </sub-workflow>
750
		<ok to="ingest_pmc_joining" />
751
		<error to="fail" />
752
	</action>
753

    
754
	<action name="skip-ingest_pmc_citations">
755
        <java>
756
			<prepare>
757
				<!-- notice: directory have to aligned with skipped action output -->
758
				<delete path="${nameNode}${workingDir}/ingest_pmc_citations/" />
759
				<delete path="${nameNode}${output_citation_pmc}"/>
760
				<mkdir path="${nameNode}${workingDir}/ingest_pmc_citations/working_dir" />
761
				<mkdir path="${nameNode}${output_citation_pmc}"/>
762
			</prepare>
763
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
764
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
765
            <arg>-C{citation_pmc,
766
				eu.dnetlib.iis.ingest.pmc.citations.schemas.Citation,
767
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
768
            <arg>-SworkingDir=${workingDir}/ingest_pmc_citations/working_dir</arg>
769
            <!-- notice: directory have to aligned with skipped action output -->
770
            <arg>-Ocitation_pmc=${output_citation_pmc}</arg>
771
        </java>
772
        <ok to="ingest_pmc_joining"/>
773
        <error to="fail"/>
774
    </action>
775

    
776
	<join name="ingest_pmc_joining" to="import_urlbased_joining"/>
777

    
778
	<!-- metadata extraction section -->
779
	<decision name="decision-metadata_extractor_use_cache">
780
        <switch>
781
            <case to="metadata_extractor">${metadataextraction_default_cache_location eq "$UNDEFINED$"}</case>
782
            <default to="metadata_extractor_cached"/>
783
        </switch>
784
    </decision>
785

    
786
	<action name="metadata_extractor_cached">
787
		<sub-workflow>
788
            <app-path>${wf:appPath()}/metadataextraction_cached</app-path>
789
            <propagate-configuration/>
790
            <configuration>
791
            	<property>
792
                    <name>workingDir</name>
793
                    <value>${workingDir}/metadata_extractor/working_dir</value>
794
                </property>
795
            	<property>
796
					<name>input</name>
797
					<value>${workingDir}/import_content_url/imported/pdf</value>
798
				</property>
799
				<property>
800
					<name>excluded_ids</name>
801
					<value>${metadataextraction_excluded_ids}</value>
802
				</property>
803
				<property>
804
				    <name>content_connection_timeout</name>
805
				   <value>${content_connection_timeout}</value>
806
				</property>
807
				<property>
808
				    <name>content_read_timeout</name>
809
				   <value>${content_read_timeout}</value>
810
				</property>
811
				<property>
812
					<name>default_cache_location</name>
813
					<value>${metadataextraction_default_cache_location}</value>
814
				</property>
815
				<property>
816
					<name>output_name_meta</name>
817
					<value>${metadataextraction_output_name_meta}</value>
818
				</property>
819
				<property>
820
					<name>output_name_plaintext</name>
821
					<value>${metadataextraction_output_name_plaintext}</value>
822
				</property>
823
				<property>
824
					<name>output_root</name>
825
					<value>${output_metadataextraction_root}</value>
826
				</property>
827
            </configuration>
828
        </sub-workflow>
829
		<ok to="import_urlbased_joining"/>
830
		<error to="fail" />
831
	</action>
832
	
833
	<action name="metadata_extractor">
834
		<sub-workflow>
835
            <app-path>${wf:appPath()}/metadataextraction</app-path>
836
            <propagate-configuration/>
837
            <configuration>
838
            	<property>
839
                    <name>workingDir</name>
840
                    <value>${workingDir}/metadata_extractor/working_dir</value>
841
                </property>
842
                <!-- enabling streaming mode -->
843
                <property>
844
					<name>processing_mode</name>
845
					<value>StreamingMetadataExtractorMapper</value>
846
				</property>
847
				<property>
848
					<name>inputport_classname</name>
849
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
850
				</property>
851
            	<property>
852
					<name>input</name>
853
					<value>${workingDir}/import_content_url/imported/pdf</value>
854
				</property>
855
				<property>
856
					<name>excluded_ids</name>
857
					<value>${metadataextraction_excluded_ids}</value>
858
				</property>
859
				<property>
860
				    <name>content_connection_timeout</name>
861
				   <value>${content_connection_timeout}</value>
862
				</property>
863
				<property>
864
				    <name>content_read_timeout</name>
865
				   <value>${content_read_timeout}</value>
866
				</property>
867
				<property>
868
					<name>output_name_meta</name>
869
					<value>${metadataextraction_output_name_meta}</value>
870
				</property>
871
				<property>
872
					<name>output_name_plaintext</name>
873
					<value>${metadataextraction_output_name_plaintext}</value>
874
				</property>
875
				<property>
876
					<name>output_root</name>
877
					<value>${output_metadataextraction_root}</value>
878
				</property>
879
            </configuration>
880
        </sub-workflow>
881
		<ok to="import_urlbased_joining"/>
882
		<error to="fail" />
883
	</action>
884
    <!-- end of metadata extraction section -->
885

    
886
	<join name="import_urlbased_joining" to="transformers_common_union_document_text"/>
887
    
888
    <!-- merging document text datastores: 
889
    	1) retrieved directly from objectstore 
890
    	2) generated by metadataextraction 
891
    	3) imported from PMC XMLs -->
892
	<action name="transformers_common_union_document_text">
893
	    <sub-workflow>
894
            <app-path>${wf:appPath()}/transformers_common_union3</app-path>
895
            <propagate-configuration/>
896
            <configuration>
897
            	<property>
898
                    <name>workingDir</name>
899
                    <value>${workingDir}/transformers_common_union_document_text/working_dir</value>
900
                </property>
901
            	<property>
902
					<name>input_a</name>
903
					<value>${workingDir}/import_plaintext/imported</value>
904
				</property>
905
				<property>
906
					<name>input_b</name>
907
					<value>${output_metadataextraction_root}/${metadataextraction_output_name_plaintext}</value>
908
				</property>
909
				<property>
910
					<name>input_c</name>
911
					<value>${workingDir}/ingest_pmc_plaintext/imported</value>
912
				</property>
913
				<property>
914
					<name>output</name>
915
					<value>${output_document_text}</value>
916
				</property>
917
				<property>
918
					<name>schema</name>
919
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
920
				</property>
921
            </configuration>
922
        </sub-workflow>
923
        <ok to="import_joining"/>
924
		<error to="fail" />
925
    </action>
926

    
927
	<join name="import_joining" to="end"/>
928
    
929
	<kill name="fail">
930
		<message>Unfortunately, the process failed -- error message:
931
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
932
	</kill>
933
	<end name="end" />
934
</workflow-app>
(2-2/2)