Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_common_import">
2
	<parameters>
3
		<!-- importing modes -->
4
		<property>
5
			<name>active_import_metadata</name>
6
			<value>false</value>
7
			<description>flag indicating HBase metadata import should be enabled, when set to false db-based project import will be performed</description>
8
		</property>
9
		<property>
10
			<name>active_import_dataset</name>
11
			<value>false</value>
12
			<description>flag indicating dataset import should be enabled</description>
13
		</property>
14
		<property>
15
			<name>active_ingest_pmc</name>
16
			<value>false</value>
17
			<description>flag indicating pmc metadata and citations ingestions should be performed</description>
18
		</property>
19
		<property>
20
			<name>active_import_concept</name>
21
			<value>false</value>
22
			<description>flag indicating concept import should be executed</description>
23
		</property>
24
		<property>
25
			<name>match_content_with_metadata</name>
26
			<value>true</value>
27
			<description>flag indicating contents should be filtered and their identifiers should be deduplicated against metadata entries retrieved from InformationSpace.
28
			This way only contents having metadata representation will be processed. 
29
			To be disabled when processing new contents which metadata is not available in hbase or when original identifiers should be preserved (contents will not be filtered as well).</description>
30
		</property>
31
		<!-- import metadata related -->
32
		<property>
33
			<name>hbase_input_table</name>
34
			<value>$UNDEFINED$</value>
35
			<description>HBase input table holding InformationSpace, available on local cluster</description>
36
		</property>
37
		<property>
38
			<name>hbase_approved_datasources_csv</name>
39
			<value>$UNDEFINED$</value>
40
			<description>CSV list of datasource ids to be approved during import. Applied on result and person entities.</description>
41
		</property>
42
		<property>
43
			<name>inference_provenance_blacklist</name>
44
			<value>iis::.*</value>
45
			<description>list of blacklisted inference provenance which sould not be taken into account by importer, skipped when set to $UNDEFINED$</description>
46
		</property>
47
		<property>
48
			<name>trust_level_threshold</name>
49
			<value>$UNDEFINED$</value>
50
			<description>trust level threshold represented as float value, ignored when set to $UNDEFINED$ value</description>
51
		</property>
52
		<property>
53
			<name>merge_body_with_updates</name>
54
			<value>false</value>
55
			<description>flag indicating Oaf objects strored in body qualifier should be merged with Oaf objects stored in update qualifier</description>
56
		</property>
57
		<!-- import concepts related -->
58
		<property>
59
			<name>islookup_service_location</name>
60
			<value>$UNDEFINED$</value>
61
			<description>IS Lookup service location, required only when active_import_concept is set to true</description>
62
		</property>
63
		<property>
64
			<name>project_concepts_context_ids_csv</name>
65
			<value>$UNDEFINED$</value>
66
			<description>comma separated list of concepts context identifiers to be picked by ISLookup, required only when active_import_concept is set to true</description>
67
		</property>
68
		<!-- import project related -->
69
		<!-- will be used when active_import_metadata=false  -->
70
		<property>
71
			<name>database_service_location</name>
72
			<value>$UNDEFINED$</value>
73
			<description>Database service (not WSDL) location URL</description>
74
		</property>
75
		<property>
76
			<name>database_dbname</name>
77
			<value>dnet_openaireplus_node6_t</value>
78
			<description>database name</description>
79
		</property>
80
		<!-- import datacite related -->
81
		<property>
82
			<name>mdstore_service_location</name>
83
			<value>$UNDEFINED$</value>
84
			<description>MDStore service (not WSDL) location URL</description>
85
		</property>
86
		<property>
87
			<name>dataset_mdstore_ids_csv</name>
88
			<value>$UNDEFINED$</value>
89
			<description>MDStore identifier</description>
90
		</property>
91
		<!-- import content related -->
92
		<property>
93
			<name>objectstore_service_location</name>
94
			<value>$UNDEFINED$</value>
95
			<description>object store service location required for content retrieval</description>
96
		</property>
97
		<property>
98
			<name>approved_objectstores_csv</name>
99
			<value>$UNDEFINED$</value>
100
			<description>CSV list of object stores identifiers to be processed</description>
101
		</property>
102
		<property>
103
			<name>mimetypes_pdf</name>
104
			<description>pdf mime types</description>
105
		</property>
106
		<property>
107
			<name>mimetypes_text</name>
108
			<description>text mime types</description>
109
		</property>
110
		<property>
111
			<name>mimetypes_html</name>
112
			<description>html mime types</description>
113
		</property>
114
		<property>
115
			<name>mimetypes_xml_pmc</name>
116
			<description>xml pmc types</description>
117
		</property>
118
		<property>
119
			<name>mimetypes_wos</name>
120
			<description>wos types</description>
121
		</property>
122
		<!-- import timeouts related -->
123
		<property>
124
			<name>resultset_client_read_timeout</name>
125
			<value>60000</value>
126
			<description>resultset client read timeout</description>
127
		</property>
128
		<property>
129
			<name>content_connection_timeout</name>
130
			<value>60000</value>
131
			<description>import content connection timeout</description>
132
		</property>
133
		<property>
134
			<name>content_read_timeout</name>
135
			<value>60000</value>
136
			<description>import content read timeout</description>
137
		</property>
138
		<!-- metadata extraction related -->
139
		<property>
140
			<name>metadataextraction_excluded_checksums</name>
141
			<value>$UNDEFINED$</value>
142
			<description>list of content checksums excluded from metadataextraction processing</description>
143
		</property>
144
		<property>
145
			<name>metadataextraction_max_file_size_mb</name>
146
			<value>$UNDEFINED$</value>
147
			<description>maximum allowed file size in Megabytes</description>
148
		</property>
149
		<property>
150
			<name>metadataextraction_default_cache_location</name>
151
			<value>/cache/metadataextraction</value>
152
			<description>metadata extraction HDFS cache location</description>
153
		</property>
154
		<!-- metadata import output subdirectory names -->
155
		<property>
156
			<name>metadataimport_output_name_document_meta</name>
157
			<value>docmeta</value>
158
			<description>metadata import docmeta output subdirectory name</description>
159
		</property>
160
		<property>
161
			<name>metadataimport_output_name_document_project</name>
162
			<value>docproject</value>
163
			<description>metadata import document to project relation subdirectory name</description>
164
		</property>
165
		<property>
166
			<name>metadataimport_output_name_project</name>
167
			<value>project</value>
168
			<description>metadata import project output subdirectory name</description>
169
		</property>
170
		<property>
171
			<name>metadataimport_output_name_person</name>
172
			<value>person</value>
173
			<description>metadata import person output subdirectory name</description>
174
		</property>
175
		<property>
176
			<name>metadataimport_output_name_dedup_mapping</name>
177
			<value>dedupmapping</value>
178
			<description>metadata import deduplication mapping output subdirectory name</description>
179
		</property>
180
		<!-- output parameters -->
181
		<property>
182
			<name>output_extracted_document_metadata</name>
183
			<description>extracted document metadata output directory</description>
184
		</property>
185
		<property>
186
			<name>output_metadataimport_root</name>
187
			<value>$UNDEFINED$</value>
188
			<description>metadata importer output root directory, required when ${active_import_metadata}=true</description>
189
		</property>
190
		<property>
191
			<name>output_dataset</name>
192
			<description>dataset importer output directory holding dataset metadata, required when ${active_import_dataset}=true</description>
193
		</property>
194
		<property>
195
			<name>output_dataset_to_mdstore</name>
196
			<description>dataset importer output directory holding dataset to mdstore mappings, required when ${active_import_dataset}=true</description>
197
		</property>
198
		<property>
199
			<name>output_document_text</name>
200
			<description>text import output directory. merged from three different sources</description>
201
		</property>
202
		<property>
203
			<name>output_wos</name>
204
			<description>wos import output directory</description>
205
		</property>
206
		<property>
207
			<name>output_project_concept</name>
208
			<description>project concepts output directory</description>
209
		</property>
210
		<property>
211
			<name>output_faults</name>
212
			<description>processing faults output directory</description>
213
		</property>
214
		<property>
215
            <name>remove_sideproducts</name>
216
            <value>true</value>
217
            <description>flag indicating whole workingDir will be erased.
218
            Notice: do not provide any output directory location pointing to workingDir subdirectory!</description>
219
        </property>
220
	</parameters>
221
	
222
	<global>
223
        <job-tracker>${jobTracker}</job-tracker>
224
        <name-node>${nameNode}</name-node>
225
        <configuration>
226
            <property>
227
                <name>mapred.job.queue.name</name>
228
                <value>${queueName}</value>
229
            </property>
230
		</configuration>
231
	</global>
232
	 
233
	<start to="import_forking" />
234
	
235
	<fork name="import_forking">
236
    	<path start="decision-import_concept"/>
237
    	<path start="decision-metadata_importer"/>
238
        <path start="decision-import_dataset"/>
239
    </fork>
240
	
241
	<decision name="decision-import_concept">
242
        <switch>
243
            <case to="import_concept">${active_import_concept eq "true"}</case>
244
            <default to="skip-import_concept"/>
245
        </switch>
246
    </decision>
247
	
248
	<action name="import_concept">
249
		<sub-workflow>
250
            <app-path>${wf:appPath()}/import_concept</app-path>
251
            <propagate-configuration/>
252
            <configuration>
253
            	<property>
254
                    <name>workingDir</name>
255
                    <value>${workingDir}/import_concept/working_dir</value>
256
                </property>
257
                <property>
258
					<name>islookup_service_location</name>
259
					<value>${islookup_service_location}</value>
260
				</property>
261
				<property>
262
					<name>context_ids_csv</name>
263
					<value>${project_concepts_context_ids_csv}</value>
264
				</property>
265
            	<property>
266
					<name>output</name>
267
					<value>${output_project_concept}</value>
268
				</property>
269
			</configuration>
270
        </sub-workflow>
271
		<ok to="import_joining" />
272
		<error to="fail" />
273
	</action>
274
	
275
	<action name="skip-import_concept">
276
        <java>
277
			<prepare>
278
				<!-- notice: directory have to aligned with skipped action output -->
279
				<delete path="${nameNode}${workingDir}/import_concept" />
280
				<delete path="${nameNode}${output_project_concept}" />
281
				<mkdir path="${nameNode}${workingDir}/import_concept" />
282
			</prepare>
283
            <main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
284
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
285
            <arg>-C{concept,
286
				eu.dnetlib.iis.importer.schemas.Concept,
287
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
288
            <arg>-Oconcept=${output_project_concept}</arg>
289
        </java>
290
        <ok to="import_joining"/>
291
        <error to="fail"/>
292
    </action>
293
	
294
	<decision name="decision-metadata_importer">
295
        <switch>
296
            <case to="metadata_importer">${active_import_metadata eq "true"}</case>
297
            <default to="import_project"/>
298
        </switch>
299
    </decision>
300
	
301
	<action name="metadata_importer">
302
		<sub-workflow>
303
            <app-path>${wf:appPath()}/import_mapred</app-path>
304
            <propagate-configuration/>
305
            <configuration>
306
            	<property>
307
                    <name>workingDir</name>
308
                    <value>${workingDir}/import/working_dir</value>
309
                </property>
310
				<property>
311
					<name>approved_datasources_csv</name>
312
					<value>${hbase_approved_datasources_csv}</value>
313
				</property>
314
				<property>
315
					<name>output</name>
316
					<value>${output_metadataimport_root}</value>
317
				</property>
318
				<!-- subdirectory names -->
319
		        <property>
320
		            <name>output_name_document_meta</name>
321
		            <value>${metadataimport_output_name_document_meta}</value>
322
		        </property>
323
		        <property>
324
		            <name>output_name_document_project</name>
325
		            <value>${metadataimport_output_name_document_project}</value>
326
		        </property>
327
		        <property>
328
		            <name>output_name_project</name>
329
		            <value>${metadataimport_output_name_project}</value>
330
		        </property>
331
		        <property>
332
		            <name>output_name_person</name>
333
		            <value>${metadataimport_output_name_person}</value>
334
		        </property>
335
		        <property>
336
		            <name>output_name_dedup_mapping</name>
337
		            <value>${metadataimport_output_name_dedup_mapping}</value>
338
		        </property>
339
		        <!-- all the other properties are autmatically propagated-->
340
			</configuration>
341
        </sub-workflow>
342
		<ok to="transformers-idextractor" />
343
		<error to="fail" />
344
	</action>
345
	
346
	<action name="transformers-idextractor">
347
        <sub-workflow>
348
            <app-path>${wf:appPath()}/transformers_idextractor</app-path>
349
            <propagate-configuration/>
350
            <configuration>
351
                <property>
352
                    <name>workingDir</name>
353
                    <value>${workingDir}/transformers_idextractor/working_dir</value>
354
                </property>
355
                <property>
356
                    <name>input_document_metadata</name>
357
                    <value>${output_metadataimport_root}/${metadataimport_output_name_document_meta}</value>
358
                </property>
359
                <property>
360
                    <name>output_identifier</name>
361
                    <value>${workingDir}/transformers_idextractor/output</value>
362
                </property>
363
            </configuration>
364
        </sub-workflow>
365
        <ok to="decision-import_content_url"/>
366
        <error to="fail"/>
367
    </action>
368
	
369
	<action name="import_project">
370
		<sub-workflow>
371
            <app-path>${wf:appPath()}/import_project</app-path>
372
            <propagate-configuration/>
373
            <configuration>
374
            	<property>
375
                    <name>workingDir</name>
376
                    <value>${workingDir}/import_project/working_dir</value>
377
                </property>
378
            	<property>
379
					<name>output</name>
380
					<value>${output_metadataimport_root}/${metadataimport_output_name_project}</value>
381
				</property>
382
				<!-- all the other properties are autmatically propagated-->
383
			</configuration>
384
        </sub-workflow>
385
		<ok to="decision-import_content_url" />
386
		<error to="fail" />
387
	</action>
388
	
389
	<decision name="decision-import_dataset">
390
        <switch>
391
            <case to="import_dataset">${active_import_dataset eq "true"}</case>
392
            <default to="skip-import_dataset"/>
393
        </switch>
394
    </decision>
395
	
396
	<action name="import_dataset">
397
		<sub-workflow>
398
            <app-path>${wf:appPath()}/import_dataset</app-path>
399
            <propagate-configuration/>
400
            <configuration>
401
            	<property>
402
                    <name>workingDir</name>
403
                    <value>${workingDir}/import_dataset/working_dir</value>
404
                </property>
405
				<property>
406
					<name>mdstore_ids_csv</name>
407
					<value>${dataset_mdstore_ids_csv}</value>
408
				</property>
409
				<!-- all the other properties are autmatically propagated-->
410
			</configuration>
411
        </sub-workflow>
412
		<ok to="import_joining" />
413
		<error to="fail" />
414
	</action>
415

    
416
	<action name="skip-import_dataset">
417
        <java>
418
			<prepare>
419
				<!-- notice: directory have to aligned with skipped action output -->
420
				<delete path="${nameNode}${workingDir}/import_dataset" />
421
				<delete path="${nameNode}${output_dataset}" />
422
				<delete path="${nameNode}${output_dataset_to_mdstore}" />
423
				<mkdir path="${nameNode}${workingDir}/import_dataset" />
424
			</prepare>
425
            <main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
426
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
427
            <arg>-C{dataset,
428
				eu.dnetlib.iis.importer.schemas.DataSetReference,
429
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
430
			<arg>-C{dataset_to_mdstore,
431
				eu.dnetlib.iis.importer.schemas.DocumentToMDStore,
432
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
433
            <!-- notice: directory have to aligned with skipped action output -->
434
            <arg>-Odataset=${output_dataset}</arg>
435
            <arg>-Odataset_to_mdstore=${output_dataset_to_mdstore}</arg>
436
        </java>
437
        <ok to="import_joining"/>
438
        <error to="fail"/>
439
    </action>
440

    
441
	<decision name="decision-import_content_url">
442
        <switch>
443
            <case to="skip-import_content_url">${objectstore_service_location eq "$UNDEFINED$"}</case>
444
            <default to="input_id_mapping-path-setter"/>
445
        </switch>
446
    </decision>
447

    
448
	<action name='input_id_mapping-path-setter'>
449
		<java>
450
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
451
			<arg>eu.dnetlib.iis.common.oozie.property.ConditionalPropertySetter</arg>
452
			<arg>-Pcondition=${active_import_metadata eq "true" and match_content_with_metadata eq "true"}</arg>
453
			<arg>-PinCaseOfTrue=${output_metadataimport_root}/${metadataimport_output_name_dedup_mapping}</arg>
454
			<arg>-PelseCase=$UNDEFINED$</arg>
455
			<capture-output />
456
		</java>
457
		<ok to="input_id-path-setter" />
458
		<error to="fail" />
459
	</action>
460

    
461
	<action name='input_id-path-setter'>
462
		<java>
463
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
464
			<arg>eu.dnetlib.iis.common.oozie.property.ConditionalPropertySetter</arg>
465
			<arg>-Pcondition=${active_import_metadata eq "true" and match_content_with_metadata eq "true"}</arg>
466
			<arg>-PinCaseOfTrue=${workingDir}/transformers_idextractor/output</arg>
467
			<arg>-PelseCase=$UNDEFINED$</arg>
468
			<capture-output />
469
		</java>
470
		<ok to="import_content_url" />
471
		<error to="fail" />
472
	</action>
473

    
474
	<action name="import_content_url">
475
		<sub-workflow>
476
            <app-path>${wf:appPath()}/import_content_url</app-path>
477
            <propagate-configuration/>
478
            <configuration>
479
            	<property>
480
                    <name>workingDir</name>
481
                    <value>${workingDir}/import_content_url/working_dir</value>
482
                </property>
483
				<property>
484
					<name>input_id</name>
485
					<value>${wf:actionData('input_id-path-setter')['result']}</value>
486
				</property>
487
				<property>
488
					<name>input_id_mapping</name>
489
					<value>${wf:actionData('input_id_mapping-path-setter')['result']}</value>
490
				</property>
491
				<property>
492
					<name>output_root</name>
493
					<value>${workingDir}/import_content_url/imported</value>
494
				</property>
495
				<property>
496
					<name>output_name_pdf</name>
497
					<value>pdf</value>
498
				</property>
499
				<property>
500
					<name>output_name_text</name>
501
					<value>text</value>
502
				</property>
503
				<property>
504
					<name>output_name_html</name>
505
					<value>html</value>
506
				</property>
507
				<property>
508
					<name>output_name_xml_pmc</name>
509
					<value>xmlpmc</value>
510
				</property>
511
				<property>
512
					<name>output_name_wos</name>
513
					<value>wos</value>
514
				</property>
515
				<!-- all the other properties are autmatically propagated-->
516
			</configuration>
517
        </sub-workflow>
518
		<ok to="import_urlbased_forking" />
519
		<error to="fail" />
520
	</action>
521

    
522
	<action name="skip-import_content_url">
523
        <java>
524
			<prepare>
525
				<!-- notice: directory have to aligned with skipped action output -->
526
				<delete path="${nameNode}${workingDir}/import_content_url" />
527
				<delete path="${nameNode}${output_document_text}" />
528
				<delete path="${nameNode}${output_extracted_document_metadata}" />
529
				<mkdir path="${nameNode}${workingDir}/import_content_url" />
530
			</prepare>
531
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
532
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
533
            <arg>-C{document_text,
534
				eu.dnetlib.iis.metadataextraction.schemas.DocumentText,
535
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
536
			<arg>-C{extracted_document_metadata,
537
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
538
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
539
            <!-- notice: directory have to aligned with skipped action output -->
540
            <arg>-Odocument_text=${output_document_text}</arg>
541
            <arg>-Oextracted_document_metadata=${output_extracted_document_metadata}</arg>
542
        </java>
543
        <ok to="import_joining"/>
544
        <error to="fail"/>
545
    </action>
546

    
547
	<fork name="import_urlbased_forking">
548
    	<path start="import_plaintext"/>
549
    	<path start="import_wos"/>
550
    	<path start="import_plaintext_pmc"/>
551
    	<path start="import_html"/>
552
		<path start="decision-metadata_extractor_use_cache"/>
553
    </fork>
554

    
555
	<action name="import_plaintext">
556
		<sub-workflow>
557
            <app-path>${wf:appPath()}/import_plaintext</app-path>
558
            <propagate-configuration/>
559
            <configuration>
560
            	<property>
561
                    <name>workingDir</name>
562
                    <value>${workingDir}/import_plaintext/working_dir</value>
563
                </property>
564
                <property>
565
					<name>input</name>
566
					<value>${workingDir}/import_content_url/imported/text</value>
567
				</property>
568
            	<property>
569
					<name>output</name>
570
					<value>${workingDir}/import_plaintext/imported</value>
571
				</property>
572
				<!-- all the other properties are autmatically propagated-->
573
			</configuration>
574
        </sub-workflow>
575
		<ok to="import_urlbased_joining" />
576
		<error to="fail" />
577
	</action>
578
	
579
	<action name="import_wos">
580
		<sub-workflow>
581
            <app-path>${wf:appPath()}/import_plaintext</app-path>
582
            <propagate-configuration/>
583
            <configuration>
584
            	<property>
585
                    <name>workingDir</name>
586
                    <value>${workingDir}/import_wos/working_dir</value>
587
                </property>
588
                <property>
589
					<name>input</name>
590
					<value>${workingDir}/import_content_url/imported/wos</value>
591
				</property>
592
            	<property>
593
					<name>output</name>
594
					<value>${output_wos}</value>
595
				</property>
596
				<!-- all the other properties are autmatically propagated-->
597
			</configuration>
598
        </sub-workflow>
599
		<ok to="import_urlbased_joining" />
600
		<error to="fail" />
601
	</action>
602
	
603
	<action name="import_plaintext_pmc">
604
		<sub-workflow>
605
            <app-path>${wf:appPath()}/import_plaintext</app-path>
606
            <propagate-configuration/>
607
            <configuration>
608
            	<property>
609
                    <name>workingDir</name>
610
                    <value>${workingDir}/import_plaintext_pmc/working_dir</value>
611
                </property>
612
                <property>
613
					<name>input</name>
614
					<value>${workingDir}/import_content_url/imported/xmlpmc</value>
615
				</property>
616
            	<property>
617
					<name>output</name>
618
					<value>${workingDir}/import_plaintext_pmc/imported</value>
619
				</property>
620
				<!-- all the other properties are autmatically propagated-->
621
			</configuration>
622
        </sub-workflow>
623
		<ok to="ingest_pmc_forking" />
624
		<error to="fail" />
625
	</action>
626
	
627
	<fork name="ingest_pmc_forking">
628
    	<path start="ingest_pmc_plaintext"/>
629
		<path start="decision-ingest_pmc_metadata"/>
630
    </fork>
631
	
632
	<action name="ingest_pmc_plaintext">
633
		<sub-workflow>
634
            <app-path>${wf:appPath()}/ingest_pmc_plaintext</app-path>
635
            <propagate-configuration/>
636
            <configuration>
637
            	<property>
638
                    <name>workingDir</name>
639
                    <value>	/working_dir</value>
640
                </property>
641
                <property>
642
					<name>input_document_nlm</name>
643
					<value>${workingDir}/import_plaintext_pmc/imported</value>
644
				</property>
645
            	<property>
646
					<name>output_document_plaintext</name>
647
					<value>${workingDir}/ingest_pmc_plaintext/imported</value>
648
				</property>
649
			</configuration>
650
        </sub-workflow>
651
		<ok to="ingest_pmc_joining" />
652
		<error to="fail" />
653
	</action>
654

    
655
	<decision name="decision-ingest_pmc_metadata">
656
        <switch>
657
        	<!-- define ingest_pmc_metadata_joining here when introducing pmc metadata ingestion -->
658
            <case to="ingest_pmc_metadata">${active_import_metadata eq "true" and active_ingest_pmc eq "true"}</case>
659
            <default to="skip-ingest_pmc_metadata"/>
660
        </switch>
661
    </decision>
662

    
663
	<action name="ingest_pmc_metadata">
664
        <sub-workflow>
665
            <app-path>${wf:appPath()}/ingest_pmc_metadata</app-path>
666
            <propagate-configuration/>
667
            <configuration>
668
                <property>
669
                    <name>workingDir</name>
670
                    <value>${workingDir}/ingest_pmc_metadata/working_dir</value>
671
                </property>
672
                <property>
673
                    <name>input</name>
674
                    <value>${workingDir}/import_plaintext_pmc/imported</value>
675
                </property>
676
                <property>
677
                    <name>output</name>
678
                    <value>${workingDir}/ingest_pmc_metadata/out</value>
679
                </property>
680
            </configuration>
681
        </sub-workflow>
682
		<ok to="transformers_ingest_pmc_metadata"/>
683
        <error to="fail"/>
684
    </action>
685

    
686
	<action name="transformers_ingest_pmc_metadata">
687
        <sub-workflow>
688
            <app-path>${wf:appPath()}/transformers_ingest_pmc_metadata</app-path>
689
            <propagate-configuration/>
690
            <configuration>
691
                <property>
692
                    <name>workingDir</name>
693
                    <value>${workingDir}/transformers_ingest_pmc_metadata/working_dir</value>
694
                </property>
695
                <property>
696
                    <name>input</name>
697
                    <value>${workingDir}/ingest_pmc_metadata/out</value>
698
                </property>
699
                <property>
700
                    <name>output</name>
701
                    <value>${workingDir}/transformers_ingest_pmc_metadata/out</value>
702
                </property>
703
            </configuration>
704
        </sub-workflow>
705
		<ok to="ingest_pmc_joining"/>
706
        <error to="fail"/>
707
    </action>
708

    
709
	<action name="skip-ingest_pmc_metadata">
710
        <java>
711
			<prepare>
712
				<!-- notice: directory have to aligned with skipped action output -->
713
				<delete path="${nameNode}${workingDir}/transformers_ingest_pmc_metadata"/>
714
				<mkdir path="${nameNode}${workingDir}/transformers_ingest_pmc_metadata" />
715
			</prepare>
716
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
717
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
718
				<arg>-C{metadata_pmc,
719
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
720
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
721
            <!-- notice: directory have to aligned with skipped action output -->
722
            <arg>-Ometadata_pmc=${workingDir}/transformers_ingest_pmc_metadata/out</arg>
723
        </java>
724
        <ok to="ingest_pmc_joining"/>
725
        <error to="fail"/>
726
    </action>
727

    
728
	<join name="ingest_pmc_joining" to="import_urlbased_joining"/>
729

    
730
	<!-- html import and plaintext ingestion section -->
731
	<action name="import_html">
732
		<sub-workflow>
733
            <app-path>${wf:appPath()}/import_plaintext</app-path>
734
            <propagate-configuration/>
735
            <configuration>
736
            	<property>
737
                    <name>workingDir</name>
738
                    <value>${workingDir}/import_html/working_dir</value>
739
                </property>
740
                <property>
741
					<name>input</name>
742
					<value>${workingDir}/import_content_url/imported/html</value>
743
				</property>
744
            	<property>
745
					<name>output</name>
746
					<value>${workingDir}/import_html/imported</value>
747
				</property>
748
				<!-- all the other properties are autmatically propagated-->
749
			</configuration>
750
        </sub-workflow>
751
		<ok to="ingest_html_plaintext" />
752
		<error to="fail" />
753
	</action>
754

    
755
	
756
	<action name="ingest_html_plaintext">
757
		<sub-workflow>
758
            <app-path>${wf:appPath()}/ingest_html_plaintext</app-path>
759
            <propagate-configuration/>
760
            <configuration>
761
            	<property>
762
                    <name>workingDir</name>
763
                    <value>${workingDir}/ingest_html_plaintext/working_dir</value>
764
                </property>
765
                <property>
766
					<name>input</name>
767
					<value>${workingDir}/import_html/imported</value>
768
				</property>
769
            	<property>
770
					<name>output</name>
771
					<value>${workingDir}/ingest_html_plaintext/imported</value>
772
				</property>
773
			</configuration>
774
        </sub-workflow>
775
		<ok to="import_urlbased_joining" />
776
		<error to="fail" />
777
	</action>
778

    
779
	<!-- metadata extraction section -->
780
	<decision name="decision-metadata_extractor_use_cache">
781
        <switch>
782
            <case to="metadata_extractor">${metadataextraction_default_cache_location eq "$UNDEFINED$"}</case>
783
            <default to="metadata_extractor_cached"/>
784
        </switch>
785
    </decision>
786

    
787
	<action name="metadata_extractor_cached">
788
		<sub-workflow>
789
            <app-path>${wf:appPath()}/metadataextraction_cached</app-path>
790
            <propagate-configuration/>
791
            <configuration>
792
            	<property>
793
                    <name>workingDir</name>
794
                    <value>${workingDir}/metadata_extractor/working_dir</value>
795
                </property>
796
            	<property>
797
					<name>input</name>
798
					<value>${workingDir}/import_content_url/imported/pdf</value>
799
				</property>
800
				<property>
801
					<name>excluded_ids</name>
802
					<value>${metadataextraction_excluded_checksums}</value>
803
				</property>
804
				<property>
805
					<name>max_file_size_mb</name>
806
					<value>${metadataextraction_max_file_size_mb}</value>
807
				</property>
808
				<property>
809
					<name>default_cache_location</name>
810
					<value>${metadataextraction_default_cache_location}</value>
811
				</property>
812
				<property>
813
					<name>output_name_meta</name>
814
					<value>meta</value>
815
				</property>
816
				<property>
817
					<name>output_name_plaintext</name>
818
					<value>plaintext</value>
819
				</property>
820
				<property>
821
					<name>output_name_fault</name>
822
					<value>fault</value>
823
				</property>
824
				<property>
825
					<name>output_root</name>
826
					<value>${workingDir}/metadata_extractor/out</value>
827
				</property>
828
				<!-- all the other properties are autmatically propagated-->
829
            </configuration>
830
        </sub-workflow>
831
		<ok to="import_urlbased_joining"/>
832
		<error to="fail" />
833
	</action>
834
	
835
	<action name="metadata_extractor">
836
		<sub-workflow>
837
            <app-path>${wf:appPath()}/metadataextraction</app-path>
838
            <propagate-configuration/>
839
            <configuration>
840
            	<property>
841
                    <name>workingDir</name>
842
                    <value>${workingDir}/metadata_extractor/working_dir</value>
843
                </property>
844
                <!-- enabling streaming mode -->
845
                <property>
846
					<name>processing_mode</name>
847
					<value>StreamingMetadataExtractorMapper</value>
848
				</property>
849
				<property>
850
					<name>inputport_classname</name>
851
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
852
				</property>
853
            	<property>
854
					<name>input</name>
855
					<value>${workingDir}/import_content_url/imported/pdf</value>
856
				</property>
857
				<property>
858
					<name>excluded_ids</name>
859
					<value>${metadataextraction_excluded_checksums}</value>
860
				</property>
861
				<property>
862
					<name>max_file_size_mb</name>
863
					<value>${metadataextraction_max_file_size_mb}</value>
864
				</property>
865
				<property>
866
					<name>output_name_meta</name>
867
					<value>meta</value>
868
				</property>
869
				<property>
870
					<name>output_name_plaintext</name>
871
					<value>plaintext</value>
872
				</property>
873
				<property>
874
					<name>output_name_fault</name>
875
					<value>fault</value>
876
				</property>
877
				<property>
878
					<name>output_root</name>
879
					<value>${workingDir}/metadata_extractor/out</value>
880
				</property>
881
				<!-- all the other properties are autmatically propagated-->
882
            </configuration>
883
        </sub-workflow>
884
		<ok to="import_urlbased_joining"/>
885
		<error to="fail" />
886
	</action>
887
    <!-- end of metadata extraction section -->
888

    
889
	<join name="import_urlbased_joining" to="transformers_common_union_document_text"/>
890
    
891
    <!-- merging document text datastores: 
892
    	1) retrieved directly from objectstore 
893
    	2) generated by metadataextraction 
894
    	3) ingested from PMC XMLs
895
    	3) ingested from HTML
896
    -->
897
	<action name="transformers_common_union_document_text">
898
	    <sub-workflow>
899
            <app-path>${wf:appPath()}/transformers_common_union4</app-path>
900
            <propagate-configuration/>
901
            <configuration>
902
            	<property>
903
                    <name>workingDir</name>
904
                    <value>${workingDir}/transformers_common_union_document_text/working_dir</value>
905
                </property>
906
            	<property>
907
					<name>input_a</name>
908
					<value>${workingDir}/import_plaintext/imported</value>
909
				</property>
910
				<property>
911
					<name>input_b</name>
912
					<value>${workingDir}/metadata_extractor/out/plaintext</value>
913
				</property>
914
				<property>
915
					<name>input_c</name>
916
					<value>${workingDir}/ingest_pmc_plaintext/imported</value>
917
				</property>
918
				<property>
919
					<name>input_d</name>
920
					<value>${workingDir}/ingest_html_plaintext/imported</value>
921
				</property>
922
				<property>
923
					<name>output</name>
924
					<value>${output_document_text}</value>
925
				</property>
926
				<property>
927
					<name>schema</name>
928
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
929
				</property>
930
            </configuration>
931
        </sub-workflow>
932
        <ok to="extracted_document_metadata_collapser"/>
933
		<error to="fail" />
934
    </action>
935

    
936
	<!-- merging extracted document metadata datastores: 
937
    	1) extracted from PDF documents 
938
    	2) ingested from PMC documents
939
   	-->
940
	<action name="extracted_document_metadata_collapser">
941
        <sub-workflow>
942
            <app-path>${wf:appPath()}/multiple_input_collapser</app-path>
943
            <propagate-configuration/>
944
            <configuration>
945
                <property>
946
                    <name>workingDir</name>
947
                    <value>${workingDir}/extracted_document_metadata_collapser/working_dir</value>
948
                </property>
949
                <property>
950
                    <name>origin_1</name>
951
                    <value>pmc_ingestion</value>
952
                </property>
953
                <property>
954
                    <name>input_1</name>
955
                    <value>${workingDir}/transformers_ingest_pmc_metadata/out</value>
956
                </property>
957
                <property>
958
                    <name>origin_2</name>
959
                    <value>cermine</value>
960
                </property>
961
                <property>
962
                    <name>input_2</name>
963
                    <value>${workingDir}/metadata_extractor/out/meta</value>
964
                </property>
965
                <property>
966
                    <name>output</name>
967
                    <value>${output_extracted_document_metadata}</value>
968
                </property>
969
                <property>
970
                    <name>blocking_field</name>
971
                    <value>id</value>
972
                </property>
973
                <property>
974
                    <name>schema_input</name>
975
                    <value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</value>
976
                </property>
977
                <property>
978
                    <name>schema_input_envelope</name>
979
                    <value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadataEnvelope</value>
980
                </property>
981
            </configuration>
982
        </sub-workflow>
983
        <ok to="import_joining"/>
984
        <error to="fail"/>
985
    </action>
986

    
987
	<join name="import_joining" to="init-faults-dir"/>
988
    
989
    <action name="init-faults-dir">
990
         <fs>
991
            <delete path="${nameNode}${output_faults}" />
992
			<mkdir path="${nameNode}${output_faults}" />
993
        </fs>
994
        <ok to="preserve-faults"/>
995
        <error to="fail"/>
996
    </action>
997
    
998
    <action name="preserve-faults">
999
       <distcp xmlns="uri:oozie:distcp-action:0.1">
1000
           <job-tracker>${jobTracker}</job-tracker>
1001
           <name-node>${nameNode}</name-node>
1002
           <arg>${nameNode}${workingDir}/metadata_extractor/out/fault</arg>
1003
           <arg>${nameNode}${output_faults}/metadataextraction</arg>
1004
           </distcp>
1005
       <ok to="finalize"/>
1006
       <error to="fail"/>
1007
	</action>
1008

    
1009
    <decision name="finalize">
1010
		<switch>
1011
			<case to="remove_sideproducts">${remove_sideproducts eq "true"}</case>
1012
			<default to="end" />
1013
		</switch>
1014
	</decision>
1015
	
1016
	<action name="remove_sideproducts">
1017
		<fs>
1018
			<delete path="${nameNode}${workingDir}" />
1019
		</fs>
1020
		<ok to="end" />
1021
		<error to="fail" />
1022
	</action>
1023
    
1024
	<kill name="fail">
1025
		<message>Unfortunately, the process failed -- error message:
1026
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
1027
	</kill>
1028
	<end name="end" />
1029
</workflow-app>
(2-2/2)