Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_common_import">
2
	<parameters>
3
		<!-- importing modes -->
4
		<property>
5
			<name>active_import_metadata</name>
6
			<value>false</value>
7
			<description>flag indicating HBase metadata import should be enabled, when set to false db-based project import will be performed</description>
8
		</property>
9
		<property>
10
			<name>active_import_dataset</name>
11
			<value>false</value>
12
			<description>flag indicating dataset import should be enabled</description>
13
		</property>
14
		<property>
15
			<name>active_ingest_pmc</name>
16
			<value>false</value>
17
			<description>flag indicating pmc metadata and citations ingestions should be performed</description>
18
		</property>
19
		<property>
20
			<name>active_import_concept</name>
21
			<value>false</value>
22
			<description>flag indicating concept import should be executed</description>
23
		</property>
24
		<property>
25
			<name>match_content_with_metadata</name>
26
			<value>true</value>
27
			<description>flag indicating contents should be filtered and their identifiers should be deduplicated against metadata entries retrieved from InformationSpace.
28
			This way only contents having metadata representation will be processed. 
29
			To be disabled when processing new contents which metadata is not available in hbase or when original identifiers should be preserved (contents will not be filtered as well).</description>
30
		</property>
31
		<!-- import metadata related -->
32
		<property>
33
			<name>hbase_input_table</name>
34
			<value>$UNDEFINED$</value>
35
			<description>HBase input table holding InformationSpace, available on local cluster</description>
36
		</property>
37
		<property>
38
			<name>hbase_approved_datasources_csv</name>
39
			<value>$UNDEFINED$</value>
40
			<description>CSV list of datasource ids to be approved during import. Applied on result and person entities.</description>
41
		</property>
42
		<property>
43
			<name>inference_provenance_blacklist</name>
44
			<value>iis::.*</value>
45
			<description>list of blacklisted inference provenance which sould not be taken into account by importer, skipped when set to $UNDEFINED$</description>
46
		</property>
47
		<property>
48
			<name>trust_level_threshold</name>
49
			<value>$UNDEFINED$</value>
50
			<description>trust level threshold represented as float value, ignored when set to $UNDEFINED$ value</description>
51
		</property>
52
		<property>
53
			<name>merge_body_with_updates</name>
54
			<value>false</value>
55
			<description>flag indicating Oaf objects strored in body qualifier should be merged with Oaf objects stored in update qualifier</description>
56
		</property>
57
		<!-- import concepts related -->
58
		<property>
59
			<name>islookup_service_location</name>
60
			<value>$UNDEFINED$</value>
61
			<description>IS Lookup service location, required only when active_import_concept is set to true</description>
62
		</property>
63
		<property>
64
			<name>project_concepts_context_ids_csv</name>
65
			<value>$UNDEFINED$</value>
66
			<description>comma separated list of concepts context identifiers to be picked by ISLookup, required only when active_import_concept is set to true</description>
67
		</property>
68
		<!-- import project related -->
69
		<!-- will be used when active_import_metadata=false  -->
70
		<property>
71
			<name>database_service_location</name>
72
			<value>$UNDEFINED$</value>
73
			<description>Database service (not WSDL) location URL</description>
74
		</property>
75
		<property>
76
			<name>database_dbname</name>
77
			<value>dnet_openaireplus_node6_t</value>
78
			<description>database name</description>
79
		</property>
80
		<!-- import datacite related -->
81
		<property>
82
			<name>mdstore_service_location</name>
83
			<value>$UNDEFINED$</value>
84
			<description>MDStore service (not WSDL) location URL</description>
85
		</property>
86
		<property>
87
			<name>dataset_mdstore_ids_csv</name>
88
			<value>$UNDEFINED$</value>
89
			<description>MDStore identifier</description>
90
		</property>
91
		<!-- import content related -->
92
		<property>
93
			<name>objectstore_service_location</name>
94
			<value>$UNDEFINED$</value>
95
			<description>object store service location required for content retrieval</description>
96
		</property>
97
		<property>
98
			<name>approved_objectstores_csv</name>
99
			<value>$UNDEFINED$</value>
100
			<description>CSV list of object stores identifiers to be processed</description>
101
		</property>
102
		<property>
103
			<name>mimetypes_pdf</name>
104
			<description>pdf mime types</description>
105
		</property>
106
		<property>
107
			<name>mimetypes_text</name>
108
			<description>text mime types</description>
109
		</property>
110
		<property>
111
			<name>mimetypes_html</name>
112
			<description>html mime types</description>
113
		</property>
114
		<property>
115
			<name>mimetypes_xml_pmc</name>
116
			<description>xml pmc types</description>
117
		</property>
118
		<property>
119
			<name>mimetypes_wos</name>
120
			<description>wos types</description>
121
		</property>
122
		<!-- import timeouts related -->
123
		<property>
124
			<name>resultset_client_read_timeout</name>
125
			<value>60000</value>
126
			<description>resultset client read timeout</description>
127
		</property>
128
		<property>
129
			<name>content_connection_timeout</name>
130
			<value>60000</value>
131
			<description>import content connection timeout</description>
132
		</property>
133
		<property>
134
			<name>content_read_timeout</name>
135
			<value>60000</value>
136
			<description>import content read timeout</description>
137
		</property>
138
		<!-- metadata extraction related -->
139
		<property>
140
			<name>metadataextraction_excluded_checksums</name>
141
			<value>$UNDEFINED$</value>
142
			<description>list of content checksums excluded from metadataextraction processing</description>
143
		</property>
144
		<property>
145
			<name>metadataextraction_max_file_size_mb</name>
146
			<value>$UNDEFINED$</value>
147
			<description>maximum allowed file size in Megabytes</description>
148
		</property>
149
		<property>
150
			<name>metadataextraction_default_cache_location</name>
151
			<value>/cache/metadataextraction</value>
152
			<description>metadata extraction HDFS cache location</description>
153
		</property>
154
		<!-- metadata import output subdirectory names -->
155
		<property>
156
			<name>metadataimport_output_name_document_meta</name>
157
			<value>docmeta</value>
158
			<description>metadata import docmeta output subdirectory name</description>
159
		</property>
160
		<property>
161
			<name>metadataimport_output_name_document_project</name>
162
			<value>docproject</value>
163
			<description>metadata import document to project relation subdirectory name</description>
164
		</property>
165
		<property>
166
			<name>metadataimport_output_name_project</name>
167
			<value>project</value>
168
			<description>metadata import project output subdirectory name</description>
169
		</property>
170
		<property>
171
			<name>metadataimport_output_name_person</name>
172
			<value>person</value>
173
			<description>metadata import person output subdirectory name</description>
174
		</property>
175
		<property>
176
			<name>metadataimport_output_name_dedup_mapping</name>
177
			<value>dedupmapping</value>
178
			<description>metadata import deduplication mapping output subdirectory name</description>
179
		</property>
180
		<!-- output parameters -->
181
		<property>
182
			<name>output_extracted_document_metadata</name>
183
			<description>extracted document metadata output directory</description>
184
		</property>
185
		<property>
186
			<name>output_metadataimport_root</name>
187
			<value>$UNDEFINED$</value>
188
			<description>metadata importer output root directory, required when ${active_import_metadata}=true</description>
189
		</property>
190
		<property>
191
			<name>output_citation_pmc</name>
192
			<description>PMC citation output directory, required when ${active_import_metadata}=true</description>
193
		</property>
194
		<property>
195
			<name>output_dataset</name>
196
			<description>dataset importer output directory holding dataset metadata, required when ${active_import_dataset}=true</description>
197
		</property>
198
		<property>
199
			<name>output_dataset_to_mdstore</name>
200
			<description>dataset importer output directory holding dataset to mdstore mappings, required when ${active_import_dataset}=true</description>
201
		</property>
202
		<property>
203
			<name>output_document_text</name>
204
			<description>text import output directory. merged from three different sources</description>
205
		</property>
206
		<property>
207
			<name>output_wos</name>
208
			<description>wos import output directory</description>
209
		</property>
210
		<property>
211
			<name>output_project_concept</name>
212
			<description>project concepts output directory</description>
213
		</property>
214
		<property>
215
			<name>output_faults</name>
216
			<description>processing faults output directory</description>
217
		</property>
218
		<property>
219
            <name>remove_sideproducts</name>
220
            <value>true</value>
221
            <description>flag indicating whole workingDir will be erased.
222
            Notice: do not provide any output directory location pointing to workingDir subdirectory!</description>
223
        </property>
224
	</parameters>
225
	
226
	<global>
227
        <job-tracker>${jobTracker}</job-tracker>
228
        <name-node>${nameNode}</name-node>
229
        <configuration>
230
            <property>
231
                <name>mapred.job.queue.name</name>
232
                <value>${queueName}</value>
233
            </property>
234
		</configuration>
235
	</global>
236
	 
237
	<start to="import_forking" />
238
	
239
	<fork name="import_forking">
240
    	<path start="decision-import_concept"/>
241
    	<path start="decision-metadata_importer"/>
242
        <path start="decision-import_dataset"/>
243
    </fork>
244
	
245
	<decision name="decision-import_concept">
246
        <switch>
247
            <case to="import_concept">${active_import_concept eq "true"}</case>
248
            <default to="skip-import_concept"/>
249
        </switch>
250
    </decision>
251
	
252
	<action name="import_concept">
253
		<sub-workflow>
254
            <app-path>${wf:appPath()}/import_concept</app-path>
255
            <propagate-configuration/>
256
            <configuration>
257
            	<property>
258
                    <name>workingDir</name>
259
                    <value>${workingDir}/import_concept/working_dir</value>
260
                </property>
261
                <property>
262
					<name>islookup_service_location</name>
263
					<value>${islookup_service_location}</value>
264
				</property>
265
				<property>
266
					<name>context_ids_csv</name>
267
					<value>${project_concepts_context_ids_csv}</value>
268
				</property>
269
            	<property>
270
					<name>output</name>
271
					<value>${output_project_concept}</value>
272
				</property>
273
			</configuration>
274
        </sub-workflow>
275
		<ok to="import_joining" />
276
		<error to="fail" />
277
	</action>
278
	
279
	<action name="skip-import_concept">
280
        <java>
281
			<prepare>
282
				<!-- notice: directory have to aligned with skipped action output -->
283
				<delete path="${nameNode}${workingDir}/import_concept" />
284
				<delete path="${nameNode}${output_project_concept}" />
285
				<mkdir path="${nameNode}${workingDir}/import_concept" />
286
			</prepare>
287
            <main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
288
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
289
            <arg>-C{concept,
290
				eu.dnetlib.iis.importer.schemas.Concept,
291
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
292
            <arg>-Oconcept=${output_project_concept}</arg>
293
        </java>
294
        <ok to="import_joining"/>
295
        <error to="fail"/>
296
    </action>
297
	
298
	<decision name="decision-metadata_importer">
299
        <switch>
300
            <case to="metadata_importer">${active_import_metadata eq "true"}</case>
301
            <default to="import_project"/>
302
        </switch>
303
    </decision>
304
	
305
	<action name="metadata_importer">
306
		<sub-workflow>
307
            <app-path>${wf:appPath()}/import_mapred</app-path>
308
            <propagate-configuration/>
309
            <configuration>
310
            	<property>
311
                    <name>workingDir</name>
312
                    <value>${workingDir}/import/working_dir</value>
313
                </property>
314
				<property>
315
					<name>approved_datasources_csv</name>
316
					<value>${hbase_approved_datasources_csv}</value>
317
				</property>
318
				<property>
319
					<name>output</name>
320
					<value>${output_metadataimport_root}</value>
321
				</property>
322
				<!-- subdirectory names -->
323
		        <property>
324
		            <name>output_name_document_meta</name>
325
		            <value>${metadataimport_output_name_document_meta}</value>
326
		        </property>
327
		        <property>
328
		            <name>output_name_document_project</name>
329
		            <value>${metadataimport_output_name_document_project}</value>
330
		        </property>
331
		        <property>
332
		            <name>output_name_project</name>
333
		            <value>${metadataimport_output_name_project}</value>
334
		        </property>
335
		        <property>
336
		            <name>output_name_person</name>
337
		            <value>${metadataimport_output_name_person}</value>
338
		        </property>
339
		        <property>
340
		            <name>output_name_dedup_mapping</name>
341
		            <value>${metadataimport_output_name_dedup_mapping}</value>
342
		        </property>
343
		        <!-- all the other properties are autmatically propagated-->
344
			</configuration>
345
        </sub-workflow>
346
		<ok to="transformers-idextractor" />
347
		<error to="fail" />
348
	</action>
349
	
350
	<action name="transformers-idextractor">
351
        <sub-workflow>
352
            <app-path>${wf:appPath()}/transformers_idextractor</app-path>
353
            <propagate-configuration/>
354
            <configuration>
355
                <property>
356
                    <name>workingDir</name>
357
                    <value>${workingDir}/transformers_idextractor/working_dir</value>
358
                </property>
359
                <property>
360
                    <name>input_document_metadata</name>
361
                    <value>${output_metadataimport_root}/${metadataimport_output_name_document_meta}</value>
362
                </property>
363
                <property>
364
                    <name>output_identifier</name>
365
                    <value>${workingDir}/transformers_idextractor/output</value>
366
                </property>
367
            </configuration>
368
        </sub-workflow>
369
        <ok to="decision-import_content_url"/>
370
        <error to="fail"/>
371
    </action>
372
	
373
	<action name="import_project">
374
		<sub-workflow>
375
            <app-path>${wf:appPath()}/import_project</app-path>
376
            <propagate-configuration/>
377
            <configuration>
378
            	<property>
379
                    <name>workingDir</name>
380
                    <value>${workingDir}/import_project/working_dir</value>
381
                </property>
382
            	<property>
383
					<name>output</name>
384
					<value>${output_metadataimport_root}/${metadataimport_output_name_project}</value>
385
				</property>
386
				<!-- all the other properties are autmatically propagated-->
387
			</configuration>
388
        </sub-workflow>
389
		<ok to="decision-import_content_url" />
390
		<error to="fail" />
391
	</action>
392
	
393
	<decision name="decision-import_dataset">
394
        <switch>
395
            <case to="import_dataset">${active_import_dataset eq "true"}</case>
396
            <default to="skip-import_dataset"/>
397
        </switch>
398
    </decision>
399
	
400
	<action name="import_dataset">
401
		<sub-workflow>
402
            <app-path>${wf:appPath()}/import_dataset</app-path>
403
            <propagate-configuration/>
404
            <configuration>
405
            	<property>
406
                    <name>workingDir</name>
407
                    <value>${workingDir}/import_dataset/working_dir</value>
408
                </property>
409
				<property>
410
					<name>mdstore_ids_csv</name>
411
					<value>${dataset_mdstore_ids_csv}</value>
412
				</property>
413
				<!-- all the other properties are autmatically propagated-->
414
			</configuration>
415
        </sub-workflow>
416
		<ok to="import_joining" />
417
		<error to="fail" />
418
	</action>
419

    
420
	<action name="skip-import_dataset">
421
        <java>
422
			<prepare>
423
				<!-- notice: directory have to aligned with skipped action output -->
424
				<delete path="${nameNode}${workingDir}/import_dataset" />
425
				<delete path="${nameNode}${output_dataset}" />
426
				<delete path="${nameNode}${output_dataset_to_mdstore}" />
427
				<mkdir path="${nameNode}${workingDir}/import_dataset" />
428
			</prepare>
429
            <main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
430
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
431
            <arg>-C{dataset,
432
				eu.dnetlib.iis.importer.schemas.DataSetReference,
433
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
434
			<arg>-C{dataset_to_mdstore,
435
				eu.dnetlib.iis.importer.schemas.DocumentToMDStore,
436
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
437
            <!-- notice: directory have to aligned with skipped action output -->
438
            <arg>-Odataset=${output_dataset}</arg>
439
            <arg>-Odataset_to_mdstore=${output_dataset_to_mdstore}</arg>
440
        </java>
441
        <ok to="import_joining"/>
442
        <error to="fail"/>
443
    </action>
444

    
445
	<decision name="decision-import_content_url">
446
        <switch>
447
            <case to="skip-import_content_url">${objectstore_service_location eq "$UNDEFINED$"}</case>
448
            <default to="input_id_mapping-path-setter"/>
449
        </switch>
450
    </decision>
451

    
452
	<action name='input_id_mapping-path-setter'>
453
		<java>
454
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
455
			<arg>eu.dnetlib.iis.common.oozie.property.ConditionalPropertySetter</arg>
456
			<arg>-Pcondition=${active_import_metadata eq "true" and match_content_with_metadata eq "true"}</arg>
457
			<arg>-PinCaseOfTrue=${output_metadataimport_root}/${metadataimport_output_name_dedup_mapping}</arg>
458
			<arg>-PelseCase=$UNDEFINED$</arg>
459
			<capture-output />
460
		</java>
461
		<ok to="input_id-path-setter" />
462
		<error to="fail" />
463
	</action>
464

    
465
	<action name='input_id-path-setter'>
466
		<java>
467
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
468
			<arg>eu.dnetlib.iis.common.oozie.property.ConditionalPropertySetter</arg>
469
			<arg>-Pcondition=${active_import_metadata eq "true" and match_content_with_metadata eq "true"}</arg>
470
			<arg>-PinCaseOfTrue=${workingDir}/transformers_idextractor/output</arg>
471
			<arg>-PelseCase=$UNDEFINED$</arg>
472
			<capture-output />
473
		</java>
474
		<ok to="import_content_url" />
475
		<error to="fail" />
476
	</action>
477

    
478
	<action name="import_content_url">
479
		<sub-workflow>
480
            <app-path>${wf:appPath()}/import_content_url</app-path>
481
            <propagate-configuration/>
482
            <configuration>
483
            	<property>
484
                    <name>workingDir</name>
485
                    <value>${workingDir}/import_content_url/working_dir</value>
486
                </property>
487
				<property>
488
					<name>input_id</name>
489
					<value>${wf:actionData('input_id-path-setter')['result']}</value>
490
				</property>
491
				<property>
492
					<name>input_id_mapping</name>
493
					<value>${wf:actionData('input_id_mapping-path-setter')['result']}</value>
494
				</property>
495
				<property>
496
					<name>output_root</name>
497
					<value>${workingDir}/import_content_url/imported</value>
498
				</property>
499
				<property>
500
					<name>output_name_pdf</name>
501
					<value>pdf</value>
502
				</property>
503
				<property>
504
					<name>output_name_text</name>
505
					<value>text</value>
506
				</property>
507
				<property>
508
					<name>output_name_html</name>
509
					<value>html</value>
510
				</property>
511
				<property>
512
					<name>output_name_xml_pmc</name>
513
					<value>xmlpmc</value>
514
				</property>
515
				<property>
516
					<name>output_name_wos</name>
517
					<value>wos</value>
518
				</property>
519
				<!-- all the other properties are autmatically propagated-->
520
			</configuration>
521
        </sub-workflow>
522
		<ok to="import_urlbased_forking" />
523
		<error to="fail" />
524
	</action>
525

    
526
	<action name="skip-import_content_url">
527
        <java>
528
			<prepare>
529
				<!-- notice: directory have to aligned with skipped action output -->
530
				<delete path="${nameNode}${workingDir}/import_content_url" />
531
				<delete path="${nameNode}${output_document_text}" />
532
				<delete path="${nameNode}${output_extracted_document_metadata}" />
533
				<mkdir path="${nameNode}${workingDir}/import_content_url" />
534
			</prepare>
535
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
536
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
537
            <arg>-C{document_text,
538
				eu.dnetlib.iis.metadataextraction.schemas.DocumentText,
539
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
540
			<arg>-C{extracted_document_metadata,
541
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
542
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
543
            <!-- notice: directory have to aligned with skipped action output -->
544
            <arg>-Odocument_text=${output_document_text}</arg>
545
            <arg>-Oextracted_document_metadata=${output_extracted_document_metadata}</arg>
546
        </java>
547
        <ok to="import_joining"/>
548
        <error to="fail"/>
549
    </action>
550

    
551
	<fork name="import_urlbased_forking">
552
    	<path start="import_plaintext"/>
553
    	<path start="import_wos"/>
554
    	<path start="import_plaintext_pmc"/>
555
    	<path start="import_html"/>
556
		<path start="decision-metadata_extractor_use_cache"/>
557
    </fork>
558

    
559
	<action name="import_plaintext">
560
		<sub-workflow>
561
            <app-path>${wf:appPath()}/import_plaintext</app-path>
562
            <propagate-configuration/>
563
            <configuration>
564
            	<property>
565
                    <name>workingDir</name>
566
                    <value>${workingDir}/import_plaintext/working_dir</value>
567
                </property>
568
                <property>
569
					<name>input</name>
570
					<value>${workingDir}/import_content_url/imported/text</value>
571
				</property>
572
            	<property>
573
					<name>output</name>
574
					<value>${workingDir}/import_plaintext/imported</value>
575
				</property>
576
				<!-- all the other properties are autmatically propagated-->
577
			</configuration>
578
        </sub-workflow>
579
		<ok to="import_urlbased_joining" />
580
		<error to="fail" />
581
	</action>
582
	
583
	<action name="import_wos">
584
		<sub-workflow>
585
            <app-path>${wf:appPath()}/import_plaintext</app-path>
586
            <propagate-configuration/>
587
            <configuration>
588
            	<property>
589
                    <name>workingDir</name>
590
                    <value>${workingDir}/import_wos/working_dir</value>
591
                </property>
592
                <property>
593
					<name>input</name>
594
					<value>${workingDir}/import_content_url/imported/wos</value>
595
				</property>
596
            	<property>
597
					<name>output</name>
598
					<value>${output_wos}</value>
599
				</property>
600
				<!-- all the other properties are autmatically propagated-->
601
			</configuration>
602
        </sub-workflow>
603
		<ok to="import_urlbased_joining" />
604
		<error to="fail" />
605
	</action>
606
	
607
	<action name="import_plaintext_pmc">
608
		<sub-workflow>
609
            <app-path>${wf:appPath()}/import_plaintext</app-path>
610
            <propagate-configuration/>
611
            <configuration>
612
            	<property>
613
                    <name>workingDir</name>
614
                    <value>${workingDir}/import_plaintext_pmc/working_dir</value>
615
                </property>
616
                <property>
617
					<name>input</name>
618
					<value>${workingDir}/import_content_url/imported/xmlpmc</value>
619
				</property>
620
            	<property>
621
					<name>output</name>
622
					<value>${workingDir}/import_plaintext_pmc/imported</value>
623
				</property>
624
				<!-- all the other properties are autmatically propagated-->
625
			</configuration>
626
        </sub-workflow>
627
		<ok to="ingest_pmc_forking" />
628
		<error to="fail" />
629
	</action>
630
	
631
	<fork name="ingest_pmc_forking">
632
    	<path start="ingest_pmc_plaintext"/>
633
		<path start="decision-ingest_pmc_metadata"/>
634
    </fork>
635
	
636
	<action name="ingest_pmc_plaintext">
637
		<sub-workflow>
638
            <app-path>${wf:appPath()}/ingest_pmc_plaintext</app-path>
639
            <propagate-configuration/>
640
            <configuration>
641
            	<property>
642
                    <name>workingDir</name>
643
                    <value>	/working_dir</value>
644
                </property>
645
                <property>
646
					<name>input_document_nlm</name>
647
					<value>${workingDir}/import_plaintext_pmc/imported</value>
648
				</property>
649
            	<property>
650
					<name>output_document_plaintext</name>
651
					<value>${workingDir}/ingest_pmc_plaintext/imported</value>
652
				</property>
653
			</configuration>
654
        </sub-workflow>
655
		<ok to="ingest_pmc_joining" />
656
		<error to="fail" />
657
	</action>
658

    
659
	<decision name="decision-ingest_pmc_metadata">
660
        <switch>
661
        	<!-- define ingest_pmc_metadata_joining here when introducing pmc metadata ingestion -->
662
            <case to="transformers-doitooaid">${active_import_metadata eq "true" and active_ingest_pmc eq "true"}</case>
663
            <default to="skip-ingest_pmc_citations"/>
664
        </switch>
665
    </decision>
666

    
667
	<action name="transformers-doitooaid">
668
        <sub-workflow>
669
            <app-path>${wf:appPath()}/transformers_externalidtooaid</app-path>
670
            <propagate-configuration/>
671
            <configuration>
672
                <property>
673
                    <name>workingDir</name>
674
                    <value>${workingDir}/transformers_doitooaid/working_dir</value>
675
                </property>
676
                <property>
677
                    <name>input_document_metadata</name>
678
                    <value>${output_metadataimport_root}/${metadataimport_output_name_document_meta}</value>
679
                </property>
680
                <property>
681
                    <name>external_id_type</name>
682
                    <value>doi</value>
683
                </property>
684
                <property>
685
                    <name>output</name>
686
                    <value>${workingDir}/transformers_doitooaid/out</value>
687
                </property>
688
            </configuration>
689
        </sub-workflow>
690
        <ok to="ingest_pmc_metadata"/>
691
        <error to="fail"/>
692
    </action>
693

    
694
	<action name="ingest_pmc_metadata">
695
        <sub-workflow>
696
            <app-path>${wf:appPath()}/ingest_pmc_metadata</app-path>
697
            <propagate-configuration/>
698
            <configuration>
699
                <property>
700
                    <name>workingDir</name>
701
                    <value>${workingDir}/ingest_pmc_metadata/working_dir</value>
702
                </property>
703
                <property>
704
                    <name>input</name>
705
                    <value>${workingDir}/import_plaintext_pmc/imported</value>
706
                </property>
707
                <property>
708
                    <name>output</name>
709
                    <value>${workingDir}/ingest_pmc_metadata/out</value>
710
                </property>
711
            </configuration>
712
        </sub-workflow>
713
		<ok to="collapse_pmc_metadata"/>
714
        <error to="fail"/>
715
    </action>
716

    
717
	<action name="collapse_pmc_metadata">
718
        <sub-workflow>
719
            <app-path>${wf:appPath()}/basic_collapser</app-path>
720
            <propagate-configuration/>
721
            <configuration>
722
                <property>
723
                    <name>workingDir</name>
724
                    <value>${workingDir}/collapse_pmc_metadata/working_dir</value>
725
                </property>
726
                <property>
727
                    <name>input</name>
728
                    <value>${workingDir}/ingest_pmc_metadata/out</value>
729
                </property>
730
                <property>
731
                    <name>output</name>
732
                    <value>${workingDir}/collapse_pmc_metadata/out</value>
733
                </property>
734
                <property>
735
                    <name>schema</name>
736
                    <value>eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata</value>
737
                </property>
738
                <property>
739
                    <name>blocking_field</name>
740
                    <value>id</value>
741
                </property>
742
                <property>
743
                    <name>significant_fields</name>
744
                    <value>journal,references,pages</value>
745
                </property>
746
            </configuration>
747
        </sub-workflow>
748
		<ok to="transformers_ingest_pmc_metadata"/>
749
        <error to="fail"/>
750
    </action>
751

    
752
	<action name="transformers_ingest_pmc_metadata">
753
        <sub-workflow>
754
            <app-path>${wf:appPath()}/transformers_ingest_pmc_metadata</app-path>
755
            <propagate-configuration/>
756
            <configuration>
757
                <property>
758
                    <name>workingDir</name>
759
                    <value>${workingDir}/transformers_ingest_pmc_metadata/working_dir</value>
760
                </property>
761
                <property>
762
                    <name>input</name>
763
                    <value>${workingDir}/collapse_pmc_metadata/out</value>
764
                </property>
765
                <property>
766
                    <name>output</name>
767
                    <value>${workingDir}/transformers_ingest_pmc_metadata/out</value>
768
                </property>
769
            </configuration>
770
        </sub-workflow>
771
		<ok to="ingest_pmc_idmapping_pmidtooaid"/>
772
        <error to="fail"/>
773
    </action>
774

    
775
	<action name="ingest_pmc_idmapping_pmidtooaid">
776
        <sub-workflow>
777
            <app-path>${wf:appPath()}/ingest_pmc_idmapping_pmidtooaid</app-path>
778
            <propagate-configuration/>
779
            <configuration>
780
                <property>
781
                    <name>workingDir</name>
782
                    <value>${workingDir}/ingest_pmc_idmapping_pmidtooaid/working_dir</value>
783
                </property>
784
                <property>
785
                    <name>input</name>
786
                    <value>${workingDir}/collapse_pmc_metadata/out</value>
787
                </property>
788
                <property>
789
                    <name>output</name>
790
                    <value>${workingDir}/ingest_pmc_idmapping_pmidtooaid/out</value>
791
                </property>
792
            </configuration>
793
        </sub-workflow>
794
		<ok to="ingest_pmc_citations"/>
795
        <error to="fail"/>
796
    </action>
797
    
798
	<action name="ingest_pmc_citations">
799
		<sub-workflow>
800
            <app-path>${wf:appPath()}/ingest_pmc_citations</app-path>
801
            <propagate-configuration/>
802
            <configuration>
803
            	<property>
804
                    <name>workingDir</name>
805
                    <value>${workingDir}/ingest_pmc_citations/working_dir</value>
806
                </property>
807
                <property>
808
					<name>input_extracted_document_metadata</name>
809
					<value>${workingDir}/collapse_pmc_metadata/out</value>
810
				</property>
811
				<property>
812
					<name>input_dedup_map</name>
813
					<value>${output_metadataimport_root}/${metadataimport_output_name_dedup_mapping}</value>
814
				</property>
815
				<property>
816
                    <name>input_doi_to_oaid</name>
817
                    <value>${workingDir}/transformers_doitooaid/out</value>
818
                </property>
819
                <property>
820
                    <name>input_pmid_to_oaid</name>
821
                    <value>${workingDir}/ingest_pmc_idmapping_pmidtooaid/out</value>
822
                </property>
823
            	<property>
824
					<name>output_citation</name>
825
					<value>${output_citation_pmc}</value>
826
				</property>
827
			</configuration>
828
        </sub-workflow>
829
		<ok to="ingest_pmc_joining" />
830
		<error to="fail" />
831
	</action>
832

    
833
	<action name="skip-ingest_pmc_citations">
834
        <java>
835
			<prepare>
836
				<!-- notice: directory have to aligned with skipped action output -->
837
				<delete path="${nameNode}${workingDir}/ingest_pmc_citations" />
838
				<delete path="${nameNode}${workingDir}/transformers_ingest_pmc_metadata"/>
839
				<delete path="${nameNode}${output_citation_pmc}"/>
840
				<mkdir path="${nameNode}${workingDir}/ingest_pmc_citations" />
841
				<mkdir path="${nameNode}${workingDir}/transformers_ingest_pmc_metadata" />
842
				<mkdir path="${nameNode}${output_citation_pmc}"/>
843
			</prepare>
844
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
845
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
846
            <arg>-C{citation_pmc,
847
				eu.dnetlib.iis.ingest.pmc.citations.schemas.Citation,
848
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
849
				<arg>-C{metadata_pmc,
850
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
851
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
852
            <!-- notice: directory have to aligned with skipped action output -->
853
            <arg>-Ocitation_pmc=${output_citation_pmc}</arg>
854
            <arg>-Ometadata_pmc=${workingDir}/transformers_ingest_pmc_metadata/out</arg>
855
        </java>
856
        <ok to="ingest_pmc_joining"/>
857
        <error to="fail"/>
858
    </action>
859

    
860
	<join name="ingest_pmc_joining" to="import_urlbased_joining"/>
861

    
862
	<!-- html import and plaintext ingestion section -->
863
	<action name="import_html">
864
		<sub-workflow>
865
            <app-path>${wf:appPath()}/import_plaintext</app-path>
866
            <propagate-configuration/>
867
            <configuration>
868
            	<property>
869
                    <name>workingDir</name>
870
                    <value>${workingDir}/import_html/working_dir</value>
871
                </property>
872
                <property>
873
					<name>input</name>
874
					<value>${workingDir}/import_content_url/imported/html</value>
875
				</property>
876
            	<property>
877
					<name>output</name>
878
					<value>${workingDir}/import_html/imported</value>
879
				</property>
880
				<!-- all the other properties are autmatically propagated-->
881
			</configuration>
882
        </sub-workflow>
883
		<ok to="ingest_html_plaintext" />
884
		<error to="fail" />
885
	</action>
886

    
887
	
888
	<action name="ingest_html_plaintext">
889
		<sub-workflow>
890
            <app-path>${wf:appPath()}/ingest_html_plaintext</app-path>
891
            <propagate-configuration/>
892
            <configuration>
893
            	<property>
894
                    <name>workingDir</name>
895
                    <value>${workingDir}/ingest_html_plaintext/working_dir</value>
896
                </property>
897
                <property>
898
					<name>input</name>
899
					<value>${workingDir}/import_html/imported</value>
900
				</property>
901
            	<property>
902
					<name>output</name>
903
					<value>${workingDir}/ingest_html_plaintext/imported</value>
904
				</property>
905
			</configuration>
906
        </sub-workflow>
907
		<ok to="import_urlbased_joining" />
908
		<error to="fail" />
909
	</action>
910

    
911
	<!-- metadata extraction section -->
912
	<decision name="decision-metadata_extractor_use_cache">
913
        <switch>
914
            <case to="metadata_extractor">${metadataextraction_default_cache_location eq "$UNDEFINED$"}</case>
915
            <default to="metadata_extractor_cached"/>
916
        </switch>
917
    </decision>
918

    
919
	<action name="metadata_extractor_cached">
920
		<sub-workflow>
921
            <app-path>${wf:appPath()}/metadataextraction_cached</app-path>
922
            <propagate-configuration/>
923
            <configuration>
924
            	<property>
925
                    <name>workingDir</name>
926
                    <value>${workingDir}/metadata_extractor/working_dir</value>
927
                </property>
928
            	<property>
929
					<name>input</name>
930
					<value>${workingDir}/import_content_url/imported/pdf</value>
931
				</property>
932
				<property>
933
					<name>excluded_ids</name>
934
					<value>${metadataextraction_excluded_checksums}</value>
935
				</property>
936
				<property>
937
					<name>max_file_size_mb</name>
938
					<value>${metadataextraction_max_file_size_mb}</value>
939
				</property>
940
				<property>
941
					<name>default_cache_location</name>
942
					<value>${metadataextraction_default_cache_location}</value>
943
				</property>
944
				<property>
945
					<name>output_name_meta</name>
946
					<value>meta</value>
947
				</property>
948
				<property>
949
					<name>output_name_plaintext</name>
950
					<value>plaintext</value>
951
				</property>
952
				<property>
953
					<name>output_name_fault</name>
954
					<value>fault</value>
955
				</property>
956
				<property>
957
					<name>output_root</name>
958
					<value>${workingDir}/metadata_extractor/out</value>
959
				</property>
960
				<!-- all the other properties are autmatically propagated-->
961
            </configuration>
962
        </sub-workflow>
963
		<ok to="import_urlbased_joining"/>
964
		<error to="fail" />
965
	</action>
966
	
967
	<action name="metadata_extractor">
968
		<sub-workflow>
969
            <app-path>${wf:appPath()}/metadataextraction</app-path>
970
            <propagate-configuration/>
971
            <configuration>
972
            	<property>
973
                    <name>workingDir</name>
974
                    <value>${workingDir}/metadata_extractor/working_dir</value>
975
                </property>
976
                <!-- enabling streaming mode -->
977
                <property>
978
					<name>processing_mode</name>
979
					<value>StreamingMetadataExtractorMapper</value>
980
				</property>
981
				<property>
982
					<name>inputport_classname</name>
983
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
984
				</property>
985
            	<property>
986
					<name>input</name>
987
					<value>${workingDir}/import_content_url/imported/pdf</value>
988
				</property>
989
				<property>
990
					<name>excluded_ids</name>
991
					<value>${metadataextraction_excluded_checksums}</value>
992
				</property>
993
				<property>
994
					<name>max_file_size_mb</name>
995
					<value>${metadataextraction_max_file_size_mb}</value>
996
				</property>
997
				<property>
998
					<name>output_name_meta</name>
999
					<value>meta</value>
1000
				</property>
1001
				<property>
1002
					<name>output_name_plaintext</name>
1003
					<value>plaintext</value>
1004
				</property>
1005
				<property>
1006
					<name>output_name_fault</name>
1007
					<value>fault</value>
1008
				</property>
1009
				<property>
1010
					<name>output_root</name>
1011
					<value>${workingDir}/metadata_extractor/out</value>
1012
				</property>
1013
				<!-- all the other properties are autmatically propagated-->
1014
            </configuration>
1015
        </sub-workflow>
1016
		<ok to="import_urlbased_joining"/>
1017
		<error to="fail" />
1018
	</action>
1019
    <!-- end of metadata extraction section -->
1020

    
1021
	<join name="import_urlbased_joining" to="transformers_common_union_document_text"/>
1022
    
1023
    <!-- merging document text datastores: 
1024
    	1) retrieved directly from objectstore 
1025
    	2) generated by metadataextraction 
1026
    	3) ingested from PMC XMLs
1027
    	3) ingested from HTML
1028
    -->
1029
	<action name="transformers_common_union_document_text">
1030
	    <sub-workflow>
1031
            <app-path>${wf:appPath()}/transformers_common_union4</app-path>
1032
            <propagate-configuration/>
1033
            <configuration>
1034
            	<property>
1035
                    <name>workingDir</name>
1036
                    <value>${workingDir}/transformers_common_union_document_text/working_dir</value>
1037
                </property>
1038
            	<property>
1039
					<name>input_a</name>
1040
					<value>${workingDir}/import_plaintext/imported</value>
1041
				</property>
1042
				<property>
1043
					<name>input_b</name>
1044
					<value>${workingDir}/metadata_extractor/out/plaintext</value>
1045
				</property>
1046
				<property>
1047
					<name>input_c</name>
1048
					<value>${workingDir}/ingest_pmc_plaintext/imported</value>
1049
				</property>
1050
				<property>
1051
					<name>input_d</name>
1052
					<value>${workingDir}/ingest_html_plaintext/imported</value>
1053
				</property>
1054
				<property>
1055
					<name>output</name>
1056
					<value>${output_document_text}</value>
1057
				</property>
1058
				<property>
1059
					<name>schema</name>
1060
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
1061
				</property>
1062
            </configuration>
1063
        </sub-workflow>
1064
        <ok to="extracted_document_metadata_collapser"/>
1065
		<error to="fail" />
1066
    </action>
1067

    
1068
	<!-- merging extracted document metadata datastores: 
1069
    	1) extracted from PDF documents 
1070
    	2) ingested from PMC documents
1071
   	-->
1072
	<action name="extracted_document_metadata_collapser">
1073
        <sub-workflow>
1074
            <app-path>${wf:appPath()}/multiple_input_collapser</app-path>
1075
            <propagate-configuration/>
1076
            <configuration>
1077
                <property>
1078
                    <name>workingDir</name>
1079
                    <value>${workingDir}/extracted_document_metadata_collapser/working_dir</value>
1080
                </property>
1081
                <property>
1082
                    <name>origin_1</name>
1083
                    <value>pmc_ingestion</value>
1084
                </property>
1085
                <property>
1086
                    <name>input_1</name>
1087
                    <value>${workingDir}/transformers_ingest_pmc_metadata/out</value>
1088
                </property>
1089
                <property>
1090
                    <name>origin_2</name>
1091
                    <value>cermine</value>
1092
                </property>
1093
                <property>
1094
                    <name>input_2</name>
1095
                    <value>${workingDir}/metadata_extractor/out/meta</value>
1096
                </property>
1097
                <property>
1098
                    <name>output</name>
1099
                    <value>${output_extracted_document_metadata}</value>
1100
                </property>
1101
                <property>
1102
                    <name>blocking_field</name>
1103
                    <value>id</value>
1104
                </property>
1105
                <property>
1106
                    <name>schema_input</name>
1107
                    <value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</value>
1108
                </property>
1109
                <property>
1110
                    <name>schema_input_envelope</name>
1111
                    <value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadataEnvelope</value>
1112
                </property>
1113
            </configuration>
1114
        </sub-workflow>
1115
        <ok to="import_joining"/>
1116
        <error to="fail"/>
1117
    </action>
1118

    
1119
	<join name="import_joining" to="init-faults-dir"/>
1120
    
1121
    <action name="init-faults-dir">
1122
         <fs>
1123
            <delete path="${nameNode}${output_faults}" />
1124
			<mkdir path="${nameNode}${output_faults}" />
1125
        </fs>
1126
        <ok to="preserve-faults"/>
1127
        <error to="fail"/>
1128
    </action>
1129
    
1130
    <action name="preserve-faults">
1131
       <distcp xmlns="uri:oozie:distcp-action:0.1">
1132
           <job-tracker>${jobTracker}</job-tracker>
1133
           <name-node>${nameNode}</name-node>
1134
           <arg>${nameNode}${workingDir}/metadata_extractor/out/fault</arg>
1135
           <arg>${nameNode}${output_faults}/metadataextraction</arg>
1136
           </distcp>
1137
       <ok to="finalize"/>
1138
       <error to="fail"/>
1139
	</action>
1140

    
1141
    <decision name="finalize">
1142
		<switch>
1143
			<case to="remove_sideproducts">${remove_sideproducts eq "true"}</case>
1144
			<default to="end" />
1145
		</switch>
1146
	</decision>
1147
	
1148
	<action name="remove_sideproducts">
1149
		<fs>
1150
			<delete path="${nameNode}${workingDir}" />
1151
		</fs>
1152
		<ok to="end" />
1153
		<error to="fail" />
1154
	</action>
1155
    
1156
	<kill name="fail">
1157
		<message>Unfortunately, the process failed -- error message:
1158
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
1159
	</kill>
1160
	<end name="end" />
1161
</workflow-app>
(2-2/2)