Project

General

Profile

« Previous | Next » 

Revision 39164

Added by Marek Horst over 8 years ago

merging trunk changes with IIS-CDH-5.3.0 branch

View differences:

workflow.xml
188 188
			<description>metadata importer output root directory, required when ${active_import_metadata}=true</description>
189 189
		</property>
190 190
		<property>
191
			<name>output_citation_pmc</name>
192
			<description>PMC citation output directory, required when ${active_import_metadata}=true</description>
193
		</property>
194
		<property>
195 191
			<name>output_dataset</name>
196 192
			<description>dataset importer output directory holding dataset metadata, required when ${active_import_dataset}=true</description>
197 193
		</property>
......
312 308
                    <value>${workingDir}/import/working_dir</value>
313 309
                </property>
314 310
				<property>
315
					<name>hbase_input_table</name>
316
					<value>${hbase_input_table}</value>
317
				</property>
318
				<property>
319 311
					<name>approved_datasources_csv</name>
320 312
					<value>${hbase_approved_datasources_csv}</value>
321 313
				</property>
322 314
				<property>
323
					<name>inference_provenance_blacklist</name>
324
					<value>${inference_provenance_blacklist}</value>
325
				</property>
326
				<property>
327
					<name>trust_level_threshold</name>
328
					<value>${trust_level_threshold}</value>
329
				</property>
330
				<property>
331
					<name>merge_body_with_updates</name>
332
					<value>${merge_body_with_updates}</value>
333
				</property>
334
				<property>
335 315
					<name>output</name>
336 316
					<value>${output_metadataimport_root}</value>
337 317
				</property>
......
356 336
		            <name>output_name_dedup_mapping</name>
357 337
		            <value>${metadataimport_output_name_dedup_mapping}</value>
358 338
		        </property>
339
		        <!-- all the other properties are autmatically propagated-->
359 340
			</configuration>
360 341
        </sub-workflow>
361 342
		<ok to="transformers-idextractor" />
......
394 375
                    <name>workingDir</name>
395 376
                    <value>${workingDir}/import_project/working_dir</value>
396 377
                </property>
397
                <property>
398
					<name>database_service_location</name>
399
					<value>${database_service_location}</value>
400
				</property>
401
				<property>
402
					<name>database_name</name>
403
					<value>${database_dbname}</value>
404
				</property>
405
				<property>
406
					<name>resultset_client_read_timeout</name>
407
					<value>${resultset_client_read_timeout}</value>
408
				</property>
409 378
            	<property>
410 379
					<name>output</name>
411 380
					<value>${output_metadataimport_root}/${metadataimport_output_name_project}</value>
412 381
				</property>
382
				<!-- all the other properties are autmatically propagated-->
413 383
			</configuration>
414 384
        </sub-workflow>
415 385
		<ok to="decision-import_content_url" />
......
432 402
                    <name>workingDir</name>
433 403
                    <value>${workingDir}/import_dataset/working_dir</value>
434 404
                </property>
435
                <property>
436
					<name>mdstore_service_location</name>
437
					<value>${mdstore_service_location}</value>
438
				</property>
439 405
				<property>
440 406
					<name>mdstore_ids_csv</name>
441 407
					<value>${dataset_mdstore_ids_csv}</value>
442 408
				</property>
443
				<property>
444
					<name>resultset_client_read_timeout</name>
445
					<value>${resultset_client_read_timeout}</value>
446
				</property>
447
            	<property>
448
					<name>output_dataset</name>
449
					<value>${output_dataset}</value>
450
				</property>
451
				<property>
452
					<name>output_dataset_to_mdstore</name>
453
					<value>${output_dataset_to_mdstore}</value>
454
				</property>
409
				<!-- all the other properties are autmatically propagated-->
455 410
			</configuration>
456 411
        </sub-workflow>
457 412
		<ok to="import_joining" />
......
525 480
                    <name>workingDir</name>
526 481
                    <value>${workingDir}/import_content_url/working_dir</value>
527 482
                </property>
528
                <property>
529
					<name>objectstore_service_location</name>
530
					<value>${objectstore_service_location}</value>
531
				</property>
532 483
				<property>
533
					<name>approved_objectstores_csv</name>
534
					<value>${approved_objectstores_csv}</value>
535
				</property>
536
				<property>
537
					<name>mimetypes_pdf</name>
538
					<value>${mimetypes_pdf}</value>
539
				</property>
540
				<property>
541
					<name>mimetypes_text</name>
542
					<value>${mimetypes_text}</value>
543
				</property>
544
				<property>
545
					<name>mimetypes_html</name>
546
					<value>${mimetypes_html}</value>
547
				</property>
548
				<property>
549
					<name>mimetypes_xml_pmc</name>
550
					<value>${mimetypes_xml_pmc}</value>
551
				</property>
552
				<property>
553
					<name>mimetypes_wos</name>
554
					<value>${mimetypes_wos}</value>
555
				</property>
556
				<property>
557
					<name>resultset_client_read_timeout</name>
558
					<value>${resultset_client_read_timeout}</value>
559
				</property>
560
				<property>
561 484
					<name>input_id</name>
562 485
					<value>${wf:actionData('input_id-path-setter')['result']}</value>
563 486
				</property>
......
589 512
					<name>output_name_wos</name>
590 513
					<value>wos</value>
591 514
				</property>
515
				<!-- all the other properties are autmatically propagated-->
592 516
			</configuration>
593 517
        </sub-workflow>
594 518
		<ok to="import_urlbased_forking" />
......
641 565
					<name>input</name>
642 566
					<value>${workingDir}/import_content_url/imported/text</value>
643 567
				</property>
644
				<property>
645
				    <name>content_connection_timeout</name>
646
				   <value>${content_connection_timeout}</value>
647
				</property>
648
				<property>
649
				    <name>content_read_timeout</name>
650
				   <value>${content_read_timeout}</value>
651
				</property>
652 568
            	<property>
653 569
					<name>output</name>
654 570
					<value>${workingDir}/import_plaintext/imported</value>
655 571
				</property>
572
				<!-- all the other properties are autmatically propagated-->
656 573
			</configuration>
657 574
        </sub-workflow>
658 575
		<ok to="import_urlbased_joining" />
......
672 589
					<name>input</name>
673 590
					<value>${workingDir}/import_content_url/imported/wos</value>
674 591
				</property>
675
				<property>
676
				    <name>content_connection_timeout</name>
677
				   <value>${content_connection_timeout}</value>
678
				</property>
679
				<property>
680
				    <name>content_read_timeout</name>
681
				   <value>${content_read_timeout}</value>
682
				</property>
683 592
            	<property>
684 593
					<name>output</name>
685 594
					<value>${output_wos}</value>
686 595
				</property>
596
				<!-- all the other properties are autmatically propagated-->
687 597
			</configuration>
688 598
        </sub-workflow>
689 599
		<ok to="import_urlbased_joining" />
......
703 613
					<name>input</name>
704 614
					<value>${workingDir}/import_content_url/imported/xmlpmc</value>
705 615
				</property>
706
				<property>
707
				    <name>content_connection_timeout</name>
708
				   <value>${content_connection_timeout}</value>
709
				</property>
710
				<property>
711
				    <name>content_read_timeout</name>
712
				   <value>${content_read_timeout}</value>
713
				</property>
714 616
            	<property>
715 617
					<name>output</name>
716 618
					<value>${workingDir}/import_plaintext_pmc/imported</value>
717 619
				</property>
620
				<!-- all the other properties are autmatically propagated-->
718 621
			</configuration>
719 622
        </sub-workflow>
720 623
		<ok to="ingest_pmc_forking" />
......
752 655
	<decision name="decision-ingest_pmc_metadata">
753 656
        <switch>
754 657
        	<!-- define ingest_pmc_metadata_joining here when introducing pmc metadata ingestion -->
755
            <case to="transformers-doitooaid">${active_import_metadata eq "true" and active_ingest_pmc eq "true"}</case>
756
            <default to="skip-ingest_pmc_citations"/>
658
            <case to="ingest_pmc_metadata">${active_import_metadata eq "true" and active_ingest_pmc eq "true"}</case>
659
            <default to="skip-ingest_pmc_metadata"/>
757 660
        </switch>
758 661
    </decision>
759 662

  
760
	<action name="transformers-doitooaid">
761
        <sub-workflow>
762
            <app-path>${wf:appPath()}/transformers_externalidtooaid</app-path>
763
            <propagate-configuration/>
764
            <configuration>
765
                <property>
766
                    <name>workingDir</name>
767
                    <value>${workingDir}/transformers_doitooaid/working_dir</value>
768
                </property>
769
                <property>
770
                    <name>input_document_metadata</name>
771
                    <value>${output_metadataimport_root}/${metadataimport_output_name_document_meta}</value>
772
                </property>
773
                <property>
774
                    <name>external_id_type</name>
775
                    <value>doi</value>
776
                </property>
777
                <property>
778
                    <name>output</name>
779
                    <value>${workingDir}/transformers_doitooaid/out</value>
780
                </property>
781
            </configuration>
782
        </sub-workflow>
783
        <ok to="ingest_pmc_metadata"/>
784
        <error to="fail"/>
785
    </action>
786

  
787 663
	<action name="ingest_pmc_metadata">
788 664
        <sub-workflow>
789 665
            <app-path>${wf:appPath()}/ingest_pmc_metadata</app-path>
......
803 679
                </property>
804 680
            </configuration>
805 681
        </sub-workflow>
806
		<ok to="collapse_pmc_metadata"/>
807
        <error to="fail"/>
808
    </action>
809

  
810
	<action name="collapse_pmc_metadata">
811
        <sub-workflow>
812
            <app-path>${wf:appPath()}/basic_collapser</app-path>
813
            <propagate-configuration/>
814
            <configuration>
815
                <property>
816
                    <name>workingDir</name>
817
                    <value>${workingDir}/collapse_pmc_metadata/working_dir</value>
818
                </property>
819
                <property>
820
                    <name>input</name>
821
                    <value>${workingDir}/ingest_pmc_metadata/out</value>
822
                </property>
823
                <property>
824
                    <name>output</name>
825
                    <value>${workingDir}/collapse_pmc_metadata/out</value>
826
                </property>
827
                <property>
828
                    <name>schema</name>
829
                    <value>eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata</value>
830
                </property>
831
                <property>
832
                    <name>blocking_field</name>
833
                    <value>id</value>
834
                </property>
835
                <property>
836
                    <name>significant_fields</name>
837
                    <value>journal,references,pages</value>
838
                </property>
839
            </configuration>
840
        </sub-workflow>
841 682
		<ok to="transformers_ingest_pmc_metadata"/>
842 683
        <error to="fail"/>
843 684
    </action>
......
853 694
                </property>
854 695
                <property>
855 696
                    <name>input</name>
856
                    <value>${workingDir}/collapse_pmc_metadata/out</value>
697
                    <value>${workingDir}/ingest_pmc_metadata/out</value>
857 698
                </property>
858 699
                <property>
859 700
                    <name>output</name>
......
861 702
                </property>
862 703
            </configuration>
863 704
        </sub-workflow>
864
		<ok to="ingest_pmc_idmapping_pmidtooaid"/>
705
		<ok to="ingest_pmc_joining"/>
865 706
        <error to="fail"/>
866 707
    </action>
867 708

  
868
	<action name="ingest_pmc_idmapping_pmidtooaid">
869
        <sub-workflow>
870
            <app-path>${wf:appPath()}/ingest_pmc_idmapping_pmidtooaid</app-path>
871
            <propagate-configuration/>
872
            <configuration>
873
                <property>
874
                    <name>workingDir</name>
875
                    <value>${workingDir}/ingest_pmc_idmapping_pmidtooaid/working_dir</value>
876
                </property>
877
                <property>
878
                    <name>input</name>
879
                    <value>${workingDir}/collapse_pmc_metadata/out</value>
880
                </property>
881
                <property>
882
                    <name>output</name>
883
                    <value>${workingDir}/ingest_pmc_idmapping_pmidtooaid/out</value>
884
                </property>
885
            </configuration>
886
        </sub-workflow>
887
		<ok to="ingest_pmc_citations"/>
888
        <error to="fail"/>
889
    </action>
890
    
891
	<action name="ingest_pmc_citations">
892
		<sub-workflow>
893
            <app-path>${wf:appPath()}/ingest_pmc_citations</app-path>
894
            <propagate-configuration/>
895
            <configuration>
896
            	<property>
897
                    <name>workingDir</name>
898
                    <value>${workingDir}/ingest_pmc_citations/working_dir</value>
899
                </property>
900
                <property>
901
					<name>input_extracted_document_metadata</name>
902
					<value>${workingDir}/collapse_pmc_metadata/out</value>
903
				</property>
904
				<property>
905
					<name>input_dedup_map</name>
906
					<value>${output_metadataimport_root}/${metadataimport_output_name_dedup_mapping}</value>
907
				</property>
908
				<property>
909
                    <name>input_doi_to_oaid</name>
910
                    <value>${workingDir}/transformers_doitooaid/out</value>
911
                </property>
912
                <property>
913
                    <name>input_pmid_to_oaid</name>
914
                    <value>${workingDir}/ingest_pmc_idmapping_pmidtooaid/out</value>
915
                </property>
916
            	<property>
917
					<name>output_citation</name>
918
					<value>${output_citation_pmc}</value>
919
				</property>
920
			</configuration>
921
        </sub-workflow>
922
		<ok to="ingest_pmc_joining" />
923
		<error to="fail" />
924
	</action>
925

  
926
	<action name="skip-ingest_pmc_citations">
709
	<action name="skip-ingest_pmc_metadata">
927 710
        <java>
928 711
			<prepare>
929 712
				<!-- notice: directory have to aligned with skipped action output -->
930
				<delete path="${nameNode}${workingDir}/ingest_pmc_citations" />
931 713
				<delete path="${nameNode}${workingDir}/transformers_ingest_pmc_metadata"/>
932
				<delete path="${nameNode}${output_citation_pmc}"/>
933
				<mkdir path="${nameNode}${workingDir}/ingest_pmc_citations" />
934 714
				<mkdir path="${nameNode}${workingDir}/transformers_ingest_pmc_metadata" />
935
				<mkdir path="${nameNode}${output_citation_pmc}"/>
936 715
			</prepare>
937 716
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
938 717
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
939
            <arg>-C{citation_pmc,
940
				eu.dnetlib.iis.ingest.pmc.citations.schemas.Citation,
941
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
942 718
				<arg>-C{metadata_pmc,
943 719
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
944 720
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
945 721
            <!-- notice: directory have to aligned with skipped action output -->
946
            <arg>-Ocitation_pmc=${output_citation_pmc}</arg>
947 722
            <arg>-Ometadata_pmc=${workingDir}/transformers_ingest_pmc_metadata/out</arg>
948 723
        </java>
949 724
        <ok to="ingest_pmc_joining"/>
......
966 741
					<name>input</name>
967 742
					<value>${workingDir}/import_content_url/imported/html</value>
968 743
				</property>
969
				<property>
970
				    <name>content_connection_timeout</name>
971
				   <value>${content_connection_timeout}</value>
972
				</property>
973
				<property>
974
				    <name>content_read_timeout</name>
975
				   <value>${content_read_timeout}</value>
976
				</property>
977 744
            	<property>
978 745
					<name>output</name>
979 746
					<value>${workingDir}/import_html/imported</value>
980 747
				</property>
748
				<!-- all the other properties are autmatically propagated-->
981 749
			</configuration>
982 750
        </sub-workflow>
983 751
		<ok to="ingest_html_plaintext" />
......
1038 806
					<value>${metadataextraction_max_file_size_mb}</value>
1039 807
				</property>
1040 808
				<property>
1041
				    <name>content_connection_timeout</name>
1042
				   <value>${content_connection_timeout}</value>
1043
				</property>
1044
				<property>
1045
				    <name>content_read_timeout</name>
1046
				   <value>${content_read_timeout}</value>
1047
				</property>
1048
				<property>
1049 809
					<name>default_cache_location</name>
1050 810
					<value>${metadataextraction_default_cache_location}</value>
1051 811
				</property>
......
1065 825
					<name>output_root</name>
1066 826
					<value>${workingDir}/metadata_extractor/out</value>
1067 827
				</property>
828
				<!-- all the other properties are autmatically propagated-->
1068 829
            </configuration>
1069 830
        </sub-workflow>
1070 831
		<ok to="import_urlbased_joining"/>
......
1102 863
					<value>${metadataextraction_max_file_size_mb}</value>
1103 864
				</property>
1104 865
				<property>
1105
				    <name>content_connection_timeout</name>
1106
				   <value>${content_connection_timeout}</value>
1107
				</property>
1108
				<property>
1109
				    <name>content_read_timeout</name>
1110
				   <value>${content_read_timeout}</value>
1111
				</property>
1112
				<property>
1113 866
					<name>output_name_meta</name>
1114 867
					<value>meta</value>
1115 868
				</property>
......
1125 878
					<name>output_root</name>
1126 879
					<value>${workingDir}/metadata_extractor/out</value>
1127 880
				</property>
881
				<!-- all the other properties are autmatically propagated-->
1128 882
            </configuration>
1129 883
        </sub-workflow>
1130 884
		<ok to="import_urlbased_joining"/>

Also available in: Unified diff