Revision 39164
Added by Marek Horst over 8 years ago
workflow.xml | ||
---|---|---|
188 | 188 |
<description>metadata importer output root directory, required when ${active_import_metadata}=true</description> |
189 | 189 |
</property> |
190 | 190 |
<property> |
191 |
<name>output_citation_pmc</name> |
|
192 |
<description>PMC citation output directory, required when ${active_import_metadata}=true</description> |
|
193 |
</property> |
|
194 |
<property> |
|
195 | 191 |
<name>output_dataset</name> |
196 | 192 |
<description>dataset importer output directory holding dataset metadata, required when ${active_import_dataset}=true</description> |
197 | 193 |
</property> |
... | ... | |
312 | 308 |
<value>${workingDir}/import/working_dir</value> |
313 | 309 |
</property> |
314 | 310 |
<property> |
315 |
<name>hbase_input_table</name> |
|
316 |
<value>${hbase_input_table}</value> |
|
317 |
</property> |
|
318 |
<property> |
|
319 | 311 |
<name>approved_datasources_csv</name> |
320 | 312 |
<value>${hbase_approved_datasources_csv}</value> |
321 | 313 |
</property> |
322 | 314 |
<property> |
323 |
<name>inference_provenance_blacklist</name> |
|
324 |
<value>${inference_provenance_blacklist}</value> |
|
325 |
</property> |
|
326 |
<property> |
|
327 |
<name>trust_level_threshold</name> |
|
328 |
<value>${trust_level_threshold}</value> |
|
329 |
</property> |
|
330 |
<property> |
|
331 |
<name>merge_body_with_updates</name> |
|
332 |
<value>${merge_body_with_updates}</value> |
|
333 |
</property> |
|
334 |
<property> |
|
335 | 315 |
<name>output</name> |
336 | 316 |
<value>${output_metadataimport_root}</value> |
337 | 317 |
</property> |
... | ... | |
356 | 336 |
<name>output_name_dedup_mapping</name> |
357 | 337 |
<value>${metadataimport_output_name_dedup_mapping}</value> |
358 | 338 |
</property> |
339 |
<!-- all the other properties are autmatically propagated--> |
|
359 | 340 |
</configuration> |
360 | 341 |
</sub-workflow> |
361 | 342 |
<ok to="transformers-idextractor" /> |
... | ... | |
394 | 375 |
<name>workingDir</name> |
395 | 376 |
<value>${workingDir}/import_project/working_dir</value> |
396 | 377 |
</property> |
397 |
<property> |
|
398 |
<name>database_service_location</name> |
|
399 |
<value>${database_service_location}</value> |
|
400 |
</property> |
|
401 |
<property> |
|
402 |
<name>database_name</name> |
|
403 |
<value>${database_dbname}</value> |
|
404 |
</property> |
|
405 |
<property> |
|
406 |
<name>resultset_client_read_timeout</name> |
|
407 |
<value>${resultset_client_read_timeout}</value> |
|
408 |
</property> |
|
409 | 378 |
<property> |
410 | 379 |
<name>output</name> |
411 | 380 |
<value>${output_metadataimport_root}/${metadataimport_output_name_project}</value> |
412 | 381 |
</property> |
382 |
<!-- all the other properties are autmatically propagated--> |
|
413 | 383 |
</configuration> |
414 | 384 |
</sub-workflow> |
415 | 385 |
<ok to="decision-import_content_url" /> |
... | ... | |
432 | 402 |
<name>workingDir</name> |
433 | 403 |
<value>${workingDir}/import_dataset/working_dir</value> |
434 | 404 |
</property> |
435 |
<property> |
|
436 |
<name>mdstore_service_location</name> |
|
437 |
<value>${mdstore_service_location}</value> |
|
438 |
</property> |
|
439 | 405 |
<property> |
440 | 406 |
<name>mdstore_ids_csv</name> |
441 | 407 |
<value>${dataset_mdstore_ids_csv}</value> |
442 | 408 |
</property> |
443 |
<property> |
|
444 |
<name>resultset_client_read_timeout</name> |
|
445 |
<value>${resultset_client_read_timeout}</value> |
|
446 |
</property> |
|
447 |
<property> |
|
448 |
<name>output_dataset</name> |
|
449 |
<value>${output_dataset}</value> |
|
450 |
</property> |
|
451 |
<property> |
|
452 |
<name>output_dataset_to_mdstore</name> |
|
453 |
<value>${output_dataset_to_mdstore}</value> |
|
454 |
</property> |
|
409 |
<!-- all the other properties are autmatically propagated--> |
|
455 | 410 |
</configuration> |
456 | 411 |
</sub-workflow> |
457 | 412 |
<ok to="import_joining" /> |
... | ... | |
525 | 480 |
<name>workingDir</name> |
526 | 481 |
<value>${workingDir}/import_content_url/working_dir</value> |
527 | 482 |
</property> |
528 |
<property> |
|
529 |
<name>objectstore_service_location</name> |
|
530 |
<value>${objectstore_service_location}</value> |
|
531 |
</property> |
|
532 | 483 |
<property> |
533 |
<name>approved_objectstores_csv</name> |
|
534 |
<value>${approved_objectstores_csv}</value> |
|
535 |
</property> |
|
536 |
<property> |
|
537 |
<name>mimetypes_pdf</name> |
|
538 |
<value>${mimetypes_pdf}</value> |
|
539 |
</property> |
|
540 |
<property> |
|
541 |
<name>mimetypes_text</name> |
|
542 |
<value>${mimetypes_text}</value> |
|
543 |
</property> |
|
544 |
<property> |
|
545 |
<name>mimetypes_html</name> |
|
546 |
<value>${mimetypes_html}</value> |
|
547 |
</property> |
|
548 |
<property> |
|
549 |
<name>mimetypes_xml_pmc</name> |
|
550 |
<value>${mimetypes_xml_pmc}</value> |
|
551 |
</property> |
|
552 |
<property> |
|
553 |
<name>mimetypes_wos</name> |
|
554 |
<value>${mimetypes_wos}</value> |
|
555 |
</property> |
|
556 |
<property> |
|
557 |
<name>resultset_client_read_timeout</name> |
|
558 |
<value>${resultset_client_read_timeout}</value> |
|
559 |
</property> |
|
560 |
<property> |
|
561 | 484 |
<name>input_id</name> |
562 | 485 |
<value>${wf:actionData('input_id-path-setter')['result']}</value> |
563 | 486 |
</property> |
... | ... | |
589 | 512 |
<name>output_name_wos</name> |
590 | 513 |
<value>wos</value> |
591 | 514 |
</property> |
515 |
<!-- all the other properties are autmatically propagated--> |
|
592 | 516 |
</configuration> |
593 | 517 |
</sub-workflow> |
594 | 518 |
<ok to="import_urlbased_forking" /> |
... | ... | |
641 | 565 |
<name>input</name> |
642 | 566 |
<value>${workingDir}/import_content_url/imported/text</value> |
643 | 567 |
</property> |
644 |
<property> |
|
645 |
<name>content_connection_timeout</name> |
|
646 |
<value>${content_connection_timeout}</value> |
|
647 |
</property> |
|
648 |
<property> |
|
649 |
<name>content_read_timeout</name> |
|
650 |
<value>${content_read_timeout}</value> |
|
651 |
</property> |
|
652 | 568 |
<property> |
653 | 569 |
<name>output</name> |
654 | 570 |
<value>${workingDir}/import_plaintext/imported</value> |
655 | 571 |
</property> |
572 |
<!-- all the other properties are autmatically propagated--> |
|
656 | 573 |
</configuration> |
657 | 574 |
</sub-workflow> |
658 | 575 |
<ok to="import_urlbased_joining" /> |
... | ... | |
672 | 589 |
<name>input</name> |
673 | 590 |
<value>${workingDir}/import_content_url/imported/wos</value> |
674 | 591 |
</property> |
675 |
<property> |
|
676 |
<name>content_connection_timeout</name> |
|
677 |
<value>${content_connection_timeout}</value> |
|
678 |
</property> |
|
679 |
<property> |
|
680 |
<name>content_read_timeout</name> |
|
681 |
<value>${content_read_timeout}</value> |
|
682 |
</property> |
|
683 | 592 |
<property> |
684 | 593 |
<name>output</name> |
685 | 594 |
<value>${output_wos}</value> |
686 | 595 |
</property> |
596 |
<!-- all the other properties are autmatically propagated--> |
|
687 | 597 |
</configuration> |
688 | 598 |
</sub-workflow> |
689 | 599 |
<ok to="import_urlbased_joining" /> |
... | ... | |
703 | 613 |
<name>input</name> |
704 | 614 |
<value>${workingDir}/import_content_url/imported/xmlpmc</value> |
705 | 615 |
</property> |
706 |
<property> |
|
707 |
<name>content_connection_timeout</name> |
|
708 |
<value>${content_connection_timeout}</value> |
|
709 |
</property> |
|
710 |
<property> |
|
711 |
<name>content_read_timeout</name> |
|
712 |
<value>${content_read_timeout}</value> |
|
713 |
</property> |
|
714 | 616 |
<property> |
715 | 617 |
<name>output</name> |
716 | 618 |
<value>${workingDir}/import_plaintext_pmc/imported</value> |
717 | 619 |
</property> |
620 |
<!-- all the other properties are autmatically propagated--> |
|
718 | 621 |
</configuration> |
719 | 622 |
</sub-workflow> |
720 | 623 |
<ok to="ingest_pmc_forking" /> |
... | ... | |
752 | 655 |
<decision name="decision-ingest_pmc_metadata"> |
753 | 656 |
<switch> |
754 | 657 |
<!-- define ingest_pmc_metadata_joining here when introducing pmc metadata ingestion --> |
755 |
<case to="transformers-doitooaid">${active_import_metadata eq "true" and active_ingest_pmc eq "true"}</case>
|
|
756 |
<default to="skip-ingest_pmc_citations"/>
|
|
658 |
<case to="ingest_pmc_metadata">${active_import_metadata eq "true" and active_ingest_pmc eq "true"}</case>
|
|
659 |
<default to="skip-ingest_pmc_metadata"/>
|
|
757 | 660 |
</switch> |
758 | 661 |
</decision> |
759 | 662 |
|
760 |
<action name="transformers-doitooaid"> |
|
761 |
<sub-workflow> |
|
762 |
<app-path>${wf:appPath()}/transformers_externalidtooaid</app-path> |
|
763 |
<propagate-configuration/> |
|
764 |
<configuration> |
|
765 |
<property> |
|
766 |
<name>workingDir</name> |
|
767 |
<value>${workingDir}/transformers_doitooaid/working_dir</value> |
|
768 |
</property> |
|
769 |
<property> |
|
770 |
<name>input_document_metadata</name> |
|
771 |
<value>${output_metadataimport_root}/${metadataimport_output_name_document_meta}</value> |
|
772 |
</property> |
|
773 |
<property> |
|
774 |
<name>external_id_type</name> |
|
775 |
<value>doi</value> |
|
776 |
</property> |
|
777 |
<property> |
|
778 |
<name>output</name> |
|
779 |
<value>${workingDir}/transformers_doitooaid/out</value> |
|
780 |
</property> |
|
781 |
</configuration> |
|
782 |
</sub-workflow> |
|
783 |
<ok to="ingest_pmc_metadata"/> |
|
784 |
<error to="fail"/> |
|
785 |
</action> |
|
786 |
|
|
787 | 663 |
<action name="ingest_pmc_metadata"> |
788 | 664 |
<sub-workflow> |
789 | 665 |
<app-path>${wf:appPath()}/ingest_pmc_metadata</app-path> |
... | ... | |
803 | 679 |
</property> |
804 | 680 |
</configuration> |
805 | 681 |
</sub-workflow> |
806 |
<ok to="collapse_pmc_metadata"/> |
|
807 |
<error to="fail"/> |
|
808 |
</action> |
|
809 |
|
|
810 |
<action name="collapse_pmc_metadata"> |
|
811 |
<sub-workflow> |
|
812 |
<app-path>${wf:appPath()}/basic_collapser</app-path> |
|
813 |
<propagate-configuration/> |
|
814 |
<configuration> |
|
815 |
<property> |
|
816 |
<name>workingDir</name> |
|
817 |
<value>${workingDir}/collapse_pmc_metadata/working_dir</value> |
|
818 |
</property> |
|
819 |
<property> |
|
820 |
<name>input</name> |
|
821 |
<value>${workingDir}/ingest_pmc_metadata/out</value> |
|
822 |
</property> |
|
823 |
<property> |
|
824 |
<name>output</name> |
|
825 |
<value>${workingDir}/collapse_pmc_metadata/out</value> |
|
826 |
</property> |
|
827 |
<property> |
|
828 |
<name>schema</name> |
|
829 |
<value>eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata</value> |
|
830 |
</property> |
|
831 |
<property> |
|
832 |
<name>blocking_field</name> |
|
833 |
<value>id</value> |
|
834 |
</property> |
|
835 |
<property> |
|
836 |
<name>significant_fields</name> |
|
837 |
<value>journal,references,pages</value> |
|
838 |
</property> |
|
839 |
</configuration> |
|
840 |
</sub-workflow> |
|
841 | 682 |
<ok to="transformers_ingest_pmc_metadata"/> |
842 | 683 |
<error to="fail"/> |
843 | 684 |
</action> |
... | ... | |
853 | 694 |
</property> |
854 | 695 |
<property> |
855 | 696 |
<name>input</name> |
856 |
<value>${workingDir}/collapse_pmc_metadata/out</value>
|
|
697 |
<value>${workingDir}/ingest_pmc_metadata/out</value>
|
|
857 | 698 |
</property> |
858 | 699 |
<property> |
859 | 700 |
<name>output</name> |
... | ... | |
861 | 702 |
</property> |
862 | 703 |
</configuration> |
863 | 704 |
</sub-workflow> |
864 |
<ok to="ingest_pmc_idmapping_pmidtooaid"/>
|
|
705 |
<ok to="ingest_pmc_joining"/>
|
|
865 | 706 |
<error to="fail"/> |
866 | 707 |
</action> |
867 | 708 |
|
868 |
<action name="ingest_pmc_idmapping_pmidtooaid"> |
|
869 |
<sub-workflow> |
|
870 |
<app-path>${wf:appPath()}/ingest_pmc_idmapping_pmidtooaid</app-path> |
|
871 |
<propagate-configuration/> |
|
872 |
<configuration> |
|
873 |
<property> |
|
874 |
<name>workingDir</name> |
|
875 |
<value>${workingDir}/ingest_pmc_idmapping_pmidtooaid/working_dir</value> |
|
876 |
</property> |
|
877 |
<property> |
|
878 |
<name>input</name> |
|
879 |
<value>${workingDir}/collapse_pmc_metadata/out</value> |
|
880 |
</property> |
|
881 |
<property> |
|
882 |
<name>output</name> |
|
883 |
<value>${workingDir}/ingest_pmc_idmapping_pmidtooaid/out</value> |
|
884 |
</property> |
|
885 |
</configuration> |
|
886 |
</sub-workflow> |
|
887 |
<ok to="ingest_pmc_citations"/> |
|
888 |
<error to="fail"/> |
|
889 |
</action> |
|
890 |
|
|
891 |
<action name="ingest_pmc_citations"> |
|
892 |
<sub-workflow> |
|
893 |
<app-path>${wf:appPath()}/ingest_pmc_citations</app-path> |
|
894 |
<propagate-configuration/> |
|
895 |
<configuration> |
|
896 |
<property> |
|
897 |
<name>workingDir</name> |
|
898 |
<value>${workingDir}/ingest_pmc_citations/working_dir</value> |
|
899 |
</property> |
|
900 |
<property> |
|
901 |
<name>input_extracted_document_metadata</name> |
|
902 |
<value>${workingDir}/collapse_pmc_metadata/out</value> |
|
903 |
</property> |
|
904 |
<property> |
|
905 |
<name>input_dedup_map</name> |
|
906 |
<value>${output_metadataimport_root}/${metadataimport_output_name_dedup_mapping}</value> |
|
907 |
</property> |
|
908 |
<property> |
|
909 |
<name>input_doi_to_oaid</name> |
|
910 |
<value>${workingDir}/transformers_doitooaid/out</value> |
|
911 |
</property> |
|
912 |
<property> |
|
913 |
<name>input_pmid_to_oaid</name> |
|
914 |
<value>${workingDir}/ingest_pmc_idmapping_pmidtooaid/out</value> |
|
915 |
</property> |
|
916 |
<property> |
|
917 |
<name>output_citation</name> |
|
918 |
<value>${output_citation_pmc}</value> |
|
919 |
</property> |
|
920 |
</configuration> |
|
921 |
</sub-workflow> |
|
922 |
<ok to="ingest_pmc_joining" /> |
|
923 |
<error to="fail" /> |
|
924 |
</action> |
|
925 |
|
|
926 |
<action name="skip-ingest_pmc_citations"> |
|
709 |
<action name="skip-ingest_pmc_metadata"> |
|
927 | 710 |
<java> |
928 | 711 |
<prepare> |
929 | 712 |
<!-- notice: directory have to aligned with skipped action output --> |
930 |
<delete path="${nameNode}${workingDir}/ingest_pmc_citations" /> |
|
931 | 713 |
<delete path="${nameNode}${workingDir}/transformers_ingest_pmc_metadata"/> |
932 |
<delete path="${nameNode}${output_citation_pmc}"/> |
|
933 |
<mkdir path="${nameNode}${workingDir}/ingest_pmc_citations" /> |
|
934 | 714 |
<mkdir path="${nameNode}${workingDir}/transformers_ingest_pmc_metadata" /> |
935 |
<mkdir path="${nameNode}${output_citation_pmc}"/> |
|
936 | 715 |
</prepare> |
937 | 716 |
<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class> |
938 | 717 |
<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg> |
939 |
<arg>-C{citation_pmc, |
|
940 |
eu.dnetlib.iis.ingest.pmc.citations.schemas.Citation, |
|
941 |
eu/dnetlib/iis/mainworkflows/data/empty.json}</arg> |
|
942 | 718 |
<arg>-C{metadata_pmc, |
943 | 719 |
eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata, |
944 | 720 |
eu/dnetlib/iis/mainworkflows/data/empty.json}</arg> |
945 | 721 |
<!-- notice: directory have to aligned with skipped action output --> |
946 |
<arg>-Ocitation_pmc=${output_citation_pmc}</arg> |
|
947 | 722 |
<arg>-Ometadata_pmc=${workingDir}/transformers_ingest_pmc_metadata/out</arg> |
948 | 723 |
</java> |
949 | 724 |
<ok to="ingest_pmc_joining"/> |
... | ... | |
966 | 741 |
<name>input</name> |
967 | 742 |
<value>${workingDir}/import_content_url/imported/html</value> |
968 | 743 |
</property> |
969 |
<property> |
|
970 |
<name>content_connection_timeout</name> |
|
971 |
<value>${content_connection_timeout}</value> |
|
972 |
</property> |
|
973 |
<property> |
|
974 |
<name>content_read_timeout</name> |
|
975 |
<value>${content_read_timeout}</value> |
|
976 |
</property> |
|
977 | 744 |
<property> |
978 | 745 |
<name>output</name> |
979 | 746 |
<value>${workingDir}/import_html/imported</value> |
980 | 747 |
</property> |
748 |
<!-- all the other properties are autmatically propagated--> |
|
981 | 749 |
</configuration> |
982 | 750 |
</sub-workflow> |
983 | 751 |
<ok to="ingest_html_plaintext" /> |
... | ... | |
1038 | 806 |
<value>${metadataextraction_max_file_size_mb}</value> |
1039 | 807 |
</property> |
1040 | 808 |
<property> |
1041 |
<name>content_connection_timeout</name> |
|
1042 |
<value>${content_connection_timeout}</value> |
|
1043 |
</property> |
|
1044 |
<property> |
|
1045 |
<name>content_read_timeout</name> |
|
1046 |
<value>${content_read_timeout}</value> |
|
1047 |
</property> |
|
1048 |
<property> |
|
1049 | 809 |
<name>default_cache_location</name> |
1050 | 810 |
<value>${metadataextraction_default_cache_location}</value> |
1051 | 811 |
</property> |
... | ... | |
1065 | 825 |
<name>output_root</name> |
1066 | 826 |
<value>${workingDir}/metadata_extractor/out</value> |
1067 | 827 |
</property> |
828 |
<!-- all the other properties are autmatically propagated--> |
|
1068 | 829 |
</configuration> |
1069 | 830 |
</sub-workflow> |
1070 | 831 |
<ok to="import_urlbased_joining"/> |
... | ... | |
1102 | 863 |
<value>${metadataextraction_max_file_size_mb}</value> |
1103 | 864 |
</property> |
1104 | 865 |
<property> |
1105 |
<name>content_connection_timeout</name> |
|
1106 |
<value>${content_connection_timeout}</value> |
|
1107 |
</property> |
|
1108 |
<property> |
|
1109 |
<name>content_read_timeout</name> |
|
1110 |
<value>${content_read_timeout}</value> |
|
1111 |
</property> |
|
1112 |
<property> |
|
1113 | 866 |
<name>output_name_meta</name> |
1114 | 867 |
<value>meta</value> |
1115 | 868 |
</property> |
... | ... | |
1125 | 878 |
<name>output_root</name> |
1126 | 879 |
<value>${workingDir}/metadata_extractor/out</value> |
1127 | 880 |
</property> |
881 |
<!-- all the other properties are autmatically propagated--> |
|
1128 | 882 |
</configuration> |
1129 | 883 |
</sub-workflow> |
1130 | 884 |
<ok to="import_urlbased_joining"/> |
Also available in: Unified diff
merging trunk changes with IIS-CDH-5.3.0 branch