Revision 34914
Added by Marek Horst about 9 years ago
workflow.xml | ||
---|---|---|
108 | 108 |
<description>text mime types</description> |
109 | 109 |
</property> |
110 | 110 |
<property> |
111 |
<name>mimetypes_html</name> |
|
112 |
<description>html mime types</description> |
|
113 |
</property> |
|
114 |
<property> |
|
111 | 115 |
<name>mimetypes_xml_pmc</name> |
112 | 116 |
<description>xml pmc types</description> |
113 | 117 |
</property> |
... | ... | |
550 | 554 |
<value>${mimetypes_text}</value> |
551 | 555 |
</property> |
552 | 556 |
<property> |
557 |
<name>mimetypes_html</name> |
|
558 |
<value>${mimetypes_html}</value> |
|
559 |
</property> |
|
560 |
<property> |
|
553 | 561 |
<name>mimetypes_xml_pmc</name> |
554 | 562 |
<value>${mimetypes_xml_pmc}</value> |
555 | 563 |
</property> |
... | ... | |
582 | 590 |
<value>text</value> |
583 | 591 |
</property> |
584 | 592 |
<property> |
593 |
<name>output_name_html</name> |
|
594 |
<value>html</value> |
|
595 |
</property> |
|
596 |
<property> |
|
585 | 597 |
<name>output_name_xml_pmc</name> |
586 | 598 |
<value>xmlpmc</value> |
587 | 599 |
</property> |
... | ... | |
625 | 637 |
<path start="import_plaintext"/> |
626 | 638 |
<path start="import_wos"/> |
627 | 639 |
<path start="import_plaintext_pmc"/> |
640 |
<path start="import_html"/> |
|
628 | 641 |
<path start="decision-metadata_extractor_use_cache"/> |
629 | 642 |
</fork> |
630 | 643 |
|
... | ... | |
953 | 966 |
|
954 | 967 |
<join name="ingest_pmc_joining" to="import_urlbased_joining"/> |
955 | 968 |
|
969 |
<!-- html import and plaintext ingestion section --> |
|
970 |
<action name="import_html"> |
|
971 |
<sub-workflow> |
|
972 |
<app-path>${wf:appPath()}/import_plaintext</app-path> |
|
973 |
<propagate-configuration/> |
|
974 |
<configuration> |
|
975 |
<property> |
|
976 |
<name>workingDir</name> |
|
977 |
<value>${workingDir}/import_html/working_dir</value> |
|
978 |
</property> |
|
979 |
<property> |
|
980 |
<name>input</name> |
|
981 |
<value>${workingDir}/import_content_url/imported/html</value> |
|
982 |
</property> |
|
983 |
<property> |
|
984 |
<name>content_connection_timeout</name> |
|
985 |
<value>${content_connection_timeout}</value> |
|
986 |
</property> |
|
987 |
<property> |
|
988 |
<name>content_read_timeout</name> |
|
989 |
<value>${content_read_timeout}</value> |
|
990 |
</property> |
|
991 |
<property> |
|
992 |
<name>output</name> |
|
993 |
<value>${workingDir}/import_html/imported</value> |
|
994 |
</property> |
|
995 |
</configuration> |
|
996 |
</sub-workflow> |
|
997 |
<ok to="ingest_html_plaintext" /> |
|
998 |
<error to="fail" /> |
|
999 |
</action> |
|
1000 |
|
|
1001 |
|
|
1002 |
<action name="ingest_html_plaintext"> |
|
1003 |
<sub-workflow> |
|
1004 |
<app-path>${wf:appPath()}/ingest_html_plaintext</app-path> |
|
1005 |
<propagate-configuration/> |
|
1006 |
<configuration> |
|
1007 |
<property> |
|
1008 |
<name>workingDir</name> |
|
1009 |
<value>${workingDir}/ingest_html_plaintext/working_dir</value> |
|
1010 |
</property> |
|
1011 |
<property> |
|
1012 |
<name>input</name> |
|
1013 |
<value>${workingDir}/import_html/imported</value> |
|
1014 |
</property> |
|
1015 |
<property> |
|
1016 |
<name>output</name> |
|
1017 |
<value>${workingDir}/ingest_html_plaintext/imported</value> |
|
1018 |
</property> |
|
1019 |
</configuration> |
|
1020 |
</sub-workflow> |
|
1021 |
<ok to="import_urlbased_joining" /> |
|
1022 |
<error to="fail" /> |
|
1023 |
</action> |
|
1024 |
|
|
956 | 1025 |
<!-- metadata extraction section --> |
957 | 1026 |
<decision name="decision-metadata_extractor_use_cache"> |
958 | 1027 |
<switch> |
... | ... | |
1074 | 1143 |
<!-- merging document text datastores: |
1075 | 1144 |
1) retrieved directly from objectstore |
1076 | 1145 |
2) generated by metadataextraction |
1077 |
3) imported from PMC XMLs |
|
1146 |
3) ingested from PMC XMLs |
|
1147 |
3) ingested from HTML |
|
1078 | 1148 |
--> |
1079 | 1149 |
<action name="transformers_common_union_document_text"> |
1080 | 1150 |
<sub-workflow> |
1081 |
<app-path>${wf:appPath()}/transformers_common_union3</app-path>
|
|
1151 |
<app-path>${wf:appPath()}/transformers_common_union4</app-path>
|
|
1082 | 1152 |
<propagate-configuration/> |
1083 | 1153 |
<configuration> |
1084 | 1154 |
<property> |
... | ... | |
1098 | 1168 |
<value>${workingDir}/ingest_pmc_plaintext/imported</value> |
1099 | 1169 |
</property> |
1100 | 1170 |
<property> |
1171 |
<name>input_d</name> |
|
1172 |
<value>${workingDir}/ingest_html_plaintext/imported</value> |
|
1173 |
</property> |
|
1174 |
<property> |
|
1101 | 1175 |
<name>output</name> |
1102 | 1176 |
<value>${output_document_text}</value> |
1103 | 1177 |
</property> |
Also available in: Unified diff
#1147 introducing HTML import and HTML plaintext ingestion in main workflows: primary and preprocessing