Project

General

Profile

« Previous | Next » 

Revision 34914

Added by Marek Horst about 9 years ago

#1147 introducing HTML import and HTML plaintext ingestion in main workflows: primary and preprocessing

View differences:

workflow.xml
108 108
			<description>text mime types</description>
109 109
		</property>
110 110
		<property>
111
			<name>mimetypes_html</name>
112
			<description>html mime types</description>
113
		</property>
114
		<property>
111 115
			<name>mimetypes_xml_pmc</name>
112 116
			<description>xml pmc types</description>
113 117
		</property>
......
550 554
					<value>${mimetypes_text}</value>
551 555
				</property>
552 556
				<property>
557
					<name>mimetypes_html</name>
558
					<value>${mimetypes_html}</value>
559
				</property>
560
				<property>
553 561
					<name>mimetypes_xml_pmc</name>
554 562
					<value>${mimetypes_xml_pmc}</value>
555 563
				</property>
......
582 590
					<value>text</value>
583 591
				</property>
584 592
				<property>
593
					<name>output_name_html</name>
594
					<value>html</value>
595
				</property>
596
				<property>
585 597
					<name>output_name_xml_pmc</name>
586 598
					<value>xmlpmc</value>
587 599
				</property>
......
625 637
    	<path start="import_plaintext"/>
626 638
    	<path start="import_wos"/>
627 639
    	<path start="import_plaintext_pmc"/>
640
    	<path start="import_html"/>
628 641
		<path start="decision-metadata_extractor_use_cache"/>
629 642
    </fork>
630 643

  
......
953 966

  
954 967
	<join name="ingest_pmc_joining" to="import_urlbased_joining"/>
955 968

  
969
	<!-- html import and plaintext ingestion section -->
970
	<action name="import_html">
971
		<sub-workflow>
972
            <app-path>${wf:appPath()}/import_plaintext</app-path>
973
            <propagate-configuration/>
974
            <configuration>
975
            	<property>
976
                    <name>workingDir</name>
977
                    <value>${workingDir}/import_html/working_dir</value>
978
                </property>
979
                <property>
980
					<name>input</name>
981
					<value>${workingDir}/import_content_url/imported/html</value>
982
				</property>
983
				<property>
984
				    <name>content_connection_timeout</name>
985
				   <value>${content_connection_timeout}</value>
986
				</property>
987
				<property>
988
				    <name>content_read_timeout</name>
989
				   <value>${content_read_timeout}</value>
990
				</property>
991
            	<property>
992
					<name>output</name>
993
					<value>${workingDir}/import_html/imported</value>
994
				</property>
995
			</configuration>
996
        </sub-workflow>
997
		<ok to="ingest_html_plaintext" />
998
		<error to="fail" />
999
	</action>
1000

  
1001
	
1002
	<action name="ingest_html_plaintext">
1003
		<sub-workflow>
1004
            <app-path>${wf:appPath()}/ingest_html_plaintext</app-path>
1005
            <propagate-configuration/>
1006
            <configuration>
1007
            	<property>
1008
                    <name>workingDir</name>
1009
                    <value>${workingDir}/ingest_html_plaintext/working_dir</value>
1010
                </property>
1011
                <property>
1012
					<name>input</name>
1013
					<value>${workingDir}/import_html/imported</value>
1014
				</property>
1015
            	<property>
1016
					<name>output</name>
1017
					<value>${workingDir}/ingest_html_plaintext/imported</value>
1018
				</property>
1019
			</configuration>
1020
        </sub-workflow>
1021
		<ok to="import_urlbased_joining" />
1022
		<error to="fail" />
1023
	</action>
1024

  
956 1025
	<!-- metadata extraction section -->
957 1026
	<decision name="decision-metadata_extractor_use_cache">
958 1027
        <switch>
......
1074 1143
    <!-- merging document text datastores: 
1075 1144
    	1) retrieved directly from objectstore 
1076 1145
    	2) generated by metadataextraction 
1077
    	3) imported from PMC XMLs
1146
    	3) ingested from PMC XMLs
1147
    	3) ingested from HTML
1078 1148
    -->
1079 1149
	<action name="transformers_common_union_document_text">
1080 1150
	    <sub-workflow>
1081
            <app-path>${wf:appPath()}/transformers_common_union3</app-path>
1151
            <app-path>${wf:appPath()}/transformers_common_union4</app-path>
1082 1152
            <propagate-configuration/>
1083 1153
            <configuration>
1084 1154
            	<property>
......
1098 1168
					<value>${workingDir}/ingest_pmc_plaintext/imported</value>
1099 1169
				</property>
1100 1170
				<property>
1171
					<name>input_d</name>
1172
					<value>${workingDir}/ingest_html_plaintext/imported</value>
1173
				</property>
1174
				<property>
1101 1175
					<name>output</name>
1102 1176
					<value>${output_document_text}</value>
1103 1177
				</property>

Also available in: Unified diff