Project

General

Profile

« Previous | Next » 

Revision 34914

Added by Marek Horst over 9 years ago

#1147 introducing HTML import and HTML plaintext ingestion in main workflows: primary and preprocessing

View differences:

modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/importer/content_url/job.properties
20 20

  
21 21
mimetypes_pdf=pdf,application/pdf
22 22
mimetypes_text=text,text/plain
23
mimetypes_html=text/html
23 24
mimetypes_xml_pmc=xml
24 25
mimetypes_wos=file::WoS
25 26

  
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/importer/content_url/oozie_app/workflow.xml
35 35
			<description>text mime types</description>
36 36
		</property>
37 37
		<property>
38
			<name>mimetypes_html</name>
39
			<description>html mime types</description>
40
		</property>
41
		<property>
38 42
			<name>mimetypes_xml_pmc</name>
39 43
			<description>EuropePMC xml mime types</description>
40 44
		</property>
......
74 78
			<description>text output subdirectory name</description>
75 79
		</property>
76 80
		<property>
81
			<name>output_name_html</name>
82
			<value>html</value>
83
			<description>html output subdirectory name</description>
84
		</property>
85
		<property>
77 86
			<name>output_name_xml_pmc</name>
78 87
			<value>xmlpmc</value>
79 88
			<description>XML PMC output subdirectory name</description>
......
287 296
				<!-- required for multiple outputs only -->
288 297
				<property>
289 298
					<name>avro.mapreduce.multipleoutputs</name>
290
					<value>${output_name_pdf} ${output_name_text} ${output_name_xml_pmc} ${output_name_wos}</value>
299
					<value>${output_name_pdf} ${output_name_text} ${output_name_html} ${output_name_xml_pmc} ${output_name_wos}</value>
291 300
				</property>
292 301
				<property>
293 302
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_pdf}.format
......
300 309
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
301 310
				</property>
302 311
				<property>
312
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_html}.format
313
					</name>
314
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
315
				</property>
316
				<property>
303 317
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_xml_pmc}.format
304 318
					</name>
305 319
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
......
318 332
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
319 333
				</property>
320 334
				<property>
335
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_html}</name>
336
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
337
				</property>
338
				<property>
321 339
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_xml_pmc}</name>
322 340
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
323 341
				</property>
......
335 353
					<value>${mimetypes_text}</value>
336 354
				</property>
337 355
				<property>
356
					<name>mimetypes.csv.${output_name_html}</name>
357
					<value>${mimetypes_html}</value>
358
				</property>
359
				<property>
338 360
					<name>mimetypes.csv.${output_name_xml_pmc}</name>
339 361
					<value>${mimetypes_xml_pmc}</value>
340 362
				</property>
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/preprocessing/main/oozie_app/workflow.xml
62 62
			<description>text mime types</description>
63 63
		</property>
64 64
		<property>
65
			<name>import_content_mimetypes_html</name>
66
			<value>text/html</value>
67
			<description>html mime types</description>
68
		</property>
69
		<property>
65 70
			<name>import_content_mimetypes_xml_pmc</name>
66 71
			<value>xml</value>
67 72
			<description>xml pmc types</description>
......
220 225
					<value>${import_content_mimetypes_text}</value>
221 226
				</property>
222 227
				<property>
228
					<name>mimetypes_html</name>
229
					<value>${import_content_mimetypes_html}</value>
230
				</property>
231
				<property>
223 232
					<name>mimetypes_xml_pmc</name>
224 233
					<value>${import_content_mimetypes_xml_pmc}</value>
225 234
				</property>
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/preprocessing/import/oozie_app/workflow.xml
56 56
			<description>text mime types</description>
57 57
		</property>
58 58
		<property>
59
			<name>mimetypes_html</name>
60
			<description>html mime types</description>
61
		</property>
62
		<property>
59 63
			<name>mimetypes_xml_pmc</name>
60 64
			<description>xml pmc types</description>
61 65
		</property>
......
213 217
					<value>${mimetypes_text}</value>
214 218
				</property>
215 219
				<property>
220
					<name>mimetypes_html</name>
221
					<value>${mimetypes_html}</value>
222
				</property>
223
				<property>
216 224
					<name>mimetypes_xml_pmc</name>
217 225
					<value>${mimetypes_xml_pmc}</value>
218 226
				</property>
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/primary/main/oozie_app/workflow.xml
109 109
			<description>text mime types</description>
110 110
		</property>
111 111
		<property>
112
			<name>import_content_mimetypes_html</name>
113
			<value>text/html</value>
114
			<description>html mime types</description>
115
		</property>
116
		<property>
112 117
			<name>import_content_mimetypes_xml_pmc</name>
113 118
			<value>xml</value>
114 119
			<description>xml pmc types</description>
......
352 357
					<value>${import_content_mimetypes_text}</value>
353 358
				</property>
354 359
				<property>
360
					<name>mimetypes_html</name>
361
					<value>${import_content_mimetypes_html}</value>
362
				</property>
363
				<property>
355 364
					<name>mimetypes_xml_pmc</name>
356 365
					<value>${import_content_mimetypes_xml_pmc}</value>
357 366
				</property>
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/common/import/job.properties
1
active_import_metadata=true
1
active_import_metadata=false
2 2
active_import_dataset=false
3
active_ingest_pmc_citations=true
4
active_import_concept=true
3
active_ingest_pmc_citations=false
4
active_import_concept=false
5 5

  
6
hbase_input_table=information_space-2014-11-05
6
hbase_input_table=db_openaireplus_services_beta
7 7

  
8 8
#import concepts related
9 9
islookup_service_location=http://beta.services.openaire.eu:8280/is/services/isLookUp
......
13 13
database_dbname=dnet_openaireplus
14 14

  
15 15
objectstore_service_location=http://beta.services.openaire.eu:8280/is/services/objectStore
16
#approved_objectstores_csv=$UNDEFINED$
16
approved_objectstores_csv=$UNDEFINED$
17 17
#puma
18 18
#approved_objectstores_csv=794e8173-8be3-4f51-a12e-b43d12ab3b7d_T2JqZWN0U3RvcmVEU1Jlc291cmNlcy9PYmplY3RTdG9yZURTUmVzb3VyY2VUeXBl
19 19
#arxiv content
20 20
#approved_objectstores_csv=258755af-0b48-41ee-9652-939c5bd2fca3_T2JqZWN0U3RvcmVEU1Jlc291cmNlcy9PYmplY3RTdG9yZURTUmVzb3VyY2VUeXBl
21 21
#pmc content
22
approved_objectstores_csv=b2b6fca5-ce18-498c-a375-b02df97998f0_T2JqZWN0U3RvcmVEU1Jlc291cmNlcy9PYmplY3RTdG9yZURTUmVzb3VyY2VUeXBl
22
#approved_objectstores_csv=b2b6fca5-ce18-498c-a375-b02df97998f0_T2JqZWN0U3RvcmVEU1Jlc291cmNlcy9PYmplY3RTdG9yZURTUmVzb3VyY2VUeXBl
23 23
#hal content
24 24
#approved_objectstores_csv=2ad5f567-386d-4812-8edb-c0922eacd107_T2JqZWN0U3RvcmVEU1Jlc291cmNlcy9PYmplY3RTdG9yZURTUmVzb3VyY2VUeXBl
25 25
#wos
......
32 32

  
33 33
mimetypes_pdf=pdf,application/pdf
34 34
mimetypes_text=text
35
mimetypes_html=text/html
35 36
mimetypes_xml_pmc=xml
36 37
mimetypes_wos=file::WoS
37 38

  
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/common/import/oozie_app/import.txt
5 5
import_concept classpath eu/dnetlib/iis/importer/concept/oozie_app
6 6
import_content_url classpath eu/dnetlib/iis/mainworkflows/importer/content_url/oozie_app
7 7
import_plaintext classpath eu/dnetlib/iis/importer/plaintext/oozie_app
8
ingest_html_plaintext classpath eu/dnetlib/iis/ingest/html/plaintext/oozie_app
8 9
ingest_pmc_plaintext classpath eu/dnetlib/iis/ingest/pmc/plaintext/oozie_app
9 10
ingest_pmc_metadata classpath eu/dnetlib/iis/ingest/pmc/metadata/oozie_app
10 11
ingest_pmc_citations classpath eu/dnetlib/iis/ingest/pmc/citations/oozie_app
......
12 13
multiple_input_collapser classpath eu/dnetlib/iis/collapsers/multiple_input_collapser/oozie_app
13 14
metadataextraction classpath eu/dnetlib/iis/metadataextraction/oozie_app
14 15
metadataextraction_cached classpath eu/dnetlib/iis/mainworkflows/metadataextraction/cached_by_checksum/oozie_app
15
transformers_common_union3 classpath eu/dnetlib/iis/transformers/common/union3/oozie_app
16
transformers_common_union4 classpath eu/dnetlib/iis/transformers/common/union4/oozie_app
16 17
transformers_idextractor classpath eu/dnetlib/iis/transformers/importer/documentmetadata/idextractor/oozie_app
17 18
transformers_externalidtooaid classpath eu/dnetlib/iis/transformers/importer/documentmetadata/externalidtooaid/oozie_app
18 19
transformers_ingest_pmc_metadata classpath eu/dnetlib/iis/transformers/ingest/pmc/metadata/oozie_app
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/common/import/oozie_app/workflow.xml
108 108
			<description>text mime types</description>
109 109
		</property>
110 110
		<property>
111
			<name>mimetypes_html</name>
112
			<description>html mime types</description>
113
		</property>
114
		<property>
111 115
			<name>mimetypes_xml_pmc</name>
112 116
			<description>xml pmc types</description>
113 117
		</property>
......
550 554
					<value>${mimetypes_text}</value>
551 555
				</property>
552 556
				<property>
557
					<name>mimetypes_html</name>
558
					<value>${mimetypes_html}</value>
559
				</property>
560
				<property>
553 561
					<name>mimetypes_xml_pmc</name>
554 562
					<value>${mimetypes_xml_pmc}</value>
555 563
				</property>
......
582 590
					<value>text</value>
583 591
				</property>
584 592
				<property>
593
					<name>output_name_html</name>
594
					<value>html</value>
595
				</property>
596
				<property>
585 597
					<name>output_name_xml_pmc</name>
586 598
					<value>xmlpmc</value>
587 599
				</property>
......
625 637
    	<path start="import_plaintext"/>
626 638
    	<path start="import_wos"/>
627 639
    	<path start="import_plaintext_pmc"/>
640
    	<path start="import_html"/>
628 641
		<path start="decision-metadata_extractor_use_cache"/>
629 642
    </fork>
630 643

  
......
953 966

  
954 967
	<join name="ingest_pmc_joining" to="import_urlbased_joining"/>
955 968

  
969
	<!-- html import and plaintext ingestion section -->
970
	<action name="import_html">
971
		<sub-workflow>
972
            <app-path>${wf:appPath()}/import_plaintext</app-path>
973
            <propagate-configuration/>
974
            <configuration>
975
            	<property>
976
                    <name>workingDir</name>
977
                    <value>${workingDir}/import_html/working_dir</value>
978
                </property>
979
                <property>
980
					<name>input</name>
981
					<value>${workingDir}/import_content_url/imported/html</value>
982
				</property>
983
				<property>
984
				    <name>content_connection_timeout</name>
985
				   <value>${content_connection_timeout}</value>
986
				</property>
987
				<property>
988
				    <name>content_read_timeout</name>
989
				   <value>${content_read_timeout}</value>
990
				</property>
991
            	<property>
992
					<name>output</name>
993
					<value>${workingDir}/import_html/imported</value>
994
				</property>
995
			</configuration>
996
        </sub-workflow>
997
		<ok to="ingest_html_plaintext" />
998
		<error to="fail" />
999
	</action>
1000

  
1001
	
1002
	<action name="ingest_html_plaintext">
1003
		<sub-workflow>
1004
            <app-path>${wf:appPath()}/ingest_html_plaintext</app-path>
1005
            <propagate-configuration/>
1006
            <configuration>
1007
            	<property>
1008
                    <name>workingDir</name>
1009
                    <value>${workingDir}/ingest_html_plaintext/working_dir</value>
1010
                </property>
1011
                <property>
1012
					<name>input</name>
1013
					<value>${workingDir}/import_html/imported</value>
1014
				</property>
1015
            	<property>
1016
					<name>output</name>
1017
					<value>${workingDir}/ingest_html_plaintext/imported</value>
1018
				</property>
1019
			</configuration>
1020
        </sub-workflow>
1021
		<ok to="import_urlbased_joining" />
1022
		<error to="fail" />
1023
	</action>
1024

  
956 1025
	<!-- metadata extraction section -->
957 1026
	<decision name="decision-metadata_extractor_use_cache">
958 1027
        <switch>
......
1074 1143
    <!-- merging document text datastores: 
1075 1144
    	1) retrieved directly from objectstore 
1076 1145
    	2) generated by metadataextraction 
1077
    	3) imported from PMC XMLs
1146
    	3) ingested from PMC XMLs
1147
    	3) ingested from HTML
1078 1148
    -->
1079 1149
	<action name="transformers_common_union_document_text">
1080 1150
	    <sub-workflow>
1081
            <app-path>${wf:appPath()}/transformers_common_union3</app-path>
1151
            <app-path>${wf:appPath()}/transformers_common_union4</app-path>
1082 1152
            <propagate-configuration/>
1083 1153
            <configuration>
1084 1154
            	<property>
......
1098 1168
					<value>${workingDir}/ingest_pmc_plaintext/imported</value>
1099 1169
				</property>
1100 1170
				<property>
1171
					<name>input_d</name>
1172
					<value>${workingDir}/ingest_html_plaintext/imported</value>
1173
				</property>
1174
				<property>
1101 1175
					<name>output</name>
1102 1176
					<value>${output_document_text}</value>
1103 1177
				</property>
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/statistics/oozie_app/workflow.xml
38 38
			<description>text mime types</description>
39 39
		</property>
40 40
		<property>
41
			<name>import_content_mimetypes_html</name>
42
			<value>text/html</value>
43
			<description>html mime types</description>
44
		</property>
45
		<property>
41 46
			<name>import_content_mimetypes_xml_pmc</name>
42 47
			<value>xml</value>
43 48
			<description>xml pmc types</description>
......
210 215
					<value>${import_content_mimetypes_text}</value>
211 216
				</property>
212 217
				<property>
218
					<name>mimetypes_html</name>
219
					<value>${import_content_mimetypes_html}</value>
220
				</property>
221
				<property>
213 222
					<name>mimetypes_xml_pmc</name>
214 223
					<value>${import_content_mimetypes_xml_pmc}</value>
215 224
				</property>

Also available in: Unified diff