Revision 34914
Added by Marek Horst over 9 years ago
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/importer/content_url/job.properties | ||
---|---|---|
20 | 20 |
|
21 | 21 |
mimetypes_pdf=pdf,application/pdf |
22 | 22 |
mimetypes_text=text,text/plain |
23 |
mimetypes_html=text/html |
|
23 | 24 |
mimetypes_xml_pmc=xml |
24 | 25 |
mimetypes_wos=file::WoS |
25 | 26 |
|
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/importer/content_url/oozie_app/workflow.xml | ||
---|---|---|
35 | 35 |
<description>text mime types</description> |
36 | 36 |
</property> |
37 | 37 |
<property> |
38 |
<name>mimetypes_html</name> |
|
39 |
<description>html mime types</description> |
|
40 |
</property> |
|
41 |
<property> |
|
38 | 42 |
<name>mimetypes_xml_pmc</name> |
39 | 43 |
<description>EuropePMC xml mime types</description> |
40 | 44 |
</property> |
... | ... | |
74 | 78 |
<description>text output subdirectory name</description> |
75 | 79 |
</property> |
76 | 80 |
<property> |
81 |
<name>output_name_html</name> |
|
82 |
<value>html</value> |
|
83 |
<description>html output subdirectory name</description> |
|
84 |
</property> |
|
85 |
<property> |
|
77 | 86 |
<name>output_name_xml_pmc</name> |
78 | 87 |
<value>xmlpmc</value> |
79 | 88 |
<description>XML PMC output subdirectory name</description> |
... | ... | |
287 | 296 |
<!-- required for multiple outputs only --> |
288 | 297 |
<property> |
289 | 298 |
<name>avro.mapreduce.multipleoutputs</name> |
290 |
<value>${output_name_pdf} ${output_name_text} ${output_name_xml_pmc} ${output_name_wos}</value> |
|
299 |
<value>${output_name_pdf} ${output_name_text} ${output_name_html} ${output_name_xml_pmc} ${output_name_wos}</value>
|
|
291 | 300 |
</property> |
292 | 301 |
<property> |
293 | 302 |
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_pdf}.format |
... | ... | |
300 | 309 |
<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value> |
301 | 310 |
</property> |
302 | 311 |
<property> |
312 |
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_html}.format |
|
313 |
</name> |
|
314 |
<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value> |
|
315 |
</property> |
|
316 |
<property> |
|
303 | 317 |
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_xml_pmc}.format |
304 | 318 |
</name> |
305 | 319 |
<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value> |
... | ... | |
318 | 332 |
<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value> |
319 | 333 |
</property> |
320 | 334 |
<property> |
335 |
<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_html}</name> |
|
336 |
<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value> |
|
337 |
</property> |
|
338 |
<property> |
|
321 | 339 |
<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_xml_pmc}</name> |
322 | 340 |
<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value> |
323 | 341 |
</property> |
... | ... | |
335 | 353 |
<value>${mimetypes_text}</value> |
336 | 354 |
</property> |
337 | 355 |
<property> |
356 |
<name>mimetypes.csv.${output_name_html}</name> |
|
357 |
<value>${mimetypes_html}</value> |
|
358 |
</property> |
|
359 |
<property> |
|
338 | 360 |
<name>mimetypes.csv.${output_name_xml_pmc}</name> |
339 | 361 |
<value>${mimetypes_xml_pmc}</value> |
340 | 362 |
</property> |
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/preprocessing/main/oozie_app/workflow.xml | ||
---|---|---|
62 | 62 |
<description>text mime types</description> |
63 | 63 |
</property> |
64 | 64 |
<property> |
65 |
<name>import_content_mimetypes_html</name> |
|
66 |
<value>text/html</value> |
|
67 |
<description>html mime types</description> |
|
68 |
</property> |
|
69 |
<property> |
|
65 | 70 |
<name>import_content_mimetypes_xml_pmc</name> |
66 | 71 |
<value>xml</value> |
67 | 72 |
<description>xml pmc types</description> |
... | ... | |
220 | 225 |
<value>${import_content_mimetypes_text}</value> |
221 | 226 |
</property> |
222 | 227 |
<property> |
228 |
<name>mimetypes_html</name> |
|
229 |
<value>${import_content_mimetypes_html}</value> |
|
230 |
</property> |
|
231 |
<property> |
|
223 | 232 |
<name>mimetypes_xml_pmc</name> |
224 | 233 |
<value>${import_content_mimetypes_xml_pmc}</value> |
225 | 234 |
</property> |
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/preprocessing/import/oozie_app/workflow.xml | ||
---|---|---|
56 | 56 |
<description>text mime types</description> |
57 | 57 |
</property> |
58 | 58 |
<property> |
59 |
<name>mimetypes_html</name> |
|
60 |
<description>html mime types</description> |
|
61 |
</property> |
|
62 |
<property> |
|
59 | 63 |
<name>mimetypes_xml_pmc</name> |
60 | 64 |
<description>xml pmc types</description> |
61 | 65 |
</property> |
... | ... | |
213 | 217 |
<value>${mimetypes_text}</value> |
214 | 218 |
</property> |
215 | 219 |
<property> |
220 |
<name>mimetypes_html</name> |
|
221 |
<value>${mimetypes_html}</value> |
|
222 |
</property> |
|
223 |
<property> |
|
216 | 224 |
<name>mimetypes_xml_pmc</name> |
217 | 225 |
<value>${mimetypes_xml_pmc}</value> |
218 | 226 |
</property> |
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/primary/main/oozie_app/workflow.xml | ||
---|---|---|
109 | 109 |
<description>text mime types</description> |
110 | 110 |
</property> |
111 | 111 |
<property> |
112 |
<name>import_content_mimetypes_html</name> |
|
113 |
<value>text/html</value> |
|
114 |
<description>html mime types</description> |
|
115 |
</property> |
|
116 |
<property> |
|
112 | 117 |
<name>import_content_mimetypes_xml_pmc</name> |
113 | 118 |
<value>xml</value> |
114 | 119 |
<description>xml pmc types</description> |
... | ... | |
352 | 357 |
<value>${import_content_mimetypes_text}</value> |
353 | 358 |
</property> |
354 | 359 |
<property> |
360 |
<name>mimetypes_html</name> |
|
361 |
<value>${import_content_mimetypes_html}</value> |
|
362 |
</property> |
|
363 |
<property> |
|
355 | 364 |
<name>mimetypes_xml_pmc</name> |
356 | 365 |
<value>${import_content_mimetypes_xml_pmc}</value> |
357 | 366 |
</property> |
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/common/import/job.properties | ||
---|---|---|
1 |
active_import_metadata=true
|
|
1 |
active_import_metadata=false
|
|
2 | 2 |
active_import_dataset=false |
3 |
active_ingest_pmc_citations=true
|
|
4 |
active_import_concept=true
|
|
3 |
active_ingest_pmc_citations=false
|
|
4 |
active_import_concept=false
|
|
5 | 5 |
|
6 |
hbase_input_table=information_space-2014-11-05
|
|
6 |
hbase_input_table=db_openaireplus_services_beta
|
|
7 | 7 |
|
8 | 8 |
#import concepts related |
9 | 9 |
islookup_service_location=http://beta.services.openaire.eu:8280/is/services/isLookUp |
... | ... | |
13 | 13 |
database_dbname=dnet_openaireplus |
14 | 14 |
|
15 | 15 |
objectstore_service_location=http://beta.services.openaire.eu:8280/is/services/objectStore |
16 |
#approved_objectstores_csv=$UNDEFINED$
|
|
16 |
approved_objectstores_csv=$UNDEFINED$ |
|
17 | 17 |
#puma |
18 | 18 |
#approved_objectstores_csv=794e8173-8be3-4f51-a12e-b43d12ab3b7d_T2JqZWN0U3RvcmVEU1Jlc291cmNlcy9PYmplY3RTdG9yZURTUmVzb3VyY2VUeXBl |
19 | 19 |
#arxiv content |
20 | 20 |
#approved_objectstores_csv=258755af-0b48-41ee-9652-939c5bd2fca3_T2JqZWN0U3RvcmVEU1Jlc291cmNlcy9PYmplY3RTdG9yZURTUmVzb3VyY2VUeXBl |
21 | 21 |
#pmc content |
22 |
approved_objectstores_csv=b2b6fca5-ce18-498c-a375-b02df97998f0_T2JqZWN0U3RvcmVEU1Jlc291cmNlcy9PYmplY3RTdG9yZURTUmVzb3VyY2VUeXBl |
|
22 |
#approved_objectstores_csv=b2b6fca5-ce18-498c-a375-b02df97998f0_T2JqZWN0U3RvcmVEU1Jlc291cmNlcy9PYmplY3RTdG9yZURTUmVzb3VyY2VUeXBl
|
|
23 | 23 |
#hal content |
24 | 24 |
#approved_objectstores_csv=2ad5f567-386d-4812-8edb-c0922eacd107_T2JqZWN0U3RvcmVEU1Jlc291cmNlcy9PYmplY3RTdG9yZURTUmVzb3VyY2VUeXBl |
25 | 25 |
#wos |
... | ... | |
32 | 32 |
|
33 | 33 |
mimetypes_pdf=pdf,application/pdf |
34 | 34 |
mimetypes_text=text |
35 |
mimetypes_html=text/html |
|
35 | 36 |
mimetypes_xml_pmc=xml |
36 | 37 |
mimetypes_wos=file::WoS |
37 | 38 |
|
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/common/import/oozie_app/import.txt | ||
---|---|---|
5 | 5 |
import_concept classpath eu/dnetlib/iis/importer/concept/oozie_app |
6 | 6 |
import_content_url classpath eu/dnetlib/iis/mainworkflows/importer/content_url/oozie_app |
7 | 7 |
import_plaintext classpath eu/dnetlib/iis/importer/plaintext/oozie_app |
8 |
ingest_html_plaintext classpath eu/dnetlib/iis/ingest/html/plaintext/oozie_app |
|
8 | 9 |
ingest_pmc_plaintext classpath eu/dnetlib/iis/ingest/pmc/plaintext/oozie_app |
9 | 10 |
ingest_pmc_metadata classpath eu/dnetlib/iis/ingest/pmc/metadata/oozie_app |
10 | 11 |
ingest_pmc_citations classpath eu/dnetlib/iis/ingest/pmc/citations/oozie_app |
... | ... | |
12 | 13 |
multiple_input_collapser classpath eu/dnetlib/iis/collapsers/multiple_input_collapser/oozie_app |
13 | 14 |
metadataextraction classpath eu/dnetlib/iis/metadataextraction/oozie_app |
14 | 15 |
metadataextraction_cached classpath eu/dnetlib/iis/mainworkflows/metadataextraction/cached_by_checksum/oozie_app |
15 |
transformers_common_union3 classpath eu/dnetlib/iis/transformers/common/union3/oozie_app
|
|
16 |
transformers_common_union4 classpath eu/dnetlib/iis/transformers/common/union4/oozie_app
|
|
16 | 17 |
transformers_idextractor classpath eu/dnetlib/iis/transformers/importer/documentmetadata/idextractor/oozie_app |
17 | 18 |
transformers_externalidtooaid classpath eu/dnetlib/iis/transformers/importer/documentmetadata/externalidtooaid/oozie_app |
18 | 19 |
transformers_ingest_pmc_metadata classpath eu/dnetlib/iis/transformers/ingest/pmc/metadata/oozie_app |
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/common/import/oozie_app/workflow.xml | ||
---|---|---|
108 | 108 |
<description>text mime types</description> |
109 | 109 |
</property> |
110 | 110 |
<property> |
111 |
<name>mimetypes_html</name> |
|
112 |
<description>html mime types</description> |
|
113 |
</property> |
|
114 |
<property> |
|
111 | 115 |
<name>mimetypes_xml_pmc</name> |
112 | 116 |
<description>xml pmc types</description> |
113 | 117 |
</property> |
... | ... | |
550 | 554 |
<value>${mimetypes_text}</value> |
551 | 555 |
</property> |
552 | 556 |
<property> |
557 |
<name>mimetypes_html</name> |
|
558 |
<value>${mimetypes_html}</value> |
|
559 |
</property> |
|
560 |
<property> |
|
553 | 561 |
<name>mimetypes_xml_pmc</name> |
554 | 562 |
<value>${mimetypes_xml_pmc}</value> |
555 | 563 |
</property> |
... | ... | |
582 | 590 |
<value>text</value> |
583 | 591 |
</property> |
584 | 592 |
<property> |
593 |
<name>output_name_html</name> |
|
594 |
<value>html</value> |
|
595 |
</property> |
|
596 |
<property> |
|
585 | 597 |
<name>output_name_xml_pmc</name> |
586 | 598 |
<value>xmlpmc</value> |
587 | 599 |
</property> |
... | ... | |
625 | 637 |
<path start="import_plaintext"/> |
626 | 638 |
<path start="import_wos"/> |
627 | 639 |
<path start="import_plaintext_pmc"/> |
640 |
<path start="import_html"/> |
|
628 | 641 |
<path start="decision-metadata_extractor_use_cache"/> |
629 | 642 |
</fork> |
630 | 643 |
|
... | ... | |
953 | 966 |
|
954 | 967 |
<join name="ingest_pmc_joining" to="import_urlbased_joining"/> |
955 | 968 |
|
969 |
<!-- html import and plaintext ingestion section --> |
|
970 |
<action name="import_html"> |
|
971 |
<sub-workflow> |
|
972 |
<app-path>${wf:appPath()}/import_plaintext</app-path> |
|
973 |
<propagate-configuration/> |
|
974 |
<configuration> |
|
975 |
<property> |
|
976 |
<name>workingDir</name> |
|
977 |
<value>${workingDir}/import_html/working_dir</value> |
|
978 |
</property> |
|
979 |
<property> |
|
980 |
<name>input</name> |
|
981 |
<value>${workingDir}/import_content_url/imported/html</value> |
|
982 |
</property> |
|
983 |
<property> |
|
984 |
<name>content_connection_timeout</name> |
|
985 |
<value>${content_connection_timeout}</value> |
|
986 |
</property> |
|
987 |
<property> |
|
988 |
<name>content_read_timeout</name> |
|
989 |
<value>${content_read_timeout}</value> |
|
990 |
</property> |
|
991 |
<property> |
|
992 |
<name>output</name> |
|
993 |
<value>${workingDir}/import_html/imported</value> |
|
994 |
</property> |
|
995 |
</configuration> |
|
996 |
</sub-workflow> |
|
997 |
<ok to="ingest_html_plaintext" /> |
|
998 |
<error to="fail" /> |
|
999 |
</action> |
|
1000 |
|
|
1001 |
|
|
1002 |
<action name="ingest_html_plaintext"> |
|
1003 |
<sub-workflow> |
|
1004 |
<app-path>${wf:appPath()}/ingest_html_plaintext</app-path> |
|
1005 |
<propagate-configuration/> |
|
1006 |
<configuration> |
|
1007 |
<property> |
|
1008 |
<name>workingDir</name> |
|
1009 |
<value>${workingDir}/ingest_html_plaintext/working_dir</value> |
|
1010 |
</property> |
|
1011 |
<property> |
|
1012 |
<name>input</name> |
|
1013 |
<value>${workingDir}/import_html/imported</value> |
|
1014 |
</property> |
|
1015 |
<property> |
|
1016 |
<name>output</name> |
|
1017 |
<value>${workingDir}/ingest_html_plaintext/imported</value> |
|
1018 |
</property> |
|
1019 |
</configuration> |
|
1020 |
</sub-workflow> |
|
1021 |
<ok to="import_urlbased_joining" /> |
|
1022 |
<error to="fail" /> |
|
1023 |
</action> |
|
1024 |
|
|
956 | 1025 |
<!-- metadata extraction section --> |
957 | 1026 |
<decision name="decision-metadata_extractor_use_cache"> |
958 | 1027 |
<switch> |
... | ... | |
1074 | 1143 |
<!-- merging document text datastores: |
1075 | 1144 |
1) retrieved directly from objectstore |
1076 | 1145 |
2) generated by metadataextraction |
1077 |
3) imported from PMC XMLs |
|
1146 |
3) ingested from PMC XMLs |
|
1147 |
3) ingested from HTML |
|
1078 | 1148 |
--> |
1079 | 1149 |
<action name="transformers_common_union_document_text"> |
1080 | 1150 |
<sub-workflow> |
1081 |
<app-path>${wf:appPath()}/transformers_common_union3</app-path>
|
|
1151 |
<app-path>${wf:appPath()}/transformers_common_union4</app-path>
|
|
1082 | 1152 |
<propagate-configuration/> |
1083 | 1153 |
<configuration> |
1084 | 1154 |
<property> |
... | ... | |
1098 | 1168 |
<value>${workingDir}/ingest_pmc_plaintext/imported</value> |
1099 | 1169 |
</property> |
1100 | 1170 |
<property> |
1171 |
<name>input_d</name> |
|
1172 |
<value>${workingDir}/ingest_html_plaintext/imported</value> |
|
1173 |
</property> |
|
1174 |
<property> |
|
1101 | 1175 |
<name>output</name> |
1102 | 1176 |
<value>${output_document_text}</value> |
1103 | 1177 |
</property> |
modules/icm-iis-mainworkflows/trunk/src/main/resources/eu/dnetlib/iis/mainworkflows/statistics/oozie_app/workflow.xml | ||
---|---|---|
38 | 38 |
<description>text mime types</description> |
39 | 39 |
</property> |
40 | 40 |
<property> |
41 |
<name>import_content_mimetypes_html</name> |
|
42 |
<value>text/html</value> |
|
43 |
<description>html mime types</description> |
|
44 |
</property> |
|
45 |
<property> |
|
41 | 46 |
<name>import_content_mimetypes_xml_pmc</name> |
42 | 47 |
<value>xml</value> |
43 | 48 |
<description>xml pmc types</description> |
... | ... | |
210 | 215 |
<value>${import_content_mimetypes_text}</value> |
211 | 216 |
</property> |
212 | 217 |
<property> |
218 |
<name>mimetypes_html</name> |
|
219 |
<value>${import_content_mimetypes_html}</value> |
|
220 |
</property> |
|
221 |
<property> |
|
213 | 222 |
<name>mimetypes_xml_pmc</name> |
214 | 223 |
<value>${import_content_mimetypes_xml_pmc}</value> |
215 | 224 |
</property> |
Also available in: Unified diff
#1147 introducing HTML import and HTML plaintext ingestion in main workflows: primary and preprocessing