Revision 37109
Added by Marek Horst over 9 years ago
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/researchinitiatives/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_document_to_research_initiative |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_input_document_to_research_initiative');
|
|
3 |
'schema', '$schema_input_document_to_research_initiative');
|
|
4 | 4 |
|
5 | 5 |
define avro_store_document_to_research_initiatives |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 | 7 |
'index', '0', |
8 |
'output_schema_class', '$schema_output_document_to_research_initiatives');
|
|
8 |
'schema', '$schema_output_document_to_research_initiatives');
|
|
9 | 9 |
|
10 | 10 |
documentToResearchInitiative = load '$input_document_to_research_initiative' using avro_load_document_to_research_initiative; |
11 | 11 |
|
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/referenceextraction/project/toconcept/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
30 | 30 |
<arg>-C{concept, |
31 | 31 |
eu.dnetlib.iis.importer.schemas.Concept, |
32 | 32 |
eu/dnetlib/iis/transformers/referenceextraction/project/toconcept/sampledataproducer/data/concept.json}</arg> |
33 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
34 |
directory has to be specified as well --> |
|
35 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
33 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
36 | 34 |
<arg>-Odocument_to_project=${workingDir}/producer/document_to_project</arg> |
37 | 35 |
<arg>-Oproject=${workingDir}/producer/project</arg> |
38 | 36 |
<arg>-Oconcept=${workingDir}/producer/concept</arg> |
... | ... | |
106 | 104 |
<arg>-C{output, |
107 | 105 |
eu.dnetlib.iis.referenceextraction.researchinitiative.schemas.DocumentToConceptId, |
108 | 106 |
eu/dnetlib/iis/transformers/referenceextraction/project/toconcept/sampledataproducer/data/output.json}</arg> |
109 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
110 |
directory has to be specified as well --> |
|
111 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
107 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
112 | 108 |
<arg>-Ioutput=${workingDir}/transformer_project_toconcept/output</arg> |
113 | 109 |
</java> |
114 | 110 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/metadataextraction/checksum/preprocessing/oozie_app/workflow.xml | ||
---|---|---|
11 | 11 |
</property> |
12 | 12 |
</parameters> |
13 | 13 |
|
14 |
<start to="transformer"/> |
|
14 |
<start to="generate-schema"/> |
|
15 |
|
|
16 |
<action name="generate-schema"> |
|
17 |
<java> |
|
18 |
<job-tracker>${jobTracker}</job-tracker> |
|
19 |
<name-node>${nameNode}</name-node> |
|
20 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
21 |
<arg>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</arg> |
|
22 |
<capture-output /> |
|
23 |
</java> |
|
24 |
<ok to="transformer" /> |
|
25 |
<error to="fail" /> |
|
26 |
</action> |
|
27 |
|
|
15 | 28 |
<action name="transformer"> |
16 | 29 |
<pig> |
17 | 30 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
28 | 41 |
</configuration> |
29 | 42 |
<!-- Path to PIG script the workflow executes. --> |
30 | 43 |
<script>lib/scripts/transformer.pig</script> |
31 |
<param>schema_document_content_url=eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</param>
|
|
44 |
<param>schema_document_content_url=${wf:actionData('generate-schema')['eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl']}</param>
|
|
32 | 45 |
<param>input=${input}</param> |
33 | 46 |
<param>output=${output}</param> |
34 | 47 |
</pig> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/export/documentmetadata/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
23 | 23 |
<arg>-C{extracted_metadata, |
24 | 24 |
eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata, |
25 | 25 |
eu/dnetlib/iis/transformers/export/documentmetadata/sampledataproducer/data/extr_metadata.json}</arg> |
26 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
27 |
directory has to be specified as well --> |
|
28 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
26 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
29 | 27 |
<arg>-Oextracted_metadata=${workingDir}/producer/extr_metadata</arg> |
30 | 28 |
</java> |
31 | 29 |
<ok to="transformer_export_documentmetadata"/> |
... | ... | |
85 | 83 |
<arg>-C{output_metadata, |
86 | 84 |
eu.dnetlib.iis.export.schemas.DocumentMetadata, |
87 | 85 |
eu/dnetlib/iis/transformers/export/documentmetadata/sampledataproducer/data/output_metadata.json}</arg> |
88 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
89 |
directory has to be specified as well --> |
|
90 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
86 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
91 | 87 |
<arg>-Ioutput_metadata=${workingDir}/transformer_export_documentmetadata/output_metadata</arg> |
92 | 88 |
</java> |
93 | 89 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/export/researchinitiatives/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
22 | 22 |
<arg>-C{document_to_concept_id, |
23 | 23 |
eu.dnetlib.iis.referenceextraction.researchinitiative.schemas.DocumentToConceptId, |
24 | 24 |
eu/dnetlib/iis/transformers/export/researchinitiatives/sampledataproducer/data/document_to_concept_id.json}</arg> |
25 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
26 | 25 |
<arg>-Odocument_to_concept_id=${workingDir}/producer/output</arg> |
27 | 26 |
</java> |
28 | 27 |
<ok to="transformer_export_researchinitiatives"/> |
... | ... | |
78 | 77 |
<arg>-C{output, |
79 | 78 |
eu.dnetlib.iis.export.schemas.DocumentToConceptIds, |
80 | 79 |
eu/dnetlib/iis/transformers/export/researchinitiatives/sampledataproducer/data/document_to_concept_ids.json}</arg> |
81 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
82 | 80 |
<arg>-Ioutput=${workingDir}/transformer_export_researchinitiatives/output</arg> |
83 | 81 |
</java> |
84 | 82 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/export/documenttodataset_without_imported_data/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
27 | 27 |
eu.dnetlib.iis.importer.schemas.DocumentRelation, |
28 | 28 |
eu/dnetlib/iis/transformers/export/documenttodataset_without_imported_data/sampledataproducer/data/document_relation.json}</arg> |
29 | 29 |
|
30 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
31 |
directory has to be specified as well --> |
|
32 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
30 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
33 | 31 |
<arg>-Odocument_to_dataset=${workingDir}/producer/document_to_dataset</arg> |
34 | 32 |
<arg>-Odocument_relation=${workingDir}/producer/document_relation</arg> |
35 | 33 |
</java> |
... | ... | |
94 | 92 |
<arg>-C{document_to_dataset, |
95 | 93 |
eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet, |
96 | 94 |
eu/dnetlib/iis/transformers/export/documenttodataset_without_imported_data/sampledataproducer/data/output_document_to_dataset.json}</arg> |
97 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
98 |
directory has to be specified as well --> |
|
99 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
95 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
100 | 96 |
<arg>-Idocument_to_dataset=${workingDir}/transformer_export_documenttodataset_without_imported_data/document_to_dataset</arg> |
101 | 97 |
</java> |
102 | 98 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/export/documenttoproject_without_imported_data/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
27 | 27 |
eu.dnetlib.iis.importer.schemas.DocumentToProject, |
28 | 28 |
eu/dnetlib/iis/transformers/export/documenttoproject_without_imported_data/sampledataproducer/data/imported_document_to_project.json}</arg> |
29 | 29 |
|
30 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
31 |
directory has to be specified as well --> |
|
32 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
30 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
33 | 31 |
<arg>-Odocument_to_project=${workingDir}/producer/document_to_project</arg> |
34 | 32 |
<arg>-Oimported_document_to_project=${workingDir}/producer/imported_document_to_project</arg> |
35 | 33 |
</java> |
... | ... | |
94 | 92 |
<arg>-C{document_to_project, |
95 | 93 |
eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject, |
96 | 94 |
eu/dnetlib/iis/transformers/export/documenttoproject_without_imported_data/sampledataproducer/data/output_document_to_project.json}</arg> |
97 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
98 |
directory has to be specified as well --> |
|
99 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
95 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
100 | 96 |
<arg>-Idocument_to_project=${workingDir}/transformer_export_documenttoproject_without_imported_data/document_to_project</arg> |
101 | 97 |
</java> |
102 | 98 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/importer/documentmetadata/idextractor/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
25 | 25 |
<arg>-C{document_metadata, |
26 | 26 |
eu.dnetlib.iis.importer.schemas.DocumentMetadata, |
27 | 27 |
eu/dnetlib/iis/transformers/importer/documentmetadata/idextractor/sampledataproducer/data/input_document_metadata.json}</arg> |
28 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
29 | 28 |
<arg>-Odocument_metadata=${workingDir}/producer/document_metadata</arg> |
30 | 29 |
</java> |
31 | 30 |
<ok to="id-extractor"/> |
... | ... | |
82 | 81 |
<arg>-C{output_identifier, |
83 | 82 |
eu.dnetlib.iis.common.schemas.DocumentId, |
84 | 83 |
eu/dnetlib/iis/transformers/importer/documentmetadata/idextractor/sampledataproducer/data/output_document_id.json}</arg> |
85 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
86 | 84 |
<arg>-Ioutput_identifier=${workingDir}/transformers_idextractor/output</arg> |
87 | 85 |
</java> |
88 | 86 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/metadataextraction/checksum/postprocessing/text/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
26 | 26 |
<arg>-C{document_text, |
27 | 27 |
eu.dnetlib.iis.metadataextraction.schemas.DocumentText, |
28 | 28 |
eu/dnetlib/iis/transformers/metadataextraction/checksum/postprocessing/text/sampledataproducer/data/input_document_text.json}</arg> |
29 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
30 | 29 |
<arg>-Odocument_content_url=${workingDir}/producer/document_content_url</arg> |
31 | 30 |
<arg>-Odocument_text=${workingDir}/producer/document_text</arg> |
32 | 31 |
</java> |
... | ... | |
86 | 85 |
<arg>-C{output_metadata, |
87 | 86 |
eu.dnetlib.iis.metadataextraction.schemas.DocumentText, |
88 | 87 |
eu/dnetlib/iis/transformers/metadataextraction/checksum/postprocessing/text/sampledataproducer/data/output_document_text.json}</arg> |
89 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
90 | 88 |
<arg>-Ioutput_metadata=${workingDir}/transformer_metadataextraction_checksum_postprocessing_text/output</arg> |
91 | 89 |
</java> |
92 | 90 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/documenttodataset_without_imported_data/oozie_app/workflow.xml | ||
---|---|---|
12 | 12 |
</property> |
13 | 13 |
</parameters> |
14 | 14 |
|
15 |
<start to="transformer"/> |
|
15 |
<start to="generate-schema"/> |
|
16 |
|
|
17 |
<action name="generate-schema"> |
|
18 |
<java> |
|
19 |
<job-tracker>${jobTracker}</job-tracker> |
|
20 |
<name-node>${nameNode}</name-node> |
|
21 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
22 |
<arg>eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet</arg> |
|
23 |
<arg>eu.dnetlib.iis.importer.schemas.DocumentRelation</arg> |
|
24 |
<capture-output /> |
|
25 |
</java> |
|
26 |
<ok to="transformer" /> |
|
27 |
<error to="fail" /> |
|
28 |
</action> |
|
29 |
|
|
16 | 30 |
<action name="transformer"> |
17 | 31 |
<pig> |
18 | 32 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
31 | 45 |
<script>lib/scripts/transformer.pig</script> |
32 | 46 |
|
33 | 47 |
<param>input_document_to_dataset=${input_document_to_dataset}</param> |
34 |
<param>schema_input_document_to_dataset=eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet</param>
|
|
48 |
<param>schema_input_document_to_dataset=${wf:actionData('generate-schema')['eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet']}</param>
|
|
35 | 49 |
|
36 | 50 |
<param>input_document_relation=${input_document_relation}</param> |
37 |
<param>schema_input_document_relation=eu.dnetlib.iis.importer.schemas.DocumentRelation</param>
|
|
51 |
<param>schema_input_document_relation=${wf:actionData('generate-schema')['eu.dnetlib.iis.importer.schemas.DocumentRelation']}</param>
|
|
38 | 52 |
|
39 | 53 |
<param>output_document_to_dataset=${output_document_to_dataset}</param> |
40 |
<param>schema_output_document_to_dataset=eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet</param>
|
|
54 |
<param>schema_output_document_to_dataset=${wf:actionData('generate-schema')['eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet']}</param>
|
|
41 | 55 |
</pig> |
42 | 56 |
<ok to="end"/> |
43 | 57 |
<error to="fail"/> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/citations/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_input_citations |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_input');
|
|
3 |
'schema', '$schema_input');
|
|
4 | 4 |
|
5 | 5 |
define avro_store_output_citations |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 | 7 |
'index', '0', |
8 |
'output_schema_class', '$schema_output');
|
|
8 |
'schema', '$schema_output');
|
|
9 | 9 |
|
10 | 10 |
input_citations = load '$input' using avro_load_input_citations; |
11 | 11 |
|
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/metadataextraction/skip_extracted/oozie_app/workflow.xml | ||
---|---|---|
27 | 27 |
</property> |
28 | 28 |
</parameters> |
29 | 29 |
|
30 |
<start to="transformer"/> |
|
30 |
<start to="generate-schema"/> |
|
31 |
|
|
32 |
<action name="generate-schema"> |
|
33 |
<java> |
|
34 |
<job-tracker>${jobTracker}</job-tracker> |
|
35 |
<name-node>${nameNode}</name-node> |
|
36 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
37 |
<arg>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</arg> |
|
38 |
<arg>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</arg> |
|
39 |
<arg>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</arg> |
|
40 |
<capture-output /> |
|
41 |
</java> |
|
42 |
<ok to="transformer" /> |
|
43 |
<error to="fail" /> |
|
44 |
</action> |
|
45 |
|
|
31 | 46 |
<action name="transformer"> |
32 | 47 |
<pig> |
33 | 48 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
55 | 70 |
<!-- Path to PIG script the workflow executes. --> |
56 | 71 |
<script>lib/scripts/transformer.pig</script> |
57 | 72 |
|
58 |
<param>schema_document_content=eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</param>
|
|
59 |
<param>schema_document_text=eu.dnetlib.iis.metadataextraction.schemas.DocumentText</param>
|
|
60 |
<param>schema_document_meta=eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</param>
|
|
73 |
<param>schema_document_content=${wf:actionData('generate-schema')['eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl']}</param>
|
|
74 |
<param>schema_document_text=${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</param>
|
|
75 |
<param>schema_document_meta=${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata']}</param>
|
|
61 | 76 |
|
62 | 77 |
<param>input_document_content=${input_document_content}</param> |
63 | 78 |
<param>input_document_text=${input_document_text}</param> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/documentmetadata/oozie_app/workflow.xml | ||
---|---|---|
11 | 11 |
</property> |
12 | 12 |
</parameters> |
13 | 13 |
|
14 |
<start to="transformer"/>
|
|
14 |
<start to="generate-schema"/>
|
|
15 | 15 |
|
16 |
<action name="generate-schema"> |
|
17 |
<java> |
|
18 |
<job-tracker>${jobTracker}</job-tracker> |
|
19 |
<name-node>${nameNode}</name-node> |
|
20 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
21 |
<arg>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</arg> |
|
22 |
<arg>eu.dnetlib.iis.export.schemas.DocumentMetadata</arg> |
|
23 |
<capture-output /> |
|
24 |
</java> |
|
25 |
<ok to="transformer" /> |
|
26 |
<error to="fail" /> |
|
27 |
</action> |
|
28 |
|
|
16 | 29 |
<action name="transformer"> |
17 | 30 |
<pig> |
18 | 31 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
31 | 44 |
<script>lib/scripts/transformer/transformer.pig</script> |
32 | 45 |
|
33 | 46 |
<param>input_extracted_metadata=${input_extracted_metadata}</param> |
34 |
<param>schema_input_extracted_metadata=eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</param>
|
|
47 |
<param>schema_input_extracted_metadata=${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata']}</param>
|
|
35 | 48 |
|
36 | 49 |
<param>output_metadata=${output_metadata}</param> |
37 |
<param>schema_output_metadata=eu.dnetlib.iis.export.schemas.DocumentMetadata</param>
|
|
50 |
<param>schema_output_metadata=${wf:actionData('generate-schema')['eu.dnetlib.iis.export.schemas.DocumentMetadata']}</param>
|
|
38 | 51 |
</pig> |
39 | 52 |
<ok to="end"/> |
40 | 53 |
<error to="fail"/> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/identifier/documenttodataset/oozie_app/workflow.xml | ||
---|---|---|
12 | 12 |
</property> |
13 | 13 |
</parameters> |
14 | 14 |
|
15 |
<start to="transformer"/> |
|
15 |
<start to="generate-schema"/> |
|
16 |
|
|
17 |
<action name="generate-schema"> |
|
18 |
<java> |
|
19 |
<job-tracker>${jobTracker}</job-tracker> |
|
20 |
<name-node>${nameNode}</name-node> |
|
21 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
22 |
<arg>eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet</arg> |
|
23 |
<arg>eu.dnetlib.iis.importer.schemas.DocumentToMDStore</arg> |
|
24 |
<capture-output /> |
|
25 |
</java> |
|
26 |
<ok to="transformer" /> |
|
27 |
<error to="fail" /> |
|
28 |
</action> |
|
29 |
|
|
16 | 30 |
<action name="transformer"> |
17 | 31 |
<pig> |
18 | 32 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
31 | 45 |
<script>lib/scripts/transformer.pig</script> |
32 | 46 |
|
33 | 47 |
<param>input_document_to_dataset=${input_document_to_dataset}</param> |
34 |
<param>schema_input_document_to_dataset=eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet</param>
|
|
48 |
<param>schema_input_document_to_dataset=${wf:actionData('generate-schema')['eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet']}</param>
|
|
35 | 49 |
|
36 | 50 |
<param>input_document_to_mdstore=${input_document_to_mdstore}</param> |
37 |
<param>schema_input_document_to_mdstore=eu.dnetlib.iis.importer.schemas.DocumentToMDStore</param>
|
|
51 |
<param>schema_input_document_to_mdstore=${wf:actionData('generate-schema')['eu.dnetlib.iis.importer.schemas.DocumentToMDStore']}</param>
|
|
38 | 52 |
|
39 | 53 |
<param>output_document_to_mdstore=${output_document_to_mdstore}</param> |
40 |
<param>schema_output_document_to_mdstore=eu.dnetlib.iis.importer.schemas.DocumentToMDStore</param>
|
|
54 |
<param>schema_output_document_to_mdstore=${wf:actionData('generate-schema')['eu.dnetlib.iis.importer.schemas.DocumentToMDStore']}</param>
|
|
41 | 55 |
</pig> |
42 | 56 |
<ok to="end"/> |
43 | 57 |
<error to="fail"/> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/identifier/documenttoproject/oozie_app/workflow.xml | ||
---|---|---|
9 | 9 |
</property> |
10 | 10 |
</parameters> |
11 | 11 |
|
12 |
<start to="transformer"/> |
|
12 |
<start to="generate-schema"/> |
|
13 |
|
|
14 |
<action name="generate-schema"> |
|
15 |
<java> |
|
16 |
<job-tracker>${jobTracker}</job-tracker> |
|
17 |
<name-node>${nameNode}</name-node> |
|
18 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
19 |
<arg>eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject</arg> |
|
20 |
<arg>eu.dnetlib.iis.export.auxiliary.schemas.Identifier</arg> |
|
21 |
<capture-output /> |
|
22 |
</java> |
|
23 |
<ok to="transformer" /> |
|
24 |
<error to="fail" /> |
|
25 |
</action> |
|
26 |
|
|
13 | 27 |
<action name="transformer"> |
14 | 28 |
<pig> |
15 | 29 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
28 | 42 |
<script>lib/scripts/transformer.pig</script> |
29 | 43 |
|
30 | 44 |
<param>input_document_to_project=${input_document_to_project}</param> |
31 |
<param>schema_input_document_to_project=eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject</param>
|
|
45 |
<param>schema_input_document_to_project=${wf:actionData('generate-schema')['eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject']}</param>
|
|
32 | 46 |
|
33 | 47 |
<param>output_identifier=${output_identifier}</param> |
34 |
<param>schema_output_identifier=eu.dnetlib.iis.export.auxiliary.schemas.Identifier</param>
|
|
48 |
<param>schema_output_identifier=${wf:actionData('generate-schema')['eu.dnetlib.iis.export.auxiliary.schemas.Identifier']}</param>
|
|
35 | 49 |
</pig> |
36 | 50 |
<ok to="end"/> |
37 | 51 |
<error to="fail"/> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/documenttodataset_without_imported_data/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_document_to_dataset |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_input_document_to_dataset');
|
|
3 |
'schema', '$schema_input_document_to_dataset');
|
|
4 | 4 |
|
5 | 5 |
define avro_load_document_relation |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 |
'input_schema_class', '$schema_input_document_relation');
|
|
7 |
'schema', '$schema_input_document_relation');
|
|
8 | 8 |
|
9 | 9 |
define avro_store_document_to_dataset |
10 | 10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
11 | 11 |
'index', '0', |
12 |
'output_schema_class', '$schema_output_document_to_dataset');
|
|
12 |
'schema', '$schema_output_document_to_dataset');
|
|
13 | 13 |
|
14 | 14 |
|
15 | 15 |
documentToDataset = load '$input_document_to_dataset' using avro_load_document_to_dataset; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/documenttoproject_without_imported_data/oozie_app/workflow.xml | ||
---|---|---|
12 | 12 |
</property> |
13 | 13 |
</parameters> |
14 | 14 |
|
15 |
<start to="transformer"/> |
|
15 |
<start to="generate-schema"/> |
|
16 |
|
|
17 |
<action name="generate-schema"> |
|
18 |
<java> |
|
19 |
<job-tracker>${jobTracker}</job-tracker> |
|
20 |
<name-node>${nameNode}</name-node> |
|
21 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
22 |
<arg>eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject</arg> |
|
23 |
<arg>eu.dnetlib.iis.importer.schemas.DocumentToProject</arg> |
|
24 |
<capture-output /> |
|
25 |
</java> |
|
26 |
<ok to="transformer" /> |
|
27 |
<error to="fail" /> |
|
28 |
</action> |
|
29 |
|
|
16 | 30 |
<action name="transformer"> |
17 | 31 |
<pig> |
18 | 32 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
31 | 45 |
<script>lib/scripts/transformer.pig</script> |
32 | 46 |
|
33 | 47 |
<param>input_document_to_project=${input_document_to_project}</param> |
34 |
<param>schema_input_document_to_project=eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject</param>
|
|
48 |
<param>schema_input_document_to_project=${wf:actionData('generate-schema')['eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject']}</param>
|
|
35 | 49 |
|
36 | 50 |
<param>input_imported_document_to_project=${input_imported_document_to_project}</param> |
37 |
<param>schema_input_imported_document_to_project=eu.dnetlib.iis.importer.schemas.DocumentToProject</param>
|
|
51 |
<param>schema_input_imported_document_to_project=${wf:actionData('generate-schema')['eu.dnetlib.iis.importer.schemas.DocumentToProject']}</param>
|
|
38 | 52 |
|
39 | 53 |
<param>output_document_to_project=${output_document_to_project}</param> |
40 |
<param>schema_output_document_to_project=eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject</param>
|
|
54 |
<param>schema_output_document_to_project=${wf:actionData('generate-schema')['eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject']}</param>
|
|
41 | 55 |
</pig> |
42 | 56 |
<ok to="end"/> |
43 | 57 |
<error to="fail"/> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/metadatamerger/oozie_app/workflow.xml | ||
---|---|---|
15 | 15 |
</property> |
16 | 16 |
</parameters> |
17 | 17 |
|
18 |
<start to="merger"/>
|
|
18 |
<start to="generate-schema"/>
|
|
19 | 19 |
|
20 |
<action name="generate-schema"> |
|
21 |
<java> |
|
22 |
<job-tracker>${jobTracker}</job-tracker> |
|
23 |
<name-node>${nameNode}</name-node> |
|
24 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
25 |
<arg>eu.dnetlib.iis.importer.schemas.DocumentMetadata</arg> |
|
26 |
<arg>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</arg> |
|
27 |
<arg>eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal</arg> |
|
28 |
<capture-output /> |
|
29 |
</java> |
|
30 |
<ok to="merger" /> |
|
31 |
<error to="fail" /> |
|
32 |
</action> |
|
33 |
|
|
20 | 34 |
<action name="merger"> |
21 | 35 |
<pig> |
22 | 36 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
35 | 49 |
<script>lib/scripts/merger/merger.pig</script> |
36 | 50 |
|
37 | 51 |
<param>input_base_metadata=${input_base_metadata}</param> |
38 |
<param>schema_input_base_metadata=eu.dnetlib.iis.importer.schemas.DocumentMetadata</param>
|
|
52 |
<param>schema_input_base_metadata=${wf:actionData('generate-schema')['eu.dnetlib.iis.importer.schemas.DocumentMetadata']}</param>
|
|
39 | 53 |
|
40 | 54 |
<param>input_extracted_metadata=${input_extracted_metadata}</param> |
41 |
<param>schema_input_extracted_metadata=eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</param>
|
|
55 |
<param>schema_input_extracted_metadata=${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata']}</param>
|
|
42 | 56 |
|
43 | 57 |
<param>output_merged_metadata=${output_merged_metadata}</param> |
44 |
<param>schema_output_merged_metadata=eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal</param>
|
|
58 |
<param>schema_output_merged_metadata=${wf:actionData('generate-schema')['eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal']}</param>
|
|
45 | 59 |
</pig> |
46 | 60 |
<ok to="end"/> |
47 | 61 |
<error to="fail"/> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/metadatamerger/oozie_app/lib/scripts/merger/merger.pig | ||
---|---|---|
1 | 1 |
define avro_load_base_metadata |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_input_base_metadata');
|
|
3 |
'schema', '$schema_input_base_metadata');
|
|
4 | 4 |
|
5 | 5 |
define avro_load_extracted_metadata |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 |
'input_schema_class', '$schema_input_extracted_metadata');
|
|
7 |
'schema', '$schema_input_extracted_metadata');
|
|
8 | 8 |
|
9 | 9 |
define avro_store_merged_metadata |
10 | 10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
11 | 11 |
'index', '0', |
12 |
'output_schema_class', '$schema_output_merged_metadata');
|
|
12 |
'schema', '$schema_output_merged_metadata');
|
|
13 | 13 |
|
14 | 14 |
define FIRST_NOT_NULL_STR eu.dnetlib.iis.transformers.udfs.StringFirstNotEmpty; |
15 | 15 |
define FIRST_NOT_NULL_INT eu.dnetlib.iis.transformers.udfs.IntegerFirstNotEmpty; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/metadataextraction/checksum/postprocessing/meta/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
26 | 26 |
<arg>-C{extracted_document_metadata, |
27 | 27 |
eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata, |
28 | 28 |
eu/dnetlib/iis/transformers/metadataextraction/checksum/postprocessing/meta/sampledataproducer/data/input_extracted_document_metadata.json}</arg> |
29 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
30 | 29 |
<arg>-Odocument_content_url=${workingDir}/producer/document_content_url</arg> |
31 | 30 |
<arg>-Oextracted_document_metadata=${workingDir}/producer/extracted_document_metadata</arg> |
32 | 31 |
</java> |
... | ... | |
87 | 86 |
<arg>-C{output_metadata, |
88 | 87 |
eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata, |
89 | 88 |
eu/dnetlib/iis/transformers/metadataextraction/checksum/postprocessing/meta/sampledataproducer/data/output_extracted_document_metadata.json}</arg> |
90 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
91 | 89 |
<arg>-Ioutput_metadata=${workingDir}/transformer_metadataextraction_checksum_postprocessing_meta/output</arg> |
92 | 90 |
</java> |
93 | 91 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/metadataextraction/checksum/preprocessing/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
23 | 23 |
<arg>-C{document_content_url, |
24 | 24 |
eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl, |
25 | 25 |
eu/dnetlib/iis/transformers/metadataextraction/checksum/preprocessing/sampledataproducer/data/input_document_content_url.json}</arg> |
26 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
27 | 26 |
<arg>-Odocument_content_url=${workingDir}/producer/document_content_url</arg> |
28 | 27 |
</java> |
29 | 28 |
<ok to="checksum_preprocessing"/> |
... | ... | |
78 | 77 |
<arg>-C{output_metadata, |
79 | 78 |
eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl, |
80 | 79 |
eu/dnetlib/iis/transformers/metadataextraction/checksum/preprocessing/sampledataproducer/data/output_document_content_url.json}</arg> |
81 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
82 | 80 |
<arg>-Ioutput_metadata=${workingDir}/transformer_metadataextraction_checksum_preprocessing/output</arg> |
83 | 81 |
</java> |
84 | 82 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/metadatamerger/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
26 | 26 |
<arg>-C{extracted_metadata, |
27 | 27 |
eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata, |
28 | 28 |
eu/dnetlib/iis/transformers/metadatamerger/sampledataproducer/data/extr_metadata.json}</arg> |
29 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
30 |
directory has to be specified as well --> |
|
31 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
29 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
32 | 30 |
<arg>-Obase_metadata=${workingDir}/producer/base_metadata</arg> |
33 | 31 |
<arg>-Oextracted_metadata=${workingDir}/producer/extr_metadata</arg> |
34 | 32 |
</java> |
... | ... | |
93 | 91 |
<arg>-C{merged_metadata, |
94 | 92 |
eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal, |
95 | 93 |
eu/dnetlib/iis/transformers/metadatamerger/sampledataproducer/data/merged_metadata.json}</arg> |
96 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
97 |
directory has to be specified as well --> |
|
98 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
94 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
99 | 95 |
<arg>-Imerged_metadata=${workingDir}/transformer_metadatamerger/merged_metadata</arg> |
100 | 96 |
</java> |
101 | 97 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/citationmatching/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
26 | 26 |
<arg>-C{metadata, |
27 | 27 |
eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal, |
28 | 28 |
eu/dnetlib/iis/transformers/citationmatching/sampledataproducer/data/metadata.json}</arg> |
29 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
30 |
directory has to be specified as well --> |
|
31 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
29 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
32 | 30 |
<arg>-Operson=${workingDir}/producer/person</arg> |
33 | 31 |
<arg>-Ometadata=${workingDir}/producer/metadata</arg> |
34 | 32 |
</java> |
... | ... | |
93 | 91 |
<arg>-C{citation_metadata, |
94 | 92 |
eu.dnetlib.iis.citationmatching.schemas.DocumentMetadata, |
95 | 93 |
eu/dnetlib/iis/transformers/citationmatching/sampledataproducer/data/citation_metadata.json}</arg> |
96 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
97 |
directory has to be specified as well --> |
|
98 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
94 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
99 | 95 |
<arg>-Icitation_metadata=${workingDir}/transformer_citationmatching/citation_metadata</arg> |
100 | 96 |
</java> |
101 | 97 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/common/existencefilter/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
26 | 26 |
<arg>-C{data, |
27 | 27 |
eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl, |
28 | 28 |
eu/dnetlib/iis/transformers/common/existencefilter/sampledataproducer/data/data.json}</arg> |
29 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
30 |
directory has to be specified as well --> |
|
31 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
29 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
32 | 30 |
<arg>-Oexistent_id=${workingDir}/producer/existent_id</arg> |
33 | 31 |
<arg>-Odata=${workingDir}/producer/data</arg> |
34 | 32 |
</java> |
... | ... | |
88 | 86 |
<arg>-C{filtered, |
89 | 87 |
eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl, |
90 | 88 |
eu/dnetlib/iis/transformers/common/existencefilter/sampledataproducer/data/filtered.json}</arg> |
91 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
92 |
directory has to be specified as well --> |
|
93 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
89 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
94 | 90 |
<arg>-Ifiltered=${workingDir}/existencefilter/filtered</arg> |
95 | 91 |
</java> |
96 | 92 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/documentssimilarity/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
27 | 27 |
eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal, |
28 | 28 |
eu/dnetlib/iis/transformers/documentssimilarity/sampledataproducer/data/input_metadata.json}</arg> |
29 | 29 |
|
30 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
31 |
directory has to be specified as well --> |
|
32 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
30 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
33 | 31 |
<arg>-Oinput_person=${workingDir}/producer/input_person</arg> |
34 | 32 |
<arg>-Oinput_metadata=${workingDir}/producer/input_metadata</arg> |
35 | 33 |
</java> |
... | ... | |
94 | 92 |
<arg>-C{output_document_metadata, |
95 | 93 |
eu.dnetlib.iis.documentssimilarity.schemas.DocumentMetadata, |
96 | 94 |
eu/dnetlib/iis/transformers/documentssimilarity/sampledataproducer/data/output_document_metadata.json}</arg> |
97 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
98 |
directory has to be specified as well --> |
|
99 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
95 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
100 | 96 |
<arg>-Ioutput_document_metadata=${workingDir}/transformer_documentssimilarity/output_document_metadata</arg> |
101 | 97 |
</java> |
102 | 98 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/ingest/pmc/metadata/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
24 | 24 |
eu.dnetlib.iis.ingest.pmc.metadata.schemas.ExtractedDocumentMetadata, |
25 | 25 |
eu/dnetlib/iis/transformers/ingest/pmc/metadata/sampledataproducer/data/input.json}</arg> |
26 | 26 |
|
27 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
28 |
directory has to be specified as well --> |
|
29 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
27 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
30 | 28 |
<arg>-Oinput=${workingDir}/producer/input</arg> |
31 | 29 |
</java> |
32 | 30 |
<ok to="transformer"/> |
... | ... | |
86 | 84 |
<arg>-C{output, |
87 | 85 |
eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata, |
88 | 86 |
eu/dnetlib/iis/transformers/ingest/pmc/metadata/sampledataproducer/data/output.json}</arg> |
89 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
90 |
directory has to be specified as well --> |
|
91 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
87 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
92 | 88 |
<arg>-Ioutput=${workingDir}/transformer/output</arg> |
93 | 89 |
</java> |
94 | 90 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/documentssimilarity_with_fulltext/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
30 | 30 |
eu.dnetlib.iis.metadataextraction.schemas.DocumentText, |
31 | 31 |
eu/dnetlib/iis/transformers/documentssimilarity_with_fulltext/sampledataproducer/data/input_document_text.json}</arg> |
32 | 32 |
|
33 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
34 |
directory has to be specified as well --> |
|
35 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
33 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
36 | 34 |
<arg>-Oinput_person=${workingDir}/producer/input_person</arg> |
37 | 35 |
<arg>-Oinput_metadata=${workingDir}/producer/input_metadata</arg> |
38 | 36 |
<arg>-Oinput_document_text=${workingDir}/producer/input_document_text</arg> |
... | ... | |
102 | 100 |
<arg>-C{output_document_metadata, |
103 | 101 |
eu.dnetlib.iis.documentssimilarity_with_fulltext.schemas.DocumentMetadata, |
104 | 102 |
eu/dnetlib/iis/transformers/documentssimilarity_with_fulltext/sampledataproducer/data/output_document_metadata.json}</arg> |
105 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
106 |
directory has to be specified as well --> |
|
107 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
103 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
108 | 104 |
<arg>-Ioutput_document_metadata=${workingDir}/transformer_documentssimilarity_with_fulltext/output_document_metadata</arg> |
109 | 105 |
</java> |
110 | 106 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/idreplacer/replacer_1_field/oozie_app/workflow.xml | ||
---|---|---|
26 | 26 |
<arg>-C{id_mapping, |
27 | 27 |
eu.dnetlib.iis.common.schemas.IdentifierMapping, |
28 | 28 |
eu/dnetlib/iis/transformers/idreplacer/replacer_1_field/data/id_mapping.json}</arg> |
29 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
30 |
directory has to be specified as well --> |
|
31 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
29 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
32 | 30 |
<arg>-Omain=${workingDir}/producer/main</arg> |
33 | 31 |
<arg>-Oid_mapping=${workingDir}/producer/id_mapping</arg> |
34 | 32 |
</java> |
... | ... | |
101 | 99 |
<arg>-C{output, |
102 | 100 |
eu.dnetlib.iis.importer.schemas.DocumentMetadata, |
103 | 101 |
eu/dnetlib/iis/transformers/idreplacer/replacer_1_field/data/output.json}</arg> |
104 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
105 |
directory has to be specified as well --> |
|
106 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
102 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
107 | 103 |
<arg>-Ioutput=${workingDir}/collapser_idreplacer/output</arg> |
108 | 104 |
</java> |
109 | 105 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/idreplacer/replacer_2_fields/oozie_app/workflow.xml | ||
---|---|---|
26 | 26 |
<arg>-C{id_mapping, |
27 | 27 |
eu.dnetlib.iis.common.schemas.IdentifierMapping, |
28 | 28 |
eu/dnetlib/iis/transformers/idreplacer/replacer_2_fields/data/id_mapping.json}</arg> |
29 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
30 |
directory has to be specified as well --> |
|
31 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
29 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
32 | 30 |
<arg>-Omain=${workingDir}/producer/main</arg> |
33 | 31 |
<arg>-Oid_mapping=${workingDir}/producer/id_mapping</arg> |
34 | 32 |
</java> |
... | ... | |
105 | 103 |
<arg>-C{output, |
106 | 104 |
eu.dnetlib.iis.citationmatching.schemas.Citation, |
107 | 105 |
eu/dnetlib/iis/transformers/idreplacer/replacer_2_fields/data/output.json}</arg> |
108 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
109 |
directory has to be specified as well --> |
|
110 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
106 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
111 | 107 |
<arg>-Ioutput=${workingDir}/collapser_idreplacer/output</arg> |
112 | 108 |
</java> |
113 | 109 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/metricsprimary/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
29 | 29 |
<arg>-C{person, |
30 | 30 |
eu.dnetlib.iis.importer.schemas.Person, |
31 | 31 |
eu/dnetlib/iis/transformers/metricsprimary/sampledataproducer/data/person.json}</arg> |
32 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
33 |
directory has to be specified as well --> |
|
34 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
32 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
35 | 33 |
<arg>-Odocument=${workingDir}/producer/document</arg> |
36 | 34 |
<arg>-Ocitation=${workingDir}/producer/citation</arg> |
37 | 35 |
<arg>-Operson=${workingDir}/producer/person</arg> |
... | ... | |
108 | 106 |
<arg>-C{person_id, |
109 | 107 |
eu.dnetlib.iis.metrics.primary.schemas.PersonId, |
110 | 108 |
eu/dnetlib/iis/transformers/metricsprimary/sampledataproducer/data/person_id.json}</arg> |
111 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
112 |
directory has to be specified as well --> |
|
113 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
109 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
114 | 110 |
<arg>-Idocument_authors_citations=${workingDir}/transformer_metricsprimary/document_authors_citations</arg> |
115 | 111 |
<arg>-Iperson_id=${workingDir}/transformer_metricsprimary/person_id</arg> |
116 | 112 |
</java> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/statistics/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
35 | 35 |
<arg>-C{document_to_project, |
36 | 36 |
eu.dnetlib.iis.importer.schemas.DocumentToProject, |
37 | 37 |
eu/dnetlib/iis/transformers/statistics/sampledataproducer/data/document_to_project.json}</arg> |
38 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
39 |
directory has to be specified as well --> |
|
40 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
38 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
41 | 39 |
<arg>-Odocument=${workingDir}/producer/document</arg> |
42 | 40 |
<arg>-Ocitation=${workingDir}/producer/citation</arg> |
43 | 41 |
<arg>-Operson=${workingDir}/producer/person</arg> |
... | ... | |
131 | 129 |
<arg>-C{project_id, |
132 | 130 |
eu.dnetlib.iis.statistics.schemas.ProjectId, |
133 | 131 |
eu/dnetlib/iis/transformers/statistics/sampledataproducer/data/project_id.json}</arg> |
134 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
135 |
directory has to be specified as well --> |
|
136 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
132 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
137 | 133 |
<arg>-Idocument_authors_citations=${workingDir}/transformer_statistics/document_authors_citations</arg> |
138 | 134 |
<arg>-Iperson_id=${workingDir}/transformer_statistics/person_id</arg> |
139 | 135 |
<arg>-Iproject_id=${workingDir}/transformer_statistics/project_id</arg> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/test/resources/eu/dnetlib/iis/transformers/documentsclassification/sampledataproducer/oozie_app/workflow.xml | ||
---|---|---|
23 | 23 |
<arg>-C{merged_metadata, |
24 | 24 |
eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal, |
25 | 25 |
eu/dnetlib/iis/transformers/documentsclassification/sampledataproducer/data/merged_metadata.json}</arg> |
26 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
27 |
directory has to be specified as well --> |
|
28 |
<arg>-SworkingDir=${workingDir}/producer/working_dir</arg> |
|
26 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
29 | 27 |
<arg>-Omerged_metadata=${workingDir}/producer/merged_metadata</arg> |
30 | 28 |
</java> |
31 | 29 |
<ok to="transformer_documentsclassification"/> |
... | ... | |
85 | 83 |
<arg>-C{document_metadata, |
86 | 84 |
eu.dnetlib.iis.documentsclassification.schemas.DocumentMetadata, |
87 | 85 |
eu/dnetlib/iis/transformers/documentsclassification/sampledataproducer/data/abstract_metadata.json}</arg> |
88 |
<!-- All input and output ports have to be bound to paths in HDFS, working |
|
89 |
directory has to be specified as well --> |
|
90 |
<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg> |
|
86 |
<!-- All input and output ports have to be bound to paths in HDFS --> |
|
91 | 87 |
<arg>-Idocument_metadata=${workingDir}/transformer_documentsclassification/document_metadata</arg> |
92 | 88 |
</java> |
93 | 89 |
<ok to="end" /> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/metadataextraction/checksum/postprocessing/text/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_document_text |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_document_text');
|
|
3 |
'schema', '$schema_document_text');
|
|
4 | 4 |
|
5 | 5 |
define avro_load_document_content_url |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 |
'input_schema_class', '$schema_document_content_url');
|
|
7 |
'schema', '$schema_document_content_url');
|
|
8 | 8 |
|
9 | 9 |
define avro_store_document_text |
10 | 10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
11 | 11 |
'index', '0', |
12 |
'output_schema_class', '$schema_document_text');
|
|
12 |
'schema', '$schema_document_text');
|
|
13 | 13 |
|
14 | 14 |
sourceDocumentText = load '$input_document_text' using avro_load_document_text; |
15 | 15 |
sourceDocumentContentUrl = load '$input_document_content_url' using avro_load_document_content_url; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/metadataextraction/checksum/postprocessing/text/oozie_app/workflow.xml | ||
---|---|---|
15 | 15 |
</property> |
16 | 16 |
</parameters> |
17 | 17 |
|
18 |
<start to="transformer"/> |
|
18 |
<start to="generate-schema"/> |
|
19 |
|
|
20 |
<action name="generate-schema"> |
|
21 |
<java> |
|
22 |
<job-tracker>${jobTracker}</job-tracker> |
|
23 |
<name-node>${nameNode}</name-node> |
|
24 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
25 |
<arg>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</arg> |
|
26 |
<arg>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</arg> |
|
27 |
<capture-output /> |
|
28 |
</java> |
|
29 |
<ok to="transformer" /> |
|
30 |
<error to="fail" /> |
|
31 |
</action> |
|
32 |
|
|
19 | 33 |
<action name="transformer"> |
20 | 34 |
<pig> |
21 | 35 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
32 | 46 |
</configuration> |
33 | 47 |
<!-- Path to PIG script the workflow executes. --> |
34 | 48 |
<script>lib/scripts/transformer.pig</script> |
35 |
<param>schema_document_content_url=eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</param>
|
|
36 |
<param>schema_document_text=eu.dnetlib.iis.metadataextraction.schemas.DocumentText</param>
|
|
49 |
<param>schema_document_content_url=${wf:actionData('generate-schema')['eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl']}</param>
|
|
50 |
<param>schema_document_text=${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</param>
|
|
37 | 51 |
<param>input_document_content_url=${input_document_content_url}</param> |
38 | 52 |
<param>input_document_text=${input_document_text}</param> |
39 | 53 |
<param>output=${output}</param> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/metadataextraction/checksum/postprocessing/meta/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_extracted_document_metadata |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_extracted_document_metadata');
|
|
3 |
'schema', '$schema_extracted_document_metadata');
|
|
4 | 4 |
|
5 | 5 |
define avro_load_document_content_url |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 |
'input_schema_class', '$schema_document_content_url');
|
|
7 |
'schema', '$schema_document_content_url');
|
|
8 | 8 |
|
9 | 9 |
define avro_store_extracted_document_metadata |
10 | 10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
11 | 11 |
'index', '0', |
12 |
'output_schema_class', '$schema_extracted_document_metadata');
|
|
12 |
'schema', '$schema_extracted_document_metadata');
|
|
13 | 13 |
|
14 | 14 |
sourceDocumentMeta = load '$input_extracted_document_metadata' using avro_load_extracted_document_metadata; |
15 | 15 |
sourceDocumentContentUrl = load '$input_document_content_url' using avro_load_document_content_url; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/metadataextraction/checksum/postprocessing/meta/oozie_app/workflow.xml | ||
---|---|---|
15 | 15 |
</property> |
16 | 16 |
</parameters> |
17 | 17 |
|
18 |
<start to="transformer"/> |
|
18 |
<start to="generate-schema"/> |
|
19 |
|
|
20 |
<action name="generate-schema"> |
|
21 |
<java> |
|
22 |
<job-tracker>${jobTracker}</job-tracker> |
|
23 |
<name-node>${nameNode}</name-node> |
|
24 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
25 |
<arg>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</arg> |
|
26 |
<arg>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</arg> |
|
27 |
<capture-output /> |
|
28 |
</java> |
|
29 |
<ok to="transformer" /> |
|
30 |
<error to="fail" /> |
|
31 |
</action> |
|
32 |
|
|
19 | 33 |
<action name="transformer"> |
20 | 34 |
<pig> |
21 | 35 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
32 | 46 |
</configuration> |
33 | 47 |
<!-- Path to PIG script the workflow executes. --> |
34 | 48 |
<script>lib/scripts/transformer.pig</script> |
35 |
<param>schema_document_content_url=eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</param>
|
|
36 |
<param>schema_extracted_document_metadata=eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</param>
|
|
49 |
<param>schema_document_content_url=${wf:actionData('generate-schema')['eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl']}</param>
|
|
50 |
<param>schema_extracted_document_metadata=${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata']}</param>
|
|
37 | 51 |
<param>input_document_content_url=${input_document_content_url}</param> |
38 | 52 |
<param>input_extracted_document_metadata=${input_extracted_document_metadata}</param> |
39 | 53 |
<param>output=${output}</param> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/metadataextraction/checksum/preprocessing/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_document_content_url |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_document_content_url');
|
|
3 |
'schema', '$schema_document_content_url');
|
|
4 | 4 |
|
5 | 5 |
define avro_store_document_content_url |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 | 7 |
'index', '0', |
8 |
'output_schema_class', '$schema_document_content_url');
|
|
8 |
'schema', '$schema_document_content_url');
|
|
9 | 9 |
|
10 | 10 |
sourceDocumentContentUrl = load '$input' using avro_load_document_content_url; |
11 | 11 |
|
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/metadataextraction/skip_extracted/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_document_content |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_document_content');
|
|
3 |
'schema', '$schema_document_content');
|
|
4 | 4 |
|
5 | 5 |
define avro_load_document_text |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 |
'input_schema_class', '$schema_document_text');
|
|
7 |
'schema', '$schema_document_text');
|
|
8 | 8 |
|
9 | 9 |
define avro_load_document_meta |
10 | 10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
11 |
'input_schema_class', '$schema_document_meta');
|
|
11 |
'schema', '$schema_document_meta');
|
|
12 | 12 |
|
13 | 13 |
define avro_store_document_content |
14 | 14 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
15 | 15 |
'index', '0', |
16 |
'output_schema_class', '$schema_document_content');
|
|
16 |
'schema', '$schema_document_content');
|
|
17 | 17 |
|
18 | 18 |
define avro_store_document_text |
19 | 19 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
20 | 20 |
'index', '1', |
21 |
'output_schema_class', '$schema_document_text');
|
|
21 |
'schema', '$schema_document_text');
|
|
22 | 22 |
|
23 | 23 |
define avro_store_document_meta |
24 | 24 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
25 | 25 |
'index', '2', |
26 |
'output_schema_class', '$schema_document_meta');
|
|
26 |
'schema', '$schema_document_meta');
|
|
27 | 27 |
|
28 | 28 |
documentContent = load '$input_document_content' using avro_load_document_content; |
29 | 29 |
documentText = load '$input_document_text' using avro_load_document_text; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/documentmetadata/oozie_app/lib/scripts/transformer/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_extracted_metadata |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_input_extracted_metadata');
|
|
3 |
'schema', '$schema_input_extracted_metadata');
|
|
4 | 4 |
|
5 | 5 |
define avro_store_metadata |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 | 7 |
'index', '0', |
8 |
'output_schema_class', '$schema_output_metadata');
|
|
8 |
'schema', '$schema_output_metadata');
|
|
9 | 9 |
|
10 | 10 |
extr_meta = load '$input_extracted_metadata' using avro_load_extracted_metadata; |
11 | 11 |
|
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/identifier/documenttodataset/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_document_to_dataset |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_input_document_to_dataset');
|
|
3 |
'schema', '$schema_input_document_to_dataset');
|
|
4 | 4 |
|
5 | 5 |
define avro_load_document_to_mdstore |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 |
'input_schema_class', '$schema_input_document_to_mdstore');
|
|
7 |
'schema', '$schema_input_document_to_mdstore');
|
|
8 | 8 |
|
9 | 9 |
define avro_store_document_to_mdstore |
10 | 10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
11 | 11 |
'index', '0', |
12 |
'output_schema_class', '$schema_output_document_to_mdstore');
|
|
12 |
'schema', '$schema_output_document_to_mdstore');
|
|
13 | 13 |
|
14 | 14 |
documentToDataset = load '$input_document_to_dataset' using avro_load_document_to_dataset; |
15 | 15 |
datasetIds = foreach documentToDataset generate datasetId as id; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/identifier/documenttoproject/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_document_to_project |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_input_document_to_project');
|
|
3 |
'schema', '$schema_input_document_to_project');
|
|
4 | 4 |
|
5 | 5 |
define avro_store_identifier |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 | 7 |
'index', '0', |
8 |
'output_schema_class', '$schema_output_identifier');
|
|
8 |
'schema', '$schema_output_identifier');
|
|
9 | 9 |
|
10 | 10 |
documentToProject = load '$input_document_to_project' using avro_load_document_to_project; |
11 | 11 |
documentToProjectId = foreach documentToProject generate documentId; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/researchinitiatives/oozie_app/workflow.xml | ||
---|---|---|
9 | 9 |
</property> |
10 | 10 |
</parameters> |
11 | 11 |
|
12 |
<start to="transformer"/> |
|
12 |
<start to="generate-schema"/> |
|
13 |
|
|
14 |
<action name="generate-schema"> |
|
15 |
<java> |
|
16 |
<job-tracker>${jobTracker}</job-tracker> |
|
17 |
<name-node>${nameNode}</name-node> |
|
18 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
19 |
<arg>eu.dnetlib.iis.referenceextraction.researchinitiative.schemas.DocumentToConceptId</arg> |
|
20 |
<arg>eu.dnetlib.iis.export.schemas.DocumentToConceptIds</arg> |
|
21 |
<capture-output /> |
|
22 |
</java> |
|
23 |
<ok to="transformer" /> |
|
24 |
<error to="fail" /> |
|
25 |
</action> |
|
26 |
|
|
13 | 27 |
<action name="transformer"> |
14 | 28 |
<pig> |
15 | 29 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
28 | 42 |
<script>lib/scripts/transformer.pig</script> |
29 | 43 |
|
30 | 44 |
<param>input_document_to_research_initiative=${input_document_to_research_initiative}</param> |
31 |
<param>schema_input_document_to_research_initiative=eu.dnetlib.iis.referenceextraction.researchinitiative.schemas.DocumentToConceptId</param>
|
|
45 |
<param>schema_input_document_to_research_initiative=${wf:actionData('generate-schema')['eu.dnetlib.iis.referenceextraction.researchinitiative.schemas.DocumentToConceptId']}</param>
|
|
32 | 46 |
|
33 | 47 |
<param>output_document_to_research_initiatives=${output_document_to_research_initiatives}</param> |
34 |
<param>schema_output_document_to_research_initiatives=eu.dnetlib.iis.export.schemas.DocumentToConceptIds</param>
|
|
48 |
<param>schema_output_document_to_research_initiatives=${wf:actionData('generate-schema')['eu.dnetlib.iis.export.schemas.DocumentToConceptIds']}</param>
|
|
35 | 49 |
</pig> |
36 | 50 |
<ok to="end"/> |
37 | 51 |
<error to="fail"/> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/documenttoproject_without_imported_data/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_document_to_project |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_input_document_to_project');
|
|
3 |
'schema', '$schema_input_document_to_project');
|
|
4 | 4 |
|
5 | 5 |
define avro_load_imported_document_to_project |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 |
'input_schema_class', '$schema_input_imported_document_to_project');
|
|
7 |
'schema', '$schema_input_imported_document_to_project');
|
|
8 | 8 |
|
9 | 9 |
|
10 | 10 |
define avro_store_document_to_project |
11 | 11 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
12 | 12 |
'index', '0', |
13 |
'output_schema_class', '$schema_output_document_to_project');
|
|
13 |
'schema', '$schema_output_document_to_project');
|
|
14 | 14 |
|
15 | 15 |
|
16 | 16 |
documentToProject = load '$input_document_to_project' using avro_load_document_to_project; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/export/citations/oozie_app/workflow.xml | ||
---|---|---|
11 | 11 |
</property> |
12 | 12 |
</parameters> |
13 | 13 |
|
14 |
<start to="transformer"/>
|
|
14 |
<start to="generate-schema"/>
|
|
15 | 15 |
|
16 |
<action name="generate-schema"> |
|
17 |
<java> |
|
18 |
<job-tracker>${jobTracker}</job-tracker> |
|
19 |
<name-node>${nameNode}</name-node> |
|
20 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
21 |
<arg>eu.dnetlib.iis.common.citations.schemas.Citation</arg> |
|
22 |
<arg>eu.dnetlib.iis.export.schemas.Citations</arg> |
|
23 |
<capture-output /> |
|
24 |
</java> |
|
25 |
<ok to="transformer" /> |
|
26 |
<error to="fail" /> |
|
27 |
</action> |
|
28 |
|
|
16 | 29 |
<action name="transformer"> |
17 | 30 |
<pig> |
18 | 31 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
31 | 44 |
<script>lib/scripts/transformer.pig</script> |
32 | 45 |
|
33 | 46 |
<param>input=${input}</param> |
34 |
<param>schema_input=eu.dnetlib.iis.common.citations.schemas.Citation</param>
|
|
47 |
<param>schema_input=${wf:actionData('generate-schema')['eu.dnetlib.iis.common.citations.schemas.Citation']}</param>
|
|
35 | 48 |
|
36 | 49 |
<param>output=${output}</param> |
37 |
<param>schema_output=eu.dnetlib.iis.export.schemas.Citations</param>
|
|
50 |
<param>schema_output=${wf:actionData('generate-schema')['eu.dnetlib.iis.export.schemas.Citations']}</param>
|
|
38 | 51 |
</pig> |
39 | 52 |
<ok to="end"/> |
40 | 53 |
<error to="fail"/> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/avro2json/oozie_app/workflow.xml | ||
---|---|---|
1 |
<workflow-app xmlns="uri:oozie:workflow:0.4" name="transformers_avro2json"> |
|
2 |
|
|
3 |
<parameters> |
|
4 |
<property> |
|
5 |
<name>input</name> |
|
6 |
<description>input avro datastore</description> |
|
7 |
</property> |
|
8 |
<property> |
|
9 |
<name>output</name> |
|
10 |
<description>output json datastore</description> |
|
11 |
</property> |
|
12 |
</parameters> |
|
13 |
|
|
14 |
<start to="decision-transformer"/> |
|
15 |
|
|
16 |
<decision name="decision-transformer"> |
|
17 |
<switch> |
|
18 |
<case to="end">${input eq "$UNDEFINED$"}</case> |
|
19 |
<default to="transformer"/> |
|
20 |
</switch> |
|
21 |
</decision> |
|
22 |
|
|
23 |
<action name="transformer"> |
|
24 |
<pig> |
|
25 |
<job-tracker>${jobTracker}</job-tracker> |
|
26 |
<name-node>${nameNode}</name-node> |
|
27 |
<!-- The data generated by this node is deleted in this section --> |
|
28 |
<prepare> |
|
29 |
<delete path="${nameNode}${output}" /> |
|
30 |
</prepare> |
|
31 |
<configuration> |
|
32 |
<property> |
|
33 |
<name>mapred.job.queue.name</name> |
|
34 |
<value>${queueName}</value> |
|
35 |
</property> |
|
36 |
</configuration> |
|
37 |
<!-- Path to PIG script the workflow executes. --> |
|
38 |
<script>lib/scripts/transformer/transformer.pig</script> |
|
39 |
<param>input=${input}</param> |
|
40 |
<param>output=${output}</param> |
|
41 |
</pig> |
|
42 |
<ok to="end"/> |
|
43 |
<error to="fail"/> |
|
44 |
</action> |
|
45 |
|
|
46 |
<kill name="fail"> |
|
47 |
<message>Unfortunately, the workflow failed -- error message: |
|
48 |
[${wf:errorMessage(wf:lastErrorNode())}]</message> |
|
49 |
</kill> |
|
50 |
|
|
51 |
<end name="end"/> |
|
52 |
</workflow-app> |
|
0 | 53 |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/citationmatching/oozie_app/lib/scripts/transformer/transformer.pig | ||
---|---|---|
1 | 1 |
define avro_load_person |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema_input_person');
|
|
3 |
'schema', '$schema_input_person');
|
|
4 | 4 |
|
5 | 5 |
define avro_load_metadata |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 |
'input_schema_class', '$schema_input_metadata');
|
|
7 |
'schema', '$schema_input_metadata');
|
|
8 | 8 |
|
9 | 9 |
define avro_store_citation_metadata |
10 | 10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
11 | 11 |
'index', '0', |
12 |
'output_schema_class', '$schema_output_citation_metadata');
|
|
12 |
'schema', '$schema_output_citation_metadata');
|
|
13 | 13 |
|
14 | 14 |
define CREATE_ARRAY eu.dnetlib.iis.transformers.udfs.NullToEmptyBag; |
15 | 15 |
define LIST_TO_INDEXED_LIST eu.dnetlib.iis.transformers.udfs.StringListToListWithIndexes; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/common/thresholdfilter/oozie_app/lib/scripts/filter.pig | ||
---|---|---|
1 |
define avro_load_input |
|
2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
|
3 |
'schema', '$schema'); |
|
4 |
|
|
5 |
define avro_store_output |
|
6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
|
7 |
'index', '0', |
|
8 |
'schema', '$schema'); |
|
9 |
|
|
10 |
input_records = load '$input' using avro_load_input; |
|
11 |
|
|
12 |
output_records = filter input_records by ($threshold_field is not null) AND ($threshold_field >= $threshold_value); |
|
13 |
|
|
14 |
store output_records into '$output' using avro_store_output; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/common/union/oozie_app/workflow.xml | ||
---|---|---|
15 | 15 |
</property> |
16 | 16 |
</parameters> |
17 | 17 |
|
18 |
<start to="transformer"/> |
|
18 |
<start to="generate-schema"/> |
|
19 |
|
|
20 |
<action name="generate-schema"> |
|
21 |
<java> |
|
22 |
<job-tracker>${jobTracker}</job-tracker> |
|
23 |
<name-node>${nameNode}</name-node> |
|
24 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
25 |
<arg>${schema}</arg> |
|
26 |
<capture-output /> |
|
27 |
</java> |
|
28 |
<ok to="transformer" /> |
|
29 |
<error to="fail" /> |
|
30 |
</action> |
|
31 |
|
|
19 | 32 |
<action name="transformer"> |
20 | 33 |
<pig> |
21 | 34 |
<job-tracker>${jobTracker}</job-tracker> |
... | ... | |
36 | 49 |
<param>input_a=${input_a}</param> |
37 | 50 |
<param>input_b=${input_b}</param> |
38 | 51 |
<param>output=${output}</param> |
39 |
<param>schema=${schema}</param>
|
|
52 |
<param>schema=${wf:actionData('generate-schema')[wf:conf('schema')]}</param>
|
|
40 | 53 |
</pig> |
41 | 54 |
<ok to="end"/> |
42 | 55 |
<error to="fail"/> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/avro2json/job.properties | ||
---|---|---|
1 |
input=/share/import/doc_meta/2015-02-18_beta |
|
2 |
output=${workingDir}/out |
|
0 | 3 |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/websiteusage/document/community_filter/oozie_app/workflow.xml | ||
---|---|---|
14 | 14 |
</property> |
15 | 15 |
</parameters> |
16 | 16 |
|
17 |
<start to="filter"/> |
|
17 |
<start to="generate-schema"/> |
|
18 |
|
|
19 |
<action name="generate-schema"> |
|
20 |
<java> |
|
21 |
<job-tracker>${jobTracker}</job-tracker> |
|
22 |
<name-node>${nameNode}</name-node> |
|
23 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class> |
|
24 |
<arg>eu.dnetlib.iis.websiteusage.schemas.DocumentToCommunity</arg> |
|
25 |
<arg>eu.dnetlib.iis.common.schemas.DocumentId</arg> |
|
26 |
<capture-output /> |
|
27 |
</java> |
|
28 |
<ok to="filter" /> |
|
29 |
<error to="fail" /> |
|
30 |
</action> |
|
18 | 31 |
|
19 | 32 |
<action name="filter"> |
20 | 33 |
<pig> |
... | ... | |
34 | 47 |
<param>input_community=${input_community}</param> |
35 | 48 |
<param>input_document_id=${input_document_id}</param> |
36 | 49 |
<param>output=${output}</param> |
37 |
<param>schema_community=eu.dnetlib.iis.websiteusage.schemas.DocumentToCommunity</param>
|
|
38 |
<param>schema_document_id=eu.dnetlib.iis.common.schemas.DocumentId</param>
|
|
50 |
<param>schema_community=${wf:actionData('generate-schema')['eu.dnetlib.iis.websiteusage.schemas.DocumentToCommunity']}</param>
|
|
51 |
<param>schema_document_id=${wf:actionData('generate-schema')['eu.dnetlib.iis.common.schemas.DocumentId']}</param>
|
|
39 | 52 |
</pig> |
40 | 53 |
<ok to="end"/> |
41 | 54 |
<error to="fail"/> |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/common/union/oozie_app/lib/scripts/union.pig | ||
---|---|---|
1 | 1 |
define avro_load_input_a |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema');
|
|
3 |
'schema', '$schema');
|
|
4 | 4 |
|
5 | 5 |
define avro_load_input_b |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 |
'input_schema_class', '$schema');
|
|
7 |
'schema', '$schema');
|
|
8 | 8 |
|
9 | 9 |
define avro_store_output |
10 | 10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
11 | 11 |
'index', '0', |
12 |
'output_schema_class', '$schema');
|
|
12 |
'schema', '$schema');
|
|
13 | 13 |
|
14 | 14 |
input_a = load '$input_a' using avro_load_input_a; |
15 | 15 |
input_b = load '$input_b' using avro_load_input_b; |
modules/icm-iis-transformers/branches/IIS-CDH-5.3.0/src/main/resources/eu/dnetlib/iis/transformers/common/union4/oozie_app/lib/scripts/union.pig | ||
---|---|---|
1 | 1 |
define avro_load_input_a |
2 | 2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
3 |
'input_schema_class', '$schema');
|
|
3 |
'schema', '$schema');
|
|
4 | 4 |
|
5 | 5 |
define avro_load_input_b |
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 |
'input_schema_class', '$schema');
|
|
7 |
'schema', '$schema');
|
|
8 | 8 |
|
9 | 9 |
define avro_load_input_c |
10 | 10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
11 |
'input_schema_class', '$schema');
|
|
11 |
'schema', '$schema');
|
|
12 | 12 |
|
13 | 13 |
define avro_load_input_d |
14 | 14 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
15 |
'input_schema_class', '$schema');
|
|
15 |
'schema', '$schema');
|
|
16 | 16 |
|
17 | 17 |
define avro_store_output |
18 | 18 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
19 | 19 |
'index', '0', |
20 |
'output_schema_class', '$schema');
|
|
20 |
'schema', '$schema');
|
|
21 | 21 |
|
22 | 22 |
input_a = load '$input_a' using avro_load_input_a; |
23 | 23 |
input_b = load '$input_b' using avro_load_input_b; |
Also available in: Unified diff
merging trunk changes with IIS-CDH-5.3.0 branch