Revision 32827
Added by Marek Horst over 9 years ago
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/export/identifier/documenttodataset/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
6 | 6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
7 | 7 |
'input_schema_class', '$schema_input_document_id'); |
8 | 8 |
|
9 |
define avro_store_identifier
|
|
9 |
define avro_load_document_to_mdstore
|
|
10 | 10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
11 |
'input_schema_class', '$schema_input_document_to_mdstore'); |
|
12 |
|
|
13 |
define avro_store_document_to_mdstore |
|
14 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
|
11 | 15 |
'index', '0', |
12 |
'output_schema_class', '$schema_output_identifier');
|
|
16 |
'output_schema_class', '$schema_output_document_to_mdstore');
|
|
13 | 17 |
|
14 | 18 |
documentId = load '$input_document_id' using avro_load_document_id; |
15 | 19 |
documentId = foreach documentId generate $0 as id; |
... | ... | |
18 | 22 |
datasetIds = foreach documentToDataset generate datasetId as id; |
19 | 23 |
datasetIdsDistinct = distinct datasetIds; |
20 | 24 |
|
25 |
documentToMDStore = load '$input_document_to_mdstore' using avro_load_document_to_mdstore; |
|
26 |
|
|
21 | 27 |
joined = join datasetIdsDistinct by id left, documentId by id; |
22 |
joinedFiltered = filter joined by documentId::id is null; |
|
23 |
identifiers = foreach joinedFiltered generate datasetIdsDistinct::id as id; |
|
28 |
joinedWithMDStore = join joined by datasetIdsDistinct::id, documentToMDStore by documentId; |
|
24 | 29 |
|
25 |
store identifiers into '$output_identifier' using avro_store_identifier; |
|
30 |
joinedFiltered = filter joinedWithMDStore by documentId::id is null; |
|
31 |
outputDocumentToMdstore = foreach joinedFiltered generate datasetIdsDistinct::id as documentId, documentToMDStore::mdStoreId as mdStoreId; |
|
32 |
|
|
33 |
store outputDocumentToMdstore into '$output_document_to_mdstore' using avro_store_document_to_mdstore; |
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/export/identifier/documenttodataset/oozie_app/workflow.xml | ||
---|---|---|
8 | 8 |
<name>input_document_id</name> |
9 | 9 |
</property> |
10 | 10 |
<property> |
11 |
<name>output_identifier</name>
|
|
11 |
<name>input_document_to_mdstore</name>
|
|
12 | 12 |
</property> |
13 |
<property> |
|
14 |
<name>output_document_to_mdstore</name> |
|
15 |
</property> |
|
13 | 16 |
</parameters> |
14 | 17 |
|
15 | 18 |
<start to="transformer"/> |
... | ... | |
20 | 23 |
<!-- The data generated by this node is deleted in this section --> |
21 | 24 |
<prepare> |
22 | 25 |
<delete path="${nameNode}${workingDir}/transformer" /> |
23 |
<delete path="${nameNode}${output_identifier}" />
|
|
26 |
<delete path="${nameNode}${output_document_to_mdstore}" />
|
|
24 | 27 |
<mkdir path="${nameNode}${workingDir}/transformer" /> |
25 | 28 |
</prepare> |
26 | 29 |
<configuration> |
... | ... | |
39 | 42 |
|
40 | 43 |
<param>input_document_id=${input_document_id}</param> |
41 | 44 |
<param>schema_input_document_id=eu.dnetlib.iis.common.schemas.DocumentId</param> |
45 |
|
|
46 |
<param>input_document_to_mdstore=${input_document_to_mdstore}</param> |
|
47 |
<param>schema_input_document_to_mdstore=eu.dnetlib.iis.importer.schemas.DocumentToMDStore</param> |
|
42 | 48 |
|
43 |
<param>output_identifier=${output_identifier}</param>
|
|
44 |
<param>schema_output_identifier=eu.dnetlib.iis.export.auxiliary.schemas.Identifier</param>
|
|
49 |
<param>output_document_to_mdstore=${output_document_to_mdstore}</param>
|
|
50 |
<param>schema_output_document_to_mdstore=eu.dnetlib.iis.importer.schemas.DocumentToMDStore</param>
|
|
45 | 51 |
</pig> |
46 | 52 |
<ok to="end"/> |
47 | 53 |
<error to="fail"/> |
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/export/identifier/documenttodataset/job.properties | ||
---|---|---|
1 |
input_document_to_dataset=/user/marek.horst/mainworkflows/preprocessing/main/working_dir/referenceextraction_dataset/document_datasets |
|
2 |
input_document_id=/user/marek.horst/mainworkflows/preprocessing/main/working_dir/producer/dataset_existing_id |
|
3 |
input_document_to_mdstore=/user/marek.horst/mainworkflows/preprocessing/main/working_dir/mainworkflows_preprocessing_import/dataset_to_mdstore |
|
4 |
output_document_to_mdstore=${workingDir}/output |
|
0 | 5 |
Also available in: Unified diff
#963 propagating dataset -> mdstore from import to exporting phase: importer produces DocumentToMDStore datasetore utilized by exporter module. Updating transformer definition to handle DocumentToMDStore instead of Identifier schema