Project

General

Profile

« Previous | Next » 

Revision 32827

Added by Marek Horst over 9 years ago

#963 propagating dataset -> mdstore from import to exporting phase: importer produces DocumentToMDStore datasetore utilized by exporter module. Updating transformer definition to handle DocumentToMDStore instead of Identifier schema

View differences:

modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/export/identifier/documenttodataset/oozie_app/lib/scripts/transformer.pig
6 6
org.apache.pig.piggybank.storage.avro.AvroStorage(
7 7
'input_schema_class', '$schema_input_document_id');
8 8

  
9
define avro_store_identifier
9
define avro_load_document_to_mdstore
10 10
org.apache.pig.piggybank.storage.avro.AvroStorage(
11
'input_schema_class', '$schema_input_document_to_mdstore');
12

  
13
define avro_store_document_to_mdstore
14
org.apache.pig.piggybank.storage.avro.AvroStorage(
11 15
'index', '0',
12
'output_schema_class', '$schema_output_identifier');
16
'output_schema_class', '$schema_output_document_to_mdstore');
13 17

  
14 18
documentId = load '$input_document_id' using avro_load_document_id;
15 19
documentId = foreach documentId generate $0 as id;
......
18 22
datasetIds = foreach documentToDataset generate datasetId as id;
19 23
datasetIdsDistinct = distinct datasetIds;
20 24

  
25
documentToMDStore = load '$input_document_to_mdstore' using avro_load_document_to_mdstore;
26

  
21 27
joined = join datasetIdsDistinct by id left, documentId by id;
22
joinedFiltered = filter joined by documentId::id is null;
23
identifiers = foreach joinedFiltered generate datasetIdsDistinct::id as id;
28
joinedWithMDStore = join joined by datasetIdsDistinct::id, documentToMDStore by documentId;
24 29

  
25
store identifiers into '$output_identifier' using avro_store_identifier;
30
joinedFiltered = filter joinedWithMDStore by documentId::id is null;
31
outputDocumentToMdstore = foreach joinedFiltered generate datasetIdsDistinct::id as documentId, documentToMDStore::mdStoreId as mdStoreId;
32

  
33
store outputDocumentToMdstore into '$output_document_to_mdstore' using avro_store_document_to_mdstore;
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/export/identifier/documenttodataset/oozie_app/workflow.xml
8 8
			<name>input_document_id</name>
9 9
		</property>
10 10
		<property>
11
			<name>output_identifier</name>
11
			<name>input_document_to_mdstore</name>
12 12
		</property>
13
		<property>
14
			<name>output_document_to_mdstore</name>
15
		</property>
13 16
	</parameters>
14 17
    
15 18
    <start to="transformer"/>
......
20 23
			<!-- The data generated by this node is deleted in this section -->
21 24
			<prepare>
22 25
				<delete path="${nameNode}${workingDir}/transformer" />
23
				<delete path="${nameNode}${output_identifier}" />
26
				<delete path="${nameNode}${output_document_to_mdstore}" />
24 27
				<mkdir path="${nameNode}${workingDir}/transformer" />
25 28
			</prepare>
26 29
            <configuration>
......
39 42
            
40 43
            <param>input_document_id=${input_document_id}</param>
41 44
            <param>schema_input_document_id=eu.dnetlib.iis.common.schemas.DocumentId</param>
45

  
46
            <param>input_document_to_mdstore=${input_document_to_mdstore}</param>
47
            <param>schema_input_document_to_mdstore=eu.dnetlib.iis.importer.schemas.DocumentToMDStore</param>
42 48
            
43
            <param>output_identifier=${output_identifier}</param>
44
            <param>schema_output_identifier=eu.dnetlib.iis.export.auxiliary.schemas.Identifier</param>
49
            <param>output_document_to_mdstore=${output_document_to_mdstore}</param>
50
            <param>schema_output_document_to_mdstore=eu.dnetlib.iis.importer.schemas.DocumentToMDStore</param>
45 51
        </pig>
46 52
        <ok to="end"/>
47 53
        <error to="fail"/>
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/export/identifier/documenttodataset/job.properties
1
input_document_to_dataset=/user/marek.horst/mainworkflows/preprocessing/main/working_dir/referenceextraction_dataset/document_datasets
2
input_document_id=/user/marek.horst/mainworkflows/preprocessing/main/working_dir/producer/dataset_existing_id
3
input_document_to_mdstore=/user/marek.horst/mainworkflows/preprocessing/main/working_dir/mainworkflows_preprocessing_import/dataset_to_mdstore
4
output_document_to_mdstore=${workingDir}/output
0 5

  

Also available in: Unified diff