Project

General

Profile

« Previous | Next » 

Revision 36289

Added by Marek Horst over 9 years ago

#1257 dropping schema generation related hacks in all map-reduce modules, switching to literal schema parameters

View differences:

modules/icm-iis-metadataextraction/trunk/src/test/resources/eu/dnetlib/iis/metadataextraction/avrobased/oozie_app/workflow.xml
1 1
<workflow-app xmlns="uri:oozie:workflow:0.3" name="test-metadataextraction_avrobased">
2
    <start to="metadata_extractor"/>
3 2
    
3
    
4
    <start to="generate-schema" />
5

  
6
	<action name="generate-schema">
7
	    <java>
8
    		<job-tracker>${jobTracker}</job-tracker>
9
        	<name-node>${nameNode}</name-node>
10
	        <main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class>
11
	        <arg>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</arg>
12
	        <capture-output />
13
	    </java>
14
	    <ok to="metadata_extractor" />
15
	    <error to="fail" />
16
	</action>
17
    
4 18
    <action name="metadata_extractor">
5 19
		<sub-workflow>
6 20
            <app-path>${wf:appPath()}/metadata_extractor</app-path>
......
31 45
	                <configuration>
32 46
	                        <property>
33 47
	                                <name>mapreduce.inputformat.class</name>
34
	                                <value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
48
	                                <value>org.apache.avro.mapreduce.AvroKeyInputFormat</value>
35 49
	                        </property>
36 50
	                        <property>
37 51
	                                <name>mapreduce.outputformat.class</name>
......
64 78
	                                <value>eu.dnetlib.iis.core.javamapreduce.json.GenericAvroDatastoreJsonLogger</value>
65 79
	                        </property>
66 80
	                        <property>
67
	                                <name>eu.dnetlib.iis.avro.input.class</name>
68
	                                <value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</value>
69
	                        </property>
81
								<name>avro.schema.input.key</name>
82
								<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata']}</value>
83
							</property>
70 84
	                        <property>
71 85
	                                <name>mapred.input.dir</name>
72 86
	                                <value>${workingDir}/metadata_extractor/meta</value>
modules/icm-iis-metadataextraction/trunk/src/main/resources/eu/dnetlib/iis/metadataextraction/oozie_app/workflow.xml
73 73
		</property>
74 74
	</parameters>
75 75
    
76
    <start to="metadata_extractor"/>
76
    <start to="generate-schema" />
77

  
78
	<action name="generate-schema">
79
	    <java>
80
    		<job-tracker>${jobTracker}</job-tracker>
81
        	<name-node>${nameNode}</name-node>
82
	        <main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class>
83
	        <arg>${inputport_classname}</arg>
84
	        <arg>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</arg>
85
	        <arg>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</arg>
86
	        <arg>eu.dnetlib.iis.audit.schemas.Fault</arg>
87
	        <capture-output />
88
	    </java>
89
	    <ok to="metadata_extractor" />
90
	    <error to="fail" />
91
	</action>
77 92
    
78 93
    <action name="metadata_extractor">
79 94
        <map-reduce>
......
90 105
				</property>
91 106
            	<property>
92 107
					<name>mapreduce.inputformat.class</name>
93
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
108
					<value>org.apache.avro.mapreduce.AvroKeyInputFormat</value>
94 109
				</property>
95 110
            	<property>
96 111
					<name>mapred.mapoutput.key.class</name>
......
110 125
				</property>
111 126
				<property>
112 127
					<name>mapred.output.key.comparator.class</name>
113
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
128
					<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
114 129
				</property>
115 130
				<property>
116 131
					<name>io.serializations</name>
......
119 134
				</property>
120 135
				<property>
121 136
					<name>mapred.output.value.groupfn.class</name>
122
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
137
					<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
123 138
				</property>
124
				
125 139
				<property>
126 140
					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
127 141
					</name>
128 142
					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
129 143
				</property>
130
            
131 144
                <!-- This is required for new api usage -->
132 145
                <property>
133 146
                    <name>mapred.mapper.new-api</name>
......
153 166
                    <name>mapred.max.split.size</name>
154 167
                    <value>${mapred_max_split_size}</value>
155 168
                </property>
169
			    <property>
170
					<name>avro.schema.input.key</name>
171
					<value>${wf:actionData('generate-schema')[wf:conf('inputport_classname')]}</value>
172
				</property>
156 173
                
157 174
                <property>
158 175
					<name>avro.mapreduce.multipleoutputs</name>
159 176
					<value>${output_name_meta} ${output_name_plaintext} ${output_name_fault}</value>
160 177
				</property>
161
                
162 178
                <!-- ## Output classes for all output ports -->
163 179
				<property>
164 180
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_meta}.format
......
175 191
					</name>
176 192
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
177 193
				</property>
178
                <!-- ### Schema of the data ingested by the mapper. To be more precise, 
179
					it's the schema of Avro data passed as template parameter of the AvroKey 
180
					object passed to mapper. -->
181
                <property>
182
					<name>eu.dnetlib.iis.avro.input.class</name>
183
					<value>${inputport_classname}</value>
184
				</property>
185
                <!-- As a convention, we're setting "null" values 
186
				since mapper does not produce any standard data in this example 
187
				(probably any other valid Avro schema would be OK as well).-->
188
				<property>
189
					<name>eu.dnetlib.iis.avro.map.output.key.class</name>
190
                	<value>org.apache.avro.Schema.Type.NULL</value>
191
                </property>
192
				<!-- As a convention, we're setting "null" values 
193
				since mapper does not produce any standard data in this example 
194
				(probably any other valid Avro schema would be OK as well).-->
195
				
196
				<property>
197
					<name>eu.dnetlib.iis.avro.map.output.value.class</name>
198
					<value>org.apache.avro.Schema.Type.NULL</value>
199
				</property>
200
                
201
                
202 194
                <!-- ### Schema of multiple output ports. -->
203 195
				<property>
204
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_meta}</name>
205
					<value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</value>
196
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_meta}.keyschema</name>
197
					<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata']}</value>
206 198
				</property>
207 199
				<property>
208
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_plaintext}</name>
209
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
200
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_plaintext}.keyschema</name>
201
					<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value>
210 202
				</property>
211 203
				<property>
212
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_fault}</name>
213
					<value>eu.dnetlib.iis.audit.schemas.Fault</value>
204
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_fault}.keyschema</name>
205
					<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.audit.schemas.Fault']}</value>
214 206
				</property>
215 207

  
216 208
				<property>

Also available in: Unified diff