73 |
73 |
</property>
|
74 |
74 |
</parameters>
|
75 |
75 |
|
76 |
|
<start to="metadata_extractor"/>
|
|
76 |
<start to="generate-schema" />
|
|
77 |
|
|
78 |
<action name="generate-schema">
|
|
79 |
<java>
|
|
80 |
<job-tracker>${jobTracker}</job-tracker>
|
|
81 |
<name-node>${nameNode}</name-node>
|
|
82 |
<main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class>
|
|
83 |
<arg>${inputport_classname}</arg>
|
|
84 |
<arg>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</arg>
|
|
85 |
<arg>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</arg>
|
|
86 |
<arg>eu.dnetlib.iis.audit.schemas.Fault</arg>
|
|
87 |
<capture-output />
|
|
88 |
</java>
|
|
89 |
<ok to="metadata_extractor" />
|
|
90 |
<error to="fail" />
|
|
91 |
</action>
|
77 |
92 |
|
78 |
93 |
<action name="metadata_extractor">
|
79 |
94 |
<map-reduce>
|
... | ... | |
90 |
105 |
</property>
|
91 |
106 |
<property>
|
92 |
107 |
<name>mapreduce.inputformat.class</name>
|
93 |
|
<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
|
|
108 |
<value>org.apache.avro.mapreduce.AvroKeyInputFormat</value>
|
94 |
109 |
</property>
|
95 |
110 |
<property>
|
96 |
111 |
<name>mapred.mapoutput.key.class</name>
|
... | ... | |
110 |
125 |
</property>
|
111 |
126 |
<property>
|
112 |
127 |
<name>mapred.output.key.comparator.class</name>
|
113 |
|
<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
|
|
128 |
<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
|
114 |
129 |
</property>
|
115 |
130 |
<property>
|
116 |
131 |
<name>io.serializations</name>
|
... | ... | |
119 |
134 |
</property>
|
120 |
135 |
<property>
|
121 |
136 |
<name>mapred.output.value.groupfn.class</name>
|
122 |
|
<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
|
|
137 |
<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
|
123 |
138 |
</property>
|
124 |
|
|
125 |
139 |
<property>
|
126 |
140 |
<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
|
127 |
141 |
</name>
|
128 |
142 |
<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
|
129 |
143 |
</property>
|
130 |
|
|
131 |
144 |
<!-- This is required for new api usage -->
|
132 |
145 |
<property>
|
133 |
146 |
<name>mapred.mapper.new-api</name>
|
... | ... | |
153 |
166 |
<name>mapred.max.split.size</name>
|
154 |
167 |
<value>${mapred_max_split_size}</value>
|
155 |
168 |
</property>
|
|
169 |
<property>
|
|
170 |
<name>avro.schema.input.key</name>
|
|
171 |
<value>${wf:actionData('generate-schema')[wf:conf('inputport_classname')]}</value>
|
|
172 |
</property>
|
156 |
173 |
|
157 |
174 |
<property>
|
158 |
175 |
<name>avro.mapreduce.multipleoutputs</name>
|
159 |
176 |
<value>${output_name_meta} ${output_name_plaintext} ${output_name_fault}</value>
|
160 |
177 |
</property>
|
161 |
|
|
162 |
178 |
<!-- ## Output classes for all output ports -->
|
163 |
179 |
<property>
|
164 |
180 |
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_meta}.format
|
... | ... | |
175 |
191 |
</name>
|
176 |
192 |
<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
|
177 |
193 |
</property>
|
178 |
|
<!-- ### Schema of the data ingested by the mapper. To be more precise,
|
179 |
|
it's the schema of Avro data passed as template parameter of the AvroKey
|
180 |
|
object passed to mapper. -->
|
181 |
|
<property>
|
182 |
|
<name>eu.dnetlib.iis.avro.input.class</name>
|
183 |
|
<value>${inputport_classname}</value>
|
184 |
|
</property>
|
185 |
|
<!-- As a convention, we're setting "null" values
|
186 |
|
since mapper does not produce any standard data in this example
|
187 |
|
(probably any other valid Avro schema would be OK as well).-->
|
188 |
|
<property>
|
189 |
|
<name>eu.dnetlib.iis.avro.map.output.key.class</name>
|
190 |
|
<value>org.apache.avro.Schema.Type.NULL</value>
|
191 |
|
</property>
|
192 |
|
<!-- As a convention, we're setting "null" values
|
193 |
|
since mapper does not produce any standard data in this example
|
194 |
|
(probably any other valid Avro schema would be OK as well).-->
|
195 |
|
|
196 |
|
<property>
|
197 |
|
<name>eu.dnetlib.iis.avro.map.output.value.class</name>
|
198 |
|
<value>org.apache.avro.Schema.Type.NULL</value>
|
199 |
|
</property>
|
200 |
|
|
201 |
|
|
202 |
194 |
<!-- ### Schema of multiple output ports. -->
|
203 |
195 |
<property>
|
204 |
|
<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_meta}</name>
|
205 |
|
<value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</value>
|
|
196 |
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_meta}.keyschema</name>
|
|
197 |
<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata']}</value>
|
206 |
198 |
</property>
|
207 |
199 |
<property>
|
208 |
|
<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_plaintext}</name>
|
209 |
|
<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
|
|
200 |
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_plaintext}.keyschema</name>
|
|
201 |
<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value>
|
210 |
202 |
</property>
|
211 |
203 |
<property>
|
212 |
|
<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_fault}</name>
|
213 |
|
<value>eu.dnetlib.iis.audit.schemas.Fault</value>
|
|
204 |
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_fault}.keyschema</name>
|
|
205 |
<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.audit.schemas.Fault']}</value>
|
214 |
206 |
</property>
|
215 |
207 |
|
216 |
208 |
<property>
|
#1257 dropping schema generation related hacks in all map-reduce modules, switching to literal schema parameters