Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="metadataextraction">
2
	
3
	<parameters>
4
		<property>
5
			<name>input</name>
6
			<description>metadata extraction input directory</description>
7
		</property>
8
		<property>
9
			<name>output_root</name>
10
			<description>metadata extraction output directory</description>
11
		</property>
12
		<property>
13
			<name>output_name_meta</name>
14
			<value>meta</value>
15
			<description>metadata output subdirectory name</description>
16
		</property>
17
		<property>
18
			<name>output_name_plaintext</name>
19
			<value>plaintext</value>
20
			<description>plaintext output subdirectory name</description>
21
		</property>
22
		<property>
23
			<name>output_name_fault</name>
24
			<value>fault</value>
25
			<description>fault output subdirectory name</description>
26
		</property>
27
		<property>
28
			<name>excluded_ids</name>
29
			<value>$UNDEFINED$</value>
30
			<description>excluded identifiers list</description>
31
		</property>
32
		<property>
33
			<name>max_file_size_mb</name>
34
			<value>$UNDEFINED$</value>
35
			<description>maximum allowed file size in Megabytes</description>
36
		</property>
37
		<property>
38
			<name>log_fault_processing_time_threshold_secs</name>
39
			<value>300</value>
40
			<description>processing time threshold expressed in seconds. 
41
			When exceeded apropriate object will be written to fault datastore</description>
42
		</property>
43
		<property>
44
			<name>content_connection_timeout</name>
45
			<value>60000</value>
46
			<description>streaming content connection timeout</description>
47
		</property>
48
		<property>
49
			<name>content_read_timeout</name>
50
			<value>60000</value>
51
			<description>streaming content read timeout</description>
52
		</property>
53
		<property>
54
			<name>mapred_child_java_opts</name>
55
			<value>-Xmx4096m</value>
56
			<description>java-opts, e.g. maximum heap size for oozie</description>
57
		</property>
58
		<property>
59
			<name>mapred_max_split_size</name>
60
			<value>50000</value>
61
			<description>maximum input data split size, required by streaming version reading DocumentContentUrl to split input data into more chunks</description>
62
		</property>
63
		
64
		<property>
65
			<name>processing_mode</name>
66
			<value>MetadataExtractorMapper</value>
67
			<description>metadata extraction processing mode, supported values: MetadataExtractorMapper, StreamingMetadataExtractorMapper</description>
68
		</property>
69
		<property>
70
			<name>inputport_classname</name>
71
			<value>eu.dnetlib.iis.importer.schemas.DocumentContent</value>
72
			<description>input classname, should be adjusted according to the processing_mode value. Supported values: eu.dnetlib.iis.importer.schemas.DocumentContent, eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</description>
73
		</property>
74
	</parameters>
75
    
76
    <start to="metadata_extractor"/>
77
    
78
    <action name="metadata_extractor">
79
        <map-reduce>
80
            <job-tracker>${jobTracker}</job-tracker>
81
            <name-node>${nameNode}</name-node>
82
 			<!-- The data generated by this node is deleted in this section -->
83
			<prepare>
84
				<delete path="${nameNode}${output_root}" />
85
			</prepare>
86
            <configuration>
87
            	<property>
88
					<name>mapred.task.timeout</name>
89
					<value>1800000</value>
90
				</property>
91
            	<property>
92
					<name>mapreduce.inputformat.class</name>
93
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
94
				</property>
95
            	<property>
96
					<name>mapred.mapoutput.key.class</name>
97
					<value>org.apache.avro.mapred.AvroKey</value>
98
				</property>
99
				<property>
100
					<name>mapred.mapoutput.value.class</name>
101
					<value>org.apache.avro.mapred.AvroValue</value>
102
				</property>
103
				<property>
104
					<name>mapred.output.key.class</name>
105
					<value>org.apache.avro.mapred.AvroKey</value>
106
				</property>
107
				<property>
108
					<name>mapred.output.value.class</name>
109
					<value>org.apache.avro.mapred.AvroValue</value>
110
				</property>
111
				<property>
112
					<name>mapred.output.key.comparator.class</name>
113
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
114
				</property>
115
				<property>
116
					<name>io.serializations</name>
117
					<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
118
					</value>
119
				</property>
120
				<property>
121
					<name>mapred.output.value.groupfn.class</name>
122
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
123
				</property>
124
				
125
				<property>
126
					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
127
					</name>
128
					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
129
				</property>
130
            
131
                <!-- This is required for new api usage -->
132
                <property>
133
                    <name>mapred.mapper.new-api</name>
134
                    <value>true</value>
135
                </property>
136
                <property>
137
                    <name>mapred.reducer.new-api</name>
138
                    <value>true</value>
139
                </property>
140
            
141
                <!-- Standard stuff for our framework -->
142
                <property>
143
                    <name>mapred.job.queue.name</name>
144
                    <value>${queueName}</value>
145
                </property>
146
                
147
                <property>
148
                    <name>mapred.child.java.opts</name>
149
                    <value>${mapred_child_java_opts}</value>
150
                </property>
151
                
152
                <property>
153
                    <name>mapred.max.split.size</name>
154
                    <value>${mapred_max_split_size}</value>
155
                </property>
156
                
157
                <property>
158
					<name>avro.mapreduce.multipleoutputs</name>
159
					<value>${output_name_meta} ${output_name_plaintext} ${output_name_fault}</value>
160
				</property>
161
                
162
                <!-- ## Output classes for all output ports -->
163
				<property>
164
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_meta}.format
165
					</name>
166
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
167
				</property>
168
				<property>
169
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_plaintext}.format
170
					</name>
171
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
172
				</property>
173
                <property>
174
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_fault}.format
175
					</name>
176
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
177
				</property>
178
                <!-- ### Schema of the data ingested by the mapper. To be more precise, 
179
					it's the schema of Avro data passed as template parameter of the AvroKey 
180
					object passed to mapper. -->
181
                <property>
182
					<name>eu.dnetlib.iis.avro.input.class</name>
183
					<value>${inputport_classname}</value>
184
				</property>
185
                <!-- As a convention, we're setting "null" values 
186
				since mapper does not produce any standard data in this example 
187
				(probably any other valid Avro schema would be OK as well).-->
188
				<property>
189
					<name>eu.dnetlib.iis.avro.map.output.key.class</name>
190
                	<value>org.apache.avro.Schema.Type.NULL</value>
191
                </property>
192
				<!-- As a convention, we're setting "null" values 
193
				since mapper does not produce any standard data in this example 
194
				(probably any other valid Avro schema would be OK as well).-->
195
				
196
				<property>
197
					<name>eu.dnetlib.iis.avro.map.output.value.class</name>
198
					<value>org.apache.avro.Schema.Type.NULL</value>
199
				</property>
200
                
201
                
202
                <!-- ### Schema of multiple output ports. -->
203
				<property>
204
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_meta}</name>
205
					<value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</value>
206
				</property>
207
				<property>
208
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_plaintext}</name>
209
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
210
				</property>
211
				<property>
212
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_fault}</name>
213
					<value>eu.dnetlib.iis.audit.schemas.Fault</value>
214
				</property>
215

    
216
				<property>
217
                    <name>mapred.input.dir</name>
218
                    <value>${input}</value>
219
                </property>
220
                <property>
221
                    <name>mapred.output.dir</name>
222
                    <value>${output_root}</value>
223
                </property>
224
                <property>
225
				    <name>output.meta</name>
226
				   <value>${output_name_meta}</value>
227
				</property>
228
				<property>
229
				    <name>output.plaintext</name>
230
				   <value>${output_name_plaintext}</value>
231
				</property>  
232
				<property>
233
				    <name>output.fault</name>
234
				   <value>${output_name_fault}</value>
235
				</property>
236
				<property>
237
				    <name>excluded.ids</name>
238
				   <value>${excluded_ids}</value>
239
				</property>
240
				<property>
241
				    <name>import.content.max.file.size.mb</name>
242
				   <value>${max_file_size_mb}</value>
243
				</property>
244
				<property>
245
				    <name>log.fault.processing.time.threshold.secs</name>
246
				   <value>${log_fault_processing_time_threshold_secs}</value>
247
				</property>
248
				<property>
249
				    <name>import.content.connection.timeout</name>
250
				   <value>${content_connection_timeout}</value>
251
				</property>
252
				<property>
253
				    <name>import.content.read.timeout</name>
254
				   <value>${content_read_timeout}</value>
255
				</property>
256
                <property>
257
                    <name>mapreduce.map.class</name>
258
                    <value>eu.dnetlib.iis.metadataextraction.${processing_mode}</value>
259
                </property>
260
                <property>
261
                    <name>mapred.reduce.tasks</name>
262
                    <value>0</value>
263
                </property>
264
            </configuration>
265
        </map-reduce>
266
          <ok to="end"/>
267
        <error to="fail"/>
268
    </action>
269
    
270
    <kill name="fail">
271
        <message>Unfortunately, the process failed -- error message: [${wf:errorMessage(wf:lastErrorNode())}]</message>
272
    </kill>
273
    <end name="end"/>
274
</workflow-app>
    (1-1/1)