Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="metadataextraction">
2
	
3
	<parameters>
4
		<property>
5
			<name>input</name>
6
			<description>metadata extraction input directory</description>
7
		</property>
8
		<property>
9
			<name>output_root</name>
10
			<description>metadata extraction output directory</description>
11
		</property>
12
		<property>
13
			<name>output_name_meta</name>
14
			<value>meta</value>
15
			<description>metadata output subdirectory name</description>
16
		</property>
17
		<property>
18
			<name>output_name_plaintext</name>
19
			<value>plaintext</value>
20
			<description>plaintext output subdirectory name</description>
21
		</property>
22
		<property>
23
			<name>excluded_ids</name>
24
			<value>$UNDEFINED$</value>
25
			<description>excluded identifiers list</description>
26
		</property>
27
		<property>
28
			<name>max_file_size_mb</name>
29
			<value>$UNDEFINED$</value>
30
			<description>maximum allowed file size in Megabytes</description>
31
		</property>
32
		<property>
33
			<name>content_connection_timeout</name>
34
			<value>60000</value>
35
			<description>streaming content connection timeout</description>
36
		</property>
37
		<property>
38
			<name>content_read_timeout</name>
39
			<value>60000</value>
40
			<description>streaming content read timeout</description>
41
		</property>
42
		<property>
43
			<name>mapred_child_java_opts</name>
44
			<value>-Xmx2048m</value>
45
			<description>java-opts, e.g. maximum heap size for oozie</description>
46
		</property>
47
		<property>
48
			<name>mapred_max_split_size</name>
49
			<value>50000</value>
50
			<description>maximum input data split size, required by streaming version reading DocumentContentUrl to split input data into more chunks</description>
51
		</property>
52
		
53
		<property>
54
			<name>processing_mode</name>
55
			<value>MetadataExtractorMapper</value>
56
			<description>metadata extraction processing mode, supported values: MetadataExtractorMapper, StreamingMetadataExtractorMapper</description>
57
		</property>
58
		<property>
59
			<name>inputport_classname</name>
60
			<value>eu.dnetlib.iis.importer.schemas.DocumentContent</value>
61
			<description>input classname, should be adjusted according to the processing_mode value. Supported values: eu.dnetlib.iis.importer.schemas.DocumentContent, eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</description>
62
		</property>
63
	</parameters>
64
    
65
    <start to="metadata_extractor"/>
66
    
67
    <action name="metadata_extractor">
68
        <map-reduce>
69
            <job-tracker>${jobTracker}</job-tracker>
70
            <name-node>${nameNode}</name-node>
71
 			<!-- The data generated by this node is deleted in this section -->
72
			<prepare>
73
				<delete path="${nameNode}${output_root}" />
74
			</prepare>
75
            <configuration>
76
            	<property>
77
					<name>mapreduce.inputformat.class</name>
78
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
79
				</property>
80
            	<property>
81
					<name>mapred.mapoutput.key.class</name>
82
					<value>org.apache.avro.mapred.AvroKey</value>
83
				</property>
84
				<property>
85
					<name>mapred.mapoutput.value.class</name>
86
					<value>org.apache.avro.mapred.AvroValue</value>
87
				</property>
88
				<property>
89
					<name>mapred.output.key.class</name>
90
					<value>org.apache.avro.mapred.AvroKey</value>
91
				</property>
92
				<property>
93
					<name>mapred.output.value.class</name>
94
					<value>org.apache.avro.mapred.AvroValue</value>
95
				</property>
96
				<property>
97
					<name>mapred.output.key.comparator.class</name>
98
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
99
				</property>
100
				<property>
101
					<name>io.serializations</name>
102
					<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
103
					</value>
104
				</property>
105
				<property>
106
					<name>mapred.output.value.groupfn.class</name>
107
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
108
				</property>
109
				
110
				<property>
111
					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
112
					</name>
113
					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
114
				</property>
115
            
116
                <!-- This is required for new api usage -->
117
                <property>
118
                    <name>mapred.mapper.new-api</name>
119
                    <value>true</value>
120
                </property>
121
                <property>
122
                    <name>mapred.reducer.new-api</name>
123
                    <value>true</value>
124
                </property>
125
            
126
                <!-- Standard stuff for our framework -->
127
                <property>
128
                    <name>mapred.job.queue.name</name>
129
                    <value>${queueName}</value>
130
                </property>
131
                
132
                <property>
133
                    <name>mapred.child.java.opts</name>
134
                    <value>${mapred_child_java_opts}</value>
135
                </property>
136
                
137
                <property>
138
                    <name>mapred.max.split.size</name>
139
                    <value>${mapred_max_split_size}</value>
140
                </property>
141
                
142
                <property>
143
					<name>avro.mapreduce.multipleoutputs</name>
144
					<value>${output_name_meta} ${output_name_plaintext}</value>
145
				</property>
146
                
147
                <!-- ## Output classes for all output ports -->
148
				<property>
149
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_meta}.format
150
					</name>
151
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
152
				</property>
153
				<property>
154
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_plaintext}.format
155
					</name>
156
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
157
				</property>
158
                
159
                <!-- ### Schema of the data ingested by the mapper. To be more precise, 
160
					it's the schema of Avro data passed as template parameter of the AvroKey 
161
					object passed to mapper. -->
162
                <property>
163
					<name>eu.dnetlib.iis.avro.input.class</name>
164
					<value>${inputport_classname}</value>
165
				</property>
166
                <!-- As a convention, we're setting "null" values 
167
				since mapper does not produce any standard data in this example 
168
				(probably any other valid Avro schema would be OK as well).-->
169
				<property>
170
					<name>eu.dnetlib.iis.avro.map.output.key.class</name>
171
                	<value>org.apache.avro.Schema.Type.NULL</value>
172
                </property>
173
				<!-- As a convention, we're setting "null" values 
174
				since mapper does not produce any standard data in this example 
175
				(probably any other valid Avro schema would be OK as well).-->
176
				
177
				<property>
178
					<name>eu.dnetlib.iis.avro.map.output.value.class</name>
179
					<value>org.apache.avro.Schema.Type.NULL</value>
180
				</property>
181
                
182
                
183
                <!-- ### Schema of multiple output ports. -->
184
				<property>
185
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_meta}</name>
186
					<value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</value>
187
				</property>
188
				<property>
189
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_plaintext}</name>
190
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
191
				</property>
192

    
193
				<property>
194
                    <name>mapred.input.dir</name>
195
                    <value>${input}</value>
196
                </property>
197
                <property>
198
                    <name>mapred.output.dir</name>
199
                    <value>${output_root}</value>
200
                </property>
201
                <property>
202
				    <name>output.meta</name>
203
				   <value>${output_name_meta}</value>
204
				</property>
205
				<property>
206
				    <name>output.plaintext</name>
207
				   <value>${output_name_plaintext}</value>
208
				</property>  
209
				<property>
210
				    <name>excluded.ids</name>
211
				   <value>${excluded_ids}</value>
212
				</property>
213
				<property>
214
				    <name>import.content.max.file.size.mb</name>
215
				   <value>${max_file_size_mb}</value>
216
				</property>
217
				<property>
218
				    <name>import.content.connection.timeout</name>
219
				   <value>${content_connection_timeout}</value>
220
				</property>
221
				<property>
222
				    <name>import.content.read.timeout</name>
223
				   <value>${content_read_timeout}</value>
224
				</property>
225
                <property>
226
                    <name>mapreduce.map.class</name>
227
                    <value>eu.dnetlib.iis.metadataextraction.${processing_mode}</value>
228
                </property>
229
                <property>
230
                    <name>mapred.reduce.tasks</name>
231
                    <value>0</value>
232
                </property>
233
            </configuration>
234
        </map-reduce>
235
          <ok to="end"/>
236
        <error to="fail"/>
237
    </action>
238
    
239
    <kill name="fail">
240
        <message>Unfortunately, the process failed -- error message: [${wf:errorMessage(wf:lastErrorNode())}]</message>
241
    </kill>
242
    <end name="end"/>
243
</workflow-app>
    (1-1/1)