Project

General

Profile

1 20566 marek.hors
<workflow-app xmlns="uri:oozie:workflow:0.4" name="metadataextraction">
2
3
	<parameters>
4
		<property>
5
			<name>input</name>
6
			<description>metadata extraction input directory</description>
7
		</property>
8
		<property>
9
			<name>output_root</name>
10
			<description>metadata extraction output directory</description>
11
		</property>
12
		<property>
13
			<name>output_name_meta</name>
14
			<value>meta</value>
15
			<description>metadata output subdirectory name</description>
16
		</property>
17
		<property>
18
			<name>output_name_plaintext</name>
19
			<value>plaintext</value>
20
			<description>plaintext output subdirectory name</description>
21
		</property>
22
		<property>
23 20663 marek.hors
			<name>excluded_ids</name>
24 20690 marek.hors
			<value>$UNDEFINED$</value>
25 20663 marek.hors
			<description>excluded identifiers list</description>
26
		</property>
27
		<property>
28 31757 marek.hors
			<name>max_file_size_mb</name>
29
			<value>$UNDEFINED$</value>
30
			<description>maximum allowed file size in Megabytes</description>
31
		</property>
32
		<property>
33 25716 marek.hors
			<name>content_connection_timeout</name>
34
			<value>60000</value>
35
			<description>streaming content connection timeout</description>
36
		</property>
37
		<property>
38
			<name>content_read_timeout</name>
39
			<value>60000</value>
40
			<description>streaming content read timeout</description>
41
		</property>
42
		<property>
43 20566 marek.hors
			<name>mapred_child_java_opts</name>
44
			<value>-Xmx2048m</value>
45
			<description>java-opts, e.g. maximum heap size for oozie</description>
46
		</property>
47 24313 marek.hors
		<property>
48
			<name>mapred_max_split_size</name>
49 25077 marek.hors
			<value>50000</value>
50 24313 marek.hors
			<description>maximum input data split size, required by streaming version reading DocumentContentUrl to split input data into more chunks</description>
51
		</property>
52 24106 marek.hors
53
		<property>
54
			<name>processing_mode</name>
55
			<value>MetadataExtractorMapper</value>
56
			<description>metadata extraction processing mode, supported values: MetadataExtractorMapper, StreamingMetadataExtractorMapper</description>
57
		</property>
58
		<property>
59 26559 marek.hors
			<name>inputport_classname</name>
60 24106 marek.hors
			<value>eu.dnetlib.iis.importer.schemas.DocumentContent</value>
61
			<description>input classname, should be adjusted according to the processing_mode value. Supported values: eu.dnetlib.iis.importer.schemas.DocumentContent, eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</description>
62
		</property>
63 20566 marek.hors
	</parameters>
64 19118 marek.hors
65 19066 marek.hors
    <start to="metadata_extractor"/>
66 19118 marek.hors
67 19066 marek.hors
    <action name="metadata_extractor">
68
        <map-reduce>
69
            <job-tracker>${jobTracker}</job-tracker>
70
            <name-node>${nameNode}</name-node>
71
 			<!-- The data generated by this node is deleted in this section -->
72
			<prepare>
73 19450 marek.hors
				<delete path="${nameNode}${output_root}" />
74 19066 marek.hors
			</prepare>
75
            <configuration>
76
            	<property>
77
					<name>mapreduce.inputformat.class</name>
78
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
79
				</property>
80
            	<property>
81
					<name>mapred.mapoutput.key.class</name>
82
					<value>org.apache.avro.mapred.AvroKey</value>
83
				</property>
84
				<property>
85
					<name>mapred.mapoutput.value.class</name>
86
					<value>org.apache.avro.mapred.AvroValue</value>
87
				</property>
88
				<property>
89
					<name>mapred.output.key.class</name>
90
					<value>org.apache.avro.mapred.AvroKey</value>
91
				</property>
92
				<property>
93
					<name>mapred.output.value.class</name>
94
					<value>org.apache.avro.mapred.AvroValue</value>
95
				</property>
96
				<property>
97
					<name>mapred.output.key.comparator.class</name>
98
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
99
				</property>
100
				<property>
101
					<name>io.serializations</name>
102
					<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
103
					</value>
104
				</property>
105
				<property>
106
					<name>mapred.output.value.groupfn.class</name>
107
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
108
				</property>
109
110
				<property>
111
					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
112
					</name>
113
					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
114
				</property>
115
116
                <!-- This is required for new api usage -->
117
                <property>
118
                    <name>mapred.mapper.new-api</name>
119
                    <value>true</value>
120
                </property>
121
                <property>
122
                    <name>mapred.reducer.new-api</name>
123
                    <value>true</value>
124
                </property>
125
126
                <!-- Standard stuff for our framework -->
127
                <property>
128
                    <name>mapred.job.queue.name</name>
129
                    <value>${queueName}</value>
130
                </property>
131
132
                <property>
133 19174 dominika.t
                    <name>mapred.child.java.opts</name>
134 19450 marek.hors
                    <value>${mapred_child_java_opts}</value>
135 19174 dominika.t
                </property>
136
137
                <property>
138 24313 marek.hors
                    <name>mapred.max.split.size</name>
139
                    <value>${mapred_max_split_size}</value>
140
                </property>
141
142
                <property>
143 19066 marek.hors
					<name>avro.mapreduce.multipleoutputs</name>
144 19450 marek.hors
					<value>${output_name_meta} ${output_name_plaintext}</value>
145 19066 marek.hors
				</property>
146
147
                <!-- ## Output classes for all output ports -->
148
				<property>
149 19450 marek.hors
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_meta}.format
150 19066 marek.hors
					</name>
151
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
152
				</property>
153
				<property>
154 19450 marek.hors
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_plaintext}.format
155 19066 marek.hors
					</name>
156
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
157
				</property>
158
159
                <!-- ### Schema of the data ingested by the mapper. To be more precise,
160
					it's the schema of Avro data passed as template parameter of the AvroKey
161
					object passed to mapper. -->
162
                <property>
163
					<name>eu.dnetlib.iis.avro.input.class</name>
164 26559 marek.hors
					<value>${inputport_classname}</value>
165 19066 marek.hors
				</property>
166
                <!-- As a convention, we're setting "null" values
167
				since mapper does not produce any standard data in this example
168
				(probably any other valid Avro schema would be OK as well).-->
169
				<property>
170
					<name>eu.dnetlib.iis.avro.map.output.key.class</name>
171
                	<value>org.apache.avro.Schema.Type.NULL</value>
172
                </property>
173
				<!-- As a convention, we're setting "null" values
174
				since mapper does not produce any standard data in this example
175
				(probably any other valid Avro schema would be OK as well).-->
176
177
				<property>
178
					<name>eu.dnetlib.iis.avro.map.output.value.class</name>
179
					<value>org.apache.avro.Schema.Type.NULL</value>
180
				</property>
181
182
183
                <!-- ### Schema of multiple output ports. -->
184
				<property>
185 19450 marek.hors
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_meta}</name>
186 19066 marek.hors
					<value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</value>
187
				</property>
188
				<property>
189 19450 marek.hors
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_plaintext}</name>
190 19066 marek.hors
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
191
				</property>
192
193
				<property>
194
                    <name>mapred.input.dir</name>
195 19450 marek.hors
                    <value>${input}</value>
196 19066 marek.hors
                </property>
197
                <property>
198
                    <name>mapred.output.dir</name>
199 19450 marek.hors
                    <value>${output_root}</value>
200 19066 marek.hors
                </property>
201
                <property>
202 20669 marek.hors
				    <name>output.meta</name>
203 19450 marek.hors
				   <value>${output_name_meta}</value>
204 19066 marek.hors
				</property>
205
				<property>
206 20669 marek.hors
				    <name>output.plaintext</name>
207 19450 marek.hors
				   <value>${output_name_plaintext}</value>
208 19066 marek.hors
				</property>
209 20668 marek.hors
				<property>
210 20677 marek.hors
				    <name>excluded.ids</name>
211 20668 marek.hors
				   <value>${excluded_ids}</value>
212
				</property>
213 25716 marek.hors
				<property>
214 31757 marek.hors
				    <name>import.content.max.file.size.mb</name>
215
				   <value>${max_file_size_mb}</value>
216
				</property>
217
				<property>
218 25716 marek.hors
				    <name>import.content.connection.timeout</name>
219
				   <value>${content_connection_timeout}</value>
220
				</property>
221
				<property>
222
				    <name>import.content.read.timeout</name>
223
				   <value>${content_read_timeout}</value>
224
				</property>
225 19066 marek.hors
                <property>
226
                    <name>mapreduce.map.class</name>
227 24106 marek.hors
                    <value>eu.dnetlib.iis.metadataextraction.${processing_mode}</value>
228 19066 marek.hors
                </property>
229 21303 marek.hors
                <property>
230
                    <name>mapred.reduce.tasks</name>
231
                    <value>0</value>
232
                </property>
233 19066 marek.hors
            </configuration>
234
        </map-reduce>
235
          <ok to="end"/>
236
        <error to="fail"/>
237
    </action>
238
239
    <kill name="fail">
240
        <message>Unfortunately, the process failed -- error message: [${wf:errorMessage(wf:lastErrorNode())}]</message>
241
    </kill>
242
    <end name="end"/>
243
</workflow-app>