1 |
20566
|
marek.hors
|
<workflow-app xmlns="uri:oozie:workflow:0.4" name="metadataextraction">
|
2 |
|
|
|
3 |
|
|
<parameters>
|
4 |
|
|
<property>
|
5 |
|
|
<name>input</name>
|
6 |
|
|
<description>metadata extraction input directory</description>
|
7 |
|
|
</property>
|
8 |
|
|
<property>
|
9 |
|
|
<name>output_root</name>
|
10 |
|
|
<description>metadata extraction output directory</description>
|
11 |
|
|
</property>
|
12 |
|
|
<property>
|
13 |
|
|
<name>output_name_meta</name>
|
14 |
|
|
<value>meta</value>
|
15 |
|
|
<description>metadata output subdirectory name</description>
|
16 |
|
|
</property>
|
17 |
|
|
<property>
|
18 |
|
|
<name>output_name_plaintext</name>
|
19 |
|
|
<value>plaintext</value>
|
20 |
|
|
<description>plaintext output subdirectory name</description>
|
21 |
|
|
</property>
|
22 |
|
|
<property>
|
23 |
20663
|
marek.hors
|
<name>excluded_ids</name>
|
24 |
20690
|
marek.hors
|
<value>$UNDEFINED$</value>
|
25 |
20663
|
marek.hors
|
<description>excluded identifiers list</description>
|
26 |
|
|
</property>
|
27 |
|
|
<property>
|
28 |
31757
|
marek.hors
|
<name>max_file_size_mb</name>
|
29 |
|
|
<value>$UNDEFINED$</value>
|
30 |
|
|
<description>maximum allowed file size in Megabytes</description>
|
31 |
|
|
</property>
|
32 |
|
|
<property>
|
33 |
25716
|
marek.hors
|
<name>content_connection_timeout</name>
|
34 |
|
|
<value>60000</value>
|
35 |
|
|
<description>streaming content connection timeout</description>
|
36 |
|
|
</property>
|
37 |
|
|
<property>
|
38 |
|
|
<name>content_read_timeout</name>
|
39 |
|
|
<value>60000</value>
|
40 |
|
|
<description>streaming content read timeout</description>
|
41 |
|
|
</property>
|
42 |
|
|
<property>
|
43 |
20566
|
marek.hors
|
<name>mapred_child_java_opts</name>
|
44 |
|
|
<value>-Xmx2048m</value>
|
45 |
|
|
<description>java-opts, e.g. maximum heap size for oozie</description>
|
46 |
|
|
</property>
|
47 |
24313
|
marek.hors
|
<property>
|
48 |
|
|
<name>mapred_max_split_size</name>
|
49 |
25077
|
marek.hors
|
<value>50000</value>
|
50 |
24313
|
marek.hors
|
<description>maximum input data split size, required by streaming version reading DocumentContentUrl to split input data into more chunks</description>
|
51 |
|
|
</property>
|
52 |
24106
|
marek.hors
|
|
53 |
|
|
<property>
|
54 |
|
|
<name>processing_mode</name>
|
55 |
|
|
<value>MetadataExtractorMapper</value>
|
56 |
|
|
<description>metadata extraction processing mode, supported values: MetadataExtractorMapper, StreamingMetadataExtractorMapper</description>
|
57 |
|
|
</property>
|
58 |
|
|
<property>
|
59 |
26559
|
marek.hors
|
<name>inputport_classname</name>
|
60 |
24106
|
marek.hors
|
<value>eu.dnetlib.iis.importer.schemas.DocumentContent</value>
|
61 |
|
|
<description>input classname, should be adjusted according to the processing_mode value. Supported values: eu.dnetlib.iis.importer.schemas.DocumentContent, eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</description>
|
62 |
|
|
</property>
|
63 |
20566
|
marek.hors
|
</parameters>
|
64 |
19118
|
marek.hors
|
|
65 |
19066
|
marek.hors
|
<start to="metadata_extractor"/>
|
66 |
19118
|
marek.hors
|
|
67 |
19066
|
marek.hors
|
<action name="metadata_extractor">
|
68 |
|
|
<map-reduce>
|
69 |
|
|
<job-tracker>${jobTracker}</job-tracker>
|
70 |
|
|
<name-node>${nameNode}</name-node>
|
71 |
|
|
<!-- The data generated by this node is deleted in this section -->
|
72 |
|
|
<prepare>
|
73 |
19450
|
marek.hors
|
<delete path="${nameNode}${output_root}" />
|
74 |
19066
|
marek.hors
|
</prepare>
|
75 |
|
|
<configuration>
|
76 |
|
|
<property>
|
77 |
|
|
<name>mapreduce.inputformat.class</name>
|
78 |
|
|
<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
|
79 |
|
|
</property>
|
80 |
|
|
<property>
|
81 |
|
|
<name>mapred.mapoutput.key.class</name>
|
82 |
|
|
<value>org.apache.avro.mapred.AvroKey</value>
|
83 |
|
|
</property>
|
84 |
|
|
<property>
|
85 |
|
|
<name>mapred.mapoutput.value.class</name>
|
86 |
|
|
<value>org.apache.avro.mapred.AvroValue</value>
|
87 |
|
|
</property>
|
88 |
|
|
<property>
|
89 |
|
|
<name>mapred.output.key.class</name>
|
90 |
|
|
<value>org.apache.avro.mapred.AvroKey</value>
|
91 |
|
|
</property>
|
92 |
|
|
<property>
|
93 |
|
|
<name>mapred.output.value.class</name>
|
94 |
|
|
<value>org.apache.avro.mapred.AvroValue</value>
|
95 |
|
|
</property>
|
96 |
|
|
<property>
|
97 |
|
|
<name>mapred.output.key.comparator.class</name>
|
98 |
|
|
<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
|
99 |
|
|
</property>
|
100 |
|
|
<property>
|
101 |
|
|
<name>io.serializations</name>
|
102 |
|
|
<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
|
103 |
|
|
</value>
|
104 |
|
|
</property>
|
105 |
|
|
<property>
|
106 |
|
|
<name>mapred.output.value.groupfn.class</name>
|
107 |
|
|
<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
|
108 |
|
|
</property>
|
109 |
|
|
|
110 |
|
|
<property>
|
111 |
|
|
<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
|
112 |
|
|
</name>
|
113 |
|
|
<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
|
114 |
|
|
</property>
|
115 |
|
|
|
116 |
|
|
<!-- This is required for new api usage -->
|
117 |
|
|
<property>
|
118 |
|
|
<name>mapred.mapper.new-api</name>
|
119 |
|
|
<value>true</value>
|
120 |
|
|
</property>
|
121 |
|
|
<property>
|
122 |
|
|
<name>mapred.reducer.new-api</name>
|
123 |
|
|
<value>true</value>
|
124 |
|
|
</property>
|
125 |
|
|
|
126 |
|
|
<!-- Standard stuff for our framework -->
|
127 |
|
|
<property>
|
128 |
|
|
<name>mapred.job.queue.name</name>
|
129 |
|
|
<value>${queueName}</value>
|
130 |
|
|
</property>
|
131 |
|
|
|
132 |
|
|
<property>
|
133 |
19174
|
dominika.t
|
<name>mapred.child.java.opts</name>
|
134 |
19450
|
marek.hors
|
<value>${mapred_child_java_opts}</value>
|
135 |
19174
|
dominika.t
|
</property>
|
136 |
|
|
|
137 |
|
|
<property>
|
138 |
24313
|
marek.hors
|
<name>mapred.max.split.size</name>
|
139 |
|
|
<value>${mapred_max_split_size}</value>
|
140 |
|
|
</property>
|
141 |
|
|
|
142 |
|
|
<property>
|
143 |
19066
|
marek.hors
|
<name>avro.mapreduce.multipleoutputs</name>
|
144 |
19450
|
marek.hors
|
<value>${output_name_meta} ${output_name_plaintext}</value>
|
145 |
19066
|
marek.hors
|
</property>
|
146 |
|
|
|
147 |
|
|
<!-- ## Output classes for all output ports -->
|
148 |
|
|
<property>
|
149 |
19450
|
marek.hors
|
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_meta}.format
|
150 |
19066
|
marek.hors
|
</name>
|
151 |
|
|
<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
|
152 |
|
|
</property>
|
153 |
|
|
<property>
|
154 |
19450
|
marek.hors
|
<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_plaintext}.format
|
155 |
19066
|
marek.hors
|
</name>
|
156 |
|
|
<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
|
157 |
|
|
</property>
|
158 |
|
|
|
159 |
|
|
<!-- ### Schema of the data ingested by the mapper. To be more precise,
|
160 |
|
|
it's the schema of Avro data passed as template parameter of the AvroKey
|
161 |
|
|
object passed to mapper. -->
|
162 |
|
|
<property>
|
163 |
|
|
<name>eu.dnetlib.iis.avro.input.class</name>
|
164 |
26559
|
marek.hors
|
<value>${inputport_classname}</value>
|
165 |
19066
|
marek.hors
|
</property>
|
166 |
|
|
<!-- As a convention, we're setting "null" values
|
167 |
|
|
since mapper does not produce any standard data in this example
|
168 |
|
|
(probably any other valid Avro schema would be OK as well).-->
|
169 |
|
|
<property>
|
170 |
|
|
<name>eu.dnetlib.iis.avro.map.output.key.class</name>
|
171 |
|
|
<value>org.apache.avro.Schema.Type.NULL</value>
|
172 |
|
|
</property>
|
173 |
|
|
<!-- As a convention, we're setting "null" values
|
174 |
|
|
since mapper does not produce any standard data in this example
|
175 |
|
|
(probably any other valid Avro schema would be OK as well).-->
|
176 |
|
|
|
177 |
|
|
<property>
|
178 |
|
|
<name>eu.dnetlib.iis.avro.map.output.value.class</name>
|
179 |
|
|
<value>org.apache.avro.Schema.Type.NULL</value>
|
180 |
|
|
</property>
|
181 |
|
|
|
182 |
|
|
|
183 |
|
|
<!-- ### Schema of multiple output ports. -->
|
184 |
|
|
<property>
|
185 |
19450
|
marek.hors
|
<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_meta}</name>
|
186 |
19066
|
marek.hors
|
<value>eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata</value>
|
187 |
|
|
</property>
|
188 |
|
|
<property>
|
189 |
19450
|
marek.hors
|
<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_plaintext}</name>
|
190 |
19066
|
marek.hors
|
<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
|
191 |
|
|
</property>
|
192 |
|
|
|
193 |
|
|
<property>
|
194 |
|
|
<name>mapred.input.dir</name>
|
195 |
19450
|
marek.hors
|
<value>${input}</value>
|
196 |
19066
|
marek.hors
|
</property>
|
197 |
|
|
<property>
|
198 |
|
|
<name>mapred.output.dir</name>
|
199 |
19450
|
marek.hors
|
<value>${output_root}</value>
|
200 |
19066
|
marek.hors
|
</property>
|
201 |
|
|
<property>
|
202 |
20669
|
marek.hors
|
<name>output.meta</name>
|
203 |
19450
|
marek.hors
|
<value>${output_name_meta}</value>
|
204 |
19066
|
marek.hors
|
</property>
|
205 |
|
|
<property>
|
206 |
20669
|
marek.hors
|
<name>output.plaintext</name>
|
207 |
19450
|
marek.hors
|
<value>${output_name_plaintext}</value>
|
208 |
19066
|
marek.hors
|
</property>
|
209 |
20668
|
marek.hors
|
<property>
|
210 |
20677
|
marek.hors
|
<name>excluded.ids</name>
|
211 |
20668
|
marek.hors
|
<value>${excluded_ids}</value>
|
212 |
|
|
</property>
|
213 |
25716
|
marek.hors
|
<property>
|
214 |
31757
|
marek.hors
|
<name>import.content.max.file.size.mb</name>
|
215 |
|
|
<value>${max_file_size_mb}</value>
|
216 |
|
|
</property>
|
217 |
|
|
<property>
|
218 |
25716
|
marek.hors
|
<name>import.content.connection.timeout</name>
|
219 |
|
|
<value>${content_connection_timeout}</value>
|
220 |
|
|
</property>
|
221 |
|
|
<property>
|
222 |
|
|
<name>import.content.read.timeout</name>
|
223 |
|
|
<value>${content_read_timeout}</value>
|
224 |
|
|
</property>
|
225 |
19066
|
marek.hors
|
<property>
|
226 |
|
|
<name>mapreduce.map.class</name>
|
227 |
24106
|
marek.hors
|
<value>eu.dnetlib.iis.metadataextraction.${processing_mode}</value>
|
228 |
19066
|
marek.hors
|
</property>
|
229 |
21303
|
marek.hors
|
<property>
|
230 |
|
|
<name>mapred.reduce.tasks</name>
|
231 |
|
|
<value>0</value>
|
232 |
|
|
</property>
|
233 |
19066
|
marek.hors
|
</configuration>
|
234 |
|
|
</map-reduce>
|
235 |
|
|
<ok to="end"/>
|
236 |
|
|
<error to="fail"/>
|
237 |
|
|
</action>
|
238 |
|
|
|
239 |
|
|
<kill name="fail">
|
240 |
|
|
<message>Unfortunately, the process failed -- error message: [${wf:errorMessage(wf:lastErrorNode())}]</message>
|
241 |
|
|
</kill>
|
242 |
|
|
<end name="end"/>
|
243 |
|
|
</workflow-app>
|