Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_importer_content_url">
2

    
3
	<parameters>
4
		<!-- input -->
5
		<property>
6
			<name>objectstore_service_location</name>
7
			<description>object store service location to retrieve PDF/text contents from</description>
8
		</property>
9
		<property>
10
			<name>approved_objectstores_csv</name>
11
			<value>$UNDEFINED$</value>
12
			<description>CSV of approved object stores</description>
13
		</property>
14
		<property>
15
			<name>blacklisted_objectstores_csv</name>
16
			<value>$UNDEFINED$</value>
17
			<description>CSV of blacklisted object stores</description>
18
		</property>
19
		<property>
20
			<name>approved_datasources_csv</name>
21
			<value>$UNDEFINED$</value>
22
			<description>CSV of approved datasources, used interchangeably with approved_objectstores_csv</description>
23
		</property>
24
		<property>
25
			<name>lookup_service_location</name>
26
			<value>$UNDEFINED$</value>
27
			<description>ISLookup service location, required when processing approved_datasources_csv in order to find datasource and objectstore relation</description>
28
		</property>
29
		<property>
30
			<name>mimetypes_pdf</name>
31
			<description>pdf mime types</description>
32
		</property>
33
		<property>
34
			<name>mimetypes_text</name>
35
			<description>text mime types</description>
36
		</property>
37
		<property>
38
			<name>mimetypes_html</name>
39
			<description>html mime types</description>
40
		</property>
41
		<property>
42
			<name>mimetypes_xml_pmc</name>
43
			<description>EuropePMC xml mime types</description>
44
		</property>
45
		<property>
46
			<name>mimetypes_wos</name>
47
			<description>WoS mime types</description>
48
		</property>
49
		<property>
50
			<name>resultset_client_read_timeout</name>
51
			<value>60000</value>
52
			<description>result set client reading timeout</description>
53
		</property>
54
		<!-- input datastores -->
55
		<property>
56
			<name>input_id_mapping</name>
57
			<value>$UNDEFINED$</value>
58
			<description>input containing id mappings used by id-replacer module to translate identifiers of deduplicated objects</description>
59
		</property>
60
		<property>
61
			<name>input_id</name>
62
			<value>$UNDEFINED$</value>
63
			<description>input containing list of identifiers used by existence filter module in order to omit contents which do not have metadata representatives</description>
64
		</property>
65
		<!-- output -->
66
		<property>
67
			<name>output_root</name>
68
			<description>output root directory</description>
69
		</property>
70
		<property>
71
			<name>output_name_pdf</name>
72
			<value>pdf</value>
73
			<description>pdf output subdirectory name</description>
74
		</property>
75
		<property>
76
			<name>output_name_text</name>
77
			<value>text</value>
78
			<description>text output subdirectory name</description>
79
		</property>
80
		<property>
81
			<name>output_name_html</name>
82
			<value>html</value>
83
			<description>html output subdirectory name</description>
84
		</property>
85
		<property>
86
			<name>output_name_xml_pmc</name>
87
			<value>xmlpmc</value>
88
			<description>XML PMC output subdirectory name</description>
89
		</property>
90
		<property>
91
			<name>output_name_wos</name>
92
			<value>wos</value>
93
			<description>WoS output subdirectory name</description>
94
		</property>
95
	</parameters>
96

    
97
	<global>
98
        <job-tracker>${jobTracker}</job-tracker>
99
        <name-node>${nameNode}</name-node>
100
        <configuration>
101
            <property>
102
                <name>mapred.job.queue.name</name>
103
                <value>${queueName}</value>
104
            </property>
105
		</configuration>
106
	</global>
107

    
108
	<start to="objectstore-content-url-importer" />
109
	
110
	<action name="objectstore-content-url-importer">
111
		<java>
112
			<!-- The data generated by this node is deleted in this section -->
113
			<prepare>
114
				<delete path="${nameNode}${workingDir}/imported-urls" />
115
			</prepare>
116
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
117
			<arg>eu.dnetlib.iis.importer.content.ObjectStoreDocumentContentUrlImporterProcess</arg>
118
			<arg>-SworkingDir=${workingDir}</arg>
119
			<arg>-Pimport.content.object.store.location=${objectstore_service_location}</arg>
120
			<arg>-Pimport.content.approved.objectstores.csv=${approved_objectstores_csv}</arg>
121
			<arg>-Pimport.content.blacklisted.objectstores.csv=${blacklisted_objectstores_csv}</arg>
122
			<arg>-Pimport.approved.datasources.csv=${approved_datasources_csv}</arg>
123
			<arg>-Pimport.content.lookup.service.location=${lookup_service_location}</arg>
124
			<arg>-Pimport.resultset.client.read.timeout=${resultset_client_read_timeout}</arg>
125
			<arg>-Ocontent_url=${workingDir}/imported-urls</arg>
126
		</java>
127
		<ok to="input_id_mapping-path-setter" />
128
		<error to="fail" />
129
	</action>
130
	
131
	<action name='input_id_mapping-path-setter'>
132
		<java>
133
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
134
			<arg>eu.dnetlib.iis.common.oozie.property.ConditionalPropertySetter</arg>
135
			<arg>-SworkingDir=${workingDir}</arg>
136
			<arg>-Pcondition=${input_id_mapping eq "$UNDEFINED$"}</arg>
137
			<arg>-PinCaseOfTrue=${workingDir}/imported-urls</arg>
138
			<arg>-PelseCase=${workingDir}/transformers_idreplacer/output</arg>
139
			<capture-output />
140
		</java>
141
		<ok to="decision-id-replacer" />
142
		<error to="fail" />
143
	</action>
144
	
145
	<decision name="decision-id-replacer">
146
        <switch>
147
            <case to="input_id-path-setter">${input_id_mapping eq "$UNDEFINED$"}</case>
148
            <default to="transformers_idreplacer"/>
149
        </switch>
150
    </decision>
151
	
152
	<action name="transformers_idreplacer">
153
		<sub-workflow>
154
            <app-path>${wf:appPath()}/transformers_idreplacer</app-path>
155
            <propagate-configuration/>
156
            <configuration>
157
            	<property>
158
                    <name>workingDir</name>
159
                    <value>${workingDir}/transformers_idreplacer/working_dir</value>
160
                </property>
161
				<property>
162
                    <name>id_field_to_replace1</name>
163
                    <value>id</value>
164
                </property>
165
				<property>
166
                    <name>schema</name>
167
                    <value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
168
                </property>
169
                <property>
170
                    <name>input</name>
171
                    <value>${workingDir}/imported-urls</value>
172
                </property>
173
                <property>
174
                    <name>input_id_mapping</name>
175
                    <value>${input_id_mapping}</value>
176
                </property>
177
				<property>
178
                    <name>output</name>
179
                    <value>${workingDir}/transformers_idreplacer/output</value>
180
                </property>
181
			</configuration>
182
        </sub-workflow>
183
		<ok to="input_id-path-setter" />
184
		<error to="fail" />
185
	</action>
186
	
187
	<action name='input_id-path-setter'>
188
		<java>
189
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
190
			<arg>eu.dnetlib.iis.common.oozie.property.ConditionalPropertySetter</arg>
191
			<arg>-SworkingDir=${workingDir}</arg>
192
			<arg>-Pcondition=${input_id eq "$UNDEFINED$"}</arg>
193
			<arg>-PinCaseOfTrue=${wf:actionData('input_id_mapping-path-setter')['result']}</arg>
194
			<arg>-PelseCase=${workingDir}/transformers_common_existencefilter/output</arg>
195
			<capture-output />
196
		</java>
197
		<ok to="decision-existence-filter" />
198
		<error to="fail" />
199
	</action>
200
	
201
	<decision name="decision-existence-filter">
202
        <switch>
203
            <case to="content-url-dispatcher">${input_id eq "$UNDEFINED$"}</case>
204
            <default to="transformers_common_existencefilter"/>
205
        </switch>
206
    </decision>
207
	
208
	<action name="transformers_common_existencefilter">
209
		<sub-workflow>
210
            <app-path>${wf:appPath()}/transformers_common_existencefilter</app-path>
211
            <propagate-configuration/>
212
            <configuration>
213
            	<property>
214
                    <name>workingDir</name>
215
                    <value>${workingDir}/transformers_common_existencefilter/working_dir</value>
216
                </property>
217
				<property>
218
                    <name>input_data</name>
219
                    <value>${wf:actionData('input_id_mapping-path-setter')['result']}</value>
220
                </property>
221
				<property>
222
                    <name>input_existent_id</name>
223
                    <value>${input_id}</value>
224
                </property>
225
				<property>
226
                    <name>output_filtered</name>
227
                    <value>${workingDir}/transformers_common_existencefilter/output</value>
228
                </property>
229
			</configuration>
230
        </sub-workflow>
231
		<ok to="content-url-dispatcher" />
232
		<error to="fail" />
233
	</action>
234
	
235
	<action name="content-url-dispatcher">
236
		<map-reduce>
237
			<prepare>
238
				<delete path="${nameNode}${output_root}" />
239
			</prepare>
240
			<configuration>
241
				<property>
242
					<name>mapreduce.inputformat.class</name>
243
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
244
				</property>
245
				<!-- The output format seems to be required, JSON is written when missing -->
246
				<property>
247
					<name>mapreduce.outputformat.class</name>
248
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyOutputFormat</value>
249
				</property>
250
				<property>
251
					<name>mapred.mapoutput.key.class</name>
252
					<value>org.apache.avro.mapred.AvroKey</value>
253
				</property>
254
				<property>
255
					<name>mapred.mapoutput.value.class</name>
256
					<value>org.apache.avro.mapred.AvroValue</value>
257
				</property>
258
				<property>
259
					<name>mapred.output.key.class</name>
260
					<value>org.apache.avro.mapred.AvroKey</value>
261
				</property>
262
				<property>
263
					<name>mapred.output.value.class</name>
264
					<value>org.apache.avro.mapred.AvroValue</value>
265
				</property>
266
				<property>
267
					<name>mapred.output.key.comparator.class</name>
268
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
269
				</property>
270
				<property>
271
		            <name>eu.dnetlib.iis.avro.output.class</name>
272
		            <value>org.apache.avro.Schema.Type.NULL</value>
273
		        </property>
274
				<property>
275
					<name>io.serializations</name>
276
					<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
277
					</value>
278
				</property>
279
				<property>
280
					<name>mapred.output.value.groupfn.class</name>
281
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
282
				</property>
283
				<property>
284
					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
285
					</name>
286
					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
287
				</property>
288
				<property>
289
					<name>mapred.mapper.new-api</name>
290
					<value>true</value>
291
				</property>
292
				<property>
293
					<name>mapred.reducer.new-api</name>
294
					<value>true</value>
295
				</property>
296
				<!-- required for multiple outputs only -->
297
				<property>
298
					<name>avro.mapreduce.multipleoutputs</name>
299
					<value>${output_name_pdf} ${output_name_text} ${output_name_html} ${output_name_xml_pmc} ${output_name_wos}</value>
300
				</property>
301
				<property>
302
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_pdf}.format
303
					</name>
304
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
305
				</property>
306
				<property>
307
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_text}.format
308
					</name>
309
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
310
				</property>
311
				<property>
312
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_html}.format
313
					</name>
314
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
315
				</property>
316
				<property>
317
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_xml_pmc}.format
318
					</name>
319
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
320
				</property>
321
				<property>
322
					<name>avro.mapreduce.multipleoutputs.namedOutput.${output_name_wos}.format
323
					</name>
324
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
325
				</property>
326
				<property>
327
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_pdf}</name>
328
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
329
				</property>
330
				<property>
331
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_text}</name>
332
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
333
				</property>
334
				<property>
335
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_html}</name>
336
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
337
				</property>
338
				<property>
339
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_xml_pmc}</name>
340
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
341
				</property>
342
				<property>
343
					<name>eu.dnetlib.iis.avro.multipleoutputs.class.${output_name_wos}</name>
344
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
345
				</property>
346
				<!-- mimetypes related -->
347
				<property>
348
					<name>mimetypes.csv.${output_name_pdf}</name>
349
					<value>${mimetypes_pdf}</value>
350
				</property>
351
				<property>
352
					<name>mimetypes.csv.${output_name_text}</name>
353
					<value>${mimetypes_text}</value>
354
				</property>
355
				<property>
356
					<name>mimetypes.csv.${output_name_html}</name>
357
					<value>${mimetypes_html}</value>
358
				</property>
359
				<property>
360
					<name>mimetypes.csv.${output_name_xml_pmc}</name>
361
					<value>${mimetypes_xml_pmc}</value>
362
				</property>
363
				<property>
364
					<name>mimetypes.csv.${output_name_wos}</name>
365
					<value>${mimetypes_wos}</value>
366
				</property>
367
				<property>
368
					<name>mapreduce.map.class</name>
369
					<value>eu.dnetlib.iis.importer.content.DocumentContentUrlDispatcher</value>
370
				</property>
371
				<property>
372
					<name>eu.dnetlib.iis.avro.input.class</name>
373
					<value>eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</value>
374
				</property>
375
				<property>
376
					<name>eu.dnetlib.iis.avro.map.output.key.class</name>
377
                	<value>org.apache.avro.Schema.Type.STRING</value>
378
                </property>
379
				<property>
380
					<name>eu.dnetlib.iis.avro.map.output.value.class</name>
381
					<value>org.apache.avro.Schema.Type.NULL</value>
382
				</property>
383
				<property>
384
					<name>mapred.input.dir</name>
385
					<value>${wf:actionData('input_id-path-setter')['result']}</value>
386
				</property>
387
				<property>
388
					<name>mapred.output.dir</name>
389
					<value>${output_root}</value>
390
				</property>
391
				 <property>
392
                    <name>mapred.reduce.tasks</name>
393
                    <value>0</value>
394
                </property>
395
			</configuration>
396
		</map-reduce>
397
		<ok to="end" />
398
		<error to="fail" />
399
	</action>
400
	
401
	<kill name="fail">
402
		<message>Unfortunately, the process failed -- error message:
403
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
404
	</kill>
405
	<end name="end" />
406
</workflow-app>
(2-2/2)