Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_importer_plaintext_cached">
2
	
3
	<parameters>
4
		<property>
5
			<name>input</name>
6
			<description>input document content directory</description>
7
		</property>
8
		<property>
9
			<name>output</name>
10
			<description>plaintext importer output directory</description>
11
		</property>
12
		<property>
13
			<name>content_connection_timeout</name>
14
			<value>60000</value>
15
			<description>streaming content connection timeout</description>
16
		</property>
17
		<property>
18
			<name>content_read_timeout</name>
19
			<value>60000</value>
20
			<description>streaming content read timeout</description>
21
		</property>
22
		<property>
23
			<name>zk_session_timeout</name>
24
			<value>60000</value>
25
			<description>zookeeper session timeout when handling locks</description>
26
		</property>
27
		<property>
28
			<name>default_cache_location</name>
29
			<description>default cache location stored in HDFS</description>
30
		</property>
31
		<property>
32
			<name>mapred_max_split_size</name>
33
			<value>50000</value>
34
			<description>maximum input data split size, required by streaming version reading DocumentContentUrl to split input data into more chunks</description>
35
		</property>
36
	</parameters>
37

    
38
	<global>
39
        <job-tracker>${jobTracker}</job-tracker>
40
        <name-node>${nameNode}</name-node>
41
        <configuration>
42
            <property>
43
                <name>mapred.job.queue.name</name>
44
                <value>${queueName}</value>
45
            </property>
46
		</configuration>
47
	</global>
48

    
49
	<start to="check_input_isempty" />
50
	
51
	<action name="check_input_isempty">
52
		<java>
53
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
54
			<arg>eu.dnetlib.iis.common.utils.EmptyDatastoreVerifierProcess</arg>
55
			<arg>-Iinput=${input}</arg>
56
			<capture-output />
57
		</java>
58
		<ok to="decision-is-input-empty" />
59
		<error to="fail" />
60
	</action>
61

    
62
	<decision name="decision-is-input-empty">
63
        <switch>
64
        	<!-- skipping metadataextraction merging process -->
65
            <case to="get-existing-cache-id">${wf:actionData('check_input_isempty')['isEmpty'] eq "false"}</case>
66
            <default to="generate-empty-output"/>
67
        </switch>
68
    </decision>
69

    
70
	<action name="generate-empty-output">
71
        <java>
72
			<prepare>
73
				<!-- notice: directory have to aligned with skipped action output -->
74
				<delete path="${nameNode}${output}" />
75
			</prepare>
76
            <main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
77
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
78
			<arg>-C{plaintext,
79
				eu.dnetlib.iis.metadataextraction.schemas.DocumentText,
80
			eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
81
            <arg>-Oplaintext=${output}</arg>
82
        </java>
83
        <ok to="end"/>
84
        <error to="fail"/>
85
    </action>
86

    
87
	<action name='get-existing-cache-id'>
88
		<java>
89
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
90
			<arg>eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess</arg>
91
			<arg>-Pdefault_cache_location=${default_cache_location}</arg>
92
			<arg>-Pmode=read_current_id</arg>
93
			<capture-output />
94
		</java>
95
		<ok to="decision-is-cache-empty" />
96
		<error to="fail" />
97
	</action>
98

    
99
	<decision name="decision-is-cache-empty">
100
        <switch>
101
            <case to="importer_plaintext_on_full_input">${wf:actionData('get-existing-cache-id')['cache_id'] eq "$UNDEFINED$"}</case>
102
            <default to="transformers_importer_plaintext_skip_extracted"/>
103
        </switch>
104
    </decision>
105
	
106
	<!-- end of cache based processing block, cache was provided as an input -->
107
	<action name="transformers_importer_plaintext_skip_extracted">
108
		<sub-workflow>
109
            <app-path>${wf:appPath()}/transformers_importer_plaintext_skip_extracted</app-path>
110
            <propagate-configuration/>
111
            <configuration>
112
            	<property>
113
                    <name>workingDir</name>
114
                    <value>${workingDir}/transformers_importer_plaintext_skip_extracted/working_dir</value>
115
                </property>
116
                <property>
117
					<name>input_document_content</name>
118
					<value>${input}</value>
119
				</property>
120
				<property>
121
					<name>input_document_text</name>
122
					<value>${default_cache_location}/${wf:actionData('get-existing-cache-id')['cache_id']}</value>
123
				</property>
124
				<property>
125
					<name>output_document_content</name>
126
					<value>${workingDir}/transformers_importer_plaintext_skip_extracted/tobeprocessed_content</value>
127
				</property>
128
				<property>
129
					<name>output_document_text</name>
130
					<value>${workingDir}/transformers_importer_plaintext_skip_extracted/tobereturned_text</value>
131
				</property>
132
            </configuration>
133
        </sub-workflow>
134
		<ok to="importer_plaintext_on_filtered_input"/>
135
		<error to="fail" />
136
	</action>
137
	
138
	<action name="importer_plaintext_on_filtered_input">
139
		<sub-workflow>
140
            <app-path>${wf:appPath()}/importer_plaintext</app-path>
141
            <propagate-configuration/>
142
            <configuration>
143
            	<property>
144
                    <name>workingDir</name>
145
                    <value>${workingDir}/importer_plaintext/working_dir</value>
146
                </property>
147
            	<property>
148
					<name>input</name>
149
					<value>${workingDir}/transformers_importer_plaintext_skip_extracted/tobeprocessed_content</value>
150
				</property>
151
				<property>
152
				    <name>content_connection_timeout</name>
153
				   <value>${content_connection_timeout}</value>
154
				</property>
155
				<property>
156
				    <name>content_read_timeout</name>
157
				   <value>${content_read_timeout}</value>
158
				</property>
159
				<property>
160
					<name>mapred_max_split_size</name>
161
					<value>${mapred_max_split_size}</value>
162
				</property>
163
				<property>
164
					<name>output</name>
165
					<value>${workingDir}/importer_plaintext/output</value>
166
				</property>
167
            </configuration>
168
        </sub-workflow>
169
		<ok to="transformers_common_union_plaintext_merge_outputs"/>
170
		<error to="fail" />
171
	</action>
172

    
173
    <action name="transformers_common_union_plaintext_merge_outputs">
174
		<sub-workflow>
175
            <app-path>${wf:appPath()}/transformers_common_union</app-path>
176
            <propagate-configuration/>
177
            <configuration>
178
            	<property>
179
					<name>input_a</name>
180
					<value>${workingDir}/transformers_importer_plaintext_skip_extracted/tobereturned_text</value>
181
				</property>
182
				<property>
183
					<name>input_b</name>
184
					<value>${workingDir}/importer_plaintext/output</value>
185
				</property>
186
				<property>
187
					<name>output</name>
188
					<value>${output}</value>
189
				</property>
190
				<property>
191
					<name>schema</name>
192
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
193
				</property>
194
            </configuration>
195
        </sub-workflow>
196
		<ok to="check_importer_plaintext_output_isempty"/>
197
		<error to="fail" />
198
	</action>
199
    
200
    <action name='check_importer_plaintext_output_isempty'>
201
		<java>
202
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
203
			<arg>eu.dnetlib.iis.common.utils.EmptyDatastoreVerifierProcess</arg>
204
			<arg>-Iinput=${workingDir}/importer_plaintext/output</arg>
205
			<capture-output />
206
		</java>
207
		<ok to="decision-is-importer_plaintext-output-empty" />
208
		<error to="fail" />
209
	</action>
210
    
211
    <decision name="decision-is-importer_plaintext-output-empty">
212
        <switch>
213
        	<!-- skipping metadataextraction merging process -->
214
            <case to="obtain-lock_for_merging">${wf:actionData('check_importer_plaintext_output_isempty')['isEmpty'] eq "false"}</case>
215
            <default to="end"/>
216
        </switch>
217
    </decision>
218
    
219
    <action name="obtain-lock_for_merging">
220
        <java>
221
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
222
			<arg>eu.dnetlib.iis.common.lock.LockManagingProcess</arg>
223
            <arg>-Pzk_session_timeout=${zk_session_timeout}</arg>
224
            <arg>-Pnode_id=${default_cache_location}</arg>
225
            <arg>-Pmode=obtain</arg>
226
        </java>
227
        <ok to="get-new-cache-id_for_merging"/>
228
        <error to="release-lock-and-fail"/>
229
    </action>
230
    
231
    <action name='get-new-cache-id_for_merging'>
232
		<java>
233
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
234
			<arg>eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess</arg>
235
			<arg>-Pdefault_cache_location=${default_cache_location}</arg>
236
			<arg>-Pmode=generate_new_id</arg>
237
			<capture-output />
238
		</java>
239
		<ok to="transformers_common_union_plaintext_merge_cache" />
240
		<error to="release-lock-and-fail" />
241
	</action>
242
    
243
    <action name="transformers_common_union_plaintext_merge_cache">
244
		<sub-workflow>
245
            <app-path>${wf:appPath()}/transformers_common_union</app-path>
246
            <propagate-configuration/>
247
            <configuration>
248
            	<property>
249
					<name>input_a</name>
250
					<value>${default_cache_location}/${wf:actionData('get-existing-cache-id')['cache_id']}</value>
251
				</property>
252
				<property>
253
					<name>input_b</name>
254
					<value>${workingDir}/importer_plaintext/output</value>
255
				</property>
256
				<property>
257
					<name>output</name>
258
					<value>${default_cache_location}/${wf:actionData('get-new-cache-id_for_merging')['cache_id']}</value>
259
				</property>
260
				<property>
261
					<name>schema</name>
262
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
263
				</property>
264
            </configuration>
265
        </sub-workflow>
266
		<ok to="write-new-cache-id_for_merging"/>
267
		<error to="fail-merge_cache-temp_files_cleanup" />
268
	</action>
269
    
270
	<action name="fail-merge_cache-temp_files_cleanup">
271
        <fs>
272
            <delete path="${nameNode}${default_cache_location}/${wf:actionData('get-new-cache-id_for_merging')['cache_id']}" />
273
        </fs>
274
        <ok to="release-lock-and-fail"/>
275
        <error to="release-lock-and-fail"/>
276
    </action>
277
	
278
	<action name='write-new-cache-id_for_merging'>
279
		<java>
280
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
281
			<arg>eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess</arg>
282
			<arg>-Pdefault_cache_location=${default_cache_location}</arg>
283
			<arg>-Pmode=write_id</arg>
284
			<arg>-Pid=${wf:actionData('get-new-cache-id_for_merging')['cache_id']}</arg>
285
			<capture-output />
286
		</java>
287
		<ok to="release-lock-and-end" />
288
		<error to="fail-merge_cache-temp_files_cleanup" />
289
	</action>
290
	
291
	<!-- end of cache based processing block, cache was provided as an input -->
292
    
293
    <!-- full input processing block, no cache was provided as an input -->
294
    <action name="importer_plaintext_on_full_input">
295
		<sub-workflow>
296
            <app-path>${wf:appPath()}/importer_plaintext</app-path>
297
            <propagate-configuration/>
298
            <configuration>
299
            	<property>
300
                    <name>workingDir</name>
301
                    <value>${workingDir}/importer_plaintext/working_dir</value>
302
                </property>
303
            	<property>
304
					<name>input</name>
305
					<value>${input}</value>
306
				</property>
307
				<property>
308
				    <name>content_connection_timeout</name>
309
				   <value>${content_connection_timeout}</value>
310
				</property>
311
				<property>
312
				    <name>content_read_timeout</name>
313
				   <value>${content_read_timeout}</value>
314
				</property>
315
				<property>
316
					<name>mapred_max_split_size</name>
317
					<value>${mapred_max_split_size}</value>
318
				</property>
319
				<property>
320
					<name>output</name>
321
					<value>${output}</value>
322
				</property>
323
            </configuration>
324
        </sub-workflow>
325
		<ok to="obtain-lock_for_initializing"/>
326
		<error to="fail" />
327
	</action>
328
    
329
    <action name="obtain-lock_for_initializing">
330
        <java>
331
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
332
			<arg>eu.dnetlib.iis.common.lock.LockManagingProcess</arg>
333
            <arg>-Pzk_session_timeout=${zk_session_timeout}</arg>
334
            <arg>-Pnode_id=${default_cache_location}</arg>
335
            <arg>-Pmode=obtain</arg>
336
        </java>
337
        <ok to="get-new-cache-id_for_initializing"/>
338
        <error to="release-lock-and-fail"/>
339
    </action>
340
    
341
    <action name='get-new-cache-id_for_initializing'>
342
		<java>
343
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
344
			<arg>eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess</arg>
345
			<arg>-Pdefault_cache_location=${default_cache_location}</arg>
346
			<arg>-Pmode=generate_new_id</arg>
347
			<capture-output />
348
		</java>
349
		<ok to="initialize_plaintext_cache" />
350
		<error to="release-lock-and-fail" />
351
	</action>
352
    
353
    <action name="initialize_plaintext_cache">
354
       <distcp xmlns="uri:oozie:distcp-action:0.1">
355
           <job-tracker>${jobTracker}</job-tracker>
356
           <name-node>${nameNode}</name-node>
357
           <arg>${nameNode}${output}</arg>
358
           <arg>${nameNode}${default_cache_location}/${wf:actionData('get-new-cache-id_for_initializing')['cache_id']}</arg>
359
           </distcp>
360
       <ok to="write-new-cache-id_for_initializing"/>
361
       <error to="fail-initialize_cache-temp_files_cleanup"/>
362
	</action>
363
	
364
	<action name="fail-initialize_cache-temp_files_cleanup">
365
        <fs>
366
            <delete path="${nameNode}${default_cache_location}/${wf:actionData('get-new-cache-id_for_initializing')['cache_id']}" />
367
        </fs>
368
        <ok to="release-lock-and-fail"/>
369
        <error to="release-lock-and-fail"/>
370
    </action>
371
	
372
	<action name='write-new-cache-id_for_initializing'>
373
		<java>
374
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
375
			<arg>eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess</arg>
376
			<arg>-Pdefault_cache_location=${default_cache_location}</arg>
377
			<arg>-Pmode=write_id</arg>
378
			<arg>-Pid=${wf:actionData('get-new-cache-id_for_initializing')['cache_id']}</arg>
379
			<capture-output />
380
		</java>
381
		<ok to="release-lock-and-end" />
382
		<error to="fail-initialize_cache-temp_files_cleanup" />
383
	</action>
384
	
385
    <!-- end of full input processing block, no cache was provided as an input -->
386
    
387
    <!-- lock releasing actions -->
388
    <action name="release-lock-and-fail">
389
        <java>
390
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
391
			<arg>eu.dnetlib.iis.common.lock.LockManagingProcess</arg>
392
            <arg>-Pzk_session_timeout=${zk_session_timeout}</arg>
393
            <arg>-Pnode_id=${default_cache_location}</arg>
394
            <arg>-Pmode=release</arg>
395
        </java>
396
        <ok to="fail"/>
397
        <error to="fail"/>
398
    </action>
399
    
400
    <action name="release-lock-and-end">
401
        <java>
402
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
403
			<arg>eu.dnetlib.iis.common.lock.LockManagingProcess</arg>
404
            <arg>-Pzk_session_timeout=${zk_session_timeout}</arg>
405
            <arg>-Pnode_id=${default_cache_location}</arg>
406
            <arg>-Pmode=release</arg>
407
        </java>
408
        <ok to="end"/>
409
        <error to="fail"/>
410
    </action>
411
    <!-- end of lock releasing actions -->
412
    
413
	<kill name="fail">
414
		<message>Unfortunately, the process failed -- error message:
415
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
416
	</kill>
417
	<end name="end" />
418
</workflow-app>
(2-2/2)