Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_importer_plaintext_cached">
2
	
3
	<parameters>
4
		<property>
5
			<name>input</name>
6
			<description>input document content directory</description>
7
		</property>
8
		<property>
9
			<name>output</name>
10
			<description>plaintext importer output directory</description>
11
		</property>
12
		<property>
13
			<name>content_connection_timeout</name>
14
			<value>60000</value>
15
			<description>streaming content connection timeout</description>
16
		</property>
17
		<property>
18
			<name>content_read_timeout</name>
19
			<value>60000</value>
20
			<description>streaming content read timeout</description>
21
		</property>
22
		<property>
23
			<name>zk_session_timeout</name>
24
			<value>60000</value>
25
			<description>zookeeper session timeout when handling locks</description>
26
		</property>
27
		<property>
28
			<name>default_cache_location</name>
29
			<description>default cache location stored in HDFS</description>
30
		</property>
31
		<property>
32
			<name>mapred_max_split_size</name>
33
			<value>50000</value>
34
			<description>maximum input data split size, required by streaming version reading DocumentContentUrl to split input data into more chunks</description>
35
		</property>
36
	</parameters>
37

    
38
	<global>
39
        <job-tracker>${jobTracker}</job-tracker>
40
        <name-node>${nameNode}</name-node>
41
        <configuration>
42
            <property>
43
                <name>mapred.job.queue.name</name>
44
                <value>${queueName}</value>
45
            </property>
46
		</configuration>
47
	</global>
48

    
49
	<start to="check_input_isempty" />
50
	
51
	<action name="check_input_isempty">
52
		<java>
53
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
54
			<arg>eu.dnetlib.iis.common.utils.EmptyDatastoreVerifierProcess</arg>
55
			<arg>-Iinput=${input}</arg>
56
			<capture-output />
57
		</java>
58
		<ok to="decision-is-input-empty" />
59
		<error to="fail" />
60
	</action>
61

    
62
	<decision name="decision-is-input-empty">
63
        <switch>
64
        	<!-- skipping metadataextraction merging process -->
65
            <case to="get-existing-cache-id">${wf:actionData('check_input_isempty')['isEmpty'] eq "false"}</case>
66
            <default to="generate-empty-output"/>
67
        </switch>
68
    </decision>
69

    
70
	<action name="generate-empty-output">
71
        <java>
72
			<prepare>
73
				<!-- notice: directory have to aligned with skipped action output -->
74
				<delete path="${nameNode}${output}" />
75
			</prepare>
76
            <main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
77
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
78
			<arg>-C{plaintext,
79
				eu.dnetlib.iis.metadataextraction.schemas.DocumentText,
80
			eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
81
            <arg>-Oplaintext=${output}</arg>
82
        </java>
83
        <ok to="end"/>
84
        <error to="fail"/>
85
    </action>
86

    
87
	<action name='get-existing-cache-id'>
88
		<java>
89
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
90
			<arg>eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess</arg>
91
			<arg>-Pdefault_cache_location=${default_cache_location}</arg>
92
			<arg>-Pmode=read_current_id</arg>
93
			<capture-output />
94
		</java>
95
		<ok to="decision-is-cache-empty" />
96
		<error to="fail" />
97
	</action>
98

    
99
	<decision name="decision-is-cache-empty">
100
        <switch>
101
            <case to="importer_plaintext_on_full_input">${wf:actionData('get-existing-cache-id')['cache_id'] eq "$UNDEFINED$"}</case>
102
            <default to="transformers_importer_plaintext_skip_extracted"/>
103
        </switch>
104
    </decision>
105
	
106
	<!-- end of cache based processing block, cache was provided as an input -->
107
	<action name="transformers_importer_plaintext_skip_extracted">
108
		<sub-workflow>
109
            <app-path>${wf:appPath()}/transformers_importer_plaintext_skip_extracted</app-path>
110
            <propagate-configuration/>
111
            <configuration>
112
            	<property>
113
                    <name>workingDir</name>
114
                    <value>${workingDir}/transformers_importer_plaintext_skip_extracted/working_dir</value>
115
                </property>
116
                <property>
117
					<name>input_document_content</name>
118
					<value>${input}</value>
119
				</property>
120
				<property>
121
					<name>input_document_text</name>
122
					<value>${default_cache_location}/${wf:actionData('get-existing-cache-id')['cache_id']}</value>
123
				</property>
124
				<property>
125
					<name>output_document_content</name>
126
					<value>${workingDir}/transformers_importer_plaintext_skip_extracted/tobeprocessed_content</value>
127
				</property>
128
				<property>
129
					<name>output_document_text</name>
130
					<value>${workingDir}/transformers_importer_plaintext_skip_extracted/tobereturned_text</value>
131
				</property>
132
            </configuration>
133
        </sub-workflow>
134
		<ok to="importer_plaintext_on_filtered_input"/>
135
		<error to="fail" />
136
	</action>
137
	
138
	<action name="importer_plaintext_on_filtered_input">
139
		<sub-workflow>
140
            <app-path>${wf:appPath()}/importer_plaintext</app-path>
141
            <propagate-configuration/>
142
            <configuration>
143
            	<property>
144
                    <name>workingDir</name>
145
                    <value>${workingDir}/importer_plaintext/working_dir</value>
146
                </property>
147
            	<property>
148
					<name>input</name>
149
					<value>${workingDir}/transformers_importer_plaintext_skip_extracted/tobeprocessed_content</value>
150
				</property>
151
				<property>
152
					<name>output</name>
153
					<value>${workingDir}/importer_plaintext/output</value>
154
				</property>
155
				<!-- all the other properties are autmatically propagated-->
156
            </configuration>
157
        </sub-workflow>
158
		<ok to="transformers_common_union_plaintext_merge_outputs"/>
159
		<error to="fail" />
160
	</action>
161

    
162
    <action name="transformers_common_union_plaintext_merge_outputs">
163
		<sub-workflow>
164
            <app-path>${wf:appPath()}/transformers_common_union</app-path>
165
            <propagate-configuration/>
166
            <configuration>
167
            	<property>
168
					<name>input_a</name>
169
					<value>${workingDir}/transformers_importer_plaintext_skip_extracted/tobereturned_text</value>
170
				</property>
171
				<property>
172
					<name>input_b</name>
173
					<value>${workingDir}/importer_plaintext/output</value>
174
				</property>
175
				<property>
176
					<name>output</name>
177
					<value>${output}</value>
178
				</property>
179
				<property>
180
					<name>schema</name>
181
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
182
				</property>
183
            </configuration>
184
        </sub-workflow>
185
		<ok to="check_importer_plaintext_output_isempty"/>
186
		<error to="fail" />
187
	</action>
188
    
189
    <action name='check_importer_plaintext_output_isempty'>
190
		<java>
191
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
192
			<arg>eu.dnetlib.iis.common.utils.EmptyDatastoreVerifierProcess</arg>
193
			<arg>-Iinput=${workingDir}/importer_plaintext/output</arg>
194
			<capture-output />
195
		</java>
196
		<ok to="decision-is-importer_plaintext-output-empty" />
197
		<error to="fail" />
198
	</action>
199
    
200
    <decision name="decision-is-importer_plaintext-output-empty">
201
        <switch>
202
        	<!-- skipping metadataextraction merging process -->
203
            <case to="obtain-lock_for_merging">${wf:actionData('check_importer_plaintext_output_isempty')['isEmpty'] eq "false"}</case>
204
            <default to="end"/>
205
        </switch>
206
    </decision>
207
    
208
    <action name="obtain-lock_for_merging">
209
        <java>
210
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
211
			<arg>eu.dnetlib.iis.common.lock.LockManagingProcess</arg>
212
            <arg>-Pzk_session_timeout=${zk_session_timeout}</arg>
213
            <arg>-Pnode_id=${default_cache_location}</arg>
214
            <arg>-Pmode=obtain</arg>
215
        </java>
216
        <ok to="get-new-cache-id_for_merging"/>
217
        <error to="release-lock-and-fail"/>
218
    </action>
219
    
220
    <action name='get-new-cache-id_for_merging'>
221
		<java>
222
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
223
			<arg>eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess</arg>
224
			<arg>-Pdefault_cache_location=${default_cache_location}</arg>
225
			<arg>-Pmode=generate_new_id</arg>
226
			<capture-output />
227
		</java>
228
		<ok to="transformers_common_union_plaintext_merge_cache" />
229
		<error to="release-lock-and-fail" />
230
	</action>
231
    
232
    <action name="transformers_common_union_plaintext_merge_cache">
233
		<sub-workflow>
234
            <app-path>${wf:appPath()}/transformers_common_union</app-path>
235
            <propagate-configuration/>
236
            <configuration>
237
            	<property>
238
					<name>input_a</name>
239
					<value>${default_cache_location}/${wf:actionData('get-existing-cache-id')['cache_id']}</value>
240
				</property>
241
				<property>
242
					<name>input_b</name>
243
					<value>${workingDir}/importer_plaintext/output</value>
244
				</property>
245
				<property>
246
					<name>output</name>
247
					<value>${default_cache_location}/${wf:actionData('get-new-cache-id_for_merging')['cache_id']}</value>
248
				</property>
249
				<property>
250
					<name>schema</name>
251
					<value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
252
				</property>
253
            </configuration>
254
        </sub-workflow>
255
		<ok to="write-new-cache-id_for_merging"/>
256
		<error to="fail-merge_cache-temp_files_cleanup" />
257
	</action>
258
    
259
	<action name="fail-merge_cache-temp_files_cleanup">
260
        <fs>
261
            <delete path="${nameNode}${default_cache_location}/${wf:actionData('get-new-cache-id_for_merging')['cache_id']}" />
262
        </fs>
263
        <ok to="release-lock-and-fail"/>
264
        <error to="release-lock-and-fail"/>
265
    </action>
266
	
267
	<action name='write-new-cache-id_for_merging'>
268
		<java>
269
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
270
			<arg>eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess</arg>
271
			<arg>-Pdefault_cache_location=${default_cache_location}</arg>
272
			<arg>-Pmode=write_id</arg>
273
			<arg>-Pid=${wf:actionData('get-new-cache-id_for_merging')['cache_id']}</arg>
274
			<capture-output />
275
		</java>
276
		<ok to="release-lock-and-end" />
277
		<error to="fail-merge_cache-temp_files_cleanup" />
278
	</action>
279
	
280
	<!-- end of cache based processing block, cache was provided as an input -->
281
    
282
    <!-- full input processing block, no cache was provided as an input -->
283
    <action name="importer_plaintext_on_full_input">
284
		<sub-workflow>
285
            <app-path>${wf:appPath()}/importer_plaintext</app-path>
286
            <propagate-configuration/>
287
            <configuration>
288
            	<property>
289
                    <name>workingDir</name>
290
                    <value>${workingDir}/importer_plaintext/working_dir</value>
291
                </property>
292
				<!-- all the other properties are autmatically propagated-->
293
            </configuration>
294
        </sub-workflow>
295
		<ok to="obtain-lock_for_initializing"/>
296
		<error to="fail" />
297
	</action>
298
    
299
    <action name="obtain-lock_for_initializing">
300
        <java>
301
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
302
			<arg>eu.dnetlib.iis.common.lock.LockManagingProcess</arg>
303
            <arg>-Pzk_session_timeout=${zk_session_timeout}</arg>
304
            <arg>-Pnode_id=${default_cache_location}</arg>
305
            <arg>-Pmode=obtain</arg>
306
        </java>
307
        <ok to="get-new-cache-id_for_initializing"/>
308
        <error to="release-lock-and-fail"/>
309
    </action>
310
    
311
    <action name='get-new-cache-id_for_initializing'>
312
		<java>
313
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
314
			<arg>eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess</arg>
315
			<arg>-Pdefault_cache_location=${default_cache_location}</arg>
316
			<arg>-Pmode=generate_new_id</arg>
317
			<capture-output />
318
		</java>
319
		<ok to="initialize_plaintext_cache" />
320
		<error to="release-lock-and-fail" />
321
	</action>
322
    
323
    <action name="initialize_plaintext_cache">
324
       <distcp xmlns="uri:oozie:distcp-action:0.1">
325
           <job-tracker>${jobTracker}</job-tracker>
326
           <name-node>${nameNode}</name-node>
327
           <arg>${nameNode}${output}</arg>
328
           <arg>${nameNode}${default_cache_location}/${wf:actionData('get-new-cache-id_for_initializing')['cache_id']}</arg>
329
           </distcp>
330
       <ok to="write-new-cache-id_for_initializing"/>
331
       <error to="fail-initialize_cache-temp_files_cleanup"/>
332
	</action>
333
	
334
	<action name="fail-initialize_cache-temp_files_cleanup">
335
        <fs>
336
            <delete path="${nameNode}${default_cache_location}/${wf:actionData('get-new-cache-id_for_initializing')['cache_id']}" />
337
        </fs>
338
        <ok to="release-lock-and-fail"/>
339
        <error to="release-lock-and-fail"/>
340
    </action>
341
	
342
	<action name='write-new-cache-id_for_initializing'>
343
		<java>
344
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
345
			<arg>eu.dnetlib.iis.common.cache.CacheMetadataManagingProcess</arg>
346
			<arg>-Pdefault_cache_location=${default_cache_location}</arg>
347
			<arg>-Pmode=write_id</arg>
348
			<arg>-Pid=${wf:actionData('get-new-cache-id_for_initializing')['cache_id']}</arg>
349
			<capture-output />
350
		</java>
351
		<ok to="release-lock-and-end" />
352
		<error to="fail-initialize_cache-temp_files_cleanup" />
353
	</action>
354
	
355
    <!-- end of full input processing block, no cache was provided as an input -->
356
    
357
    <!-- lock releasing actions -->
358
    <action name="release-lock-and-fail">
359
        <java>
360
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
361
			<arg>eu.dnetlib.iis.common.lock.LockManagingProcess</arg>
362
            <arg>-Pzk_session_timeout=${zk_session_timeout}</arg>
363
            <arg>-Pnode_id=${default_cache_location}</arg>
364
            <arg>-Pmode=release</arg>
365
        </java>
366
        <ok to="fail"/>
367
        <error to="fail"/>
368
    </action>
369
    
370
    <action name="release-lock-and-end">
371
        <java>
372
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
373
			<arg>eu.dnetlib.iis.common.lock.LockManagingProcess</arg>
374
            <arg>-Pzk_session_timeout=${zk_session_timeout}</arg>
375
            <arg>-Pnode_id=${default_cache_location}</arg>
376
            <arg>-Pmode=release</arg>
377
        </java>
378
        <ok to="end"/>
379
        <error to="fail"/>
380
    </action>
381
    <!-- end of lock releasing actions -->
382
    
383
	<kill name="fail">
384
		<message>Unfortunately, the process failed -- error message:
385
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
386
	</kill>
387
	<end name="end" />
388
</workflow-app>
(2-2/2)