Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="mainworkflows_metadataextraction_cached_by_checksum">
2
	
3
	<parameters>
4
		<property>
5
			<name>input</name>
6
			<description>input document content directory</description>
7
		</property>
8
		<property>
9
			<name>output_root</name>
10
			<description>metadata extraction output directory</description>
11
		</property>
12
		<property>
13
			<name>excluded_ids</name>
14
			<value>$UNDEFINED$</value>
15
			<description>list of content identifiers excluded from metadataextraction processing</description>
16
		</property>
17
		<property>
18
			<name>max_file_size_mb</name>
19
			<value>500</value>
20
			<description>maximum allowed file size in Megabytes</description>
21
		</property>
22
		<property>
23
			<name>content_connection_timeout</name>
24
			<value>60000</value>
25
			<description>streaming content connection timeout</description>
26
		</property>
27
		<property>
28
			<name>content_read_timeout</name>
29
			<value>60000</value>
30
			<description>streaming content read timeout</description>
31
		</property>
32
		<property>
33
			<name>zk_session_timeout</name>
34
			<value>60000</value>
35
			<description>zookeeper session timeout when handling locks</description>
36
		</property>
37
		<property>
38
			<name>default_cache_location</name>
39
			<value>/cache/metadataextraction</value>
40
			<description>default cache location stored in HDFS</description>
41
		</property>
42
		<property>
43
			<name>mapred_max_split_size</name>
44
			<value>50000</value>
45
			<description>maximum input data split size, required by streaming version reading DocumentContentUrl to split input data into more chunks</description>
46
		</property>
47
		<property>
48
			<name>output_name_meta</name>
49
			<value>meta</value>
50
			<description>metadata output subdirectory name</description>
51
		</property>
52
		<property>
53
			<name>output_name_plaintext</name>
54
			<value>plaintext</value>
55
			<description>plaintext output subdirectory name</description>
56
		</property>
57
		<property>
58
			<name>output_name_fault</name>
59
			<value>fault</value>
60
			<description>fault output subdirectory name</description>
61
		</property>
62
	</parameters>
63

    
64
	<global>
65
        <job-tracker>${jobTracker}</job-tracker>
66
        <name-node>${nameNode}</name-node>
67
        <configuration>
68
            <property>
69
                <name>mapred.job.queue.name</name>
70
                <value>${queueName}</value>
71
            </property>
72
		</configuration>
73
	</global>
74

    
75
	<start to="preprocessing" />
76
		
77
	<action name="preprocessing">
78
		<sub-workflow>
79
            <app-path>${wf:appPath()}/transformers_metadataextraction_checksum_preprocessing</app-path>
80
            <propagate-configuration/>
81
            <configuration>
82
            	<property>
83
                    <name>workingDir</name>
84
                    <value>${workingDir}/preprocessing/working_dir</value>
85
                </property>
86
                <property>
87
					<name>input</name>
88
					<value>${input}</value>
89
				</property>
90
				<property>
91
					<name>output</name>
92
					<value>${workingDir}/preprocessing/output</value>
93
				</property>
94
            </configuration>
95
        </sub-workflow>
96
		<ok to="mainworkflows_metadataextraction_cached"/>
97
		<error to="fail" />
98
	</action>
99
	
100
	<action name="mainworkflows_metadataextraction_cached">
101
		<sub-workflow>
102
            <app-path>${wf:appPath()}/mainworkflows_metadataextraction_cached</app-path>
103
            <propagate-configuration/>
104
            <configuration>
105
            	<property>
106
                    <name>workingDir</name>
107
                    <value>${workingDir}/mainworkflows_metadataextraction_cached/working_dir</value>
108
                </property>
109
                <property>
110
                	<!-- checksum identified input -->
111
					<name>input</name>
112
					<value>${workingDir}/preprocessing/output</value>
113
				</property>
114
				<property>
115
					<name>output_root</name>
116
					<value>${workingDir}/mainworkflows_metadataextraction_cached/out</value>
117
				</property>
118
				<property>
119
					<name>excluded_ids</name>
120
					<value>${excluded_ids}</value>
121
				</property>
122
				<property>
123
					<name>max_file_size_mb</name>
124
					<value>${max_file_size_mb}</value>
125
				</property>
126
				<property>
127
					<name>content_connection_timeout</name>
128
					<value>${content_connection_timeout}</value>
129
				</property>
130
				<property>
131
					<name>content_read_timeout</name>
132
					<value>${content_read_timeout}</value>
133
				</property>
134
				<property>
135
					<name>zk_session_timeout</name>
136
					<value>${zk_session_timeout}</value>
137
				</property>
138
				<property>
139
					<name>default_cache_location</name>
140
					<value>${default_cache_location}</value>
141
				</property>
142
				<property>
143
					<name>mapred_max_split_size</name>
144
					<value>${mapred_max_split_size}</value>
145
				</property>
146
				<property>
147
					<name>output_name_meta</name>
148
					<value>${output_name_meta}</value>
149
				</property>
150
				<property>
151
					<name>output_name_plaintext</name>
152
					<value>${output_name_plaintext}</value>
153
				</property>
154
				<property>
155
					<name>output_name_fault</name>
156
					<value>${output_name_fault}</value>
157
				</property>
158
            </configuration>
159
        </sub-workflow>
160
		<ok to="postprocessing-forking"/>
161
		<error to="fail" />
162
	</action>
163
    
164
    <fork name="postprocessing-forking">
165
    	<path start="postprocessing-meta"/>
166
        <path start="postprocessing-text"/>
167
        <path start="postprocessing-fault"/>
168
    </fork>
169
    
170
    <action name="postprocessing-meta">
171
		<sub-workflow>
172
            <app-path>${wf:appPath()}/transformers_metadataextraction_checksum_postprocessing_meta</app-path>
173
            <propagate-configuration/>
174
            <configuration>
175
            	<property>
176
                    <name>workingDir</name>
177
                    <value>${workingDir}/postprocessing_meta/working_dir</value>
178
                </property>
179
                <property>
180
					<name>input_document_content_url</name>
181
					<value>${input}</value>
182
				</property>
183
				<property>
184
					<name>input_extracted_document_metadata</name>
185
					<value>${workingDir}/mainworkflows_metadataextraction_cached/out/${output_name_meta}</value>
186
				</property>
187
				<property>
188
					<name>output</name>
189
					<value>${output_root}/${output_name_meta}</value>
190
				</property>
191
            </configuration>
192
        </sub-workflow>
193
		<ok to="postprocessing-joining"/>
194
		<error to="fail" />
195
	</action>
196
    
197
    <action name="postprocessing-text">
198
		<sub-workflow>
199
            <app-path>${wf:appPath()}/transformers_metadataextraction_checksum_postprocessing_text</app-path>
200
            <propagate-configuration/>
201
            <configuration>
202
            	<property>
203
                    <name>workingDir</name>
204
                    <value>${workingDir}/postprocessing_text/working_dir</value>
205
                </property>
206
                <property>
207
					<name>input_document_content_url</name>
208
					<value>${input}</value>
209
				</property>
210
				<property>
211
					<name>input_document_text</name>
212
					<value>${workingDir}/mainworkflows_metadataextraction_cached/out/${output_name_plaintext}</value>
213
				</property>
214
				<property>
215
					<name>output</name>
216
					<value>${output_root}/${output_name_plaintext}</value>
217
				</property>
218
            </configuration>
219
        </sub-workflow>
220
		<ok to="postprocessing-joining"/>
221
		<error to="fail" />
222
	</action>
223
    
224
    <action name="postprocessing-fault">
225
       <distcp xmlns="uri:oozie:distcp-action:0.1">
226
           <job-tracker>${jobTracker}</job-tracker>
227
           <name-node>${nameNode}</name-node>
228
           <prepare>
229
				<delete path="${nameNode}${output_root}/${output_name_fault}" />
230
		   </prepare>
231
           <arg>${nameNode}${workingDir}/mainworkflows_metadataextraction_cached/out/${output_name_fault}</arg>
232
           <arg>${nameNode}${output_root}/${output_name_fault}</arg>
233
           </distcp>
234
       <ok to="postprocessing-joining"/>
235
       <error to="fail"/>
236
	</action>
237
    
238
    <join name="postprocessing-joining" to="end"/>
239
    
240
	<kill name="fail">
241
		<message>Unfortunately, the process failed -- error message:
242
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
243
	</kill>
244
	<end name="end" />
245
</workflow-app>
(2-2/2)