Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.3" name="test-mainworkflows_integration_primary_processing">
2

    
3
	<start to="producer" />
4
	
5
	<action name="producer">
6
        <java>
7
            <job-tracker>${jobTracker}</job-tracker>
8
            <name-node>${nameNode}</name-node>
9
			<!-- The data generated by this node is deleted in this section -->
10
			<prepare>
11
				<delete path="${nameNode}${workingDir}/producer" />
12
				<mkdir path="${nameNode}${workingDir}/producer" />
13
			</prepare>
14
            <configuration>
15
                <property>
16
                    <name>mapred.job.queue.name</name>
17
                    <value>${queueName}</value>
18
                </property>
19
            </configuration>
20
            <!-- This is simple wrapper for the Java code -->
21
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
22
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
23
            <arg>-C{document_metadata,
24
				eu.dnetlib.iis.importer.schemas.DocumentMetadata,
25
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/document_metadata.json}</arg>
26
			<arg>-C{document_to_project,
27
				eu.dnetlib.iis.importer.schemas.DocumentToProject,
28
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
29
			<arg>-C{document_content_classpath,
30
				eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath,
31
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/document_content_classpath.json}</arg>
32
			<arg>-C{person,
33
				eu.dnetlib.iis.importer.schemas.Person,
34
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/person.json}</arg>
35
			<arg>-C{project,
36
				eu.dnetlib.iis.importer.schemas.Project,
37
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/project.json}</arg>
38
			<arg>-C{project_concept,
39
				eu.dnetlib.iis.importer.schemas.Concept,
40
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
41
			<arg>-C{dataset,
42
				eu.dnetlib.iis.importer.schemas.DataSetReference,
43
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/dataset.json}</arg>
44
			<arg>-C{extracted_document_metadata,
45
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
46
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/metadataextraction/extracted_document_metadata.json}</arg>
47
			<arg>-C{document_text_wos,
48
				eu.dnetlib.iis.metadataextraction.schemas.DocumentText,
49
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
50
			<arg>-C{dedup_mapping,
51
				eu.dnetlib.iis.common.schemas.IdentifierMapping,
52
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
53
			<arg>-C{citation,
54
				eu.dnetlib.iis.citationmatching.schemas.Citation,
55
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
56
			<arg>-C{document_text_classpath,
57
				eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath,
58
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/metadataextraction/document_text_classpath.json}</arg>
59
            <arg>-SworkingDir=${workingDir}/producer/working_dir</arg>
60
            <arg>-Odocument_metadata=${workingDir}/producer/document_metadata</arg>
61
            <arg>-Odocument_to_project=${workingDir}/producer/document_to_project</arg>
62
            <arg>-Odocument_content_classpath=${workingDir}/producer/document_content_classpath</arg>
63
            <arg>-Operson=${workingDir}/producer/person</arg>
64
            <arg>-Oproject=${workingDir}/producer/project</arg>
65
            <arg>-Oproject_concept=${workingDir}/producer/project_concept</arg>
66
            <arg>-Odataset=${workingDir}/producer/dataset</arg>
67
            <arg>-Oextracted_document_metadata=${workingDir}/producer/extracted_document_metadata</arg>
68
            <arg>-Odocument_text_wos=${workingDir}/producer/document_text_wos</arg>
69
            <arg>-Odedup_mapping=${workingDir}/producer/dedup_mapping</arg>
70
            <arg>-Ocitation=${workingDir}/producer/citation</arg>
71
            <arg>-Odocument_text_classpath=${workingDir}/producer/document_text_classpath</arg>           
72
        </java>
73
        <ok to="document_text_producer"/>
74
        <error to="fail"/>
75
    </action>
76

    
77
	<action name="document_text_producer">
78
		<map-reduce>
79
			<job-tracker>${jobTracker}</job-tracker>
80
			<name-node>${nameNode}</name-node>
81
			<prepare>
82
				<delete path="${nameNode}${workingDir}/text-producer" />
83
				<mkdir path="${nameNode}${workingDir}/text-producer" />
84
			</prepare>
85
			<configuration>
86
				<property>
87
					<name>mapreduce.inputformat.class</name>
88
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyInputFormat</value>
89
				</property>
90
				<property>
91
					<name>mapreduce.outputformat.class</name>
92
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyOutputFormat</value>
93
				</property>
94
				<property>
95
					<name>mapred.mapoutput.key.class</name>
96
					<value>org.apache.avro.mapred.AvroKey</value>
97
				</property>
98
				<property>
99
					<name>mapred.mapoutput.value.class</name>
100
					<value>org.apache.hadoop.io.NullWritable</value>
101
				</property>
102
				 <property>
103
                    <name>mapred.reduce.tasks</name>
104
                    <value>0</value>
105
                </property>
106
				<property>
107
					<name>mapred.output.key.class</name>
108
					<value>org.apache.avro.mapred.AvroKey</value>
109
				</property>
110
				<property>
111
					<name>mapred.output.value.class</name>
112
					<value>org.apache.avro.mapred.AvroValue</value>
113
				</property>
114
				<property>
115
					<name>mapred.output.key.comparator.class</name>
116
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
117
				</property>
118
				<property>
119
					<name>io.serializations</name>
120
					<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
121
					</value>
122
				</property>
123
				<property>
124
					<name>mapred.output.value.groupfn.class</name>
125
					<value>eu.dnetlib.iis.core.javamapreduce.hack.KeyComparator</value>
126
				</property>
127
				<property>
128
					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
129
					</name>
130
					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
131
				</property>
132
				<property>
133
					<name>mapred.mapper.new-api</name>
134
					<value>true</value>
135
				</property>
136
				<property>
137
					<name>mapred.reducer.new-api</name>
138
					<value>true</value>
139
				</property>
140
				<property>
141
					<name>mapreduce.map.class</name>
142
					<value>eu.dnetlib.iis.mainworkflows.converters.DocumentClasspathToTextConverter</value>
143
				</property>
144
				<property>
145
					<name>eu.dnetlib.iis.avro.input.class</name>
146
					<value>eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath</value>
147
				</property>
148
		        <property>
149
		            <name>eu.dnetlib.iis.avro.map.output.key.class</name>
150
		            <value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
151
		        </property>
152
		        <property>
153
		            <name>eu.dnetlib.iis.avro.map.output.value.class</name>
154
		            <value>org.apache.avro.Schema.Type.NULL</value>
155
		        </property>
156
		        <property>
157
		            <name>eu.dnetlib.iis.avro.output.class</name>
158
		            <value>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</value>
159
		        </property>
160
				<property>
161
					<name>mapred.input.dir</name>
162
					<value>${workingDir}/producer/document_text_classpath</value>
163
				</property>
164
				<property>
165
					<name>mapred.output.dir</name>
166
					<value>${workingDir}/text-producer/document_text</value>
167
				</property>
168
			</configuration>
169
		</map-reduce>
170
		<ok to="mainworkflows_primary_processing" />
171
		<error to="fail" />
172
	</action>
173

    
174
	<action name="mainworkflows_primary_processing">
175
		<sub-workflow>
176
            <app-path>${wf:appPath()}/mainworkflows_primary_processing</app-path>
177
            <propagate-configuration/>
178
            <configuration>
179
            	<property>
180
                    <name>active_referenceextraction_project</name>
181
                    <value>true</value>
182
                </property>
183
                <property>
184
                    <name>active_referenceextraction_dataset</name>
185
                    <value>true</value>
186
                </property>
187
            	<!-- disabling not tested KDMs -->
188
            	<property>
189
					<name>active_referenceextraction_researchinitiative</name>
190
					<value>true</value>
191
				</property>
192
            	<property>
193
                    <name>active_documentsclassification</name>
194
                    <value>true</value>
195
                </property>
196
                <property>
197
                    <name>active_documentssimilarity</name>
198
                    <value>false</value>
199
                </property>
200
            	<property>
201
                    <name>active_citationmatching</name>
202
                    <value>true</value>
203
                </property>
204
                <property>
205
                    <name>active_statistics</name>
206
                    <value>false</value>
207
                </property>
208
            	<property>
209
                    <name>input_document_metadata</name>
210
                    <value>${workingDir}/producer/document_metadata</value>
211
                </property>
212
                <property>
213
                    <name>input_document_to_project</name>
214
                    <value>${workingDir}/producer/document_to_project</value>
215
                </property>
216
                <property>
217
                    <name>input_document_text</name>
218
                    <value>${workingDir}/text-producer/document_text</value>
219
                </property>
220
                <property>
221
                    <name>input_document_text_wos</name>
222
                    <value>${workingDir}/producer/document_text_wos</value>
223
                </property>
224
                <property>
225
                    <name>input_project</name>
226
                    <value>${workingDir}/producer/project</value>
227
                </property>
228
                <property>
229
                    <name>input_project_concept</name>
230
                    <value>${workingDir}/producer/project_concept</value>
231
                </property>
232
                <property>
233
                    <name>input_person</name>
234
                    <value>${workingDir}/producer/person</value>
235
                </property>
236
                <property>
237
                    <name>input_dataset</name>
238
                    <value>${workingDir}/producer/dataset</value>
239
                </property>
240
                <property>
241
					<name>input_extracted_document_metadata</name>
242
					<value>${workingDir}/producer/extracted_document_metadata</value>
243
				</property>
244
				<property>
245
					<name>input_citation_pmc</name>
246
					<value>${workingDir}/producer/citation</value>
247
				</property>
248
				<property>
249
					<name>input_deduplication_mapping</name>
250
					<value>${workingDir}/producer/dedup_mapping</value>
251
				</property>
252
                <property>
253
                    <name>output_document_to_project</name>
254
                    <value>${workingDir}/exported/document_to_project</value>
255
                </property>
256
                <property>
257
                    <name>output_document_to_project_concepts</name>
258
                    <value>${workingDir}/exported/document_to_project_concepts</value>
259
                </property>
260
                <property>
261
                    <name>output_document_to_dataset</name>
262
                    <value>${workingDir}/exported/document_to_dataset</value>
263
                </property>
264
                <property>
265
                    <name>output_document_to_research_initiatives</name>
266
                    <value>${workingDir}/exported/document_to_research_initiatives</value>
267
                </property>
268
                <property>
269
                    <name>output_document_to_document_classes</name>
270
                    <value>${workingDir}/exported/document_to_document_classes</value>
271
                </property>
272
                <property>
273
                    <name>output_citation</name>
274
                    <value>${workingDir}/exported/citation</value>
275
                </property>
276
                <property>
277
                    <name>output_document_similarity</name>
278
                    <value>${workingDir}/exported/document_similarity</value>
279
                </property>
280
                <property>
281
                    <name>output_document_statistics</name>
282
                    <value>${workingDir}/exported/document_statistics</value>
283
                </property>                
284
                <property>
285
                    <name>output_author_statistics</name>
286
                    <value>${workingDir}/exported/author_statistics</value>
287
                </property>
288
                <property>
289
                    <name>output_project_statistics</name>
290
                    <value>${workingDir}/exported/project_statistics</value>
291
                </property>
292
            </configuration>
293
        </sub-workflow>
294
		<ok to="consumer"/>
295
		<error to="fail" />
296
	</action>
297
	
298
	<action name="consumer">
299
		<java>
300
			<job-tracker>${jobTracker}</job-tracker>
301
			<name-node>${nameNode}</name-node>
302
			<configuration>
303
				<property>
304
					<name>mapred.job.queue.name</name>
305
					<value>${queueName}</value>
306
				</property>
307
			</configuration>
308
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
309
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.TestingConsumer</arg>
310
			<arg>-SworkingDir=${workingDir}/consumer/working_dir</arg>
311
			<!-- TODO prepare proper json generated datastores -->
312
			<arg>-C{document_to_project,
313
				eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject,
314
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_to_project.json}</arg>
315
			<arg>-Idocument_to_project=${workingDir}/exported/document_to_project</arg>
316
			<arg>-C{document_to_document_classes,
317
				eu.dnetlib.iis.documentsclassification.schemas.DocumentToDocumentClasses,
318
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_classes.json}</arg>
319
			<arg>-Idocument_to_document_classes=${workingDir}/exported/document_to_document_classes</arg>
320
			<arg>-C{document_to_dataset,
321
				eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet,
322
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
323
			<arg>-Idocument_to_dataset=${workingDir}/exported/document_to_dataset</arg>
324
			<arg>-C{document_to_research_initiatives,
325
				eu.dnetlib.iis.export.schemas.DocumentToConceptIds,
326
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
327
			<arg>-Idocument_to_research_initiatives=${workingDir}/exported/document_to_research_initiatives</arg>
328
			<arg>-C{citation,
329
				eu.dnetlib.iis.export.schemas.Citations,
330
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/citations.json}</arg>
331
			<arg>-Icitation=${workingDir}/exported/citation</arg>
332
			<arg>-C{document_similarity,
333
				eu.dnetlib.iis.documentssimilarity.schemas.DocumentSimilarity,
334
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
335
			<arg>-Idocument_similarity=${workingDir}/exported/document_similarity</arg>
336
			<arg>-C{document_statistics,
337
				eu.dnetlib.iis.statistics.schemas.DocumentToDocumentStatistics,
338
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
339
			<arg>-Idocument_statistics=${workingDir}/exported/document_statistics</arg>
340
			<arg>-C{author_statistics,
341
				eu.dnetlib.iis.statistics.schemas.AuthorToAuthorStatistics,
342
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
343
			<arg>-Iauthor_statistics=${workingDir}/exported/author_statistics</arg>
344
			<arg>-C{project_statistics,
345
				eu.dnetlib.iis.statistics.schemas.ProjectToProjectStatistics,
346
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
347
			<arg>-Iproject_statistics=${workingDir}/exported/project_statistics</arg>
348
		</java>
349
		<ok to="end" />
350
		<error to="fail" />
351
	</action>
352
	
353
	<kill name="fail">
354
		<message>Unfortunately, the process failed -- error message:
355
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
356
	</kill>
357
	<end name="end" />
358
</workflow-app>
(2-2/2)