Project

General

Profile

1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="test-mainworkflows_integration_primary_processing">
2

    
3
	<global>
4
        <job-tracker>${jobTracker}</job-tracker>
5
        <name-node>${nameNode}</name-node>
6
        <configuration>
7
            <property>
8
                <name>mapred.job.queue.name</name>
9
                <value>${queueName}</value>
10
            </property>
11
		</configuration>
12
	</global>
13

    
14
	<start to="generate-schema" />
15

    
16
	<action name="generate-schema">
17
	    <java>
18
	        <main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class>
19
	        <arg>eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath</arg>
20
	        <arg>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</arg>
21
	        <arg>org.apache.avro.Schema.Type.NULL</arg>
22
	        <capture-output />
23
	    </java>
24
	    <ok to="producer" />
25
	    <error to="fail" />
26
	</action>
27
	
28
	<action name="producer">
29
        <java>
30
			<!-- The data generated by this node is deleted in this section -->
31
			<prepare>
32
				<delete path="${nameNode}${workingDir}/producer" />
33
				<mkdir path="${nameNode}${workingDir}/producer" />
34
			</prepare>
35
            <!-- This is simple wrapper for the Java code -->
36
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
37
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
38
            <arg>-C{document_metadata,
39
				eu.dnetlib.iis.importer.schemas.DocumentMetadata,
40
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/document_metadata.json}</arg>
41
			<arg>-C{document_to_project,
42
				eu.dnetlib.iis.importer.schemas.DocumentToProject,
43
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
44
			<arg>-C{document_content_classpath,
45
				eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath,
46
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/document_content_classpath.json}</arg>
47
			<arg>-C{person,
48
				eu.dnetlib.iis.importer.schemas.Person,
49
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/person.json}</arg>
50
			<arg>-C{project,
51
				eu.dnetlib.iis.importer.schemas.Project,
52
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/project.json}</arg>
53
			<arg>-C{project_concept,
54
				eu.dnetlib.iis.importer.schemas.Concept,
55
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/concept.json}</arg>
56
			<arg>-C{dataset,
57
				eu.dnetlib.iis.importer.schemas.DataSetReference,
58
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/dataset.json}</arg>
59
			<arg>-C{extracted_document_metadata,
60
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
61
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/metadataextraction/extracted_document_metadata.json}</arg>
62
			<arg>-C{document_text_wos,
63
				eu.dnetlib.iis.metadataextraction.schemas.DocumentText,
64
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
65
			<arg>-C{citation,
66
				eu.dnetlib.iis.ingest.pmc.citations.schemas.Citation,
67
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/citation_pmc.json}</arg>
68
			<arg>-C{document_text_classpath,
69
				eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath,
70
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/metadataextraction/document_text_classpath.json}</arg>
71
            <arg>-Odocument_metadata=${workingDir}/producer/document_metadata</arg>
72
            <arg>-Odocument_to_project=${workingDir}/producer/document_to_project</arg>
73
            <arg>-Odocument_content_classpath=${workingDir}/producer/document_content_classpath</arg>
74
            <arg>-Operson=${workingDir}/producer/person</arg>
75
            <arg>-Oproject=${workingDir}/producer/project</arg>
76
            <arg>-Oproject_concept=${workingDir}/producer/project_concept</arg>
77
            <arg>-Odataset=${workingDir}/producer/dataset</arg>
78
            <arg>-Oextracted_document_metadata=${workingDir}/producer/extracted_document_metadata</arg>
79
            <arg>-Odocument_text_wos=${workingDir}/producer/document_text_wos</arg>
80
            <arg>-Ocitation=${workingDir}/producer/citation</arg>
81
            <arg>-Odocument_text_classpath=${workingDir}/producer/document_text_classpath</arg>           
82
        </java>
83
        <ok to="document_text_producer"/>
84
        <error to="fail"/>
85
    </action>
86

    
87
	<action name="document_text_producer">
88
		<map-reduce>
89
			<prepare>
90
				<delete path="${nameNode}${workingDir}/text-producer" />
91
				<mkdir path="${nameNode}${workingDir}/text-producer" />
92
			</prepare>
93
			<configuration>
94
				<property>
95
					<name>mapreduce.inputformat.class</name>
96
					<value>org.apache.avro.mapreduce.AvroKeyInputFormat</value>
97
				</property>
98
				<property>
99
					<name>mapreduce.outputformat.class</name>
100
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
101
				</property>
102
				<property>
103
					<name>mapred.mapoutput.key.class</name>
104
					<value>org.apache.avro.mapred.AvroKey</value>
105
				</property>
106
				<property>
107
					<name>mapred.mapoutput.value.class</name>
108
					<value>org.apache.hadoop.io.NullWritable</value>
109
				</property>
110
				 <property>
111
                    <name>mapred.reduce.tasks</name>
112
                    <value>0</value>
113
                </property>
114
				<property>
115
					<name>mapred.output.key.class</name>
116
					<value>org.apache.avro.mapred.AvroKey</value>
117
				</property>
118
				<property>
119
					<name>mapred.output.value.class</name>
120
					<value>org.apache.avro.mapred.AvroValue</value>
121
				</property>
122
				<property>
123
					<name>mapred.output.key.comparator.class</name>
124
					<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
125
				</property>
126
				<property>
127
					<name>io.serializations</name>
128
					<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
129
					</value>
130
				</property>
131
				<property>
132
					<name>mapred.output.value.groupfn.class</name>
133
					<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
134
				</property>
135
				<property>
136
					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
137
					</name>
138
					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
139
				</property>
140
				<property>
141
					<name>mapred.mapper.new-api</name>
142
					<value>true</value>
143
				</property>
144
				<property>
145
					<name>mapred.reducer.new-api</name>
146
					<value>true</value>
147
				</property>
148
				<property>
149
					<name>mapreduce.map.class</name>
150
					<value>eu.dnetlib.iis.mainworkflows.converters.DocumentClasspathToTextConverter</value>
151
				</property>
152
				
153
				<property>
154
					<name>avro.schema.input.key</name>
155
					<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath']}</value>
156
				</property>
157
		        <property>
158
                    <name>avro.serialization.key.reader.schema</name>
159
                    <value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value>
160
                </property>
161
                <property>
162
                    <name>avro.serialization.key.writer.schema</name>
163
                    <value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value>
164
                </property>
165
		        <property>
166
                    <name>avro.serialization.value.reader.schema</name>
167
                    <value>${wf:actionData('generate-schema')['org.apache.avro.Schema.Type.NULL']}</value>
168
                </property>
169
                <property>
170
                    <name>avro.serialization.value.writer.schema</name>
171
                    <value>${wf:actionData('generate-schema')['org.apache.avro.Schema.Type.NULL']}</value>
172
                </property>
173
		        <property>
174
                    <name>avro.schema.output.key</name>
175
                    <value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value>
176
                </property>
177
		        
178
				<property>
179
					<name>mapred.input.dir</name>
180
					<value>${workingDir}/producer/document_text_classpath</value>
181
				</property>
182
				<property>
183
					<name>mapred.output.dir</name>
184
					<value>${workingDir}/text-producer/document_text</value>
185
				</property>
186
			</configuration>
187
		</map-reduce>
188
		<ok to="mainworkflows_primary_processing" />
189
		<error to="fail" />
190
	</action>
191

    
192
	<action name="mainworkflows_primary_processing">
193
		<sub-workflow>
194
            <app-path>${wf:appPath()}/mainworkflows_primary_processing</app-path>
195
            <propagate-configuration/>
196
            <configuration>
197
            	<property>
198
                    <name>active_referenceextraction_project</name>
199
                    <value>true</value>
200
                </property>
201
                <property>
202
                    <name>active_referenceextraction_dataset</name>
203
                    <value>true</value>
204
                </property>
205
            	<property>
206
					<name>active_referenceextraction_researchinitiative</name>
207
					<value>true</value>
208
				</property>
209
				<property>
210
					<name>active_referenceextraction_pdb</name>
211
					<value>false</value>
212
				</property>
213
            	<property>
214
                    <name>active_documentsclassification</name>
215
                    <value>true</value>
216
                </property>
217
            	<property>
218
                    <name>active_citationmatching</name>
219
                    <value>true</value>
220
                </property>
221
            	<!-- disabling not tested KDMs -->
222
                <property>
223
                    <name>active_documentssimilarity</name>
224
                    <value>false</value>
225
                </property>
226
                <property>
227
                    <name>active_statistics</name>
228
                    <value>false</value>
229
                </property>
230
            	<property>
231
                    <name>input_document_metadata</name>
232
                    <value>${workingDir}/producer/document_metadata</value>
233
                </property>
234
                <property>
235
                    <name>input_document_to_project</name>
236
                    <value>${workingDir}/producer/document_to_project</value>
237
                </property>
238
                <property>
239
                    <name>input_document_text</name>
240
                    <value>${workingDir}/text-producer/document_text</value>
241
                </property>
242
                <property>
243
                    <name>input_document_text_wos</name>
244
                    <value>${workingDir}/producer/document_text_wos</value>
245
                </property>
246
                <property>
247
                    <name>input_project</name>
248
                    <value>${workingDir}/producer/project</value>
249
                </property>
250
                <property>
251
                    <name>input_project_concept</name>
252
                    <value>${workingDir}/producer/project_concept</value>
253
                </property>
254
                <property>
255
                    <name>input_person</name>
256
                    <value>${workingDir}/producer/person</value>
257
                </property>
258
                <property>
259
                    <name>input_dataset</name>
260
                    <value>${workingDir}/producer/dataset</value>
261
                </property>
262
                <property>
263
					<name>input_extracted_document_metadata</name>
264
					<value>${workingDir}/producer/extracted_document_metadata</value>
265
				</property>
266
				<property>
267
					<name>input_citation_pmc</name>
268
					<value>${workingDir}/producer/citation</value>
269
				</property>
270
                <property>
271
                    <name>output_document_to_project</name>
272
                    <value>${workingDir}/exported/document_to_project</value>
273
                </property>
274
                <property>
275
                    <name>output_document_to_project_concepts</name>
276
                    <value>${workingDir}/exported/document_to_project_concepts</value>
277
                </property>
278
                <property>
279
                    <name>output_document_to_dataset</name>
280
                    <value>${workingDir}/exported/document_to_dataset</value>
281
                </property>
282
                <property>
283
                    <name>output_document_to_research_initiatives</name>
284
                    <value>${workingDir}/exported/document_to_research_initiatives</value>
285
                </property>
286
                <property>
287
                    <name>output_document_to_pdb</name>
288
                    <value>${workingDir}/exported/document_to_pdb</value>
289
                </property>
290
                <property>
291
                    <name>output_document_to_document_classes</name>
292
                    <value>${workingDir}/exported/document_to_document_classes</value>
293
                </property>
294
                <property>
295
                    <name>output_citation</name>
296
                    <value>${workingDir}/exported/citation</value>
297
                </property>
298
                <property>
299
                    <name>output_document_similarity</name>
300
                    <value>${workingDir}/exported/document_similarity</value>
301
                </property>
302
                <property>
303
                    <name>output_document_statistics</name>
304
                    <value>${workingDir}/exported/document_statistics</value>
305
                </property>                
306
                <property>
307
                    <name>output_author_statistics</name>
308
                    <value>${workingDir}/exported/author_statistics</value>
309
                </property>
310
                <property>
311
                    <name>output_project_statistics</name>
312
                    <value>${workingDir}/exported/project_statistics</value>
313
                </property>
314
            </configuration>
315
        </sub-workflow>
316
		<ok to="consumer"/>
317
		<error to="fail" />
318
	</action>
319
	
320
	<action name="consumer">
321
		<java>
322
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
323
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.TestingConsumer</arg>
324
			<!-- TODO prepare proper json generated datastores -->
325
			<arg>-C{document_to_document_classes,
326
				eu.dnetlib.iis.documentsclassification.schemas.DocumentToDocumentClasses,
327
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_classes.json}</arg>
328
			<arg>-Idocument_to_document_classes=${workingDir}/exported/document_to_document_classes</arg>
329
			<arg>-C{document_to_dataset,
330
				eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet,
331
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_to_dataset.json}</arg>
332
			<arg>-Idocument_to_dataset=${workingDir}/exported/document_to_dataset</arg>
333
			<arg>-C{document_to_project,
334
				eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject,
335
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_to_project.json}</arg>
336
			<arg>-Idocument_to_project=${workingDir}/exported/document_to_project</arg>
337
			<arg>-C{document_to_project_concepts,
338
				eu.dnetlib.iis.export.schemas.DocumentToConceptIds,
339
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_to_project_concepts.json}</arg>
340
			<arg>-Idocument_to_project_concepts=${workingDir}/exported/document_to_project_concepts</arg>
341
			<arg>-C{document_to_research_initiatives,
342
				eu.dnetlib.iis.export.schemas.DocumentToConceptIds,
343
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_to_research_initiatives.json}</arg>
344
			<arg>-Idocument_to_research_initiatives=${workingDir}/exported/document_to_research_initiatives</arg>
345
			<arg>-C{citation,
346
				eu.dnetlib.iis.export.schemas.Citations,
347
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/citations.json}</arg>
348
			<arg>-Icitation=${workingDir}/exported/citation</arg>
349
			<arg>-C{document_similarity,
350
				eu.dnetlib.iis.documentssimilarity.schemas.DocumentSimilarity,
351
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
352
			<arg>-Idocument_similarity=${workingDir}/exported/document_similarity</arg>
353
			<arg>-C{document_statistics,
354
				eu.dnetlib.iis.statistics.schemas.DocumentToDocumentStatistics,
355
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
356
			<arg>-Idocument_statistics=${workingDir}/exported/document_statistics</arg>
357
			<arg>-C{author_statistics,
358
				eu.dnetlib.iis.statistics.schemas.AuthorToAuthorStatistics,
359
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
360
			<arg>-Iauthor_statistics=${workingDir}/exported/author_statistics</arg>
361
			<arg>-C{project_statistics,
362
				eu.dnetlib.iis.statistics.schemas.ProjectToProjectStatistics,
363
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
364
			<arg>-Iproject_statistics=${workingDir}/exported/project_statistics</arg>
365
		</java>
366
		<ok to="end" />
367
		<error to="fail" />
368
	</action>
369
	
370
	<kill name="fail">
371
		<message>Unfortunately, the process failed -- error message:
372
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
373
	</kill>
374
	<end name="end" />
375
</workflow-app>
(2-2/2)