Project

General

Profile

1 36450 marek.hors
<workflow-app xmlns="uri:oozie:workflow:0.4" name="test-mainworkflows_integration_primary_processing">
2 25563 marek.hors
3 36286 marek.hors
	<global>
4
        <job-tracker>${jobTracker}</job-tracker>
5
        <name-node>${nameNode}</name-node>
6
        <configuration>
7
            <property>
8
                <name>mapred.job.queue.name</name>
9
                <value>${queueName}</value>
10
            </property>
11
		</configuration>
12
	</global>
13
14
	<start to="generate-schema" />
15
16
	<action name="generate-schema">
17
	    <java>
18
	        <main-class>eu.dnetlib.iis.core.javamapreduce.hack.AvroSchemaGenerator</main-class>
19
	        <arg>eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath</arg>
20
	        <arg>eu.dnetlib.iis.metadataextraction.schemas.DocumentText</arg>
21
	        <arg>org.apache.avro.Schema.Type.NULL</arg>
22
	        <capture-output />
23
	    </java>
24
	    <ok to="producer" />
25
	    <error to="fail" />
26
	</action>
27 20087 marek.hors
28
	<action name="producer">
29
        <java>
30
			<!-- The data generated by this node is deleted in this section -->
31
			<prepare>
32
				<delete path="${nameNode}${workingDir}/producer" />
33
				<mkdir path="${nameNode}${workingDir}/producer" />
34
			</prepare>
35
            <!-- This is simple wrapper for the Java code -->
36
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
37
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.Producer</arg>
38
            <arg>-C{document_metadata,
39
				eu.dnetlib.iis.importer.schemas.DocumentMetadata,
40 26785 marek.hors
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/document_metadata.json}</arg>
41 27509 marek.hors
			<arg>-C{document_to_project,
42
				eu.dnetlib.iis.importer.schemas.DocumentToProject,
43
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
44 20087 marek.hors
			<arg>-C{document_content_classpath,
45
				eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath,
46 26785 marek.hors
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/document_content_classpath.json}</arg>
47 20087 marek.hors
			<arg>-C{person,
48
				eu.dnetlib.iis.importer.schemas.Person,
49 26785 marek.hors
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/person.json}</arg>
50 20087 marek.hors
			<arg>-C{project,
51
				eu.dnetlib.iis.importer.schemas.Project,
52 26785 marek.hors
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/project.json}</arg>
53 33218 marek.hors
			<arg>-C{project_concept,
54
				eu.dnetlib.iis.importer.schemas.Concept,
55 35146 marek.hors
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/concept.json}</arg>
56 22142 marek.hors
			<arg>-C{dataset,
57
				eu.dnetlib.iis.importer.schemas.DataSetReference,
58 26785 marek.hors
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/dataset.json}</arg>
59 35120 marek.hors
			<arg>-C{extracted_document_metadata,
60
				eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata,
61
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/metadataextraction/extracted_document_metadata.json}</arg>
62 29017 marek.hors
			<arg>-C{document_text_wos,
63
				eu.dnetlib.iis.metadataextraction.schemas.DocumentText,
64
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
65 25563 marek.hors
			<arg>-C{citation,
66 35153 marek.hors
				eu.dnetlib.iis.ingest.pmc.citations.schemas.Citation,
67
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/meta/citation_pmc.json}</arg>
68 35120 marek.hors
			<arg>-C{document_text_classpath,
69
				eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath,
70
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/input/metadataextraction/document_text_classpath.json}</arg>
71 20087 marek.hors
            <arg>-Odocument_metadata=${workingDir}/producer/document_metadata</arg>
72 27509 marek.hors
            <arg>-Odocument_to_project=${workingDir}/producer/document_to_project</arg>
73 20087 marek.hors
            <arg>-Odocument_content_classpath=${workingDir}/producer/document_content_classpath</arg>
74
            <arg>-Operson=${workingDir}/producer/person</arg>
75
            <arg>-Oproject=${workingDir}/producer/project</arg>
76 33218 marek.hors
            <arg>-Oproject_concept=${workingDir}/producer/project_concept</arg>
77 22142 marek.hors
            <arg>-Odataset=${workingDir}/producer/dataset</arg>
78 35120 marek.hors
            <arg>-Oextracted_document_metadata=${workingDir}/producer/extracted_document_metadata</arg>
79 29300 mateusz.ko
            <arg>-Odocument_text_wos=${workingDir}/producer/document_text_wos</arg>
80 35120 marek.hors
            <arg>-Ocitation=${workingDir}/producer/citation</arg>
81
            <arg>-Odocument_text_classpath=${workingDir}/producer/document_text_classpath</arg>
82 20087 marek.hors
        </java>
83 35120 marek.hors
        <ok to="document_text_producer"/>
84 20087 marek.hors
        <error to="fail"/>
85
    </action>
86 35120 marek.hors
87
	<action name="document_text_producer">
88 20087 marek.hors
		<map-reduce>
89 35120 marek.hors
			<prepare>
90
				<delete path="${nameNode}${workingDir}/text-producer" />
91
				<mkdir path="${nameNode}${workingDir}/text-producer" />
92
			</prepare>
93 20087 marek.hors
			<configuration>
94
				<property>
95
					<name>mapreduce.inputformat.class</name>
96 36286 marek.hors
					<value>org.apache.avro.mapreduce.AvroKeyInputFormat</value>
97 20087 marek.hors
				</property>
98
				<property>
99
					<name>mapreduce.outputformat.class</name>
100 36286 marek.hors
					<value>org.apache.avro.mapreduce.AvroKeyOutputFormat</value>
101 20087 marek.hors
				</property>
102
				<property>
103
					<name>mapred.mapoutput.key.class</name>
104
					<value>org.apache.avro.mapred.AvroKey</value>
105
				</property>
106
				<property>
107
					<name>mapred.mapoutput.value.class</name>
108
					<value>org.apache.hadoop.io.NullWritable</value>
109
				</property>
110
				 <property>
111
                    <name>mapred.reduce.tasks</name>
112
                    <value>0</value>
113
                </property>
114
				<property>
115
					<name>mapred.output.key.class</name>
116
					<value>org.apache.avro.mapred.AvroKey</value>
117
				</property>
118
				<property>
119
					<name>mapred.output.value.class</name>
120
					<value>org.apache.avro.mapred.AvroValue</value>
121
				</property>
122
				<property>
123
					<name>mapred.output.key.comparator.class</name>
124 36286 marek.hors
					<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
125 20087 marek.hors
				</property>
126
				<property>
127
					<name>io.serializations</name>
128
					<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.avro.hadoop.io.AvroSerialization
129
					</value>
130
				</property>
131
				<property>
132
					<name>mapred.output.value.groupfn.class</name>
133 36286 marek.hors
					<value>org.apache.avro.hadoop.io.AvroKeyComparator</value>
134 20087 marek.hors
				</property>
135
				<property>
136
					<name>rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB
137
					</name>
138
					<value>org.apache.hadoop.ipc.ProtobufRpcEngine</value>
139
				</property>
140
				<property>
141
					<name>mapred.mapper.new-api</name>
142
					<value>true</value>
143
				</property>
144
				<property>
145
					<name>mapred.reducer.new-api</name>
146
					<value>true</value>
147
				</property>
148
				<property>
149
					<name>mapreduce.map.class</name>
150 35120 marek.hors
					<value>eu.dnetlib.iis.mainworkflows.converters.DocumentClasspathToTextConverter</value>
151 20087 marek.hors
				</property>
152 36286 marek.hors
153 20087 marek.hors
				<property>
154 36286 marek.hors
					<name>avro.schema.input.key</name>
155
					<value>${wf:actionData('generate-schema')['eu.dnetlib.iis.mainworkflows.schemas.DocumentContentClasspath']}</value>
156 20087 marek.hors
				</property>
157
		        <property>
158 36286 marek.hors
                    <name>avro.serialization.key.reader.schema</name>
159
                    <value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value>
160
                </property>
161
                <property>
162
                    <name>avro.serialization.key.writer.schema</name>
163
                    <value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value>
164
                </property>
165 20087 marek.hors
		        <property>
166 36286 marek.hors
                    <name>avro.serialization.value.reader.schema</name>
167
                    <value>${wf:actionData('generate-schema')['org.apache.avro.Schema.Type.NULL']}</value>
168
                </property>
169
                <property>
170
                    <name>avro.serialization.value.writer.schema</name>
171
                    <value>${wf:actionData('generate-schema')['org.apache.avro.Schema.Type.NULL']}</value>
172
                </property>
173 20087 marek.hors
		        <property>
174 36286 marek.hors
                    <name>avro.schema.output.key</name>
175
                    <value>${wf:actionData('generate-schema')['eu.dnetlib.iis.metadataextraction.schemas.DocumentText']}</value>
176
                </property>
177
178 20087 marek.hors
				<property>
179
					<name>mapred.input.dir</name>
180 35120 marek.hors
					<value>${workingDir}/producer/document_text_classpath</value>
181 20087 marek.hors
				</property>
182
				<property>
183
					<name>mapred.output.dir</name>
184 35120 marek.hors
					<value>${workingDir}/text-producer/document_text</value>
185 20087 marek.hors
				</property>
186
			</configuration>
187
		</map-reduce>
188 35120 marek.hors
		<ok to="mainworkflows_primary_processing" />
189 20087 marek.hors
		<error to="fail" />
190
	</action>
191
192 26780 marek.hors
	<action name="mainworkflows_primary_processing">
193 20087 marek.hors
		<sub-workflow>
194 26780 marek.hors
            <app-path>${wf:appPath()}/mainworkflows_primary_processing</app-path>
195 20087 marek.hors
            <propagate-configuration/>
196
            <configuration>
197 26285 marek.hors
            	<property>
198
                    <name>active_referenceextraction_project</name>
199
                    <value>true</value>
200
                </property>
201
                <property>
202
                    <name>active_referenceextraction_dataset</name>
203
                    <value>true</value>
204
                </property>
205 20087 marek.hors
            	<property>
206 26285 marek.hors
					<name>active_referenceextraction_researchinitiative</name>
207 31647 marek.hors
					<value>true</value>
208 26285 marek.hors
				</property>
209 37470 marek.hors
				<property>
210
					<name>active_referenceextraction_pdb</name>
211
					<value>false</value>
212
				</property>
213 26285 marek.hors
            	<property>
214 23896 marek.hors
                    <name>active_documentsclassification</name>
215 31647 marek.hors
                    <value>true</value>
216 23896 marek.hors
                </property>
217
            	<property>
218 24466 marek.hors
                    <name>active_citationmatching</name>
219 35144 marek.hors
                    <value>true</value>
220 24466 marek.hors
                </property>
221 35146 marek.hors
            	<!-- disabling not tested KDMs -->
222 24466 marek.hors
                <property>
223 35146 marek.hors
                    <name>active_documentssimilarity</name>
224
                    <value>false</value>
225
                </property>
226
                <property>
227 24466 marek.hors
                    <name>active_statistics</name>
228
                    <value>false</value>
229
                </property>
230
            	<property>
231 20087 marek.hors
                    <name>input_document_metadata</name>
232
                    <value>${workingDir}/producer/document_metadata</value>
233
                </property>
234
                <property>
235 27509 marek.hors
                    <name>input_document_to_project</name>
236
                    <value>${workingDir}/producer/document_to_project</value>
237
                </property>
238
                <property>
239 23891 marek.hors
                    <name>input_document_text</name>
240 35120 marek.hors
                    <value>${workingDir}/text-producer/document_text</value>
241 23891 marek.hors
                </property>
242
                <property>
243 29017 marek.hors
                    <name>input_document_text_wos</name>
244
                    <value>${workingDir}/producer/document_text_wos</value>
245
                </property>
246
                <property>
247 20087 marek.hors
                    <name>input_project</name>
248
                    <value>${workingDir}/producer/project</value>
249
                </property>
250
                <property>
251 33218 marek.hors
                    <name>input_project_concept</name>
252
                    <value>${workingDir}/producer/project_concept</value>
253
                </property>
254
                <property>
255 20087 marek.hors
                    <name>input_person</name>
256
                    <value>${workingDir}/producer/person</value>
257
                </property>
258
                <property>
259 22142 marek.hors
                    <name>input_dataset</name>
260
                    <value>${workingDir}/producer/dataset</value>
261
                </property>
262
                <property>
263 25563 marek.hors
					<name>input_extracted_document_metadata</name>
264 35120 marek.hors
					<value>${workingDir}/producer/extracted_document_metadata</value>
265 24466 marek.hors
				</property>
266
				<property>
267 29300 mateusz.ko
					<name>input_citation_pmc</name>
268 25563 marek.hors
					<value>${workingDir}/producer/citation</value>
269 24466 marek.hors
				</property>
270
                <property>
271 26281 marek.hors
                    <name>output_document_to_project</name>
272
                    <value>${workingDir}/exported/document_to_project</value>
273 20087 marek.hors
                </property>
274
                <property>
275 33218 marek.hors
                    <name>output_document_to_project_concepts</name>
276
                    <value>${workingDir}/exported/document_to_project_concepts</value>
277
                </property>
278
                <property>
279 26281 marek.hors
                    <name>output_document_to_dataset</name>
280
                    <value>${workingDir}/exported/document_to_dataset</value>
281 20087 marek.hors
                </property>
282
                <property>
283 26281 marek.hors
                    <name>output_document_to_research_initiatives</name>
284
                    <value>${workingDir}/exported/document_to_research_initiatives</value>
285 20087 marek.hors
                </property>
286
                <property>
287 37470 marek.hors
                    <name>output_document_to_pdb</name>
288
                    <value>${workingDir}/exported/document_to_pdb</value>
289
                </property>
290
                <property>
291 26281 marek.hors
                    <name>output_document_to_document_classes</name>
292
                    <value>${workingDir}/exported/document_to_document_classes</value>
293 20087 marek.hors
                </property>
294 25415 marek.hors
                <property>
295 26281 marek.hors
                    <name>output_citation</name>
296
                    <value>${workingDir}/exported/citation</value>
297
                </property>
298
                <property>
299
                    <name>output_document_similarity</name>
300
                    <value>${workingDir}/exported/document_similarity</value>
301
                </property>
302
                <property>
303
                    <name>output_document_statistics</name>
304
                    <value>${workingDir}/exported/document_statistics</value>
305
                </property>
306
                <property>
307
                    <name>output_author_statistics</name>
308
                    <value>${workingDir}/exported/author_statistics</value>
309
                </property>
310
                <property>
311 25415 marek.hors
                    <name>output_project_statistics</name>
312
                    <value>${workingDir}/exported/project_statistics</value>
313
                </property>
314 20087 marek.hors
            </configuration>
315
        </sub-workflow>
316 20261 marek.hors
		<ok to="consumer"/>
317 20087 marek.hors
		<error to="fail" />
318
	</action>
319
320 20128 marek.hors
	<action name="consumer">
321
		<java>
322
			<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
323
			<arg>eu.dnetlib.iis.core.java.jsonworkflownodes.TestingConsumer</arg>
324 26281 marek.hors
			<!-- TODO prepare proper json generated datastores -->
325 31647 marek.hors
			<arg>-C{document_to_document_classes,
326
				eu.dnetlib.iis.documentsclassification.schemas.DocumentToDocumentClasses,
327
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_classes.json}</arg>
328
			<arg>-Idocument_to_document_classes=${workingDir}/exported/document_to_document_classes</arg>
329 26281 marek.hors
			<arg>-C{document_to_dataset,
330
				eu.dnetlib.iis.referenceextraction.dataset.schemas.DocumentToDataSet,
331 35146 marek.hors
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_to_dataset.json}</arg>
332 26287 marek.hors
			<arg>-Idocument_to_dataset=${workingDir}/exported/document_to_dataset</arg>
333 35146 marek.hors
			<arg>-C{document_to_project,
334
				eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject,
335
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_to_project.json}</arg>
336
			<arg>-Idocument_to_project=${workingDir}/exported/document_to_project</arg>
337
			<arg>-C{document_to_project_concepts,
338
				eu.dnetlib.iis.export.schemas.DocumentToConceptIds,
339
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_to_project_concepts.json}</arg>
340
			<arg>-Idocument_to_project_concepts=${workingDir}/exported/document_to_project_concepts</arg>
341 26281 marek.hors
			<arg>-C{document_to_research_initiatives,
342 33249 marek.hors
				eu.dnetlib.iis.export.schemas.DocumentToConceptIds,
343 35146 marek.hors
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/document_to_research_initiatives.json}</arg>
344 26281 marek.hors
			<arg>-Idocument_to_research_initiatives=${workingDir}/exported/document_to_research_initiatives</arg>
345
			<arg>-C{citation,
346 31846 marek.hors
				eu.dnetlib.iis.export.schemas.Citations,
347 35144 marek.hors
				eu/dnetlib/iis/mainworkflows/integration/primary/processing/output/citations.json}</arg>
348 26281 marek.hors
			<arg>-Icitation=${workingDir}/exported/citation</arg>
349
			<arg>-C{document_similarity,
350
				eu.dnetlib.iis.documentssimilarity.schemas.DocumentSimilarity,
351
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
352
			<arg>-Idocument_similarity=${workingDir}/exported/document_similarity</arg>
353
			<arg>-C{document_statistics,
354
				eu.dnetlib.iis.statistics.schemas.DocumentToDocumentStatistics,
355
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
356
			<arg>-Idocument_statistics=${workingDir}/exported/document_statistics</arg>
357
			<arg>-C{author_statistics,
358
				eu.dnetlib.iis.statistics.schemas.AuthorToAuthorStatistics,
359
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
360
			<arg>-Iauthor_statistics=${workingDir}/exported/author_statistics</arg>
361
			<arg>-C{project_statistics,
362
				eu.dnetlib.iis.statistics.schemas.ProjectToProjectStatistics,
363
				eu/dnetlib/iis/mainworkflows/data/empty.json}</arg>
364
			<arg>-Iproject_statistics=${workingDir}/exported/project_statistics</arg>
365 20128 marek.hors
		</java>
366 20087 marek.hors
		<ok to="end" />
367
		<error to="fail" />
368
	</action>
369
370
	<kill name="fail">
371
		<message>Unfortunately, the process failed -- error message:
372
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
373
	</kill>
374
	<end name="end" />
375
</workflow-app>