1 |
18686
|
mateusz.ko
|
<?xml version="1.0"?>
|
2 |
|
|
<!-- Note that documentation placed in comments in this file uses the
|
3 |
|
|
"markdown" syntax (along with its way of dividing text into sections). -->
|
4 |
25208
|
mateusz.ko
|
<workflow-app xmlns="uri:oozie:workflow:0.3" name="test-core_examples_hadoopstreaming_cloner_without_reducer_with_explicit_schema_file">
|
5 |
18686
|
mateusz.ko
|
<start to="data_producer" />
|
6 |
|
|
<action name="data_producer">
|
7 |
|
|
<java>
|
8 |
|
|
<job-tracker>${jobTracker}</job-tracker>
|
9 |
|
|
<name-node>${nameNode}</name-node>
|
10 |
|
|
<!-- The data generated by this node is deleted in this section -->
|
11 |
|
|
<prepare>
|
12 |
|
|
<delete path="${nameNode}${workingDir}/data_producer" />
|
13 |
|
|
<mkdir path="${nameNode}${workingDir}/data_producer" />
|
14 |
|
|
</prepare>
|
15 |
|
|
<configuration>
|
16 |
|
|
<property>
|
17 |
|
|
<name>mapred.job.queue.name</name>
|
18 |
|
|
<value>${queueName}</value>
|
19 |
|
|
</property>
|
20 |
|
|
</configuration>
|
21 |
|
|
<!-- This is simple wrapper for the Java code -->
|
22 |
|
|
<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
|
23 |
|
|
<!-- The business Java code that gets to be executed -->
|
24 |
|
|
<arg>eu.dnetlib.iis.core.examples.java.SampleDataProducer</arg>
|
25 |
35701
|
mateusz.ko
|
<!-- All input and output ports have to be bound to paths in HDFS -->
|
26 |
18686
|
mateusz.ko
|
<arg>-Operson=${workingDir}/data_producer/person</arg>
|
27 |
|
|
<arg>-Odocument=${workingDir}/data_producer/document</arg>
|
28 |
|
|
</java>
|
29 |
|
|
<ok to="python_cloner" />
|
30 |
|
|
<error to="fail" />
|
31 |
|
|
</action>
|
32 |
|
|
<action name="python_cloner">
|
33 |
|
|
<map-reduce>
|
34 |
|
|
<job-tracker>${jobTracker}</job-tracker>
|
35 |
|
|
<name-node>${nameNode}</name-node>
|
36 |
|
|
<prepare>
|
37 |
|
|
<delete path="${nameNode}${workingDir}/python_cloner"/>
|
38 |
|
|
<mkdir path="${nameNode}${workingDir}/python_cloner"/>
|
39 |
|
|
</prepare>
|
40 |
|
|
<streaming>
|
41 |
|
|
<!-- Here, we give the relative path to the script and pass it
|
42 |
|
|
the parameters of the workflow node. The script is held
|
43 |
|
|
in a directory having the same name as the workflow node.
|
44 |
|
|
|
45 |
|
|
The parameters should be passed as **named** arguments. This
|
46 |
|
|
convention of passing them as named arguments makes the code
|
47 |
|
|
more readable/maintainable.
|
48 |
|
|
-->
|
49 |
|
|
<mapper>scripts/python_cloner/cloner.py --copies 3</mapper>
|
50 |
|
|
</streaming>
|
51 |
|
|
<configuration>
|
52 |
|
|
<!-- # Standard settings for our framework -->
|
53 |
|
|
<property>
|
54 |
|
|
<name>mapred.output.format.class</name>
|
55 |
|
|
<value>com.cloudera.science.avro.streaming.AvroAsJSONOutputFormat</value>
|
56 |
|
|
</property>
|
57 |
|
|
<property>
|
58 |
|
|
<name>mapred.input.format.class</name>
|
59 |
|
|
<value>com.cloudera.science.avro.streaming.AvroAsJSONInputFormat</value>
|
60 |
|
|
</property>
|
61 |
|
|
<!-- # Custom settings for this workflow node -->
|
62 |
|
|
<!-- We do not use any reducers, so we set their number to 0 -->
|
63 |
|
|
<property>
|
64 |
|
|
<name>mapred.reduce.tasks</name>
|
65 |
|
|
<value>0</value>
|
66 |
|
|
</property>
|
67 |
|
|
<property>
|
68 |
|
|
<name>mapred.input.dir</name>
|
69 |
|
|
<value>${workingDir}/data_producer/person</value>
|
70 |
|
|
</property>
|
71 |
|
|
<!-- Path to the input schema. This is held in the same
|
72 |
|
|
directory as the script. -->
|
73 |
|
|
<property>
|
74 |
|
|
<name>input.schema.url</name>
|
75 |
|
|
<value>${wf:appPath()}/lib/scripts/python_cloner/Person.avsc</value>
|
76 |
|
|
</property>
|
77 |
|
|
<property>
|
78 |
|
|
<name>mapred.output.dir</name>
|
79 |
|
|
<value>${workingDir}/python_cloner/output</value>
|
80 |
|
|
</property>
|
81 |
|
|
<!-- Path to the output schema. This is held in the same
|
82 |
|
|
directory as the script. -->
|
83 |
|
|
<property>
|
84 |
|
|
<name>output.schema.url</name>
|
85 |
|
|
<value>${wf:appPath()}/lib/scripts/python_cloner/Person.avsc</value>
|
86 |
|
|
</property>
|
87 |
|
|
</configuration>
|
88 |
|
|
</map-reduce>
|
89 |
|
|
<ok to="cloner"/>
|
90 |
|
|
<error to="fail"/>
|
91 |
|
|
</action>
|
92 |
|
|
<action name="cloner">
|
93 |
|
|
<java>
|
94 |
|
|
<job-tracker>${jobTracker}</job-tracker>
|
95 |
|
|
<name-node>${nameNode}</name-node>
|
96 |
|
|
<!-- The data generated by this node is deleted in this section -->
|
97 |
|
|
<prepare>
|
98 |
|
|
<delete path="${nameNode}${workingDir}/cloner" />
|
99 |
|
|
<mkdir path="${nameNode}${workingDir}/cloner" />
|
100 |
|
|
</prepare>
|
101 |
|
|
<configuration>
|
102 |
|
|
<property>
|
103 |
|
|
<name>mapred.job.queue.name</name>
|
104 |
|
|
<value>${queueName}</value>
|
105 |
|
|
</property>
|
106 |
|
|
</configuration>
|
107 |
|
|
<!-- This is simple wrapper for the Java code -->
|
108 |
|
|
<main-class>eu.dnetlib.iis.core.java.ProcessWrapper</main-class>
|
109 |
|
|
<!-- The business Java code that gets to be executed -->
|
110 |
|
|
<arg>eu.dnetlib.iis.core.examples.java.PersonCloner</arg>
|
111 |
35701
|
mateusz.ko
|
<!-- All input and output ports have to be bound to paths in HDFS -->
|
112 |
18686
|
mateusz.ko
|
<arg>-Iperson=${workingDir}/python_cloner/output</arg>
|
113 |
|
|
<arg>-Operson=${workingDir}/cloner/person</arg>
|
114 |
|
|
</java>
|
115 |
|
|
<ok to="end" />
|
116 |
|
|
<error to="fail" />
|
117 |
|
|
</action>
|
118 |
|
|
<kill name="fail">
|
119 |
|
|
<message>Unfortunately, the process failed -- error message:
|
120 |
|
|
[${wf:errorMessage(wf:lastErrorNode())}]
|
121 |
|
|
</message>
|
122 |
|
|
</kill>
|
123 |
|
|
<end name="end"/>
|
124 |
|
|
</workflow-app>
|
125 |
|
|
|