Project

General

Profile

1 45704 claudio.at
<RESOURCE_PROFILE>
2
	<HEADER>
3 58106 sandro.lab
		<RESOURCE_IDENTIFIER value="1dd10bf0-5c97-470c-9938-ae8e57a422fc_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
4 45704 claudio.at
		<RESOURCE_TYPE value="WorkflowDSResourceType"/>
5
		<RESOURCE_KIND value="WorkflowDSResources"/>
6
		<RESOURCE_URI value=""/>
7 58106 sandro.lab
		<DATE_OF_CREATION value="2020-02-05T18:13:51.0Z"/>
8 45704 claudio.at
	</HEADER>
9
	<BODY>
10 58106 sandro.lab
		<WORKFLOW_NAME menuSection="InfoSpace Deduplication">InfoSpace Deduplication using Spark</WORKFLOW_NAME>
11
		<WORKFLOW_DESCRIPTION>InfoSpace Deduplication using Spark</WORKFLOW_DESCRIPTION>
12 45704 claudio.at
		<WORKFLOW_INFO/>
13
		<WORKFLOW_FAMILY>InfoSpace Deduplication</WORKFLOW_FAMILY>
14
		<WORKFLOW_PRIORITY>35</WORKFLOW_PRIORITY>
15
		<CONFIGURATION status="EXECUTABLE" start="MANUAL">
16
17
			<PARAMETERS>
18 58106 sandro.lab
				<PARAM description="Oozie Job name" function="listProfiles('HadoopJobConfigurationDSResourceType', '//HADOOP_JOB/@name','executeOozie')" managedBy="user" name="oozieJobName" required="true" type="string"/>
19
				<PARAM name="workingDirPath" description="working dir where generate all the intermediate verison of the graph" required="true" type="string" managedBy="user"/>
20
				<PARAM name="cluster" description="Hadoop cluster logical name" required="true" managedBy="user"   type="string" function="validValues(['DM','IIS', 'GARR'])"/>
21 45704 claudio.at
				<PARAM name="reusePublication" description="reuse publications on HDFS?" required="true" type="boolean" managedBy="user"/>
22
				<PARAM name="reuseResolvedPublication" description="reuse resolved publications on HDFS?" required="true" type="boolean" managedBy="user"/>
23
				<PARAM name="reuseDataset" description="reuse datasets on HDFS?" required="true" type="boolean" managedBy="user"/>
24
				<PARAM name="reuseResolvedDataset" description="reuse resolved datasets on HDFS?" required="true" type="boolean" managedBy="user"/>
25
				<PARAM name="reuseUnresolved" description="reuse unresolved objects on HDFS?" required="true" type="boolean" managedBy="user"/>
26 58106 sandro.lab
                <PARAM name="dedupConfigDataset" description="dedup configuration orchestration name" required="true" type="string" function="obtainValues('dedupOrchestrations', {})" managedBy="user"/>
27
                <PARAM name="dedupConfigPublication" description="dedup configuration orchestration name" required="true" type="string" function="listProfiles('DedupConfigurationDSResources', '//DESCRIPTION', '')" managedBy="user"/>
28
				<PARAM name="dedupConfigUnknown" description="dedup configuration orchestration name" required="true" type="string" function="obtainValues('dedupOrchestrations', {})" managedBy="user"/>
29 45704 claudio.at
			</PARAMETERS>
30
			<WORKFLOW>
31 58106 sandro.lab
				<NODE name="pmf2hdfs" type="LaunchWorkflowTemplate">
32
					<DESCRIPTION>import PMF Publications to HDFS DIR</DESCRIPTION>
33 45704 claudio.at
					<PARAMETERS>
34 58106 sandro.lab
						<PARAM name="wfTemplateId" value="4a268738-b635-4d86-9a4a-52bec6d20866_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
35 45704 claudio.at
						<PARAM name="wfTemplateParams">
36
							<MAP>
37
								<ENTRY key="cluster" ref="cluster"/>
38 58106 sandro.lab
								<ENTRY key="reuseMdRecords" ref="reusePublication"/>
39
								<ENTRY key="mdFormat" value="PMF"/>
40
								<ENTRY key="sourcePath" value="${workingDirPath}/xml/pmf.dli.seq"/>
41
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
42
								<ENTRY key="targetPath" value="${workingDirPath}/input/0"/>
43
								<ENTRY key="entity" value="publication"/>
44 45704 claudio.at
							</MAP>
45
						</PARAM>
46
					</PARAMETERS>
47
					<ARCS>
48 58106 sandro.lab
						<ARC to="extractPublication"/>
49 45704 claudio.at
					</ARCS>
50
				</NODE>
51 58106 sandro.lab
				<NODE name="extractPublication" type="SubmitHadoopJob">
52
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
53 45704 claudio.at
					<PARAMETERS>
54 58106 sandro.lab
						<PARAM name="cluster" ref="cluster"/>
55
						<PARAM name="hadoopJob" ref="oozieJobName"/>
56
						<PARAM name="jobParams">
57
							<MAP>
58
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/extractentities/oozie_app"/>
59
								<ENTRY key="sourcePath" value="${workingDirPath}/input/0"/>
60
								<ENTRY key="targetPath" value="${workingDirPath}/extracted"/>
61
								<ENTRY key="targetDir" value="0"/>
62
								<ENTRY key="entities" value="publication,unknown,relation"/>
63
							</MAP>
64
						</PARAM>
65
					</PARAMETERS>
66
					<ARCS>
67
						<ARC to="pmfResolved2hdfs"/>
68
					</ARCS>
69
				</NODE>
70
				<NODE name="pmfResolved2hdfs" type="LaunchWorkflowTemplate">
71
					<DESCRIPTION>import PMF Publications to HDFS DIR</DESCRIPTION>
72
					<PARAMETERS>
73
						<PARAM name="wfTemplateId" value="4a268738-b635-4d86-9a4a-52bec6d20866_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
74 45704 claudio.at
						<PARAM name="wfTemplateParams">
75
							<MAP>
76
								<ENTRY key="cluster" ref="cluster"/>
77 58106 sandro.lab
								<ENTRY key="reuseMdRecords" ref="reuseResolvedPublication"/>
78 45704 claudio.at
								<ENTRY key="mdFormat" value="PMF"/>
79 58106 sandro.lab
								<ENTRY key="interpretation" value="resolved"/>
80
								<ENTRY key="sourcePath" value="${workingDirPath}/xml/pmf.dli.resolved.seq"/>
81
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
82
								<ENTRY key="targetPath" value="${workingDirPath}/input/1"/>
83
								<ENTRY key="entity" value="publication"/>
84 45704 claudio.at
							</MAP>
85
						</PARAM>
86
					</PARAMETERS>
87
					<ARCS>
88 58106 sandro.lab
						<ARC to="extractPublicationResolved"/>
89 45704 claudio.at
					</ARCS>
90
				</NODE>
91 58106 sandro.lab
				<NODE name="extractPublicationResolved" type="SubmitHadoopJob">
92
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
93 45704 claudio.at
					<PARAMETERS>
94 58106 sandro.lab
						<PARAM name="cluster" ref="cluster"/>
95
						<PARAM name="hadoopJob" ref="oozieJobName"/>
96
						<PARAM name="jobParams">
97 45704 claudio.at
							<MAP>
98 58106 sandro.lab
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/extractentities/oozie_app"/>
99
								<ENTRY key="sourcePath" value="${workingDirPath}/input/1"/>
100
								<ENTRY key="targetPath" value="${workingDirPath}/extracted"/>
101
								<ENTRY key="targetDir" value="1"/>
102
								<ENTRY key="entities" value="publication"/>
103 45704 claudio.at
							</MAP>
104
						</PARAM>
105
					</PARAMETERS>
106
					<ARCS>
107 58106 sandro.lab
						<ARC to="dmf2hdfs"/>
108 45704 claudio.at
					</ARCS>
109
				</NODE>
110 58106 sandro.lab
				<NODE name="dmf2hdfs" type="LaunchWorkflowTemplate">
111
					<DESCRIPTION>import PMF Publications to HDFS DIR</DESCRIPTION>
112 45704 claudio.at
					<PARAMETERS>
113 58106 sandro.lab
						<PARAM name="wfTemplateId" value="4a268738-b635-4d86-9a4a-52bec6d20866_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
114 45704 claudio.at
						<PARAM name="wfTemplateParams">
115
							<MAP>
116
								<ENTRY key="cluster" ref="cluster"/>
117 58106 sandro.lab
								<ENTRY key="reuseMdRecords" ref="reuseDataset"/>
118
								<ENTRY key="mdFormat" value="DMF"/>
119
								<ENTRY key="sourcePath" value="${workingDirPath}/xml/dmf.dli.seq"/>
120
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
121
								<ENTRY key="targetPath" value="${workingDirPath}/input/2"/>
122
								<ENTRY key="entity" value="dataset"/>
123 45704 claudio.at
							</MAP>
124
						</PARAM>
125
					</PARAMETERS>
126
					<ARCS>
127 58106 sandro.lab
						<ARC to="extractDataset"/>
128 47088 sandro.lab
					</ARCS>
129
				</NODE>
130 58106 sandro.lab
				<NODE name="extractDataset" type="SubmitHadoopJob">
131
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
132 47088 sandro.lab
					<PARAMETERS>
133 58106 sandro.lab
						<PARAM name="cluster" ref="cluster"/>
134
						<PARAM name="hadoopJob" ref="oozieJobName"/>
135
						<PARAM name="jobParams">
136 47088 sandro.lab
							<MAP>
137 58106 sandro.lab
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/extractentities/oozie_app"/>
138
								<ENTRY key="sourcePath" value="${workingDirPath}/input/2"/>
139
								<ENTRY key="targetPath" value="${workingDirPath}/extracted"/>
140
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
141
								<ENTRY key="targetDir" value="2"/>
142
								<ENTRY key="entities" value="dataset,unknown,relation"/>
143 47088 sandro.lab
							</MAP>
144
						</PARAM>
145
					</PARAMETERS>
146
					<ARCS>
147 58106 sandro.lab
						<ARC to="dmfResolved2hdfs"/>
148 45704 claudio.at
					</ARCS>
149
				</NODE>
150 58106 sandro.lab
				<NODE name="dmfResolved2hdfs" type="LaunchWorkflowTemplate">
151
					<DESCRIPTION>import PMF Publications to HDFS DIR</DESCRIPTION>
152 45704 claudio.at
					<PARAMETERS>
153 58106 sandro.lab
						<PARAM name="wfTemplateId" value="4a268738-b635-4d86-9a4a-52bec6d20866_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
154 45704 claudio.at
						<PARAM name="wfTemplateParams">
155
							<MAP>
156
								<ENTRY key="cluster" ref="cluster"/>
157 58106 sandro.lab
								<ENTRY key="reuseMdRecords" ref="reuseResolvedDataset"/>
158 45704 claudio.at
								<ENTRY key="mdFormat" value="DMF"/>
159
								<ENTRY key="interpretation" value="resolved"/>
160 58106 sandro.lab
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
161
								<ENTRY key="sourcePath" value="${workingDirPath}/xml/dmf.dli.resolved.seq"/>
162
								<ENTRY key="targetPath" value="${workingDirPath}/input/3"/>
163
								<ENTRY key="entity" value="dataset"/>
164 45704 claudio.at
							</MAP>
165
						</PARAM>
166
					</PARAMETERS>
167
					<ARCS>
168 58106 sandro.lab
						<ARC to="extractDatasetResolved"/>
169 45704 claudio.at
					</ARCS>
170
				</NODE>
171 58106 sandro.lab
				<NODE name="extractDatasetResolved" type="SubmitHadoopJob">
172
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
173 45704 claudio.at
					<PARAMETERS>
174 58106 sandro.lab
						<PARAM name="cluster" ref="cluster"/>
175
						<PARAM name="hadoopJob" ref="oozieJobName"/>
176
						<PARAM name="jobParams">
177 45704 claudio.at
							<MAP>
178 58106 sandro.lab
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/extractentities/oozie_app"/>
179
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
180
								<ENTRY key="sourcePath" value="${workingDirPath}/input/3"/>
181
								<ENTRY key="targetPath" value="${workingDirPath}/extracted"/>
182
								<ENTRY key="targetDir" value="3"/>
183
								<ENTRY key="entities" value="dataset"/>
184 45704 claudio.at
							</MAP>
185
						</PARAM>
186
					</PARAMETERS>
187
					<ARCS>
188 58106 sandro.lab
						<ARC to="mergeDataset"/>
189 45704 claudio.at
					</ARCS>
190
				</NODE>
191 58106 sandro.lab
				<NODE isStart="true" name="mergeDataset" type="SubmitHadoopJob">
192
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
193 45704 claudio.at
					<PARAMETERS>
194 58106 sandro.lab
						<PARAM name="cluster" ref="cluster"/>
195
						<PARAM name="hadoopJob" ref="oozieJobName"/>
196
						<PARAM name="jobParams">
197 45704 claudio.at
							<MAP>
198 58106 sandro.lab
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/mergeentities/oozie_app"/>
199
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
200
								<ENTRY key="sourcePath" value="${workingDirPath}/extracted"/>
201
								<ENTRY key="targetPath" value="${workingDirPath}/graph"/>
202
								<ENTRY key="entity" value="dataset"/>
203 45704 claudio.at
							</MAP>
204
						</PARAM>
205
					</PARAMETERS>
206
					<ARCS>
207 58106 sandro.lab
						<ARC to="mergePublication"/>
208 45704 claudio.at
					</ARCS>
209
				</NODE>
210 58106 sandro.lab
				<NODE name="mergePublication" type="SubmitHadoopJob">
211
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
212 45704 claudio.at
					<PARAMETERS>
213 58106 sandro.lab
						<PARAM name="cluster" ref="cluster"/>
214
						<PARAM name="hadoopJob" ref="oozieJobName"/>
215
						<PARAM name="jobParams">
216
							<MAP>
217
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/mergeentities/oozie_app"/>
218
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
219
								<ENTRY key="sourcePath" value="${workingDirPath}/extracted"/>
220
								<ENTRY key="targetPath" value="${workingDirPath}/graph"/>
221
								<ENTRY key="entity" value="publication"/>
222
							</MAP>
223
						</PARAM>
224
					</PARAMETERS>
225
					<ARCS>
226
						<ARC to="mergeUnknown"/>
227
					</ARCS>
228
				</NODE>
229
				<NODE name="mergeUnknown" type="SubmitHadoopJob">
230
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
231
					<PARAMETERS>
232
						<PARAM name="cluster" ref="cluster"/>
233
						<PARAM name="hadoopJob" ref="oozieJobName"/>
234
						<PARAM name="jobParams">
235
							<MAP>
236
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/mergeentities/oozie_app"/>
237
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
238
								<ENTRY key="sourcePath" value="${workingDirPath}/extracted"/>
239
								<ENTRY key="targetPath" value="${workingDirPath}/graph"/>
240
								<ENTRY key="entity" value="unknown"/>
241
							</MAP>
242
						</PARAM>
243
					</PARAMETERS>
244
					<ARCS>
245
						<ARC to="mergeRelation"/>
246
					</ARCS>
247
				</NODE>
248
				<NODE name="mergeRelation" type="SubmitHadoopJob">
249
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
250
					<PARAMETERS>
251
						<PARAM name="cluster" ref="cluster"/>
252
						<PARAM name="hadoopJob" ref="oozieJobName"/>
253
						<PARAM name="jobParams">
254
							<MAP>
255
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/mergeentities/oozie_app"/>
256
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
257
								<ENTRY key="sourcePath" value="${workingDirPath}/extracted"/>
258
								<ENTRY key="targetPath" value="${workingDirPath}/graph"/>
259
								<ENTRY key="entity" value="relation"/>
260
							</MAP>
261
						</PARAM>
262
					</PARAMETERS>
263
					<ARCS>
264
						<ARC to="success"/>
265
					</ARCS>
266
				</NODE>
267
				<NODE name="dedupPublication" type="LaunchWorkflowTemplate">
268
					<DESCRIPTION>import PMF Publications to HDFS DIR</DESCRIPTION>
269
					<PARAMETERS>
270
						<PARAM name="wfTemplateId" value="b8e1afcf-e5ca-47d0-9ee8-47da90e1a9c3_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
271 45704 claudio.at
						<PARAM name="wfTemplateParams">
272
							<MAP>
273
								<ENTRY key="cluster" ref="cluster"/>
274 58106 sandro.lab
								<ENTRY key="sourcePath" value="${workingDirPath}/graph"/>
275
								<ENTRY key="targetPath" value="${workingDirPath}/dedupGraphWD"/>
276
								<ENTRY key="entity" value="publication"/>
277
								<ENTRY key="dedup_conf" ref="dedupConfigPublication"/>
278
								<ENTRY key="oozieJob" ref="oozieJobName"/>
279 45704 claudio.at
							</MAP>
280
						</PARAM>
281
					</PARAMETERS>
282
					<ARCS>
283
						<ARC to="success"/>
284
					</ARCS>
285
				</NODE>
286
			</WORKFLOW>
287
		</CONFIGURATION>
288
		<NOTIFICATIONS/>
289
		<SCHEDULING enabled="false">
290
			<CRON>29 5 22 ? * *</CRON>
291
			<MININTERVAL>10080</MININTERVAL>
292
		</SCHEDULING>
293
		<STATUS/>
294
	</BODY>
295
</RESOURCE_PROFILE>