Project

General

Profile

1
<?xml version="1.0" encoding="UTF-8"?>
2
<RESOURCE_PROFILE>
3
    <HEADER>
4
        <RESOURCE_IDENTIFIER value="251db67f-24ac-4d1d-9e7b-d38fe8898234_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
5
        <RESOURCE_TYPE value="WorkflowDSResourceType"/>
6
        <RESOURCE_KIND value="WorkflowDSResources"/>
7
        <RESOURCE_URI value=""/>
8
        <DATE_OF_CREATION value="2006-05-04T18:13:51.0Z"/>
9
    </HEADER>
10
    <BODY>
11
        <WORKFLOW_NAME>Offline Deduplication v2</WORKFLOW_NAME>
12
        <WORKFLOW_TYPE>Deduplication</WORKFLOW_TYPE>
13
        <WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
14
        <CONFIGURATION start="manual">
15
			<NODE name="fetchRelClasses" type="FetchRelClasses" isStart="true">
16
				<DESCRIPTION />
17
				<PARAMETERS>
18
					<PARAM name="relClassesProperty" type="string" managedBy="system" required="true">dnet.openaire.model.relclasses.xquery</PARAM>
19
					<PARAM name="relClassesName" type="string" managedBy="system" required="true">relClasses</PARAM>
20
				</PARAMETERS>
21
				<ARCS>
22
					<ARC to="checkConf" />
23
				</ARCS>
24
			</NODE>        
25
			<NODE name="setDedupConfigs" type="SetDedupConfiguration" isStart="true">
26
				<DESCRIPTION>Set Dedup conf</DESCRIPTION>
27
				<PARAMETERS>
28
					<PARAM function="obtainValues('dedupOrchestrations', {})" required="true" type="string" name="dedupConfigSequence" managedBy="user"></PARAM>
29
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
30
				</PARAMETERS>
31
				<ARCS>
32
					<ARC to="checkConf" />
33
				</ARCS>
34
			</NODE>
35
			<NODE name="hadoopConfig" type="SetClusterAndTable" isStart="true">
36
				<DESCRIPTION>Set table name</DESCRIPTION>
37
				<PARAMETERS>
38
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
39
					<PARAM required="true" type="string" name="tableParam" managedBy="system">hbaseTable</PARAM>
40
					<PARAM managedBy="user" name="table" required="true" type="string"></PARAM>
41
				</PARAMETERS>
42
				<ARCS>
43
					<ARC to="checkConf" />
44
				</ARCS>
45
			</NODE>
46

    
47
	        <NODE name="setWorkingPath" type="SetEnvParameter" isStart="true">
48
		        <DESCRIPTION>Set the working dir on HDFS for the MinDist Algo</DESCRIPTION>
49
		        <PARAMETERS>
50
			        <PARAM managedBy="system" name="parameterName" required="true" type="string">workDir</PARAM>
51
			        <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/dedup/mindist</PARAM>
52
		        </PARAMETERS>
53
		        <ARCS>
54
			        <ARC to="checkConf"/>
55
		        </ARCS>
56
	        </NODE>
57

    
58

    
59
        	<NODE name="checkConf" type="DedupCheckConfiguration" isJoin="true">
60
				<DESCRIPTION/>
61
				<PARAMETERS>
62
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
63
				</PARAMETERS>
64
				<ARCS>
65
					<ARC to="resetPath" />
66
				</ARCS>
67
			</NODE>
68

    
69
	        <NODE name="resetPath" type="CreateHdfsDirJob">
70
		        <DESCRIPTION>input files cleanup</DESCRIPTION>
71
		        <PARAMETERS>
72
			        <PARAM required="true" type="boolean" name="force" managedBy="system">true</PARAM>
73
			        <PARAM required="true" type="string" name="envParams" managedBy="system">
74
				        {
75
				        'path' : 'workDir',
76
				        'cluster' : 'cluster'
77
				        }
78
			        </PARAM>
79
		        </PARAMETERS>
80
		        <ARCS>
81
			        <ARC to="deduplicateScan"/>
82
		        </ARCS>
83
	        </NODE>
84
			
85
			<NODE name="deduplicateScan" type="DuplicateScanJob">
86
				<DESCRIPTION>Dup Scan</DESCRIPTION>
87
				<PARAMETERS>
88
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupCandidateScanJob</PARAM>
89
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
90
					<PARAM required="true" type="string" name="envParams" managedBy="system">
91
						{ 	
92
							'entityTypeId' : 'entityTypeId',
93
							'entityType' : 'entityType',
94
							'cluster' : 'cluster',
95
							'hbase.mapred.inputtable' : 'hbaseTable', 
96
							'hbase.mapred.outputtable' : 'hbaseTable', 
97
							'hbase.mapreduce.inputtable' : 'hbaseTable', 
98
							'hbase.mapreduce.outputtable' : 'hbaseTable'
99
						}
100
					</PARAM>					
101
				</PARAMETERS>
102
				<ARCS>
103
					<ARC to="deduplicateScan" />
104
					<ARC name="done" to="prepareActionSet" />
105
					<ARC name="done" to="queryUserSimilarities" />
106
				</ARCS>
107
			</NODE>
108
			
109
			
110

    
111
			
112
            <NODE name="queryUserSimilarities" type="QueryUserActionDbJob">
113
                <DESCRIPTION>query user similarity</DESCRIPTION>
114
                <PARAMETERS>
115
	                <PARAM name="dedupConfigSequenceParam" type="string" required="true" managedBy="system">dedup.conf.queue</PARAM>
116
                    <PARAM name="dbProperty" type="string" managedBy="system" required="true">dnet.dedup.db.name</PARAM>
117
                    <PARAM name="sql" type="string" managedBy="system" required="true">/eu/dnetlib/msro/workflows/dedup/querySimilaritiesBySet.sql.st</PARAM>
118
                    <PARAM name="outputEprParam" type="string" managedBy="system" required="true">simEpr</PARAM>
119
                </PARAMETERS>
120
                <ARCS>
121
                    <ARC to="buildSimilarityMesh"/>
122
                </ARCS>
123
            </NODE>            
124
            <NODE name="buildSimilarityMesh" type="BuildSimilarityMeshJob">
125
                <DESCRIPTION>build mesh</DESCRIPTION>
126
                <PARAMETERS>
127
                    <PARAM name="inputEprParam" type="string" managedBy="system" required="true">simEpr</PARAM>
128
                    <PARAM name="outputEprParam" type="string" managedBy="system" required="true">simMeshEpr</PARAM>
129
                </PARAMETERS>
130
                <ARCS>
131
                    <ARC to="storeSimilarities"/>
132
                </ARCS>
133
            </NODE>
134

    
135

    
136
            <NODE name="storeSimilarities" type="StoreHBase">
137
                <DESCRIPTION>store similarity</DESCRIPTION>
138
                <PARAMETERS>
139
                    <PARAM name="inputEprParam" type="string" managedBy="system" required="true">simMeshEpr</PARAM>
140
                    <PARAM name="cluster" type="string" managedBy="system" required="true">DM</PARAM>
141
	                <PARAM name="mapping" type="string" managedBy="user" required="true" function="obtainValues('dbmf2hbaseMappings', {})"></PARAM>
142
                    <PARAM name="simulation" type="boolean" managedBy="user" required="false">false</PARAM>
143
                </PARAMETERS>
144
                <ARCS>
145
                    <ARC to="queryUserDissimilarities"/>
146
                </ARCS>
147
            </NODE>
148
            
149
            <NODE name="queryUserDissimilarities" type="QueryUserActionDbJob">
150
                <DESCRIPTION>query user dissimilarity</DESCRIPTION>
151
                <PARAMETERS>
152
	                <PARAM name="dedupConfigSequenceParam" type="string" required="true" managedBy="system">dedup.conf.queue</PARAM>
153
                    <PARAM name="dbProperty" type="string" managedBy="system" required="true">dnet.dedup.db.name</PARAM>
154
                    <PARAM name="sql" type="string" managedBy="system" required="true">/eu/dnetlib/msro/workflows/dedup/queryDissimilaritiesBySet.sql.st</PARAM>
155
                    <PARAM name="outputEprParam" type="string" managedBy="system" required="true">dissimEpr</PARAM>
156
                </PARAMETERS>
157
                <ARCS>
158
                    <ARC to="storeDissimilarities"/>
159
                </ARCS>
160
            </NODE>
161

    
162
            <NODE name="storeDissimilarities" type="DeleteFromHBase">
163
                <DESCRIPTION>store dissimilarity</DESCRIPTION>
164
                <PARAMETERS>
165
                    <PARAM name="inputEprParam" type="string" managedBy="system" required="true">dissimEpr</PARAM>
166
                    <PARAM name="cluster" type="string" managedBy="system" required="true">DM</PARAM>
167
	                <PARAM name="mapping" type="string" managedBy="user" required="true" function="obtainValues('dbmf2hbaseMappings', {})"></PARAM>
168
                    <PARAM name="simulation" type="boolean" managedBy="user" required="false">false</PARAM>
169
                </PARAMETERS>
170
                <ARCS>
171
                    <ARC to="doneActions"/>
172
                </ARCS>
173
            </NODE>
174

    
175

    
176
			<NODE name="prepareActionSet" type="PrepareConfiguredActionSet">				
177
				<DESCRIPTION>prepare action sets</DESCRIPTION>
178
				<PARAMETERS>
179
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
180
					<PARAM required="true" type="string" name="jobProperty" managedBy="system">rawSetId</PARAM>
181
					<PARAM required="true" type="string" name="actionSetPathParam" managedBy="system">actionSetPath</PARAM>
182
				</PARAMETERS>
183
				<ARCS>
184
					<ARC to="similarity2actions" />
185
				</ARCS>
186
			</NODE>
187
			
188
			<NODE name="similarity2actions" type="DedupSimilarityToActionsJobNode">
189
				<DESCRIPTION>export the similarity rels as Actions</DESCRIPTION>
190
				<PARAMETERS>
191
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupSimilarity2HdfsActionsJob</PARAM>
192
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
193
					<PARAM required="true" type="string" name="envParams" managedBy="system">
194
						{ 	
195
							'dedup.conf' : 'dedup.conf',
196
							'entityTypeId' : 'entityTypeId',
197
							'entityType' : 'entityType',
198
							'cluster' : 'cluster',
199
							'rawSetId' : 'rawSetId',
200
							'hbase.mapred.inputtable' : 'hbaseTable', 
201
							'hbase.mapreduce.inputtable' : 'hbaseTable',
202
							'mapred.output.dir' : 'actionSetPath'
203
						}
204
					</PARAM>
205
				</PARAMETERS>
206
				<ARCS>
207
					<ARC to="updateActionSets" />
208
				</ARCS>
209
			</NODE>
210
			
211
			<NODE name="updateActionSets" type="UpdateActionSets">				
212
				<DESCRIPTION>update action sets</DESCRIPTION>
213
				<PARAMETERS/>
214
				<ARCS>
215
					<ARC to="doneActions" />
216
				</ARCS>
217
			</NODE>
218
			
219
             <NODE name="doneActions" isJoin="true">
220
                <DESCRIPTION>done actions</DESCRIPTION>
221
                <PARAMETERS/>
222
                <ARCS>
223
                    <ARC to="mindist"/>
224
                </ARCS>
225
            </NODE>
226

    
227

    
228
	        <NODE name="mindist" type="MinDistSearchHadoopJob">
229
		        <DESCRIPTION>find the minimum verted in each adjacency lists</DESCRIPTION>
230
		        <PARAMETERS>
231
			        <PARAM required="true" type="boolean" name="debug" managedBy="user">false</PARAM>
232
			        <PARAM required="true" type="boolean" name="outPathParam" managedBy="system">outputPath</PARAM>
233
			        <PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
234
			        <PARAM required="true" type="string" name="envParams" managedBy="system">
235
				        {
236
				        'cluster' : 'cluster',
237
				        'entityTypeId' : 'entityTypeId',
238
				        'hbase.mapred.inputtable' : 'hbaseTable',
239
				        'hbase.mapreduce.inputtable' : 'hbaseTable',
240
				        'workDir' : 'workDir'
241
				        }
242
			        </PARAM>
243
		        </PARAMETERS>
244
		        <ARCS>
245
			        <ARC name="depth_n" to="mindist"/>
246
			        <ARC to="components"/>
247
		        </ARCS>
248
	        </NODE>
249

    
250
	        <NODE name="components" type="DedupConfigurationAwareJob">
251
		        <DESCRIPTION>joins all the vertex ids to build the connected components in the graph</DESCRIPTION>
252
		        <PARAMETERS>
253
			        <PARAM required="true" type="string" name="hadoopJob" managedBy="system">connectedComponentsJob</PARAM>
254
			        <PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
255
			        <PARAM required="true" type="string" name="envParams" managedBy="system">
256
				        {
257
				        'cluster' : 'cluster',
258
				        'entityType' : 'entityType',
259
				        'entityTypeId' : 'entityTypeId',
260
				        'mapred.input.dir' : 'outputPath',
261
				        'hbase.mapred.outputtable' : 'hbaseTable',
262
				        'hbase.mapreduce.outputtable' : 'hbaseTable'
263
				        }
264
			        </PARAM>
265
		        </PARAMETERS>
266
		        <ARCS>
267
			        <ARC to="markDeleted"/>
268
		        </ARCS>
269
	        </NODE>
270

    
271
	        <NODE name="markDeleted" type="DedupConfigurationAwareJob">
272
		        <DESCRIPTION>mark duplicates as deleted by inference</DESCRIPTION>
273
		        <PARAMETERS>
274
			        <PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupMarkDeletedEntityJob</PARAM>
275
			        <PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
276
			        <PARAM required="true" type="string" name="envParams" managedBy="system">
277
				        {
278
				        'cluster' : 'cluster',
279
				        'entityType' : 'entityType',
280
				        'entityTypeId' : 'entityTypeId',
281
				        'hbase.mapred.inputtable' : 'hbaseTable',
282
				        'hbase.mapreduce.inputtable' : 'hbaseTable',
283
				        'hbase.mapred.outputtable' : 'hbaseTable',
284
				        'hbase.mapreduce.outputtable' : 'hbaseTable'
285
				        }
286
			        </PARAM>
287
		        </PARAMETERS>
288
		        <ARCS>
289
			        <ARC to="buildRoots"/>
290
		        </ARCS>
291
	        </NODE>
292

    
293
			<NODE name="buildRoots" type="SubmitHadoopJob">
294
				<DESCRIPTION>redirect rels</DESCRIPTION>
295
				<PARAMETERS>
296
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupBuildRootsJob</PARAM>
297
					<PARAM required="true" type="string" name="envParams" managedBy="system">
298
						{ 	
299
							'dedup.conf' : 'dedup.conf',
300
							'relClasses' : 'relClasses',
301
							'entityTypeId' : 'entityTypeId',
302
							'entityType' : 'entityType',
303
							'cluster' : 'cluster',
304
							'hbase.mapred.inputtable' : 'hbaseTable', 
305
							'hbase.mapreduce.inputtable' : 'hbaseTable', 
306
							'hbase.mapred.outputtable' : 'hbaseTable', 
307
							'hbase.mapreduce.outputtable' : 'hbaseTable'														
308
						}
309
					</PARAM>					
310
				</PARAMETERS>
311
				<ARCS>
312
					<ARC to="findIndex" />
313
				</ARCS>
314
			</NODE>			
315
			
316
			<NODE name="findIndex" type="FindIndex">
317
				<DESCRIPTION />
318
				<PARAMETERS>
319
					<PARAM name="mdFormat" type="string" managedBy="system" required="true">OPENAIRE</PARAM>
320
					<PARAM name="layout" type="string" managedBy="system" required="true">index</PARAM>
321
					<PARAM name="interpretation" type="string" managedBy="system" required="true">dedup</PARAM>
322
				</PARAMETERS>
323
				<ARCS>
324
					<ARC name="found" to="prepareIndex" />
325
					<ARC name="notFound" to="createIndex" />
326
				</ARCS>
327
			</NODE>
328

    
329
			<NODE name="createIndex" type="CreateIndex">
330
				<DESCRIPTION />
331
				<PARAMETERS />
332
				<ARCS>
333
					<ARC to="prepareIndex" />
334
				</ARCS>
335
			</NODE>
336

    
337
			<NODE name="prepareIndex" type="PrepareDedupIndexJob">
338
				<DESCRIPTION>Prepare indexing</DESCRIPTION>
339
				<PARAMETERS>
340
					<PARAM name="rottenRecordsPathParam"  type="string" required="true" managedBy="system">rottenRecordsPath</PARAM>
341
					<PARAM required="true" type="string" name="dedupConfig" managedBy="system">
342
						{ 	
343
							'dedupConfig' : 'dedup.conf'
344
						}
345
					</PARAM>					
346
				</PARAMETERS>
347
				<ARCS>
348
					<ARC to="cleanupRotten" />
349
				</ARCS>
350
			</NODE>
351

    
352
			<NODE name="cleanupRotten" type="DeleteHdfsPathJob">
353
				<DESCRIPTION>hdfs cleanup (rotten)</DESCRIPTION>
354
				<PARAMETERS>
355
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
356
					<PARAM required="true" type="string" name="envParams" managedBy="system">
357
						{ 	
358
							'path' : 'rottenRecordsPath'
359
						}
360
					</PARAM>					
361
				</PARAMETERS>
362
				<ARCS>
363
					<ARC to="updateIndex" />
364
				</ARCS>
365
			</NODE>
366
		
367
			<NODE name="updateIndex" type="SubmitHadoopJob">
368
				<DESCRIPTION>M/O index records</DESCRIPTION>
369
				<PARAMETERS>
370
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupIndexFeedJob</PARAM>					
371
					<PARAM required="true" type="string" name="envParams" managedBy="system">
372
						{ 	
373
							'mapred.output.dir' : 'rottenRecordsPath',
374
							'index.fields' : 'index.fields',
375
							'index.solr.url' : 'index.solr.url',
376
							'index.solr.collection' : 'index.solr.collection',
377
							'index.buffer.flush.threshold' : 'index.buffer.flush.threshold',
378
							'index.shutdown.wait.time' : 'index.shutdown.wait.time',
379
							'index.solr.sim.mode' : 'index.solr.sim.mode',
380
							'index.feed.timestamp' : 'index.feed.timestamp',
381
							'entityTypeId' : 'entityTypeId',
382
							'entityType' : 'entityType',
383
							'actionset' : 'actionset',
384
							'cluster' : 'cluster'
385
						}
386
					</PARAM>
387
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
388
						{ 	
389
							'hbase.mapred.inputtable' : 'hbase.mapred.datatable', 
390
							'hbase.mapreduce.inputtable' : 'hbase.mapred.datatable'
391
						}
392
					</PARAM>
393
				</PARAMETERS>
394
				<ARCS>
395
					<ARC to="finalize" />
396
				</ARCS>
397
			</NODE>
398

    
399
			<NODE name="finalize" type="FinalizeDedupIndexFeeding">
400
				<DESCRIPTION>commit changes</DESCRIPTION>
401
				<PARAMETERS />
402
				<ARCS>
403
					<ARC to="updateDs" />
404
				</ARCS>
405
			</NODE>
406

    
407
			<NODE name="updateDs" type="IndexDsUpdateJob">
408
				<DESCRIPTION>update DS</DESCRIPTION>
409
				<PARAMETERS />
410
				<ARCS>
411
					<ARC to="success" />
412
				</ARCS>
413
			</NODE>
414
			
415
        </CONFIGURATION>
416
        <STATUS />
417
    </BODY>
418
</RESOURCE_PROFILE>
(25-25/34)