Project

General

Profile

1
<?xml version="1.0" encoding="UTF-8"?>
2
<RESOURCE_PROFILE>
3
    <HEADER>
4
        <RESOURCE_IDENTIFIER value="251db67f-24ac-4d1d-9e7b-d38fe8898234_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
5
        <RESOURCE_TYPE value="WorkflowDSResourceType"/>
6
        <RESOURCE_KIND value="WorkflowDSResources"/>
7
        <RESOURCE_URI value=""/>
8
        <DATE_OF_CREATION value="2006-05-04T18:13:51.0Z"/>
9
    </HEADER>
10
    <BODY>
11
        <WORKFLOW_NAME>Offline Deduplication v2</WORKFLOW_NAME>
12
        <WORKFLOW_TYPE>Deduplication</WORKFLOW_TYPE>
13
        <WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
14
        <CONFIGURATION start="manual">
15
			<NODE name="fetchRelClasses" type="FetchRelClasses" isStart="true">
16
				<DESCRIPTION />
17
				<PARAMETERS>
18
					<PARAM name="relClassesProperty" type="string" managedBy="system" required="true">dnet.openaire.model.relclasses.xquery</PARAM>
19
					<PARAM name="relClassesName" type="string" managedBy="system" required="true">relClasses</PARAM>
20
				</PARAMETERS>
21
				<ARCS>
22
					<ARC to="checkConf" />
23
				</ARCS>
24
			</NODE>        
25
			<NODE name="setDedupConfigs" type="SetDedupConfiguration" isStart="true">
26
				<DESCRIPTION>Set Dedup conf</DESCRIPTION>
27
				<PARAMETERS>
28
					<PARAM function="obtainValues('dedupOrchestrations', {})" required="true" type="string" name="dedupConfigSequence" managedBy="user"></PARAM>
29
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
30
				</PARAMETERS>
31
				<ARCS>
32
					<ARC to="checkConf" />
33
				</ARCS>
34
			</NODE>
35
			<NODE name="hadoopConfig" type="SetClusterAndTable" isStart="true">
36
				<DESCRIPTION>Set table name</DESCRIPTION>
37
				<PARAMETERS>
38
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
39
					<PARAM required="true" type="string" name="tableParam" managedBy="system">tableName</PARAM>
40
				</PARAMETERS>
41
				<ARCS>
42
					<ARC to="checkConf" />
43
				</ARCS>
44
			</NODE>
45

    
46
	        <NODE name="setWorkingPath" type="SetEnvParameter" isStart="true">
47
		        <DESCRIPTION>Set the working dir on HDFS for the MinDist Algo</DESCRIPTION>
48
		        <PARAMETERS>
49
			        <PARAM managedBy="system" name="parameterName" required="true" type="string">workDir</PARAM>
50
			        <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/dedup/mindist</PARAM>
51
		        </PARAMETERS>
52
		        <ARCS>
53
			        <ARC to="checkConf"/>
54
		        </ARCS>
55
	        </NODE>
56

    
57

    
58
        	<NODE name="checkConf" type="DedupCheckConfiguration" isJoin="true">
59
				<DESCRIPTION/>
60
				<PARAMETERS>
61
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
62
				</PARAMETERS>
63
				<ARCS>
64
					<ARC to="resetPath" />
65
				</ARCS>
66
			</NODE>
67

    
68
	        <NODE name="resetPath" type="CreateHdfsDirJob">
69
		        <DESCRIPTION>input files cleanup</DESCRIPTION>
70
		        <PARAMETERS>
71
			        <PARAM required="true" type="boolean" name="force" managedBy="system">true</PARAM>
72
			        <PARAM required="true" type="string" name="envParams" managedBy="system">
73
				        {
74
				        'path' : 'workDir',
75
				        'cluster' : 'cluster'
76
				        }
77
			        </PARAM>
78
		        </PARAMETERS>
79
		        <ARCS>
80
			        <ARC to="deduplicateScan"/>
81
		        </ARCS>
82
	        </NODE>
83
			
84
			<NODE name="deduplicateScan" type="DuplicateScanJob">
85
				<DESCRIPTION>Dup Scan</DESCRIPTION>
86
				<PARAMETERS>
87
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupCandidateScanJob</PARAM>
88
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
89
					<PARAM required="true" type="string" name="envParams" managedBy="system">
90
						{ 	
91
							'entityTypeId' : 'entityTypeId',
92
							'entityType' : 'entityType',
93
							'cluster' : 'cluster',
94
							'hbase.mapred.inputtable' : 'tableName', 
95
							'hbase.mapred.outputtable' : 'tableName', 
96
							'hbase.mapreduce.inputtable' : 'tableName', 
97
							'hbase.mapreduce.outputtable' : 'tableName'
98
						}
99
					</PARAM>					
100
				</PARAMETERS>
101
				<ARCS>
102
					<ARC to="deduplicateScan" />
103
					<ARC name="done" to="prepareActionSet" />
104
					<ARC name="done" to="queryUserActions" />
105
				</ARCS>
106
			</NODE>
107
			
108
			
109
			
110
			
111
			
112
            <NODE name="queryUserActions">
113
                <DESCRIPTION>query user actions</DESCRIPTION>
114
                <PARAMETERS/>
115
                <ARCS>
116
                    <ARC to="querySimilarities"/>
117
                </ARCS>
118
            </NODE>
119
			
120
			
121
            <NODE name="querySimilarities" type="QueryUserActionDbJob">
122
                <DESCRIPTION>query similarity</DESCRIPTION>
123
                <PARAMETERS>
124
	                <PARAM name="dedupConfigSequenceParam" type="string" required="true" managedBy="system">dedup.conf.queue</PARAM>
125
                    <PARAM name="dbProperty" type="string" managedBy="system" required="true">dnet.dedup.db.name</PARAM>
126
                    <PARAM name="sql" type="string" managedBy="system" required="true">/eu/dnetlib/msro/workflows/dedup/querySimilaritiesBySet.sql.st</PARAM>
127
                    <PARAM name="outputEprParam" type="string" managedBy="system" required="true">simEpr</PARAM>
128
                </PARAMETERS>
129
                <ARCS>
130
                    <ARC to="buildSimilarityMesh"/>
131
                </ARCS>
132
            </NODE>            
133
            <NODE name="buildSimilarityMesh" type="BuildSimilarityMeshJob">
134
                <DESCRIPTION>build mesh</DESCRIPTION>
135
                <PARAMETERS>
136
                    <PARAM name="inputEprParam" type="string" managedBy="system" required="true">simEpr</PARAM>
137
                    <PARAM name="outputEprParam" type="string" managedBy="system" required="true">simMeshEpr</PARAM>
138
                </PARAMETERS>
139
                <ARCS>
140
                    <ARC to="storeSimilarities"/>
141
                </ARCS>
142
            </NODE>
143

    
144

    
145
            <NODE name="storeSimilarities" type="StoreHBase">
146
                <DESCRIPTION>store similarity</DESCRIPTION>
147
                <PARAMETERS>
148
                    <PARAM name="inputEprParam" type="string" managedBy="system" required="true">simMeshEpr</PARAM>
149
                    <PARAM name="hbaseTableProperty" type="string" managedBy="system" required="true">hbase.mapred.datatable</PARAM>
150
                    <PARAM name="cluster" type="string" managedBy="system" required="true">DM</PARAM>
151
	                <PARAM name="mapping" type="string" managedBy="user" required="true" function="obtainValues('dbmf2hbaseMappings', {})"></PARAM>
152
                    <PARAM name="simulation" type="boolean" managedBy="user" required="false">false</PARAM>
153
                </PARAMETERS>
154
                <ARCS>
155
                    <ARC to="queryDissimilarities"/>
156
                </ARCS>
157
            </NODE>
158
            
159
            <NODE name="queryDissimilarities" type="QueryUserActionDbJob">
160
                <DESCRIPTION>query dissimilarity</DESCRIPTION>
161
                <PARAMETERS>
162
	                <PARAM name="dedupConfigSequenceParam" type="string" required="true" managedBy="system">dedup.conf.queue</PARAM>
163
                    <PARAM name="dbProperty" type="string" managedBy="system" required="true">dnet.dedup.db.name</PARAM>
164
                    <PARAM name="sql" type="string" managedBy="system" required="true">/eu/dnetlib/msro/workflows/dedup/queryDissimilaritiesBySet.sql.st</PARAM>
165
                    <PARAM name="outputEprParam" type="string" managedBy="system" required="true">dissimEpr</PARAM>
166
                </PARAMETERS>
167
                <ARCS>
168
                    <ARC to="storeDissimilarities"/>
169
                </ARCS>
170
            </NODE>
171

    
172
            <NODE name="storeDissimilarities" type="DeleteFromHBase">
173
                <DESCRIPTION>store dissimilarity</DESCRIPTION>
174
                <PARAMETERS>
175
                    <PARAM name="inputEprParam" type="string" managedBy="system" required="true">dissimEpr</PARAM>
176
                    <PARAM name="hbaseTableProperty" type="string" managedBy="system" required="true">hbase.mapred.datatable</PARAM>
177
                    <PARAM name="cluster" type="string" managedBy="system" required="true">DM</PARAM>
178
	                <PARAM name="mapping" type="string" managedBy="user" required="true" function="obtainValues('dbmf2hbaseMappings', {})"></PARAM>
179
                    <PARAM name="simulation" type="boolean" managedBy="user" required="false">false</PARAM>
180
                </PARAMETERS>
181
                <ARCS>
182
                    <ARC to="doneUserActions"/>
183
                </ARCS>
184
            </NODE>
185

    
186
            <NODE name="doneUserActions" isJoin="true">
187
                <DESCRIPTION>done user actions</DESCRIPTION>
188
                <PARAMETERS/>
189
                <ARCS>
190
                    <ARC to="doneActions"/>
191
                </ARCS>
192
            </NODE>	
193

    
194
			<NODE name="prepareActionSet" type="PrepareConfiguredActionSet">				
195
				<DESCRIPTION>prepare action sets</DESCRIPTION>
196
				<PARAMETERS>
197
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
198
					<PARAM required="true" type="string" name="jobProperty" managedBy="system">rawSetId</PARAM>
199
				</PARAMETERS>
200
				<ARCS>
201
					<ARC to="similarity2actions" />
202
				</ARCS>
203
			</NODE>
204
			
205
			<NODE name="similarity2actions" type="DedupSimilarityToActionsJobNode">
206
				<DESCRIPTION>export the similarity rels as Actions</DESCRIPTION>
207
				<PARAMETERS>
208
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupSimilarity2ActionsJob</PARAM>
209
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
210
					<PARAM required="true" type="string" name="envParams" managedBy="system">
211
						{ 	
212
							'dedup.conf' : 'dedup.conf',
213
							'entityTypeId' : 'entityTypeId',
214
							'entityType' : 'entityType',
215
							'cluster' : 'cluster',
216
							'rawSetId' : 'rawSetId',
217
							'hbase.mapred.inputtable' : 'tableName', 
218
							'hbase.mapreduce.inputtable' : 'tableName'
219
						}
220
					</PARAM>
221
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
222
						{ 	
223
							'hbase.mapred.outputtable' : 'hbase.actions.table',
224
							'hbase.mapreduce.outputtable' : 'hbase.actions.table'
225
						}
226
					</PARAM>						
227
				</PARAMETERS>
228
				<ARCS>
229
					<ARC to="updateActionSets" />
230
				</ARCS>
231
			</NODE>
232
			
233
			<NODE name="updateActionSets" type="UpdateActionSets">				
234
				<DESCRIPTION>update action sets</DESCRIPTION>
235
				<PARAMETERS/>
236
				<ARCS>
237
					<ARC to="doneActions" />
238
				</ARCS>
239
			</NODE>
240
			
241
             <NODE name="doneActions" isJoin="true">
242
                <DESCRIPTION>done user actions</DESCRIPTION>
243
                <PARAMETERS/>
244
                <ARCS>
245
                    <ARC to="mindist"/>
246
                </ARCS>
247
            </NODE>
248

    
249

    
250
	        <NODE name="mindist" type="MinDistSearchHadoopJob">
251
		        <DESCRIPTION>find the minimum verted in each adjacency lists</DESCRIPTION>
252
		        <PARAMETERS>
253
			        <PARAM required="true" type="boolean" name="debug" managedBy="user">false</PARAM>
254
			        <PARAM required="true" type="boolean" name="outPathParam" managedBy="system">outputPath</PARAM>
255
			        <PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
256
			        <PARAM required="true" type="string" name="envParams" managedBy="system">
257
				        {
258
				        'cluster' : 'cluster',
259
				        'entityTypeId' : 'entityTypeId',
260
				        'hbase.mapred.inputtable' : 'tableName',
261
				        'hbase.mapreduce.inputtable' : 'tableName',
262
				        'workDir' : 'workDir'
263
				        }
264
			        </PARAM>
265
		        </PARAMETERS>
266
		        <ARCS>
267
			        <ARC name="depth_n" to="mindist"/>
268
			        <ARC to="components"/>
269
		        </ARCS>
270
	        </NODE>
271

    
272
	        <NODE name="components" type="DedupConfigurationAwareJob">
273
		        <DESCRIPTION>joins all the vertex ids to build the connected components in the graph</DESCRIPTION>
274
		        <PARAMETERS>
275
			        <PARAM required="true" type="string" name="hadoopJob" managedBy="system">connectedComponentsJob</PARAM>
276
			        <PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
277
			        <PARAM required="true" type="string" name="envParams" managedBy="system">
278
				        {
279
				        'cluster' : 'cluster',
280
				        'entityType' : 'entityType',
281
				        'entityTypeId' : 'entityTypeId',
282
				        'mapred.input.dir' : 'outputPath',
283
				        'hbase.mapred.outputtable' : 'tableName',
284
				        'hbase.mapreduce.outputtable' : 'tableName'
285
				        }
286
			        </PARAM>
287
		        </PARAMETERS>
288
		        <ARCS>
289
			        <ARC to="markDeleted"/>
290
		        </ARCS>
291
	        </NODE>
292

    
293
	        <NODE name="markDeleted" type="DedupConfigurationAwareJob">
294
		        <DESCRIPTION>mark duplicates as deleted by inference</DESCRIPTION>
295
		        <PARAMETERS>
296
			        <PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupMarkDeletedEntityJob</PARAM>
297
			        <PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
298
			        <PARAM required="true" type="string" name="envParams" managedBy="system">
299
				        {
300
				        'cluster' : 'cluster',
301
				        'entityType' : 'entityType',
302
				        'entityTypeId' : 'entityTypeId',
303
				        'hbase.mapred.inputtable' : 'tableName',
304
				        'hbase.mapreduce.inputtable' : 'tableName',
305
				        'hbase.mapred.outputtable' : 'tableName',
306
				        'hbase.mapreduce.outputtable' : 'tableName'
307
				        }
308
			        </PARAM>
309
		        </PARAMETERS>
310
		        <ARCS>
311
			        <ARC to="buildRoots"/>
312
		        </ARCS>
313
	        </NODE>
314

    
315
			<NODE name="buildRoots" type="SubmitHadoopJob">
316
				<DESCRIPTION>redirect rels</DESCRIPTION>
317
				<PARAMETERS>
318
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupBuildRootsJob</PARAM>
319
					<PARAM required="true" type="string" name="envParams" managedBy="system">
320
						{ 	
321
							'dedup.conf' : 'dedup.conf',
322
							'relClasses' : 'relClasses',
323
							'entityTypeId' : 'entityTypeId',
324
							'entityType' : 'entityType',
325
							'cluster' : 'cluster',
326
							'hbase.mapred.inputtable' : 'tableName', 
327
							'hbase.mapreduce.inputtable' : 'tableName', 
328
							'hbase.mapred.outputtable' : 'tableName', 
329
							'hbase.mapreduce.outputtable' : 'tableName'														
330
						}
331
					</PARAM>					
332
				</PARAMETERS>
333
				<ARCS>
334
					<ARC to="findIndex" />
335
				</ARCS>
336
			</NODE>			
337
			
338
			<NODE name="findIndex" type="FindIndex">
339
				<DESCRIPTION />
340
				<PARAMETERS>
341
					<PARAM name="mdFormat" type="string" managedBy="system" required="true">OPENAIRE</PARAM>
342
					<PARAM name="layout" type="string" managedBy="system" required="true">index</PARAM>
343
					<PARAM name="interpretation" type="string" managedBy="system" required="true">dedup</PARAM>
344
				</PARAMETERS>
345
				<ARCS>
346
					<ARC name="found" to="prepareIndex" />
347
					<ARC name="notFound" to="createIndex" />
348
				</ARCS>
349
			</NODE>
350

    
351
			<NODE name="createIndex" type="CreateIndex">
352
				<DESCRIPTION />
353
				<PARAMETERS />
354
				<ARCS>
355
					<ARC to="prepareIndex" />
356
				</ARCS>
357
			</NODE>
358

    
359
			<NODE name="prepareIndex" type="PrepareDedupIndexJob">
360
				<DESCRIPTION>Prepare indexing</DESCRIPTION>
361
				<PARAMETERS>
362
					<PARAM name="rottenRecordsPathParam"  type="string" required="true" managedBy="system">rottenRecordsPath</PARAM>
363
					<PARAM required="true" type="string" name="dedupConfig" managedBy="system">
364
						{ 	
365
							'dedupConfig' : 'dedup.conf'
366
						}
367
					</PARAM>					
368
				</PARAMETERS>
369
				<ARCS>
370
					<ARC to="cleanupRotten" />
371
				</ARCS>
372
			</NODE>
373

    
374
			<NODE name="cleanupRotten" type="DeleteHdfsPathJob">
375
				<DESCRIPTION>hdfs cleanup (rotten)</DESCRIPTION>
376
				<PARAMETERS>
377
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
378
					<PARAM required="true" type="string" name="envParams" managedBy="system">
379
						{ 	
380
							'path' : 'rottenRecordsPath'
381
						}
382
					</PARAM>					
383
				</PARAMETERS>
384
				<ARCS>
385
					<ARC to="updateIndex" />
386
				</ARCS>
387
			</NODE>
388
		
389
			<NODE name="updateIndex" type="SubmitHadoopJob">
390
				<DESCRIPTION>M/O index records</DESCRIPTION>
391
				<PARAMETERS>
392
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupIndexFeedJob</PARAM>					
393
					<PARAM required="true" type="string" name="envParams" managedBy="system">
394
						{ 	
395
							'mapred.output.dir' : 'rottenRecordsPath',
396
							'index.fields' : 'index.fields',
397
							'index.solr.url' : 'index.solr.url',
398
							'index.solr.collection' : 'index.solr.collection',
399
							'index.buffer.flush.threshold' : 'index.buffer.flush.threshold',
400
							'index.shutdown.wait.time' : 'index.shutdown.wait.time',
401
							'index.solr.sim.mode' : 'index.solr.sim.mode',
402
							'index.feed.timestamp' : 'index.feed.timestamp',
403
							'entityTypeId' : 'entityTypeId',
404
							'entityType' : 'entityType',
405
							'actionset' : 'actionset',
406
							'cluster' : 'cluster'
407
						}
408
					</PARAM>
409
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
410
						{ 	
411
							'hbase.mapred.inputtable' : 'hbase.mapred.datatable', 
412
							'hbase.mapreduce.inputtable' : 'hbase.mapred.datatable'
413
						}
414
					</PARAM>
415
				</PARAMETERS>
416
				<ARCS>
417
					<ARC to="finalize" />
418
				</ARCS>
419
			</NODE>
420

    
421
			<NODE name="finalize" type="FinalizeDedupIndexFeeding">
422
				<DESCRIPTION>commit changes</DESCRIPTION>
423
				<PARAMETERS />
424
				<ARCS>
425
					<ARC to="updateDs" />
426
				</ARCS>
427
			</NODE>
428

    
429
			<NODE name="updateDs" type="IndexDsUpdateJob">
430
				<DESCRIPTION>update DS</DESCRIPTION>
431
				<PARAMETERS />
432
				<ARCS>
433
					<ARC to="success" />
434
				</ARCS>
435
			</NODE>
436
			
437
        </CONFIGURATION>
438
        <STATUS />
439
    </BODY>
440
</RESOURCE_PROFILE>
(25-25/32)