Project

General

Profile

1
<?xml version="1.0" encoding="UTF-8"?>
2
<RESOURCE_PROFILE xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3
    <HEADER>
4
        <RESOURCE_IDENTIFIER value="ab5d7de1-b23e-495a-9928-be62a6bbd8b6_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
5
        <RESOURCE_TYPE value="WorkflowDSResourceType"/>
6
        <RESOURCE_KIND value="WorkflowDSResources"/>
7
        <RESOURCE_URI value=""/>
8
        <DATE_OF_CREATION value="2006-05-04T18:13:51.0Z"/>
9
    </HEADER>
10
    <BODY>
11
        <WORKFLOW_NAME>Offline Deduplication</WORKFLOW_NAME>
12
        <WORKFLOW_TYPE>Deduplication</WORKFLOW_TYPE>
13
        <WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
14
        <CONFIGURATION start="manual">
15
			<NODE name="fetchRelClasses" type="FetchRelClasses" isStart="true">
16
				<DESCRIPTION />
17
				<PARAMETERS>
18
					<PARAM name="relClassesProperty" type="string" managedBy="system" required="true">dnet.openaire.model.relclasses.xquery</PARAM>
19
					<PARAM name="relClassesName" type="string" managedBy="system" required="true">relClasses</PARAM>
20
				</PARAMETERS>
21
				<ARCS>
22
					<ARC to="checkConf" />
23
				</ARCS>
24
			</NODE>        
25
			<NODE name="setDedupConfigs" type="SetDedupConfiguration" isStart="true">
26
				<DESCRIPTION>Set Dedup conf</DESCRIPTION>
27
				<PARAMETERS>
28
					<PARAM function="obtainValues('dedupOrchestrations', {})" required="true" type="string" name="dedupConfigSequence" managedBy="user"></PARAM>
29
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
30
				</PARAMETERS>
31
				<ARCS>
32
					<ARC to="checkConf" />
33
				</ARCS>
34
			</NODE>
35
			<NODE name="hadoopConfig" type="SetClusterAndTable" isStart="true">
36
				<DESCRIPTION>Set table name</DESCRIPTION>
37
				<PARAMETERS>
38
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
39
					<PARAM required="true" type="string" name="tableParam" managedBy="system">tableName</PARAM>
40
				</PARAMETERS>
41
				<ARCS>
42
					<ARC to="checkConf" />
43
				</ARCS>
44
			</NODE>			
45
        	<NODE name="checkConf" type="DedupCheckConfiguration" isJoin="true">
46
				<DESCRIPTION/>
47
				<PARAMETERS>
48
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
49
				</PARAMETERS>
50
				<ARCS>
51
					<ARC to="deduplicateScan" />
52
				</ARCS>
53
			</NODE>
54
			
55
			<NODE name="deduplicateScan" type="DuplicateScanJob">
56
				<DESCRIPTION>Dup Scan</DESCRIPTION>
57
				<PARAMETERS>
58
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupCandidateScanJob</PARAM>
59
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
60
					<PARAM required="true" type="string" name="envParams" managedBy="system">
61
						{ 	
62
							'entityTypeId' : 'entityTypeId',
63
							'entityType' : 'entityType',
64
							'cluster' : 'cluster',
65
							'hbase.mapred.inputtable' : 'tableName', 
66
							'hbase.mapred.outputtable' : 'tableName', 
67
							'hbase.mapreduce.inputtable' : 'tableName', 
68
							'hbase.mapreduce.outputtable' : 'tableName'
69
						}
70
					</PARAM>					
71
				</PARAMETERS>
72
				<ARCS>
73
					<ARC to="deduplicateScan" />
74
					<ARC name="done" to="prepareActionSet" />
75
					<ARC name="done" to="queryUserActions" />
76
				</ARCS>
77
			</NODE>
78
			
79
			
80
			
81
			
82
			
83
            <NODE name="queryUserActions">
84
                <DESCRIPTION>query user actions</DESCRIPTION>
85
                <PARAMETERS/>
86
                <ARCS>
87
                    <ARC to="querySimilarities"/>
88
                </ARCS>
89
            </NODE>
90
			
91
			
92
            <NODE name="querySimilarities" type="QueryUserActionDbJob">
93
                <DESCRIPTION>query similarity</DESCRIPTION>
94
                <PARAMETERS>
95
	                <PARAM name="dedupConfigSequenceParam" type="string" required="true" managedBy="system">dedup.conf.queue</PARAM>
96
                    <PARAM name="dbProperty" type="string" managedBy="system" required="true">dnet.dedup.db.name</PARAM>
97
                    <PARAM name="sql" type="string" managedBy="system" required="true">/eu/dnetlib/msro/workflows/dedup/querySimilaritiesBySet.sql.st</PARAM>
98
                    <PARAM name="outputEprParam" type="string" managedBy="system" required="true">simEpr</PARAM>
99
                </PARAMETERS>
100
                <ARCS>
101
                    <ARC to="buildSimilarityMesh"/>
102
                </ARCS>
103
            </NODE>            
104
            <NODE name="buildSimilarityMesh" type="BuildSimilarityMeshJob">
105
                <DESCRIPTION>build mesh</DESCRIPTION>
106
                <PARAMETERS>
107
                    <PARAM name="inputEprParam" type="string" managedBy="system" required="true">simEpr</PARAM>
108
                    <PARAM name="outputEprParam" type="string" managedBy="system" required="true">simMeshEpr</PARAM>
109
                </PARAMETERS>
110
                <ARCS>
111
                    <ARC to="storeSimilarities"/>
112
                </ARCS>
113
            </NODE>
114
            <NODE name="storeSimilarities" type="StoreHBase">
115
                <DESCRIPTION>store similarity</DESCRIPTION>
116
                <PARAMETERS>
117
                    <PARAM name="inputEprParam" type="string" managedBy="system" required="true">simMeshEpr</PARAM>
118
                    <PARAM name="hbaseTableProperty" type="string" managedBy="system" required="true">hbase.mapred.datatable</PARAM>
119
                    <PARAM name="cluster" type="string" managedBy="system" required="true">DM</PARAM>
120
	                <PARAM name="mapping" type="string" managedBy="user" required="true" function="obtainValues('dbmf2hbaseMappings', {})"></PARAM>
121
                    <PARAM name="simulation" type="boolean" managedBy="user" required="false">false</PARAM>
122
                </PARAMETERS>
123
                <ARCS>
124
                    <ARC to="queryDissimilarities"/>
125
                </ARCS>
126
            </NODE>
127
            
128
            <NODE name="queryDissimilarities" type="QueryUserActionDbJob">
129
                <DESCRIPTION>query dissimilarity</DESCRIPTION>
130
                <PARAMETERS>
131
	                <PARAM name="dedupConfigSequenceParam" type="string" required="true" managedBy="system">dedup.conf.queue</PARAM>
132
                    <PARAM name="dbProperty" type="string" managedBy="system" required="true">dnet.dedup.db.name</PARAM>
133
                    <PARAM name="sql" type="string" managedBy="system" required="true">/eu/dnetlib/msro/workflows/dedup/queryDissimilaritiesBySet.sql.st</PARAM>
134
                    <PARAM name="outputEprParam" type="string" managedBy="system" required="true">dissimEpr</PARAM>
135
                </PARAMETERS>
136
                <ARCS>
137
                    <ARC to="storeDissimilarities"/>
138
                </ARCS>
139
            </NODE>            
140
            <NODE name="storeDissimilarities" type="DeleteFromHBase">
141
                <DESCRIPTION>store dissimilarity</DESCRIPTION>
142
                <PARAMETERS>
143
                    <PARAM name="inputEprParam" type="string" managedBy="system" required="true">dissimEpr</PARAM>
144
                    <PARAM name="hbaseTableProperty" type="string" managedBy="system" required="true">hbase.mapred.datatable</PARAM>
145
                    <PARAM name="cluster" type="string" managedBy="system" required="true">DM</PARAM>
146
	                <PARAM name="mapping" type="string" managedBy="user" required="true" function="obtainValues('dbmf2hbaseMappings', {})"></PARAM>
147
                    <PARAM name="simulation" type="boolean" managedBy="user" required="false">false</PARAM>
148
                </PARAMETERS>
149
                <ARCS>
150
                    <ARC to="doneUserActions"/>
151
                </ARCS>
152
            </NODE>
153
			
154
            <NODE name="doneUserActions" isJoin="true">
155
                <DESCRIPTION>done user actions</DESCRIPTION>
156
                <PARAMETERS/>
157
                <ARCS>
158
                    <ARC to="doneActions"/>
159
                </ARCS>
160
            </NODE>	
161

    
162
			<NODE name="prepareActionSet" type="PrepareConfiguredActionSet">				
163
				<DESCRIPTION>prepare action sets</DESCRIPTION>
164
				<PARAMETERS>
165
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
166
					<PARAM required="true" type="string" name="jobProperty" managedBy="system">rawSetId</PARAM>
167
				</PARAMETERS>
168
				<ARCS>
169
					<ARC to="similarity2actions" />
170
				</ARCS>
171
			</NODE>
172
			
173
			<NODE name="similarity2actions" type="DedupSimilarityToActionsJobNode">
174
				<DESCRIPTION>export the similarity rels as Actions</DESCRIPTION>
175
				<PARAMETERS>
176
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupSimilarity2ActionsJob</PARAM>
177
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
178
					<PARAM required="true" type="string" name="envParams" managedBy="system">
179
						{ 	
180
							'dedup.conf' : 'dedup.conf',
181
							'entityTypeId' : 'entityTypeId',
182
							'entityType' : 'entityType',
183
							'cluster' : 'cluster',
184
							'rawSetId' : 'rawSetId',
185
							'hbase.mapred.inputtable' : 'tableName', 
186
							'hbase.mapreduce.inputtable' : 'tableName'
187
						}
188
					</PARAM>
189
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
190
						{ 	
191
							'hbase.mapred.outputtable' : 'hbase.actions.table',
192
							'hbase.mapreduce.outputtable' : 'hbase.actions.table'
193
						}
194
					</PARAM>						
195
				</PARAMETERS>
196
				<ARCS>
197
					<ARC to="updateActionSets" />
198
				</ARCS>
199
			</NODE>
200
			
201
			<NODE name="updateActionSets" type="UpdateActionSets">				
202
				<DESCRIPTION>update action sets</DESCRIPTION>
203
				<PARAMETERS/>
204
				<ARCS>
205
					<ARC to="doneActions" />
206
				</ARCS>
207
			</NODE>
208
			
209
             <NODE name="doneActions" isJoin="true">
210
                <DESCRIPTION>done user actions</DESCRIPTION>
211
                <PARAMETERS/>
212
                <ARCS>
213
                    <ARC to="dedupGrouper"/>
214
                </ARCS>
215
            </NODE>	
216
			
217
			
218
			<NODE name="dedupGrouper" type="DedupGrouperJob">
219
				<DESCRIPTION>dedup grouper</DESCRIPTION>
220
				<PARAMETERS>
221
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupGrouperJob</PARAM>
222
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
223
					<PARAM required="true" type="string" name="envParams" managedBy="system">
224
						{
225
							'dedup.conf' : 'dedup.conf',
226
							'entityTypeId' : 'entityTypeId',
227
							'entityType' : 'entityType',
228
							'cluster' : 'cluster',
229
							'hbase.mapred.inputtable' : 'tableName', 
230
							'hbase.mapred.outputtable' : 'tableName', 
231
							'hbase.mapreduce.inputtable' : 'tableName', 
232
							'hbase.mapreduce.outputtable' : 'tableName'													
233
						}
234
					</PARAM>					
235
				</PARAMETERS>
236
				<ARCS>
237
					<ARC to="dedupGrouper" />
238
					<ARC name="done" to="findRoots" />
239
				</ARCS>
240
			</NODE>
241
			<NODE name="findRoots" type="SubmitHadoopJob">
242
				<DESCRIPTION>find roots</DESCRIPTION>
243
				<PARAMETERS>
244
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupFindRootsJob</PARAM>
245
					<PARAM required="true" type="string" name="envParams" managedBy="system">
246
						{ 
247
							'dedup.conf' : 'dedup.conf',
248
							'entityTypeId' : 'entityTypeId',
249
							'entityType' : 'entityType',
250
							'cluster' : 'cluster',
251
							'hbase.mapred.inputtable' : 'tableName', 
252
							'hbase.mapred.outputtable' : 'tableName', 
253
							'hbase.mapreduce.inputtable' : 'tableName', 
254
							'hbase.mapreduce.outputtable' : 'tableName'												
255
						}
256
					</PARAM>					
257
				</PARAMETERS>
258
				<ARCS>
259
					<ARC to="buildRoots" />
260
				</ARCS>
261
			</NODE>
262
			<NODE name="buildRoots" type="SubmitHadoopJob">
263
				<DESCRIPTION>redirect rels</DESCRIPTION>
264
				<PARAMETERS>
265
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupBuildRootsJob</PARAM>
266
					<PARAM required="true" type="string" name="envParams" managedBy="system">
267
						{ 	
268
							'dedup.conf' : 'dedup.conf',
269
							'relClasses' : 'relClasses',
270
							'entityTypeId' : 'entityTypeId',
271
							'entityType' : 'entityType',
272
							'cluster' : 'cluster',
273
							'hbase.mapred.inputtable' : 'tableName', 
274
							'hbase.mapreduce.inputtable' : 'tableName', 
275
							'hbase.mapred.outputtable' : 'tableName', 
276
							'hbase.mapreduce.outputtable' : 'tableName'														
277
						}
278
					</PARAM>					
279
				</PARAMETERS>
280
				<ARCS>
281
					<ARC to="findIndex" />
282
				</ARCS>
283
			</NODE>			
284
			
285
			<NODE name="findIndex" type="FindIndex">
286
				<DESCRIPTION />
287
				<PARAMETERS>
288
					<PARAM name="mdFormat" type="string" managedBy="system" required="true">OPENAIRE</PARAM>
289
					<PARAM name="layout" type="string" managedBy="system" required="true">index</PARAM>
290
					<PARAM name="interpretation" type="string" managedBy="system" required="true">dedup</PARAM>
291
				</PARAMETERS>
292
				<ARCS>
293
					<ARC name="found" to="prepareIndex" />
294
					<ARC name="notFound" to="createIndex" />
295
				</ARCS>
296
			</NODE>
297

    
298
			<NODE name="createIndex" type="CreateIndex">
299
				<DESCRIPTION />
300
				<PARAMETERS />
301
				<ARCS>
302
					<ARC to="prepareIndex" />
303
				</ARCS>
304
			</NODE>
305

    
306
			<NODE name="prepareIndex" type="PrepareDedupIndexJob">
307
				<DESCRIPTION>Prepare indexing</DESCRIPTION>
308
				<PARAMETERS>
309
					<PARAM name="rottenRecordsPathParam"  type="string" required="true" managedBy="system">rottenRecordsPath</PARAM>
310
					<PARAM required="true" type="string" name="dedupConfig" managedBy="system">
311
						{ 	
312
							'dedupConfig' : 'dedup.conf'
313
						}
314
					</PARAM>					
315
				</PARAMETERS>
316
				<ARCS>
317
					<ARC to="cleanupRotten" />
318
				</ARCS>
319
			</NODE>
320

    
321
			<NODE name="cleanupRotten" type="DeleteHdfsPathJob">
322
				<DESCRIPTION>hdfs cleanup (rotten)</DESCRIPTION>
323
				<PARAMETERS>
324
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
325
					<PARAM required="true" type="string" name="envParams" managedBy="system">
326
						{ 	
327
							'path' : 'rottenRecordsPath'
328
						}
329
					</PARAM>					
330
				</PARAMETERS>
331
				<ARCS>
332
					<ARC to="updateIndex" />
333
				</ARCS>
334
			</NODE>
335
		
336
			<NODE name="updateIndex" type="SubmitHadoopJob">
337
				<DESCRIPTION>M/O index records</DESCRIPTION>
338
				<PARAMETERS>
339
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupIndexFeedJob</PARAM>					
340
					<PARAM required="true" type="string" name="envParams" managedBy="system">
341
						{ 	
342
							'mapred.output.dir' : 'rottenRecordsPath',
343
							'index.fields' : 'index.fields',
344
							'index.solr.url' : 'index.solr.url',
345
							'index.solr.collection' : 'index.solr.collection',
346
							'index.buffer.flush.threshold' : 'index.buffer.flush.threshold',
347
							'index.shutdown.wait.time' : 'index.shutdown.wait.time',
348
							'index.solr.sim.mode' : 'index.solr.sim.mode',
349
							'index.feed.timestamp' : 'index.feed.timestamp',
350
							'entityTypeId' : 'entityTypeId',
351
							'entityType' : 'entityType',
352
							'actionset' : 'actionset',
353
							'cluster' : 'cluster'
354
						}
355
					</PARAM>
356
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
357
						{ 	
358
							'hbase.mapred.inputtable' : 'hbase.mapred.datatable', 
359
							'hbase.mapreduce.inputtable' : 'hbase.mapred.datatable'
360
						}
361
					</PARAM>
362
				</PARAMETERS>
363
				<ARCS>
364
					<ARC to="finalize" />
365
				</ARCS>
366
			</NODE>
367

    
368
			<NODE name="finalize" type="FinalizeDedupIndexFeeding">
369
				<DESCRIPTION>commit changes</DESCRIPTION>
370
				<PARAMETERS />
371
				<ARCS>
372
					<ARC to="updateDs" />
373
				</ARCS>
374
			</NODE>
375

    
376
			<NODE name="updateDs" type="IndexDsUpdateJob">
377
				<DESCRIPTION>update DS</DESCRIPTION>
378
				<PARAMETERS />
379
				<ARCS>
380
					<ARC to="success" />
381
				</ARCS>
382
			</NODE>
383
			
384
			
385
			
386
			
387
			
388
			
389
			
390
			
391
			
392
			
393
			
394
			
395
			
396
			
397
			
398
<!-- 			
399
			
400
	       	<NODE name="setCsvPath" type="SetEnvParameter">
401
				<DESCRIPTION>Set the CSV file path on HDFS</DESCRIPTION>
402
				<PARAMETERS>
403
					<PARAM managedBy="system" name="parameterName" required="true" type="string">csvPath</PARAM>
404
					<PARAM managedBy="user" name="parameterValue" required="false" type="string"></PARAM>
405
				</PARAMETERS>
406
				<ARCS>
407
					<ARC to="cleanupCsv" />
408
				</ARCS>
409
			</NODE>			
410
			
411
			<NODE name="cleanupCsv" type="DeleteHdfsPathJob">
412
				<DESCRIPTION>CSV files cleanup</DESCRIPTION>
413
				<PARAMETERS>
414
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
415
					<PARAM required="true" type="string" name="envParams" managedBy="system">
416
						{ 	
417
							'path' : 'csvPath'
418
						}
419
					</PARAM>					
420
				</PARAMETERS>
421
				<ARCS>
422
					<ARC to="roots2CSV" />
423
				</ARCS>
424
			</NODE>
425
				
426
			<NODE name="roots2CSV" type="DedupConfigurationAwareJobLoader">
427
				<DESCRIPTION>export the representative entities as CSV files</DESCRIPTION>
428
				<PARAMETERS>
429
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupRootsToCSVJob</PARAM>
430
					<PARAM managedBy="system" name="dedupConfigSequenceParam" required="true" type="string">dedup.conf.queue</PARAM>
431
					<PARAM required="true" type="string" name="envParams" managedBy="system">
432
						{ 	
433
							'dedup.conf' : 'dedup.conf',
434
							'relClasses' : 'relClasses',
435
							'entityTypeId' : 'entityTypeId',
436
							'entityType' : 'entityType',
437
							'cluster' : 'cluster',
438
							'hbase.mapred.inputtable' : 'tableName', 
439
							'hbase.mapreduce.inputtable' : 'tableName',
440
							'mapred.output.dir' : 'csvPath'
441
						}
442
					</PARAM>					
443
				</PARAMETERS>
444
				<ARCS>
445
					<ARC to="success" />
446
				</ARCS>
447
			</NODE>	
448
 -->						
449
						
450
						
451
        </CONFIGURATION>
452
        <STATUS />
453
    </BODY>
454
</RESOURCE_PROFILE>
(26-26/32)