Project

General

Profile

1
<?xml version="1.0" encoding="UTF-8"?>
2
<RESOURCE_PROFILE xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3
    <HEADER>
4
        <RESOURCE_IDENTIFIER value="ab5d7de1-b23e-495a-9928-be62a6bbd8b6_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
5
        <RESOURCE_TYPE value="WorkflowDSResourceType"/>
6
        <RESOURCE_KIND value="WorkflowDSResources"/>
7
        <RESOURCE_URI value=""/>
8
        <DATE_OF_CREATION value="2006-05-04T18:13:51.0Z"/>
9
    </HEADER>
10
    <BODY>
11
        <WORKFLOW_NAME>Offline Deduplication</WORKFLOW_NAME>
12
        <WORKFLOW_TYPE>Deduplication</WORKFLOW_TYPE>
13
        <WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
14
        <CONFIGURATION start="manual">
15
			<NODE name="fetchRelClasses" type="FetchRelClasses" isStart="true">
16
				<DESCRIPTION />
17
				<PARAMETERS>
18
					<PARAM name="relClassesProperty" type="string" managedBy="system" required="true">dnet.openaire.model.relclasses.xquery</PARAM>
19
					<PARAM name="relClassesName" type="string" managedBy="system" required="true">relClasses</PARAM>
20
				</PARAMETERS>
21
				<ARCS>
22
					<ARC to="SELECT_MODE" />
23
				</ARCS>
24
			</NODE>        
25
			<NODE name="setDedupConfigs" type="SetDedupConfiguration" isStart="true">
26
				<DESCRIPTION>Set Dedup conf</DESCRIPTION>
27
				<PARAMETERS>
28
					<PARAM function="obtainValues('dedupOrchestrations', {})" required="true" type="string" name="dedupConfigSequence" managedBy="user"></PARAM>
29
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
30
				</PARAMETERS>
31
				<ARCS>
32
					<ARC to="SELECT_MODE" />
33
				</ARCS>
34
			</NODE>
35
			<NODE name="hadoopConfig" type="SetClusterAndTable" isStart="true">
36
				<DESCRIPTION>Set table name</DESCRIPTION>
37
				<PARAMETERS>
38
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
39
					<PARAM required="true" type="string" name="table" managedBy="user">db_stdl</PARAM>
40
					<PARAM required="true" type="string" name="tableParam" managedBy="system">tableName</PARAM>
41
				</PARAMETERS>
42
				<ARCS>
43
					<ARC to="SELECT_MODE" />
44
				</ARCS>
45
			</NODE>			
46
        	<NODE name="SELECT_MODE" type="Selection" isJoin="true">
47
				<DESCRIPTION>Select Workflow execution MODE.</DESCRIPTION>
48
				<PARAMETERS>
49
					<PARAM function="validValues(['DUPLICATE_SCAN', 'CLOSE_MESH', 'EXPORT_CSV', 'EXPORT_ACTIONS'])" managedBy="user" name="selection" required="true" type="string"></PARAM>
50
				</PARAMETERS>
51
				<ARCS>
52
					<ARC name="DUPLICATE_SCAN" to="deduplicateScan" />
53
					<ARC name="CLOSE_MESH" to="dedupGrouper" />
54
					<ARC name="EXPORT_CSV" to="setCsvPath" />
55
					<ARC name="EXPORT_ACTIONS" to="prepareActionSet" />
56
				</ARCS>
57
			</NODE>
58
			
59
			<NODE name="deduplicateScan" type="DuplicateScanJob">
60
				<DESCRIPTION>Dup Scan</DESCRIPTION>
61
				<PARAMETERS>
62
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupCandidateScanJob</PARAM>
63
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
64
					<PARAM required="true" type="string" name="envParams" managedBy="system">
65
						{ 	
66
							'entityTypeId' : 'entityTypeId',
67
							'entityType' : 'entityType',
68
							'cluster' : 'cluster',
69
							'hbase.mapred.inputtable' : 'tableName', 
70
							'hbase.mapred.outputtable' : 'tableName', 
71
							'hbase.mapreduce.inputtable' : 'tableName', 
72
							'hbase.mapreduce.outputtable' : 'tableName'
73
						}
74
					</PARAM>					
75
				</PARAMETERS>
76
				<ARCS>
77
					<ARC to="deduplicateScan" />
78
					<ARC name="done" to="success" />
79
				</ARCS>
80
			</NODE>
81

    
82
			<NODE name="dedupGrouper" type="DedupGrouperJob">
83
				<DESCRIPTION>dedup grouper</DESCRIPTION>
84
				<PARAMETERS>
85
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupGrouperJob</PARAM>
86
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
87
					<PARAM required="true" type="string" name="envParams" managedBy="system">
88
						{
89
							'dedup.conf' : 'dedup.conf',
90
							'entityTypeId' : 'entityTypeId',
91
							'entityType' : 'entityType',
92
							'cluster' : 'cluster',
93
							'hbase.mapred.inputtable' : 'tableName', 
94
							'hbase.mapred.outputtable' : 'tableName', 
95
							'hbase.mapreduce.inputtable' : 'tableName', 
96
							'hbase.mapreduce.outputtable' : 'tableName'													
97
						}
98
					</PARAM>					
99
				</PARAMETERS>
100
				<ARCS>
101
					<ARC to="dedupGrouper" />
102
					<ARC name="done" to="findRoots" />
103
				</ARCS>
104
			</NODE>
105
			<NODE name="findRoots" type="SubmitHadoopJob">
106
				<DESCRIPTION>find roots</DESCRIPTION>
107
				<PARAMETERS>
108
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupFindRootsJob</PARAM>
109
					<PARAM required="true" type="string" name="envParams" managedBy="system">
110
						{ 
111
							'dedup.conf' : 'dedup.conf',
112
							'entityTypeId' : 'entityTypeId',
113
							'entityType' : 'entityType',
114
							'cluster' : 'cluster',
115
							'hbase.mapred.inputtable' : 'tableName', 
116
							'hbase.mapred.outputtable' : 'tableName', 
117
							'hbase.mapreduce.inputtable' : 'tableName', 
118
							'hbase.mapreduce.outputtable' : 'tableName'												
119
						}
120
					</PARAM>					
121
				</PARAMETERS>
122
				<ARCS>
123
					<ARC to="buildRoots" />
124
				</ARCS>
125
			</NODE>
126
			<NODE name="buildRoots" type="SubmitHadoopJob">
127
				<DESCRIPTION>redirect rels</DESCRIPTION>
128
				<PARAMETERS>
129
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupBuildRootsJob</PARAM>
130
					<PARAM required="true" type="string" name="envParams" managedBy="system">
131
						{ 	
132
							'dedup.conf' : 'dedup.conf',
133
							'relClasses' : 'relClasses',
134
							'entityTypeId' : 'entityTypeId',
135
							'entityType' : 'entityType',
136
							'cluster' : 'cluster',
137
							'hbase.mapred.inputtable' : 'tableName', 
138
							'hbase.mapreduce.inputtable' : 'tableName', 
139
							'hbase.mapred.outputtable' : 'tableName', 
140
							'hbase.mapreduce.outputtable' : 'tableName'														
141
						}
142
					</PARAM>					
143
				</PARAMETERS>
144
				<ARCS>
145
					<ARC to="success" />
146
				</ARCS>
147
			</NODE>
148
			
149
        	<NODE name="setCsvPath" type="SetEnvParameter">
150
				<DESCRIPTION>Set the CSV file path on HDFS</DESCRIPTION>
151
				<PARAMETERS>
152
					<PARAM managedBy="system" name="parameterName" required="true" type="string">csvPath</PARAM>
153
					<PARAM managedBy="user" name="parameterValue" required="false" type="string"></PARAM>
154
				</PARAMETERS>
155
				<ARCS>
156
					<ARC to="cleanupCsv" />
157
				</ARCS>
158
			</NODE>			
159
			
160
			<NODE name="cleanupCsv" type="DeleteHdfsPathJob">
161
				<DESCRIPTION>CSV files cleanup</DESCRIPTION>
162
				<PARAMETERS>
163
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
164
					<PARAM required="true" type="string" name="envParams" managedBy="system">
165
						{ 	
166
							'path' : 'csvPath'
167
						}
168
					</PARAM>					
169
				</PARAMETERS>
170
				<ARCS>
171
					<ARC to="roots2CSV" />
172
				</ARCS>
173
			</NODE>
174
				
175
			<NODE name="roots2CSV" type="DedupConfigurationAwareJobLoader">
176
				<DESCRIPTION>export the representative entities as CSV files</DESCRIPTION>
177
				<PARAMETERS>
178
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupRootsToCSVJob</PARAM>
179
					<PARAM managedBy="system" name="dedupConfigSequenceParam" required="true" type="string">dedup.conf.queue</PARAM>
180
					<PARAM required="true" type="string" name="envParams" managedBy="system">
181
						{ 	
182
							'dedup.conf' : 'dedup.conf',
183
							'relClasses' : 'relClasses',
184
							'entityTypeId' : 'entityTypeId',
185
							'entityType' : 'entityType',
186
							'cluster' : 'cluster',
187
							'hbase.mapred.inputtable' : 'tableName', 
188
							'hbase.mapreduce.inputtable' : 'tableName',
189
							'mapred.output.dir' : 'csvPath'
190
						}
191
					</PARAM>					
192
				</PARAMETERS>
193
				<ARCS>
194
					<ARC to="success" />
195
				</ARCS>
196
			</NODE>
197
			
198
			<NODE name="prepareActionSet" type="PrepareConfiguredActionSet">				
199
				<DESCRIPTION>prepare action sets</DESCRIPTION>
200
				<PARAMETERS>
201
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
202
					<PARAM required="true" type="string" name="jobProperty" managedBy="system">rawSetId</PARAM>
203
				</PARAMETERS>
204
				<ARCS>
205
					<ARC to="similarity2actions" />
206
				</ARCS>
207
			</NODE>
208
			
209
			<NODE name="similarity2actions" type="DedupSimilarityToActionsJobNode">
210
				<DESCRIPTION>export the similarity rels as Actions</DESCRIPTION>
211
				<PARAMETERS>
212
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupSimilarity2ActionsJob</PARAM>
213
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
214
					<PARAM required="true" type="string" name="envParams" managedBy="system">
215
						{ 	
216
							'dedup.conf' : 'dedup.conf',
217
							'entityTypeId' : 'entityTypeId',
218
							'entityType' : 'entityType',
219
							'cluster' : 'cluster',
220
							'rawSetId' : 'rawSetId',
221
							'hbase.mapred.inputtable' : 'tableName', 
222
							'hbase.mapreduce.inputtable' : 'tableName'
223
						}
224
					</PARAM>
225
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
226
						{ 	
227
							'hbase.mapred.outputtable' : 'hbase.actions.table',
228
							'hbase.mapreduce.outputtable' : 'hbase.actions.table'
229
						}
230
					</PARAM>						
231
				</PARAMETERS>
232
				<ARCS>
233
					<ARC to="updateActionSets" />
234
				</ARCS>
235
			</NODE>
236
			
237
			<NODE name="updateActionSets" type="UpdateActionSets">				
238
				<DESCRIPTION>update action sets</DESCRIPTION>
239
				<PARAMETERS/>
240
				<ARCS>
241
					<ARC to="success" />
242
				</ARCS>
243
			</NODE>					
244
						
245
        </CONFIGURATION>
246
        <STATUS />
247
    </BODY>
248
</RESOURCE_PROFILE>
(6-6/7)