Project

General

Profile

1
<?xml version="1.0" encoding="UTF-8"?>
2
<RESOURCE_PROFILE xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3
    <HEADER>
4
        <RESOURCE_IDENTIFIER value="31483043-7dd0-435f-b76e-bad9107aecc4_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
5
        <RESOURCE_TYPE value="WorkflowDSResourceType"/>
6
        <RESOURCE_KIND value="WorkflowDSResources"/>
7
        <RESOURCE_URI value=""/>
8
        <DATE_OF_CREATION value="2006-05-04T18:13:51.0Z"/>
9
    </HEADER>
10
    <BODY>
11
        <WORKFLOW_NAME>Offline Deduplication (Person Full)</WORKFLOW_NAME>
12
        <WORKFLOW_TYPE>Deduplication</WORKFLOW_TYPE>
13
        <WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
14
        <CONFIGURATION start="manual">
15
        
16
			<NODE name="fetchRelClasses" type="FetchRelClasses" isStart="true" >
17
				<DESCRIPTION />
18
				<PARAMETERS>
19
					<PARAM name="relClassesProperty" type="string" managedBy="system" required="true">dnet.openaire.model.relclasses.xquery</PARAM>
20
					<PARAM name="relClassesName" type="string" managedBy="system" required="true">relClasses</PARAM>
21
				</PARAMETERS>
22
				<ARCS>
23
					<ARC to="checkConf" />
24
				</ARCS>
25
			</NODE>        
26
			<NODE name="setDedupConfigs" type="SetDedupConfiguration" isStart="true">
27
				<DESCRIPTION>Set Dedup conf</DESCRIPTION>
28
				<PARAMETERS>
29
					<PARAM function="obtainValues('dedupOrchestrations', {})" required="true" type="string" name="dedupConfigSequence" managedBy="user"></PARAM>
30
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
31
				</PARAMETERS>
32
				<ARCS>
33
					<ARC to="checkConf" />
34
				</ARCS>
35
			</NODE>
36
			<NODE name="hadoopConfig" type="SetClusterAndTable" isStart="true">
37
				<DESCRIPTION>Set table name</DESCRIPTION>
38
				<PARAMETERS>
39
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
40
					<PARAM required="true" type="string" name="tableParam" managedBy="system">tableName</PARAM>
41
				</PARAMETERS>
42
				<ARCS>
43
					<ARC to="checkConf" />
44
				</ARCS>
45
			</NODE>
46
			<NODE name="setPath" type="SetHdfsPathJob" isStart="true">
47
				<DESCRIPTION>Set map path name</DESCRIPTION>
48
				<PARAMETERS>
49
					<PARAM required="true" type="string" name="path" managedBy="user">/tmp/anchorMap</PARAM>
50
					<PARAM required="true" type="string" name="pathParam" managedBy="system">mapPath</PARAM>
51
				</PARAMETERS>
52
				<ARCS>
53
					<ARC to="checkConf" />
54
				</ARCS>
55
			</NODE>	
56
			
57
			
58
			
59
				
60
        	<NODE name="checkConf" type="DedupCheckConfiguration" isJoin="true">
61
				<DESCRIPTION/>
62
				<PARAMETERS>
63
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
64
				</PARAMETERS>
65
				<ARCS>
66
					<ARC to="resetCountersJob" />
67
				</ARCS>
68
			</NODE>
69
			
70
			
71
			<NODE name="resetCountersJob" type="ResetCountersJob">
72
				<DESCRIPTION>resets the loop counter</DESCRIPTION>
73
				<PARAMETERS>
74
					<PARAM required="true" type="string" name="attributesCSV" managedBy="system">blackboard:param:person.dedupSimilarity (x2),dedup.grouper.looper</PARAM>
75
				</PARAMETERS>
76
				<ARCS>
77
					<ARC to="deduplicateScan" />
78
				</ARCS>
79
			</NODE>				
80
			
81
			<NODE name="deduplicateScan" type="DuplicateScanJob">
82
				<DESCRIPTION>Dup Scan</DESCRIPTION>
83
				<PARAMETERS>
84
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupCandidateScanJob</PARAM>
85
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
86
					<PARAM required="true" type="string" name="envParams" managedBy="system">
87
						{ 	
88
							'entityTypeId' : 'entityTypeId',
89
							'entityType' : 'entityType',
90
							'cluster' : 'cluster',
91
							'hbase.mapred.inputtable' : 'tableName', 
92
							'hbase.mapred.outputtable' : 'tableName', 
93
							'hbase.mapreduce.inputtable' : 'tableName', 
94
							'hbase.mapreduce.outputtable' : 'tableName'
95
						}
96
					</PARAM>					
97
				</PARAMETERS>
98
				<ARCS>
99
					<ARC to="deduplicateScan" />
100
					<ARC name="done" to="checkDone" />
101
				</ARCS>
102
			</NODE>
103
			
104
			<NODE name="checkDone" type="CheckDoneJob">
105
				<DESCRIPTION>checks if iteration is completed</DESCRIPTION>
106
				<PARAMETERS>
107
					<PARAM required="true" type="string" name="param" managedBy="system">blackboard:param:person.dedupSimilarity (x2)</PARAM>
108
					<PARAM required="true" type="string" name="exitArc" managedBy="system">continue</PARAM>
109
				</PARAMETERS>
110
				<ARCS>
111
					<ARC name="continue" to="dedupGrouper" />
112
					<ARC to="deleteSimRels" />
113
				</ARCS>
114
			</NODE>				
115
			
116
			<NODE name="dedupGrouper" type="DedupGrouperJob">
117
				<DESCRIPTION>dedup grouper</DESCRIPTION>
118
				<PARAMETERS>
119
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupGrouperJob</PARAM>
120
					<PARAM required="true" type="string" name="dedupConfigSequenceParam" managedBy="system">dedup.conf.queue</PARAM>
121
					<PARAM required="true" type="string" name="envParams" managedBy="system">
122
						{
123
							'dedup.conf' : 'dedup.conf',
124
							'entityTypeId' : 'entityTypeId',
125
							'entityType' : 'entityType',
126
							'cluster' : 'cluster',
127
							'hbase.mapred.inputtable' : 'tableName', 
128
							'hbase.mapred.outputtable' : 'tableName', 
129
							'hbase.mapreduce.inputtable' : 'tableName', 
130
							'hbase.mapreduce.outputtable' : 'tableName'													
131
						}
132
					</PARAM>					
133
				</PARAMETERS>
134
				<ARCS>
135
					<ARC to="dedupGrouper" />
136
					<ARC name="done" to="findRoots" />
137
				</ARCS>
138
			</NODE>
139
			<NODE name="findRoots" type="SubmitHadoopJob">
140
				<DESCRIPTION>find roots</DESCRIPTION>
141
				<PARAMETERS>
142
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">dedupFindPersonRootsJob</PARAM>
143
					<PARAM required="true" type="string" name="envParams" managedBy="system">
144
						{ 
145
							'dedup.conf' : 'dedup.conf',
146
							'entityTypeId' : 'entityTypeId',
147
							'entityType' : 'entityType',
148
							'cluster' : 'cluster',
149
							'hbase.mapred.inputtable' : 'tableName', 
150
							'hbase.mapred.outputtable' : 'tableName', 
151
							'hbase.mapreduce.inputtable' : 'tableName', 
152
							'hbase.mapreduce.outputtable' : 'tableName'												
153
						}
154
					</PARAM>					
155
				</PARAMETERS>
156
				<ARCS>
157
					<ARC to="cleanupMap" />
158
				</ARCS>
159
			</NODE>
160

    
161
			
162
			
163
			<!-- postprocess -->
164
			
165
			<NODE name="cleanupMap" type="DeleteHdfsPathJob">
166
				<DESCRIPTION>hdfs cleanup (xml)</DESCRIPTION>
167
				<PARAMETERS>
168
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
169
					<PARAM required="true" type="string" name="envParams" managedBy="system">
170
						{ 	
171
							'path' : 'mapPath'
172
						}
173
					</PARAM>					
174
				</PARAMETERS>
175
				<ARCS>
176
					<ARC to="buildAnchorMap" />
177
				</ARCS>
178
			</NODE>			
179
 
180
			<NODE name="buildAnchorMap" type="SubmitHadoopJob">
181
				<DESCRIPTION>redirect rels</DESCRIPTION>
182
				<PARAMETERS>
183
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">buildMergedToAnchorMapJob</PARAM>
184
					<PARAM required="true" type="string" name="envParams" managedBy="system">
185
						{ 	
186
							'cluster' : 'cluster',
187
							'hbase.mapred.inputtable' : 'tableName', 
188
							'hbase.mapreduce.inputtable' : 'tableName', 
189
							'hbase.mapred.outputtable' : 'tableName', 
190
							'hbase.mapreduce.outputtable' : 'tableName',
191
							'mapred.output.dir' : 'mapPath'												
192
						}
193
					</PARAM>					
194
				</PARAMETERS>
195
				<ARCS>
196
					<ARC to="updateCoAuthors" />
197
				</ARCS>
198
			</NODE>
199
			<NODE name="updateCoAuthors" type="SubmitHadoopJob">
200
				<DESCRIPTION>update co-authors</DESCRIPTION>
201
				<PARAMETERS>
202
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">coauthorUpdateJob</PARAM>
203
					<PARAM required="true" type="string" name="envParams" managedBy="system">
204
						{ 	
205
							'cluster' : 'cluster',
206
							'hbase.mapred.inputtable' : 'tableName', 
207
							'hbase.mapreduce.inputtable' : 'tableName', 
208
							'hbase.mapred.outputtable' : 'tableName', 
209
							'hbase.mapreduce.outputtable' : 'tableName',
210
							'mapred.output.dir' : 'mapPath'												
211
						}
212
					</PARAM>					
213
				</PARAMETERS>
214
				<ARCS>
215
					<ARC to="deleteSimRels" />
216
				</ARCS>
217
			</NODE>	
218
			
219
			<NODE name="deleteSimRels" type="SubmitHadoopJob">
220
				<DESCRIPTION>redirect rels</DESCRIPTION>
221
				<PARAMETERS>
222
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">deleteSimRelJob</PARAM>
223
					<PARAM required="true" type="string" name="envParams" managedBy="system">
224
						{ 	
225
							'dedup.conf' : 'dedup.conf',
226
							'entityTypeId' : 'entityTypeId',
227
							'entityType' : 'entityType',
228
							'cluster' : 'cluster',
229
							'hbase.mapred.inputtable' : 'tableName', 
230
							'hbase.mapreduce.inputtable' : 'tableName', 
231
							'hbase.mapred.outputtable' : 'tableName', 
232
							'hbase.mapreduce.outputtable' : 'tableName'														
233
						}
234
					</PARAM>					
235
				</PARAMETERS>
236
				<ARCS>
237
					<ARC to="checkExit" />
238
				</ARCS>
239
			</NODE>			
240
			
241
			<NODE name="checkExit" type="CheckDoneJob">
242
				<DESCRIPTION>checks if iteration is completed</DESCRIPTION>
243
				<PARAMETERS>
244
					<PARAM required="true" type="string" name="param" managedBy="system">blackboard:param:person.dedupSimilarity (x2)</PARAM>
245
					<PARAM required="true" type="string" name="exitArc" managedBy="system">continue</PARAM>
246
				</PARAMETERS>
247
				<ARCS>
248
					<ARC name="continue" to="resetCountersJob" />
249
					<ARC to="success" />
250
				</ARCS>
251
			</NODE>		
252
			
253

    
254
			
255
			
256
			
257
			
258
        </CONFIGURATION>
259
        <STATUS />
260
    </BODY>
261
</RESOURCE_PROFILE>
(16-16/25)