Project

General

Profile

1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER value="a4434d62-d4cd-4c73-a107-bc7c62e6f815_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
4
		<RESOURCE_TYPE value="WorkflowDSResourceType"/>
5
		<RESOURCE_KIND value="WorkflowDSResources"/>
6
		<RESOURCE_URI value=""/>
7
		<DATE_OF_CREATION value="2014-08-01T18:13:51.0Z"/>
8
	</HEADER>
9
	<BODY>
10
		<WORKFLOW_NAME>Data Provision</WORKFLOW_NAME>
11
		<WORKFLOW_TYPE>Data Provision</WORKFLOW_TYPE>
12
		<WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
13
		<CONFIGURATION start="manual">
14

    
15
			<!-- PREPARE NODES -->
16
			<NODE name="setInfo" type="SetFormatInfo" isStart="true">
17
				<DESCRIPTION>set mdformat, layout, interpretation</DESCRIPTION>
18
				<PARAMETERS>
19
					<PARAM name="format" type="string" managedBy="user" required="true">DMF</PARAM>
20
					<PARAM name="layout" type="string" managedBy="user" required="true">index</PARAM>
21
					<PARAM name="interpretation" type="string" managedBy="user" required="true">openaire</PARAM>
22
				</PARAMETERS>
23
				<ARCS>
24
					<ARC to="findIndex"/>
25
				</ARCS>
26
			</NODE>
27
			<NODE name="fetchRelClasses" type="FetchRelClasses" isStart="true">
28
				<DESCRIPTION/>
29
				<PARAMETERS>
30
					<PARAM name="relClassesProperty" type="string" managedBy="system" required="true">dnet.openaire.model.relclasses.xquery</PARAM>
31
					<PARAM name="relClassesName" type="string" managedBy="system" required="true">relClasses</PARAM>
32
				</PARAMETERS>
33
				<ARCS>
34
					<ARC to="groupEntities"/>
35
				</ARCS>
36
			</NODE>
37
			<NODE name="fetchContexts" type="LoadContextsJob" isStart="true">
38
				<DESCRIPTION/>
39
				<PARAMETERS/>
40
				<ARCS>
41
					<ARC to="groupEntities"/>
42
				</ARCS>
43
			</NODE>
44
			<NODE name="fetchEntityLinks" type="LoadEntityLinksJob" isStart="true">
45
				<DESCRIPTION/>
46
				<PARAMETERS/>
47
				<ARCS>
48
					<ARC to="groupEntities"/>
49
				</ARCS>
50
			</NODE>
51

    
52
			<!-- UPDATE INDEX -->
53
			<NODE name="findIndex" type="FindIndex">
54
				<DESCRIPTION/>
55
				<PARAMETERS/>
56
				<ARCS>
57
					<ARC name="found" to="prepareIndexing"/>
58
					<ARC name="notFound" to="createIndex"/>
59
				</ARCS>
60
			</NODE>
61
			<NODE name="createIndex" type="CreateIndex">
62
				<DESCRIPTION/>
63
				<PARAMETERS/>
64
				<ARCS>
65
					<ARC to="prepareIndexing"/>
66
				</ARCS>
67
			</NODE>
68
			<NODE name="prepareIndexing" type="PrepareIndexJob">
69
				<DESCRIPTION>Prepare indexing</DESCRIPTION>
70
				<PARAMETERS>
71
					<PARAM name="outputRecordsPathParam" type="string" managedBy="system" required="true">hdfsRecordsPath</PARAM>
72
					<PARAM name="rottenRecordsPathParam" type="string" managedBy="system" required="true">rottenRecordsPath</PARAM>
73
					<PARAM name="layoutToRecordStylesheet" type="string" managedBy="system" required="true">/eu/dnetlib/msro/openaireplus/workflows/index/openaireLayoutToRecordStylesheet.xsl</PARAM>
74
					<PARAM name="oafSchemaLocationProperty" type="string" managedBy="system" required="true">oaf.schema.location</PARAM>
75
				</PARAMETERS>
76
				<ARCS>
77
					<ARC to="cleanupXml"/>
78
					<ARC to="cleanupRotten"/>
79
				</ARCS>
80
			</NODE>
81
			<NODE name="cleanupXml" type="DeleteHdfsPathJob">
82
				<DESCRIPTION>hdfs cleanup (xml)</DESCRIPTION>
83
				<PARAMETERS>
84
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
85
					<PARAM required="true" type="string" name="envParams" managedBy="system">
86
						{
87
						'path' : 'hdfsRecordsPath'
88
						}
89
					</PARAM>
90
				</PARAMETERS>
91
				<ARCS>
92
					<ARC to="groupEntities"/>
93
				</ARCS>
94
			</NODE>
95
			<NODE name="cleanupRotten" type="DeleteHdfsPathJob">
96
				<DESCRIPTION>hdfs cleanup (rotten)</DESCRIPTION>
97
				<PARAMETERS>
98
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
99
					<PARAM required="true" type="string" name="envParams" managedBy="system">
100
						{
101
						'path' : 'rottenRecordsPath'
102
						}
103
					</PARAM>
104
				</PARAMETERS>
105
				<ARCS>
106
					<ARC to="groupEntities"/>
107
				</ARCS>
108
			</NODE>
109
			<NODE name="groupEntities" type="SubmitHadoopJob" isJoin="true">
110
				<DESCRIPTION>M/R group entities</DESCRIPTION>
111
				<PARAMETERS>
112
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
113
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">prepareIndexDataJob</PARAM>
114
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
115
						{
116
						'hbase.mapred.inputtable' : 'hbase.mapred.datatable',
117
						'hbase.mapreduce.inputtable' : 'hbase.mapred.datatable'
118
						}
119
					</PARAM>
120
					<PARAM required="true" type="string" name="envParams" managedBy="system">
121
						{
122
						'mapred.output.dir' : 'hdfsRecordsPath',
123
						'index.entity.links' : 'index.entity.links',
124
						'oaf.schema.location' : 'oaf.schema.location',
125
						'contextmap' : 'contextmap',
126
						'relClasses' : 'relClasses'
127
						}
128
					</PARAM>
129
					<PARAM required="true" type="boolean" name="simulation" managedBy="user">false</PARAM>
130
				</PARAMETERS>
131
				<ARCS>
132
					<ARC to="updateIndex"/>
133
					<ARC to="setOAIFormat"/>
134
					<ARC to="prepareStats"/>
135
				</ARCS>
136
			</NODE>
137
			<NODE name="updateIndex" type="SubmitHadoopJob">
138
				<DESCRIPTION>M/O index records</DESCRIPTION>
139
				<PARAMETERS>
140
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
141
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">indexFeedJob</PARAM>
142
					<PARAM required="true" type="string" name="envParams" managedBy="system">
143
						{
144
						'mapred.input.dir' : 'hdfsRecordsPath',
145
						'mapred.output.dir' : 'rottenRecordsPath',
146
						'index.xslt' : 'index.xslt',
147
						'index.solr.url' : 'index.solr.url',
148
						'index.solr.collection' : 'index.solr.collection',
149
						'index.buffer.flush.threshold' : 'index.buffer.flush.threshold',
150
						'index.shutdown.wait.time' : 'index.shutdown.wait.time',
151
						'index.solr.sim.mode' : 'index.solr.sim.mode',
152
						'index.feed.timestamp' : 'index.feed.timestamp'
153
						}
154
					</PARAM>
155
					<PARAM required="true" type="boolean" name="simulation" managedBy="user">false</PARAM>
156
				</PARAMETERS>
157
				<ARCS>
158
					<ARC to="finalize"/>
159
				</ARCS>
160
			</NODE>
161
			<NODE name="finalize" type="FinalizeIndexFeeding">
162
				<DESCRIPTION>commit changes</DESCRIPTION>
163
				<PARAMETERS/>
164
				<ARCS>
165
					<ARC to="updateDs"/>
166
				</ARCS>
167
			</NODE>
168
			<NODE name="updateDs" type="IndexDsUpdateJob">
169
				<DESCRIPTION>update DS</DESCRIPTION>
170
				<PARAMETERS/>
171
				<ARCS>
172
					<ARC to="waitAll"/>
173
				</ARCS>
174
			</NODE>
175

    
176
			<!--  UPDATE STATS -->
177
			<NODE name="prepareStats" type="PrepareStatsParams">
178
				<DESCRIPTION>set params for stats</DESCRIPTION>
179
				<PARAMETERS/>
180
				<ARCS>
181
					<ARC to="updateStats"/>
182
				</ARCS>
183
			</NODE>
184
			<NODE name="updateStats" type="SubmitHadoopJob">
185
				<DESCRIPTION>Update stats DB</DESCRIPTION>
186
				<PARAMETERS>
187
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
188
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">StatsExportJob</PARAM>
189
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
190
						{
191
						'Stats_Hbase_Source_Table' : 'hbase.mapred.datatable'
192
						}
193
					</PARAM>
194
					<PARAM required="true" type="string" name="envParams" managedBy="system">
195
						{
196
						'nameNode' : 'nameNode',
197
						'jobTracker' : 'jobTracker',
198
						'isLookupEndpoint' : 'isLookupEndpoint',
199
						'Stats_indexConf' : 'index.entity.links'
200
						}
201
					</PARAM>
202
					<PARAM required="true" type="boolean" name="simulation" managedBy="user">false</PARAM>
203
				</PARAMETERS>
204
				<ARCS>
205
					<ARC to="waitAll"/>
206
				</ARCS>
207
			</NODE>
208

    
209
			<!-- OAI STORE UPDATE -->
210
			<NODE name="setOAIFormat" type="SetFormatInfo">
211
				<DESCRIPTION>set format, layout, interpretation for OAI publisher</DESCRIPTION>
212
				<PARAMETERS>
213
					<PARAM name="format" type="string" managedBy="user" required="true">oaf</PARAM>
214
					<PARAM name="layout" type="string" managedBy="user" required="true">index</PARAM>
215
					<PARAM name="interpretation" type="string" managedBy="user" required="true">openaire</PARAM>
216
					<PARAM name="formatParam" type="string" managedBy="system" required="true">oai_format</PARAM>
217
					<PARAM name="layoutParam" type="string" managedBy="system" required="true">oai_layout</PARAM>
218
					<PARAM name="interpretationParam" type="string" managedBy="system" required="true">oai_interpretation</PARAM>
219
				</PARAMETERS>
220
				<ARCS>
221
					<ARC to="prepareOAI"/>
222
				</ARCS>
223
			</NODE>
224
			<NODE name="prepareOAI" type="PrepareOaiJob">
225
				<DESCRIPTION>Prepare params for OAI store feeding</DESCRIPTION>
226
				<PARAMETERS>
227
					<PARAM name="oaiStoreCollectionParam" type="string" managedBy="system" required="true">oaiStoreCollection</PARAM>
228
					<PARAM name="oaiDBName" type="string" managedBy="user" required="true">oaistore</PARAM>
229
					<PARAM name="oaiDBNameParam" type="string" managedBy="system" required="true">oai_dbName</PARAM>
230
					<PARAM name="formatParam" type="string" managedBy="system" required="true">oai_format</PARAM>
231
					<PARAM name="layoutParam" type="string" managedBy="system" required="true">oai_layout</PARAM>
232
					<PARAM name="interpretationParam" type="string" managedBy="system" required="true">oai_interpretation</PARAM>
233
					<PARAM name="skipDuplicates" type="boolean" managedBy="user" required="true">true</PARAM>
234
					<PARAM name="duplicateXPath" type="string" managedBy="user" required="true">//*[local-name()='entity']//*[local-name()='datainfo']/*[local-name()='deletedbyinference'][./text() = 'true']</PARAM>
235
				</PARAMETERS>
236
				<ARCS>
237
					<ARC to="OAIDropStore"/>
238
				</ARCS>
239
			</NODE>
240
			<NODE name="OAIDropStore" type="OAIDropStore">
241
				<DESCRIPTION>Drop the store: upserts are too expensive!</DESCRIPTION>
242
				<PARAMETERS>
243
					<PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
244
					<PARAM name="formatParam" type="string" managedBy="system" required="true">oai_format</PARAM>
245
					<PARAM name="layoutParam" type="string" managedBy="system" required="true">oai_layout</PARAM>
246
					<PARAM name="interpretationParam" type="string" managedBy="system" required="true">oai_interpretation</PARAM>
247
				</PARAMETERS>
248
				<ARCS>
249
					<ARC to="OAICreateStore"/>
250
				</ARCS>
251
			</NODE>
252
			<NODE name="OAICreateStore" type="OAICreateStore">
253
				<DESCRIPTION/>
254
				<PARAMETERS>
255
					<PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
256
					<PARAM name="formatParam" type="string" managedBy="system" required="true">oai_format</PARAM>
257
					<PARAM name="layoutParam" type="string" managedBy="system" required="true">oai_layout</PARAM>
258
					<PARAM name="interpretationParam" type="string" managedBy="system" required="true">oai_interpretation</PARAM>
259
				</PARAMETERS>
260
				<ARCS>
261
					<ARC to="feedOAI"/>
262
				</ARCS>
263
			</NODE>
264
			<NODE name="feedOAI" type="SubmitHadoopJob">
265
				<DESCRIPTION>M/O oai feeding</DESCRIPTION>
266
				<PARAMETERS>
267
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
268
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">oaiFeedJob</PARAM>
269
					<PARAM required="true" type="string" name="envParams" managedBy="system">
270
						{
271
						'mapred.input.dir' : 'hdfsRecordsPath',
272
						'services.publisher.oai.collection' : 'oaiStoreCollection',
273
						'oaiConfiguration' : 'oaiConfiguration',
274
						'oai.feed.date' : 'oai.feed.date',
275
						'services.publisher.oai.db' : 'oai_dbName',
276
						'services.publisher.oai.skipDuplicates':'services.publisher.oai.skipDuplicates',
277
						'services.publisher.oai.duplicateXPath':'services.publisher.oai.duplicateXPath'
278
						}
279
					</PARAM>
280
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
281
						{
282
						'services.publisher.oai.host' : 'services.publisher.oai.host',
283
						'services.publisher.oai.port' : 'services.publisher.oai.port'
284
						}
285
					</PARAM>
286
					<PARAM required="true" type="boolean" name="simulation" managedBy="user">false</PARAM>
287
				</PARAMETERS>
288
				<ARCS>
289
					<ARC to="CompoundIndexes"/>
290
				</ARCS>
291
			</NODE>
292
			<NODE name="CompoundIndexes" type="OAICreateIndex">
293
				<DESCRIPTION>Create composite indexes for the OAI store</DESCRIPTION>
294
				<PARAMETERS>
295
					<PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
296
					<PARAM name="oaiStoreCollectionParam" type="string" managedBy="system" required="true">oaiStoreCollection</PARAM>
297
					<PARAM required="true" type="string" name="fieldNames" managedBy="user">set,datestamp;license,oaftype;set,oaftype;oaftype,funder;resulttypeid,oaftype</PARAM>
298
				</PARAMETERS>
299
				<ARCS>
300
					<ARC to="ConfigIndexes"/>
301
				</ARCS>
302
			</NODE>
303
			<NODE name="ConfigIndexes" type="OAIEnsureIndexes">
304
				<DESCRIPTION>Ensure an index exists on fields as specified in the configuration profile</DESCRIPTION>
305
				<PARAMETERS>
306
					<PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
307
					<PARAM name="oaiStoreCollectionParam" type="string" managedBy="system" required="true">oaiStoreCollection</PARAM>
308
				</PARAMETERS>
309
				<ARCS>
310
					<ARC to="SetsCount"/>
311
					<ARC to="ConfigSets"/>
312
				</ARCS>
313
			</NODE>
314
			<NODE name="SetsCount" type="OAISetsCountUpdate">
315
				<DESCRIPTION>Count records in each OAI set, for each exported metadata format</DESCRIPTION>
316
				<PARAMETERS>
317
					<PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
318
					<PARAM name="oaiStoreCollectionParam" type="string" managedBy="system" required="true">oaiStoreCollection</PARAM>
319
					<PARAM name="configuredOnly" type="boolean" managedBy="user" required="true">false</PARAM>
320
				</PARAMETERS>
321
				<ARCS>
322
					<ARC to="waitAll"/>
323
				</ARCS>
324
			</NODE>
325
			<NODE name="ConfigSets" type="RefreshSetsFromConfig">
326
				<DESCRIPTION>Reads the current OAI configuration and upsert OAI sets accordingly (counts are updated as well)</DESCRIPTION>
327
				<PARAMETERS>
328
					<PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
329
				</PARAMETERS>
330
				<ARCS>
331
					<ARC to="waitAll"/>
332
				</ARCS>
333
			</NODE>
334

    
335
			<!-- WAIT FOR ALL THE WF BRANCHES TO COMPLETE -->
336
			<NODE name="waitAll" isJoin="true">
337
				<DESCRIPTION>wait for all the branches to complete</DESCRIPTION>
338
				<PARAMETERS/>
339
				<ARCS>
340
					<ARC to="success"/>
341
				</ARCS>
342
			</NODE>
343
		</CONFIGURATION>
344
		<STATUS>
345
		</STATUS>
346
	</BODY>
347
</RESOURCE_PROFILE>
(6-6/11)