Project

General

Profile

1
<RESOURCE_PROFILE xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="a4434d62-d4cd-4c73-a107-bc7c62e6f815_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
4
        <RESOURCE_TYPE value="WorkflowDSResourceType"/>
5
        <RESOURCE_KIND value="WorkflowDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2006-05-04T18:13:51.0Z"/>
8
    </HEADER>
9
    <BODY>
10
        <WORKFLOW_NAME>Data Provision</WORKFLOW_NAME>
11
        <WORKFLOW_TYPE>Data Provision</WORKFLOW_TYPE>
12
        <WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
13
        <CONFIGURATION start="manual">
14
        
15
        	<!-- PREPARE NODES -->
16
            <NODE name="setInfo" type="SetFormatInfo" isStart="true">
17
                <DESCRIPTION>set mdformat, layout, interpretation</DESCRIPTION>
18
                <PARAMETERS>
19
                    <PARAM name="format" type="string" managedBy="user" required="true">DMF</PARAM>
20
                    <PARAM name="layout" type="string" managedBy="user" required="true">index</PARAM>
21
                    <PARAM name="interpretation" type="string" managedBy="user" required="true">openaire</PARAM>
22
                </PARAMETERS>
23
                <ARCS>
24
                    <ARC to="findIndex"/>
25
                </ARCS>
26
            </NODE>
27
            <NODE name="fetchRelClasses" type="FetchRelClasses" isStart="true">
28
                <DESCRIPTION/>
29
                <PARAMETERS>
30
                    <PARAM name="relClassesProperty" type="string" managedBy="system" required="true">dnet.openaire.model.relclasses.xquery</PARAM>
31
                    <PARAM name="relClassesName" type="string" managedBy="system" required="true">relClasses</PARAM>
32
                </PARAMETERS>
33
                <ARCS>
34
                    <ARC to="groupEntities"/>
35
                </ARCS>
36
            </NODE>
37
            <NODE name="fetchContexts" type="LoadContextsJob" isStart="true">
38
                <DESCRIPTION/>
39
                <PARAMETERS/>
40
                <ARCS>
41
                    <ARC to="groupEntities"/>
42
                </ARCS>
43
            </NODE>
44
            <NODE name="fetchEntityLinks" type="LoadEntityLinksJob" isStart="true">
45
                <DESCRIPTION/>
46
                <PARAMETERS/>
47
                <ARCS>
48
                    <ARC to="groupEntities"/>
49
                    <ARC to="prepareStats"/>
50
                </ARCS>
51
            </NODE>
52
			
53
			<!-- UPDATE INDEX -->
54
            <NODE name="findIndex" type="FindIndex">
55
                <DESCRIPTION/>
56
                <PARAMETERS/>
57
                <ARCS>
58
                    <ARC name="found" to="prepareIndexing"/>
59
                    <ARC name="notFound" to="createIndex"/>
60
                </ARCS>
61
            </NODE>
62
            <NODE name="createIndex" type="CreateIndex">
63
                <DESCRIPTION/>
64
                <PARAMETERS/>
65
                <ARCS>
66
                    <ARC to="prepareIndexing"/>
67
                </ARCS>
68
            </NODE>
69
            <NODE name="prepareIndexing" type="PrepareIndexJob">
70
                <DESCRIPTION>Prepare indexing</DESCRIPTION>
71
                <PARAMETERS>
72
                    <PARAM name="outputRecordsPathParam" type="string" managedBy="system" required="true">hdfsRecordsPath</PARAM>
73
                    <PARAM name="rottenRecordsPathParam" type="string" managedBy="system" required="true">rottenRecordsPath</PARAM>
74
                    <PARAM name="layoutToRecordStylesheet" type="string" managedBy="system" required="true">/eu/dnetlib/msro/openaireplus/workflows/index/openaireLayoutToRecordStylesheet.xsl</PARAM>
75
                    <PARAM name="oafSchemaLocationProperty" type="string" managedBy="system" required="true">oaf.schema.location</PARAM>
76
                </PARAMETERS>
77
                <ARCS>
78
                    <ARC to="cleanupXml"/>
79
                    <ARC to="cleanupRotten"/>
80
                </ARCS>
81
            </NODE>
82
            <NODE name="cleanupXml" type="DeleteHdfsPathJob">
83
                <DESCRIPTION>hdfs cleanup (xml)</DESCRIPTION>
84
                <PARAMETERS>
85
                    <PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
86
                    <PARAM required="true" type="string" name="envParams" managedBy="system">
87
						{ 	
88
							'path' : 'hdfsRecordsPath'
89
						}
90
					</PARAM>
91
                </PARAMETERS>
92
                <ARCS>
93
                    <ARC to="groupEntities"/>
94
                </ARCS>
95
            </NODE>
96
            <NODE name="cleanupRotten" type="DeleteHdfsPathJob">
97
                <DESCRIPTION>hdfs cleanup (rotten)</DESCRIPTION>
98
                <PARAMETERS>
99
                    <PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
100
                    <PARAM required="true" type="string" name="envParams" managedBy="system">
101
						{ 	
102
							'path' : 'rottenRecordsPath'
103
						}
104
					</PARAM>
105
                </PARAMETERS>
106
                <ARCS>
107
                    <ARC to="groupEntities"/>
108
                </ARCS>
109
            </NODE>
110
            <NODE name="groupEntities" type="SubmitHadoopJob" isJoin="true">
111
                <DESCRIPTION>M/R group entities</DESCRIPTION>
112
                <PARAMETERS>
113
                    <PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
114
                    <PARAM required="true" type="string" name="hadoopJob" managedBy="system">prepareIndexDataJob</PARAM>
115
                    <PARAM required="true" type="string" name="sysParams" managedBy="system">
116
						{ 	
117
							'hbase.mapred.inputtable' : 'hbase.mapred.datatable', 
118
							'hbase.mapreduce.inputtable' : 'hbase.mapred.datatable'
119
						}
120
					</PARAM>
121
                    <PARAM required="true" type="string" name="envParams" managedBy="system">
122
						{ 	
123
							'mapred.output.dir' : 'hdfsRecordsPath',
124
							'index.entity.links' : 'index.entity.links',
125
							'oaf.schema.location' : 'oaf.schema.location',
126
							'contextmap' : 'contextmap',
127
							'relClasses' : 'relClasses'
128
						}
129
					</PARAM>
130
                </PARAMETERS>
131
                <ARCS>
132
                    <ARC to="updateIndex"/>
133
                    <ARC to="setOAIFormat"/>
134
                </ARCS>
135
            </NODE>
136
            <NODE name="updateIndex" type="SubmitHadoopJob">
137
                <DESCRIPTION>M/O index records</DESCRIPTION>
138
                <PARAMETERS>
139
                    <PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
140
                    <PARAM required="true" type="string" name="hadoopJob" managedBy="system">indexFeedJob</PARAM>
141
                    <PARAM required="true" type="string" name="envParams" managedBy="system">
142
						{ 	
143
							'mapred.input.dir' : 'hdfsRecordsPath',
144
							'mapred.output.dir' : 'rottenRecordsPath',
145
							'index.xslt' : 'index.xslt',
146
							'index.solr.url' : 'index.solr.url',
147
							'index.solr.collection' : 'index.solr.collection',
148
							'index.buffer.flush.threshold' : 'index.buffer.flush.threshold',
149
							'index.shutdown.wait.time' : 'index.shutdown.wait.time',
150
							'index.solr.sim.mode' : 'index.solr.sim.mode',
151
							'index.feed.timestamp' : 'index.feed.timestamp'
152
						}
153
					</PARAM>
154
                </PARAMETERS>
155
                <ARCS>
156
                    <ARC to="finalize"/>
157
                </ARCS>
158
            </NODE>
159
            <NODE name="finalize" type="FinalizeIndexFeeding">
160
                <DESCRIPTION>commit changes</DESCRIPTION>
161
                <PARAMETERS/>
162
                <ARCS>
163
                    <ARC to="updateDs"/>
164
                </ARCS>
165
            </NODE>
166
            <NODE name="updateDs" type="IndexDsUpdateJob">
167
                <DESCRIPTION>update DS</DESCRIPTION>
168
                <PARAMETERS/>
169
                <ARCS>
170
                    <ARC to="sync"/>
171
                </ARCS>
172
            </NODE>				
173

    
174
			<!--  UPDATE STATS -->
175
            <NODE name="prepareStats" type="PrepareStatsParams">
176
                <DESCRIPTION>set params for stats</DESCRIPTION>
177
                <PARAMETERS>
178
                    <PARAM required="true" type="string" name="paramsMapJson" managedBy="user">
179
					{
180
						'oozieWfApplicationPath' : '/user/dnet/lib/stats/oozie_app',
181
					    'statsDbUrl' : ' jdbc:postgresql://stats.openaire.eu:5432/stats',
182
						'statsDbUser' : 'sqoop',
183
						'statsDbPass' : 'sqoop',
184
						'statsDbDriver' : 'org.postgresql.Driver',
185
						'statsDbTableMap' : 'datasourceLanguage=datasource_languages,datasource=datasource,project=project,result=result,organization=organization,datasourceOrganization=datasource_organizations,datasourceTopic=datasource_topics,projectOrganization=project_organizations,resultClaim=result_claims,resultClassification=result_classifications,resultConcept=result_concepts,resultLanguage=result_languages,resultOrganization=result_organizations,resultResult=result_results,resultProject=project_results,resultTopic=result_topics,category=category,context=context,claim=claim,concept=concept,datasourceLanguage=datasource_languages,resultLanguage=result_languages,resultDatasource=result_datasources',
186
						 'statsSqoopRecsPerStatement' : '10000',
187
						'statsSqoopStatementPerTrans' : '1000000',
188
						'statsSqoopReducersCount' : '4',
189

    
190
						'statsOutputPath' : '/tmp/stats/',
191
						'statsNullStringField' : 'NULL',
192
						'statsNullNumericField' : '-1',
193
						'statsEnclosingCharacter' : '#',
194
						'statsDelimCharacter' : '!',
195
					   	'out1' : 'datasource',
196
						'out2' : 'project',
197
						'out3' : 'organization',
198
						'out4' : 'datasourceOrganization',
199
						'out5' : 'datasourceTopic',
200
						'out6' : 'datasourceLanguage',
201
						'out7' : 'projectOrganization',
202
						'out8' : 'resultClaim',
203
						'out9' : 'resultClassification',
204
						'out10' : 'resultConcept',
205
						'out11' : 'resultLanguage',
206
						'out12' : 'resultOrganization',
207
						'out13' : 'resultResult',
208
						'out14' : 'resultProject',
209
						'out15' : 'category',
210
						'out16' : 'resultTopic',
211
						'out17' : 'resultDatasource',
212
						'out18' : 'result',
213
						'out19' : 'claim',
214
						'out20' : 'concept'
215
					}
216
					</PARAM>
217
                </PARAMETERS>
218
                <ARCS>
219
                    <ARC to="updateStats"/>
220
                </ARCS>
221
            </NODE>
222
            <NODE name="updateStats" type="SubmitHadoopJob">
223
                <DESCRIPTION>Update stats DB</DESCRIPTION>
224
                <PARAMETERS>
225
                    <PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
226
                    <PARAM required="true" type="string" name="hadoopJob" managedBy="system">StatsExportJob</PARAM>
227
                    <PARAM required="true" type="string" name="sysParams" managedBy="system">
228
						{ 	
229
							'Stats_Hbase_Source_Table' : 'hbase.mapred.datatable'
230
						}
231
					</PARAM>
232
                    <PARAM required="true" type="string" name="envParams" managedBy="system">
233
						{ 	
234
							'nameNode' : 'nameNode',
235
							'jobTracker' : 'jobTracker',
236
							'oozie.wf.application.path' : 'oozieWfApplicationPath',
237
							'Stats_db_Url' : 'statsDbUrl',
238
							'Stats_db_User' : 'statsDbUser',
239
							'Stats_db_Pass' : 'statsDbPass',
240
							'Stats_db_Driver' : 'statsDbDriver',
241
							'Stats_db_table_map' : 'statsDbTableMap',
242
							'Stats_sqoop_RecsPerStatement' : 'statsSqoopRecsPerStatement',
243
							'Stats_sqoop_StatementPerTrans' : 'statsSqoopStatementPerTrans',
244
							'Stats_sqoop_ReducersCount' : 'statsSqoopReducersCount',
245
							'Stats_output_Path' : 'statsOutputPath',
246
							'Stats_null_String_Field' : 'statsNullStringField',
247
							'Stats_null_Numeric_Field' : 'statsNullNumericField',
248
							'Stats_enclosing_Character' : 'statsEnclosingCharacter',
249
				            		'Stats_delim_Character' : 'statsDelimCharacter',
250
                                                        'out1' : 'out1',
251
							'out2' : 'out2',
252
							'out3' : 'out3',
253
							'out4' : 'out4',
254
							'out5' : 'out5',
255
							'out6' : 'out6',
256
							'out7' : 'out7',
257
							'out8' : 'out8',
258
							'out9' : 'out9',
259
							'out10' : 'out10',
260
							'out11' : 'out11',
261
							'out12' : 'out12',
262
							'out13' : 'out13',
263
							'out14' : 'out14',
264
							'out15' : 'out15',
265
							'out16' : 'out16',
266
							'out17' : 'out17',
267
							'out18' : 'out18',
268
							'out19' : 'out19',
269
							'out20' : 'out20',
270
							'isLookupEndpoint' : 'isLookupEndpoint',
271
							'Stats_indexConf' : 'index.entity.links'
272
						}
273
					</PARAM>
274
                </PARAMETERS>
275
                <ARCS>
276
                    <ARC to="sync"/>
277
                </ARCS>
278
            </NODE>
279
			<!-- WAIT FOR INDEX AND STATS TO COMPLETE -->
280
            <NODE name="sync" isJoin="true">
281
                <DESCRIPTION/>
282
                <PARAMETERS/>
283
                <ARCS>
284
                    <ARC to="findSearchService"/>
285
                </ARCS>
286
            </NODE>
287
            <NODE name="findSearchService" type="FindSearchService">
288
                <DESCRIPTION>find search service</DESCRIPTION>
289
                <PARAMETERS>
290
                    <PARAM name="xquery" type="string" managedBy="user" required="false">/RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='SearchServiceResourceType' and .//RESOURCE_KIND/@value='ServiceResources']/HEADER/RESOURCE_IDENTIFIER/@value/string()</PARAM>
291
                    <PARAM name="xqueryForServiceIdParam" type="string" managedBy="system" required="true">xqueryForSearchService</PARAM>
292
                </PARAMETERS>
293
                <ARCS>
294
                    <ARC name="found" to="switch"/>
295
                    <ARC name="notFound" to="success"/>
296
                </ARCS>
297
            </NODE>
298
            <NODE name="switch" type="SwitchIndex">
299
                <DESCRIPTION>switch index</DESCRIPTION>
300
                <PARAMETERS>
301
                    <PARAM name="xqueryForServiceIdParam" type="string" managedBy="system" required="true">xqueryForSearchService</PARAM>
302
                    <PARAM name="inputIndexIdParam" type="string" managedBy="system" required="true">index_id</PARAM>
303
                    <PARAM name="outputIndexIdParam" type="string" managedBy="system" required="true">IndexId</PARAM>
304
                </PARAMETERS>
305
                <ARCS>
306
                    <ARC to="success"/>
307
                </ARCS>
308
            </NODE>	
309
			
310
			<!-- OAI STORE UPDATE -->
311
            <NODE name="setOAIFormat" type="SetFormatInfo">
312
                <DESCRIPTION>set format, layout, interpretation for OAI publisher</DESCRIPTION>
313
                <PARAMETERS>
314
                    <PARAM name="format" type="string" managedBy="user" required="true">oaf</PARAM>
315
                    <PARAM name="layout" type="string" managedBy="user" required="true">index</PARAM>
316
                    <PARAM name="interpretation" type="string" managedBy="user" required="true">openaire</PARAM>
317
                    <PARAM name="formatParam" type="string" managedBy="system" required="true">oai_format</PARAM>
318
                    <PARAM name="layoutParam" type="string" managedBy="system" required="true">oai_layout</PARAM>
319
                    <PARAM name="interpretationParam" type="string" managedBy="system" required="true">oai_interpretation</PARAM>
320
                </PARAMETERS>
321
                <ARCS>
322
                    <ARC to="prepareOAI"/>
323
                </ARCS>
324
            </NODE>
325
            <NODE name="prepareOAI" type="PrepareOaiJob">
326
                <DESCRIPTION>Prepare params for OAI store feeding</DESCRIPTION>
327
                <PARAMETERS>
328
                    <PARAM name="oaiStoreCollectionParam" type="string" managedBy="system" required="true">oaiStoreCollection</PARAM>
329
                    <PARAM name="oaiDBName" type="string" managedBy="user" required="true">oaistore</PARAM>
330
                    <PARAM name="oaiDBNameParam" type="string" managedBy="system" required="true">oai_dbName</PARAM>
331
                    <PARAM name="formatParam" type="string" managedBy="system" required="true">oai_format</PARAM>
332
                    <PARAM name="layoutParam" type="string" managedBy="system" required="true">oai_layout</PARAM>
333
                    <PARAM name="interpretationParam" type="string" managedBy="system" required="true">oai_interpretation</PARAM>
334
                </PARAMETERS>
335
                <ARCS>
336
                    <ARC to="OAIDropStore"/>
337
                </ARCS>
338
            </NODE>
339
            <NODE name="OAIDropStore" type="OAIDropStore">
340
                <DESCRIPTION>Drop the store: upserts are too expensive!</DESCRIPTION>
341
                <PARAMETERS>
342
                    <PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
343
                    <PARAM name="formatParam" type="string" managedBy="system" required="true">oai_format</PARAM>
344
                    <PARAM name="layoutParam" type="string" managedBy="system" required="true">oai_layout</PARAM>
345
                    <PARAM name="interpretationParam" type="string" managedBy="system" required="true">oai_interpretation</PARAM>
346
                </PARAMETERS>
347
                <ARCS>
348
                    <ARC to="OAICreateStore"/>
349
                </ARCS>
350
            </NODE>
351
            <NODE name="OAICreateStore" type="OAICreateStore">
352
                <DESCRIPTION/>
353
                <PARAMETERS>
354
                    <PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
355
                    <PARAM name="formatParam" type="string" managedBy="system" required="true">oai_format</PARAM>
356
                    <PARAM name="layoutParam" type="string" managedBy="system" required="true">oai_layout</PARAM>
357
                    <PARAM name="interpretationParam" type="string" managedBy="system" required="true">oai_interpretation</PARAM>
358
                </PARAMETERS>
359
                <ARCS>
360
                    <ARC to="feedOAI"/>
361
                </ARCS>
362
            </NODE>
363
            <NODE name="feedOAI" type="SubmitHadoopJob">
364
                <DESCRIPTION>M/O oai feeding</DESCRIPTION>
365
                <PARAMETERS>
366
                    <PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
367
                    <PARAM required="true" type="string" name="hadoopJob" managedBy="system">oaiFeedJob</PARAM>
368
                    <PARAM required="true" type="string" name="envParams" managedBy="system">
369
						{ 	
370
							'mapred.input.dir' : 'hdfsRecordsPath',
371
							'services.publisher.oai.collection' : 'oaiStoreCollection',
372
							'oaiConfiguration' : 'oaiConfiguration',
373
							'oai.feed.date' : 'oai.feed.date',
374
							'services.publisher.oai.db' : 'oai_dbName'
375
						}
376
					</PARAM>
377
                    <PARAM required="true" type="string" name="sysParams" managedBy="system">
378
						{ 	
379
							'services.publisher.oai.host' : 'services.publisher.oai.host',
380
							'services.publisher.oai.port' : 'services.publisher.oai.port'			
381
						}
382
					</PARAM>
383
                </PARAMETERS>
384
                <ARCS>
385
                    <ARC to="CompoundIndexes"/>
386
                </ARCS>
387
            </NODE>
388
            <NODE name="CompoundIndexes" type="OAICreateIndex">
389
                <DESCRIPTION>Create composite indexes for the OAI store</DESCRIPTION>
390
                <PARAMETERS>
391
                    <PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
392
                    <PARAM name="oaiStoreCollectionParam" type="string" managedBy="system" required="true">oaiStoreCollection</PARAM>
393
                    <PARAM required="true" type="string" name="fieldNames" managedBy="user">set,datestamp;license,oaftype;set,oaftype;oaftype,funder;resulttypeid,oaftype</PARAM>
394
                </PARAMETERS>
395
                <ARCS>
396
                    <ARC to="ConfigIndexes"/>
397
                </ARCS>
398
            </NODE>
399
            <NODE name="ConfigIndexes" type="OAIEnsureIndexes">
400
                <DESCRIPTION>Ensure an index exists on fields as specified in the configuration profile</DESCRIPTION>
401
                <PARAMETERS>
402
                    <PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
403
                    <PARAM name="oaiStoreCollectionParam" type="string" managedBy="system" required="true">oaiStoreCollection</PARAM>
404
                </PARAMETERS>
405
                <ARCS>
406
                    <ARC to="SetsCount"/>
407
                    <ARC to="ConfigSets"/>
408
                </ARCS>
409
            </NODE>
410
            <NODE name="SetsCount" type="OAISetsCountUpdate">
411
                <DESCRIPTION>Count records in each OAI set, for each exported metadata format</DESCRIPTION>
412
                <PARAMETERS>
413
                    <PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
414
                    <PARAM name="oaiStoreCollectionParam" type="string" managedBy="system" required="true">oaiStoreCollection</PARAM>
415
                    <PARAM name="configuredOnly" type="boolean" managedBy="user" required="true">false</PARAM>
416
                </PARAMETERS>
417
                <ARCS>
418
                    <ARC to="SetOAIDB"/>
419
                </ARCS>
420
            </NODE>
421
            <NODE name="ConfigSets" type="RefreshSetsFromConfig">
422
                <DESCRIPTION>Reads the current OAI configuration and upsert OAI sets accordingly (counts are updated as well)</DESCRIPTION>
423
                <PARAMETERS>
424
                    <PARAM name="dbNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
425
                </PARAMETERS>
426
                <ARCS>
427
                    <ARC to="SetOAIDB"/>
428
                </ARCS>
429
            </NODE>
430
            <NODE name="SetOAIDB" type="SetCurrentOAIDB" isJoin="true">
431
                <DESCRIPTION>Update the currentdb in the OAI configuration profile to the value in oai_dbName</DESCRIPTION>
432
                <PARAMETERS>
433
                    <PARAM name="oaiDBNameParam" required="true" type="string" managedBy="system">oai_dbName</PARAM>
434
                </PARAMETERS>
435
                <ARCS>
436
                    <ARC to="success"/>
437
                </ARCS>
438
            </NODE>
439
        </CONFIGURATION>
440
        <STATUS>           
441
        </STATUS>
442
    </BODY>
443
</RESOURCE_PROFILE>
(2-2/2)