Project

General

Profile

1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER value="a4434d62-d4cd-4c73-a107-bc7c62e6f815_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
4
		<RESOURCE_TYPE value="WorkflowDSResourceType"/>
5
		<RESOURCE_KIND value="WorkflowDSResources"/>
6
		<RESOURCE_URI value=""/>
7
		<DATE_OF_CREATION value="2014-08-01T18:13:51.0Z"/>
8
	</HEADER>
9
	<BODY>
10
		<WORKFLOW_NAME>Data Provision</WORKFLOW_NAME>
11
		<WORKFLOW_TYPE>Data Provision</WORKFLOW_TYPE>
12
		<WORKFLOW_PRIORITY>30</WORKFLOW_PRIORITY>
13
		<CONFIGURATION start="manual">
14

    
15
			<!-- PREPARE NODES -->
16
			<NODE name="setInfo" type="SetFormatInfo" isStart="true">
17
				<DESCRIPTION>set mdformat, layout, interpretation</DESCRIPTION>
18
				<PARAMETERS>
19
					<PARAM name="format" type="string" managedBy="user" required="true">DMF</PARAM>
20
					<PARAM name="layout" type="string" managedBy="user" required="true">index</PARAM>
21
					<PARAM name="interpretation" type="string" managedBy="user" required="true">openaire</PARAM>
22
				</PARAMETERS>
23
				<ARCS>
24
					<ARC to="findIndex"/>
25
				</ARCS>
26
			</NODE>
27
			<NODE name="fetchRelClasses" type="FetchRelClasses" isStart="true">
28
				<DESCRIPTION/>
29
				<PARAMETERS>
30
					<PARAM name="relClassesProperty" type="string" managedBy="system" required="true">dnet.openaire.model.relclasses.xquery</PARAM>
31
					<PARAM name="relClassesName" type="string" managedBy="system" required="true">relClasses</PARAM>
32
				</PARAMETERS>
33
				<ARCS>
34
					<ARC to="groupEntities"/>
35
				</ARCS>
36
			</NODE>
37
			<NODE name="fetchContexts" type="LoadContextsJob" isStart="true">
38
				<DESCRIPTION/>
39
				<PARAMETERS/>
40
				<ARCS>
41
					<ARC to="groupEntities"/>
42
				</ARCS>
43
			</NODE>
44
			<NODE name="fetchEntityLinks" type="LoadEntityLinksJob" isStart="true">
45
				<DESCRIPTION/>
46
				<PARAMETERS/>
47
				<ARCS>
48
					<ARC to="groupEntities"/>
49
				</ARCS>
50
			</NODE>
51

    
52
			<!-- UPDATE INDEX -->
53
			<NODE name="findIndex" type="FindIndex">
54
				<DESCRIPTION/>
55
				<PARAMETERS/>
56
				<ARCS>
57
					<ARC name="found" to="prepareIndexing"/>
58
					<ARC name="notFound" to="createIndex"/>
59
				</ARCS>
60
			</NODE>
61
			<NODE name="createIndex" type="CreateIndex">
62
				<DESCRIPTION/>
63
				<PARAMETERS/>
64
				<ARCS>
65
					<ARC to="prepareIndexing"/>
66
				</ARCS>
67
			</NODE>
68
			<NODE name="prepareIndexing" type="PrepareIndexJob">
69
				<DESCRIPTION>Prepare indexing</DESCRIPTION>
70
				<PARAMETERS>
71
					<PARAM name="outputRecordsPathParam" type="string" managedBy="system" required="true">hdfsRecordsPath</PARAM>
72
					<PARAM name="rottenRecordsPathParam" type="string" managedBy="system" required="true">rottenRecordsPath</PARAM>
73
					<PARAM name="layoutToRecordStylesheet" type="string" managedBy="system" required="true">/eu/dnetlib/msro/openaireplus/workflows/index/openaireLayoutToRecordStylesheet.xsl</PARAM>
74
					<PARAM name="oafSchemaLocationProperty" type="string" managedBy="system" required="true">oaf.schema.location</PARAM>
75
				</PARAMETERS>
76
				<ARCS>
77
					<ARC to="cleanupXml"/>
78
					<ARC to="cleanupRotten"/>
79
				</ARCS>
80
			</NODE>
81
			<NODE name="cleanupXml" type="DeleteHdfsPathJob">
82
				<DESCRIPTION>hdfs cleanup (xml)</DESCRIPTION>
83
				<PARAMETERS>
84
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
85
					<PARAM required="true" type="string" name="envParams" managedBy="system">
86
						{
87
						'path' : 'hdfsRecordsPath'
88
						}
89
					</PARAM>
90
				</PARAMETERS>
91
				<ARCS>
92
					<ARC to="groupEntities"/>
93
				</ARCS>
94
			</NODE>
95
			<NODE name="cleanupRotten" type="DeleteHdfsPathJob">
96
				<DESCRIPTION>hdfs cleanup (rotten)</DESCRIPTION>
97
				<PARAMETERS>
98
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
99
					<PARAM required="true" type="string" name="envParams" managedBy="system">
100
						{
101
						'path' : 'rottenRecordsPath'
102
						}
103
					</PARAM>
104
				</PARAMETERS>
105
				<ARCS>
106
					<ARC to="groupEntities"/>
107
				</ARCS>
108
			</NODE>
109
			<NODE name="groupEntities" type="SubmitHadoopJob" isJoin="true">
110
				<DESCRIPTION>M/R group entities</DESCRIPTION>
111
				<PARAMETERS>
112
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
113
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">prepareIndexDataJob</PARAM>
114
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
115
						{
116
						'hbase.mapred.inputtable' : 'hbase.mapred.datatable',
117
						'hbase.mapreduce.inputtable' : 'hbase.mapred.datatable'
118
						}
119
					</PARAM>
120
					<PARAM required="true" type="string" name="envParams" managedBy="system">
121
						{
122
						'mapred.output.dir' : 'hdfsRecordsPath',
123
						'index.entity.links' : 'index.entity.links',
124
						'oaf.schema.location' : 'oaf.schema.location',
125
						'contextmap' : 'contextmap',
126
						'relClasses' : 'relClasses'
127
						}
128
					</PARAM>
129
					<PARAM required="true" type="boolean" name="simulation" managedBy="user">false</PARAM>
130
				</PARAMETERS>
131
				<ARCS>
132
					<ARC to="updateIndex"/>
133
					<ARC to="prepareOAI"/>
134
					<ARC to="prepareStats"/>
135
				</ARCS>
136
			</NODE>
137
			<NODE name="updateIndex" type="SubmitHadoopJob">
138
				<DESCRIPTION>M/O index records</DESCRIPTION>
139
				<PARAMETERS>
140
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
141
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">indexFeedJob</PARAM>
142
					<PARAM required="true" type="string" name="envParams" managedBy="system">
143
						{
144
						'mapred.input.dir' : 'hdfsRecordsPath',
145
						'mapred.output.dir' : 'rottenRecordsPath',
146
						'index.xslt' : 'index.xslt',
147
						'index.solr.url' : 'index.solr.url',
148
						'index.solr.collection' : 'index.solr.collection',
149
						'index.buffer.flush.threshold' : 'index.buffer.flush.threshold',
150
						'index.shutdown.wait.time' : 'index.shutdown.wait.time',
151
						'index.solr.sim.mode' : 'index.solr.sim.mode',
152
						'index.feed.timestamp' : 'index.feed.timestamp'
153
						}
154
					</PARAM>
155
					<PARAM required="true" type="boolean" name="simulation" managedBy="user">false</PARAM>
156
				</PARAMETERS>
157
				<ARCS>
158
					<ARC to="finalize"/>
159
				</ARCS>
160
			</NODE>
161
			<NODE name="finalize" type="FinalizeIndexFeeding">
162
				<DESCRIPTION>commit changes</DESCRIPTION>
163
				<PARAMETERS/>
164
				<ARCS>
165
					<ARC to="updateDs"/>
166
				</ARCS>
167
			</NODE>
168
			<NODE name="updateDs" type="IndexDsUpdateJob">
169
				<DESCRIPTION>update DS</DESCRIPTION>
170
				<PARAMETERS/>
171
				<ARCS>
172
					<ARC to="waitAll"/>
173
				</ARCS>
174
			</NODE>
175

    
176
			<!--  UPDATE STATS -->
177
			<NODE name="prepareStats" type="PrepareStatsParams">
178
				<DESCRIPTION>set params for stats</DESCRIPTION>
179
				<PARAMETERS/>
180
				<ARCS>
181
					<ARC to="updateStats"/>
182
				</ARCS>
183
			</NODE>
184
			<NODE name="updateStats" type="SubmitHadoopJob">
185
				<DESCRIPTION>Update stats DB</DESCRIPTION>
186
				<PARAMETERS>
187
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
188
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">StatsExportJob</PARAM>
189
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
190
						{
191
						'Stats_Hbase_Source_Table' : 'hbase.mapred.datatable'
192
						}
193
					</PARAM>
194
					<PARAM required="true" type="string" name="envParams" managedBy="system">
195
						{
196
						'nameNode' : 'nameNode',
197
						'jobTracker' : 'jobTracker',
198
						'isLookupEndpoint' : 'isLookupEndpoint',
199
						'Stats_indexConf' : 'index.entity.links'
200
						}
201
					</PARAM>
202
					<PARAM required="true" type="boolean" name="simulation" managedBy="user">false</PARAM>
203
				</PARAMETERS>
204
				<ARCS>
205
					<ARC to="waitAll"/>
206
				</ARCS>
207
			</NODE>
208

    
209

    
210
			<!-- OAI STORE UPDATE -->
211
			<NODE name="prepareOAI" type="PrepareOaiJob">
212
				<DESCRIPTION>Prepare target OAI store</DESCRIPTION>
213
				<PARAMETERS>
214
					<PARAM name="oaiDbName" type="string" managedBy="user" required="true">oaistore</PARAM>
215
					<PARAM name="oaiFormat" type="string" managedBy="user" required="true">oaf</PARAM>
216
					<PARAM name="oaiLayout" type="string" managedBy="user" required="true">index</PARAM>
217
					<PARAM name="oaiInterpretation" type="string" managedBy="user" required="true">openaire</PARAM>
218
					<PARAM name="oaiSource" type="string" managedBy="user" required="false"></PARAM>
219
					<PARAM name="skipDuplicates" type="boolean" managedBy="user" required="true">true</PARAM>
220
					<PARAM name="duplicateXPath" type="string" managedBy="user" required="true">//*[local-name()='entity']//*[local-name()='datainfo']/*[local-name()='deletedbyinference'][./text() = 'true']</PARAM>
221
				</PARAMETERS>
222
				<ARCS>
223
					<ARC to="OAIDropStore" />
224
				</ARCS>
225
			</NODE>
226
			<NODE name="OAIDropStore" type="OAIDropStore">
227
				<DESCRIPTION>Drop the store: upserts are too expensive!</DESCRIPTION>
228
				<PARAMETERS/>
229
				<ARCS>
230
					<ARC to="OAICreateStore"/>
231
				</ARCS>
232
			</NODE>
233
			<NODE name="OAICreateStore" type="OAICreateStore">
234
				<DESCRIPTION/>
235
				<PARAMETERS/>
236
				<ARCS>
237
					<ARC to="feedOAI"/>
238
				</ARCS>
239
			</NODE>
240
			<NODE name="feedOAI" type="SubmitHadoopJob">
241
				<DESCRIPTION>M/O oai feeding</DESCRIPTION>
242
				<PARAMETERS>
243
					<PARAM required="true" type="string" name="cluster" managedBy="system">DM</PARAM>
244
					<PARAM required="true" type="string" name="hadoopJob" managedBy="system">oaiFeedJob</PARAM>
245
					<PARAM required="true" type="string" name="envParams" managedBy="system">
246
						{
247
						'mapred.input.dir' : 'hdfsRecordsPath',
248
						'services.publisher.oai.collection' : 'oai_collection',
249
						'oaiConfiguration' : 'oai_configuration',
250
						'oai.feed.date' : 'oai_actionDate',
251
						'services.publisher.oai.db' : 'oai_dbName',
252
						'services.publisher.oai.skipDuplicates':'oai_skipDuplicates',
253
						'services.publisher.oai.duplicateXPath':'oai_duplicateXPath'
254
						}
255
					</PARAM>
256
					<PARAM required="true" type="string" name="sysParams" managedBy="system">
257
						{
258
						'services.publisher.oai.host' : 'services.publisher.oai.host',
259
						'services.publisher.oai.port' : 'services.publisher.oai.port'
260
						}
261
					</PARAM>
262
					<PARAM required="true" type="boolean" name="simulation" managedBy="user">false</PARAM>
263
				</PARAMETERS>
264
				<ARCS>
265
					<ARC to="CompoundIndexes"/>
266
				</ARCS>
267
			</NODE>
268
			<NODE name="CompoundIndexes" type="OAICreateIndex">
269
				<DESCRIPTION>Create composite indexes for the OAI store</DESCRIPTION>
270
				<PARAMETERS>
271
					<PARAM required="true" type="string" name="fieldNames" managedBy="user">set,resulttypeid;set,datestamp;license,oaftype;set,oaftype;oaftype,funder;resulttypeid,oaftype;oaftype,funding;resulttypeid,funder;resulttypeid,funding</PARAM>
272
				</PARAMETERS>
273
				<ARCS>
274
					<ARC to="ConfigIndexes"/>
275
				</ARCS>
276
			</NODE>
277
			<NODE name="ConfigIndexes" type="OAIEnsureIndexes">
278
				<DESCRIPTION>Ensure an index exists on fields as specified in the configuration profile</DESCRIPTION>
279
				<PARAMETERS/>
280
				<ARCS>
281
					<ARC to="SetsCount"/>
282
					<ARC to="ConfigSets"/>
283
				</ARCS>
284
			</NODE>
285
			<NODE name="SetsCount" type="OAISetsCountUpdate">
286
				<DESCRIPTION>Count records in each OAI set, for each exported metadata format</DESCRIPTION>
287
				<PARAMETERS>
288
					<PARAM name="configuredOnly" type="boolean" managedBy="user" required="true">false</PARAM>
289
				</PARAMETERS>
290
				<ARCS>
291
					<ARC to="waitAll"/>
292
				</ARCS>
293
			</NODE>
294
			<NODE name="ConfigSets" type="OAIRefreshConfiguration">
295
				<DESCRIPTION>Reads the current OAI configuration and upsert OAI sets accordingly (counts are updated as well)</DESCRIPTION>
296
				<PARAMETERS>
297
				</PARAMETERS>
298
				<ARCS>
299
					<ARC to="waitAll"/>
300
				</ARCS>
301
			</NODE>
302
			<!-- WAIT FOR ALL THE WF BRANCHES TO COMPLETE -->
303
			<NODE name="waitAll" isJoin="true">
304
				<DESCRIPTION>wait for all the branches to complete</DESCRIPTION>
305
				<PARAMETERS/>
306
				<ARCS>
307
					<ARC to="success"/>
308
				</ARCS>
309
			</NODE>
310
		</CONFIGURATION>
311
		<STATUS>
312
		</STATUS>
313
	</BODY>
314
</RESOURCE_PROFILE>
(6-6/16)