Project

General

Profile

1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER value="80f76a00-0eb3-4df0-a256-99f1df7b5fd8_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
4
		<RESOURCE_TYPE value="WorkflowDSResourceType"/>
5
		<RESOURCE_KIND value="WorkflowDSResources"/>
6
		<RESOURCE_URI value=""/>
7
		<DATE_OF_CREATION value="2006-05-04T18:13:51.0Z"/>
8
	</HEADER>
9
	<BODY>
10
		<WORKFLOW_NAME menuSection="InfoSpace Deduplication">InfoSpace Deduplication</WORKFLOW_NAME>
11
		<WORKFLOW_DESCRIPTION>OpenAIRE Deduplication</WORKFLOW_DESCRIPTION>
12
		<WORKFLOW_INFO/>
13
		<WORKFLOW_FAMILY>InfoSpace Deduplication</WORKFLOW_FAMILY>
14
		<WORKFLOW_PRIORITY>35</WORKFLOW_PRIORITY>
15
		<!-- <ADMIN_EMAIL>alessia.bardi@isti.cnr.it,claudio.atzori@isti.cnr.it</ADMIN_EMAIL> -->
16
		<CONFIGURATION status="EXECUTABLE" start="MANUAL">
17

    
18
			<PARAMETERS>
19
				<PARAM name="table" description="HBase table to be used in this workflow" required="true" managedBy="user" type="string"></PARAM>
20
				<PARAM name="cluster" description="Hadoop cluster logical name" required="true" managedBy="user" type="string" function="validValues(['DM','IIS'])"></PARAM>
21
				<PARAM name="reuseRegionInfo" description="Reuse table splits" required="true" managedBy="user" type="boolean">true</PARAM>
22

    
23
				<PARAM name="tableConf" description="HBase table configuration param name" required="true" managedBy="system" type="string">hbase.table.conf</PARAM>
24
				<PARAM name="tableCols" description="HBase table schema param name" required="true" managedBy="system" type="string">hbase.table.cols</PARAM>
25

    
26
				<PARAM name="db" description="relational db name" required="true" type="string" managedBy="user"/>
27

    
28
				<PARAM name="sqlQueryORG" description="the SQL query" required="true" managedBy="system" type="string">/eu/dnetlib/msro/workflows/hbase/queryOrganizations.sql</PARAM>
29
				<PARAM name="mappingORG" description="xslt mapping function" required="true" type="string" managedBy="user" function="obtainValues('dbmf2hbaseMappings', {})"/>
30

    
31
				<PARAM name="webcrawlActionSet" description="action set name for WebCrawl" required="true" type="string" managedBy="system">iis-wos-entities</PARAM>
32

    
33
				<PARAM name="mappingPublication" description="xslt mapping for publications" required="true" type="string" managedBy="user" function="obtainValues('oaf2hbaseMappings', {})"/>
34
				<PARAM name="reusePublication" description="reuse publications on HDFS?" required="true" type="boolean" managedBy="user"/>
35
				<PARAM name="hdfsPathPublication" description="hdfs path for publications" required="true" type="string" managedBy="user"/>
36

    
37
				<PARAM name="dedupConfigSequence" description="dedup configuration orchestration name" required="true" type="string" function="obtainValues('dedupOrchestrations', {})" managedBy="user"/>
38
				<PARAM name="minDistWorkDir" description="work directory for the minDist algorithm" required="true" type="string" managedBy="user">/tmp/dedup/prod/mindist</PARAM>
39

    
40
				<PARAM name="mappingSimilarities" description="xslt mapping function for similarities" required="true" type="string" managedBy="user" function="obtainValues('dbmf2hbaseMappings', {})"/>
41
				<PARAM name="mappingDissimilarities" description="xslt mapping function for dissimilarities" required="true" type="string" managedBy="user" function="obtainValues('dbmf2hbaseMappings', {})"/>
42
			</PARAMETERS>
43
			<WORKFLOW>
44

    
45
				<NODE name="resetHbase" type="LaunchWorkflowTemplate" isStart="true">
46
					<DESCRIPTION>Reset HBase table</DESCRIPTION>
47
					<PARAMETERS>
48
						<PARAM name="wfTemplateId" value="75345aba-c069-43f4-90aa-e13688d9845e_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
49
						<PARAM name="wfTemplateParams">
50
							<MAP>
51
								<ENTRY key="table" ref="table"/>
52
								<ENTRY key="cluster" ref="cluster"/>
53
								<ENTRY key="reuseRegionInfo" ref="reuseRegionInfo"/>
54
							</MAP>
55
						</PARAM>
56
					</PARAMETERS>
57

    
58
					<ARCS>
59
						<ARC to="select"/>
60
					</ARCS>
61
				</NODE>
62

    
63
				<NODE name="select" type="SelectWfPathByDedupEntity">
64
					<DESCRIPTION>Select the workflow path</DESCRIPTION>
65
					<PARAMETERS>
66
						<PARAM name="dedupConfigSequence" ref="dedupConfigSequence"/>
67
					</PARAMETERS>
68
					<ARCS>
69
						<ARC name="organization" to="orgs2hbase"/>
70
						<ARC name="result" to="webcrawl2hbase"/>
71
					</ARCS>
72
				</NODE>
73

    
74
				<NODE name="orgs2hbase" type="LaunchWorkflowTemplate">
75
					<DESCRIPTION>Organizations to HBase</DESCRIPTION>
76
					<PARAMETERS>
77
						<PARAM name="wfTemplateId" value="9bb04513-be1d-4d13-8e86-785cc0375635_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
78
						<PARAM name="wfTemplateParams">
79
							<MAP>
80
								<ENTRY key="sqlQuery" ref="sqlQueryORG"/>
81
								<ENTRY key="mapping" ref="mappingORG"/>
82
								<ENTRY key="sqlPrepareQuery" value="/eu/dnetlib/msro/workflows/hbase/prepareQueryDefault.sql"/>
83
								<ENTRY key="db" ref="db"/>
84
								<ENTRY key="hbaseTable" ref="table"/>
85
								<ENTRY key="cluster" ref="cluster"/>
86
							</MAP>
87
						</PARAM>
88
					</PARAMETERS>
89
					<ARCS>
90
						<ARC to="deduplication"/>
91
					</ARCS>
92
				</NODE>
93

    
94
				<NODE name="webcrawl2hbase" type="PromoteActionsHDFS">
95
					<DESCRIPTION>Promote Actions</DESCRIPTION>
96
					<PARAMETERS>
97
						<!-- dedup-similarity-organization-simple,dedup-similarity-result-levenstein,iis-dataset-entities-main,iis-referenced-datasets-preprocessing,iis-dataset-entities-preprocessing,iis-document-citations,iis-document-classes,iis-referenced-projects-main,iis-document-similarities,iis-document-statistics,iis-extracted-metadata,iis-referenced-datasets-main,iis-referenced-projects-preprocessing,iis-researchinitiative,iis-wos-entities,iis-referenceextraction-pdb -->
98
						<PARAM name="set" ref="webcrawlActionSet"/>
99
						<PARAM name="tableName" ref="table"/>
100
					</PARAMETERS>
101
					<ARCS>
102
						<ARC to="oaf2hbase"/>
103
					</ARCS>
104
				</NODE>
105

    
106

    
107
				<NODE name="oaf2hbase" type="LaunchWorkflowTemplate">
108
					<DESCRIPTION>OAF Publications to HBase</DESCRIPTION>
109
					<PARAMETERS>
110
						<PARAM name="wfTemplateId" value="5b05a65a-4eeb-4862-bc55-b35c7ec3baf0_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
111
						<PARAM name="wfTemplateParams">
112
							<MAP>
113
								<ENTRY key="hbaseTable" ref="table"/>
114
								<ENTRY key="cluster" ref="cluster"/>
115
								<ENTRY key="mdFormat" value="OAF"/>
116
								<ENTRY key="mapping" ref="mappingPublication"/>
117
								<ENTRY key="reuseMdRecords" ref="reusePublication"/>
118
								<ENTRY key="hdfsPath" ref="hdfsPathPublication"/>
119
							</MAP>
120
						</PARAM>
121
					</PARAMETERS>
122
					<ARCS>
123
						<ARC to="deduplication"/>
124
					</ARCS>
125
				</NODE>
126

    
127

    
128
				<NODE name="deduplication" type="LaunchWorkflowTemplate">
129
					<DESCRIPTION>Deduplication</DESCRIPTION>
130
					<PARAMETERS>
131
						<PARAM name="wfTemplateId" value="01ed11e8-e874-4478-a8ac-83e63e9699e4_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
132
						<PARAM name="wfTemplateParams">
133
							<MAP>
134
								<ENTRY key="hbaseTable" ref="table"/>
135
								<ENTRY key="cluster" ref="cluster"/>
136
								<ENTRY key="dedupConfigSequence" ref="dedupConfigSequence"/>
137
								<ENTRY key="minDistWorkDir" ref="minDistWorkDir"/>
138
								<ENTRY key="mappingSimilarities" ref="mappingSimilarities"/>
139
								<ENTRY key="mappingDissimilarities" ref="mappingDissimilarities"/>
140
							</MAP>
141
						</PARAM>
142
					</PARAMETERS>
143
					<ARCS>
144
						<ARC to="index"/>
145
					</ARCS>
146
				</NODE>
147

    
148

    
149
				<NODE name="index" type="LaunchWorkflowTemplate">
150
					<DESCRIPTION>Update dedup index</DESCRIPTION>
151
					<PARAMETERS>
152
						<PARAM name="wfTemplateId" value="a1b0b30c-7e19-46e2-888c-67c0b880b346_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
153
						<PARAM name="wfTemplateParams">
154
							<MAP>
155
								<ENTRY key="hbaseTable" ref="table"/>
156
								<ENTRY key="cluster" ref="cluster"/>
157
								<ENTRY key="dedupConfigSequence" ref="dedupConfigSequence"/>
158
							</MAP>
159
						</PARAM>
160
					</PARAMETERS>
161
					<ARCS>
162
						<ARC to="success"/>
163
					</ARCS>
164
				</NODE>
165

    
166

    
167
			</WORKFLOW>
168
		</CONFIGURATION>
169
		<NOTIFICATIONS/>
170
		<SCHEDULING enabled="false">
171
			<CRON>29 5 22 ? * *</CRON>
172
			<MININTERVAL>10080</MININTERVAL>
173
		</SCHEDULING>
174
		<STATUS/>
175
	</BODY>
176
</RESOURCE_PROFILE>
(4-4/23)