Project

General

Profile

« Previous | Next » 

Revision 58106

migration to spark

View differences:

modules/dnet-hadoop-services/trunk/src/main/resources/eu/dnetlib/data/hadoop/applicationContext-dnet-hadoop-service.properties
1
services.hadoop.clients={"DM":{"oozie":"true","mapred":"true","hbase":"true"},"IIS":{"oozie":"true","mapred":"false","hbase":"false"}}
1
services.hadoop.clients={"DM":{"oozie":"true","mapred":"true","hbase":"true"},"IIS":{"oozie":"true","mapred":"false","hbase":"false"},"GARR":{"oozie":"true","mapred":"false","hbase":"false"}}
2 2
services.hadoop.hbase.tablefeeder.batchsize=500
3 3
services.hadoop.jobregistry.size=100
4 4
services.hadoop.lib.path=/user/dnet/lib/dnet-mapreduce-jobs-assembly-1.0.0-SNAPSHOT.jar
modules/dnet-hadoop-services/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/applicationContext-hadoop.xml
8 8
	<bean id="DM" class="eu.dnetlib.data.hadoop.config.ConfigurationFactory"
9 9
	      p:defaults="${services.data.hadoop.dm.properties}" />
10 10

  
11
	<bean id="GARR" class="eu.dnetlib.data.hadoop.config.ConfigurationFactory"
12
		  p:defaults="${services.data.hadoop.garr.properties}" />
13

  
11 14
	<bean id="IIS" class="eu.dnetlib.data.hadoop.config.ConfigurationFactory"
12 15
	      p:defaults="${services.data.hadoop.iis.properties}" />
13 16

  
modules/dnet-hadoop-services/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.dm.garr.properties
1
dnet.clustername				=	GARR
2
##CORE-SITE
3
#fs.defaultFS					=	hdfs://dm-cluster-nn
4
#
5
#hadoop.security.authentication	=	simple
6
#hadoop.security.auth_to_local	=	DEFAULT
7
#
8
#hadoop.rpc.socket.factory.class.default	=	org.apache.hadoop.net.StandardSocketFactory
9
#
10
##HBASE-SITE
11
#hbase.rootdir					=	hdfs://dm-cluster-nn/hbase
12
#
13
#hbase.security.authentication	=	simple
14
#zookeeper.znode.rootserver		=	root-region-server
15
#hbase.zookeeper.quorum			=	namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
16
#hbase.zookeeper.property.clientPort	=	2181
17
#hbase.zookeeper.client.port		=	2181
18
#zookeeper.znode.parent			=	/hbase
19
#
20
##HDFS-SITE
21
#dfs.replication					=	2
22
#dfs.nameservices				=	dm-cluster-nn
23
#dfs.ha.namenodes.dm-cluster-nn	=	nn1,nn2
24
#
25
#dfs.namenode.rpc-address.dm-cluster-nn.nn1=namenode1.hadoop.dm.openaire.eu:8020
26
#dfs.namenode.http-address.dm-cluster-nn.nn1=namenode1.hadoop.dm.openaire.eu:50070
27
#dfs.namenode.rpc-address.dm-cluster-nn.nn2=namenode2.hadoop.dm.openaire.eu:8020
28
#dfs.namenode.http-address.dm-cluster-nn.nn2=namenode2.hadoop.dm.openaire.eu:50070
29
#
30
#dfs.client.failover.proxy.provider.dm-cluster-nn=org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
31

  
32

  
33

  
34
#OOZIE SERVER
35
oozie.service.loc =  http://hadoop-edge3.garr-pa1.d4science.org:11000/oozie/
modules/dnet-hadoop-services/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.iis.icm.properties
41 41
mapred.reducer.new-api			=	true
42 42

  
43 43
#OOZIE SERVER
44
oozie.service.loc				=	http://oozie.hadoop.iis.openaire.eu:11000/oozie
44
oozie.service.loc				=	http://iis-cdh5-test-m1.ocean.icm.edu.pl:11000/oozie/
modules/dnet-hadoop-services/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/applicationContext-hadoop.properties
1 1
services.data.hadoop.dm.properties = classpath:/eu/dnetlib/data/hadoop/config/hadoop-default.dm.cnr.properties
2 2
services.data.hadoop.iis.properties = classpath:/eu/dnetlib/data/hadoop/config/hadoop-default.iis.icm.properties
3
services.data.hadoop.garr.properties = classpath:/eu/dnetlib/data/hadoop/config/hadoop-default.dm.garr.properties
3 4

  
4 5
services.data.hadoop.hdfs.seqfilewriterfactory.keyclass = org.apache.hadoop.io.Text
5 6
services.data.hadoop.hdfs.seqfilewriterfactory.valueclass = org.apache.hadoop.io.Text
modules/dnet-dli/trunk/src/main/java/eu/dnetlib/msro/workflows/nodes/dedup/FindDedupConfigurationJobNode.java
1
package eu.dnetlib.msro.workflows.nodes.dedup;
2

  
3
import eu.dnetlib.enabling.locators.UniqueServiceLocator;
4
import eu.dnetlib.msro.workflows.graph.Arc;
5
import eu.dnetlib.msro.workflows.nodes.SimpleJobNode;
6
import eu.dnetlib.msro.workflows.procs.Env;
7
import eu.dnetlib.rmi.enabling.ISLookUpException;
8
import eu.dnetlib.rmi.enabling.ISLookUpService;
9
import org.apache.commons.lang3.StringUtils;
10
import org.springframework.beans.factory.annotation.Autowired;
11

  
12
import java.io.ByteArrayOutputStream;
13
import java.util.List;
14
import java.util.zip.GZIPOutputStream;
15

  
16
public class FindDedupConfigurationJobNode extends SimpleJobNode {
17

  
18

  
19
    private String configName;
20

  
21
    @Autowired
22
    UniqueServiceLocator uniqueServiceLocator;
23

  
24
    private final static String query ="for $x in collection('/db/DRIVER/DedupConfigurationDSResources/DedupConfigurationDSResourceType') where $x//RESOURCE_IDENTIFIER/@value ='%s' return $x//DEDUPLICATION/text()";
25

  
26
    @Override
27
    protected String execute(Env env) throws Exception {
28

  
29

  
30
        if (StringUtils.isBlank(configName)) throw new IllegalArgumentException("missing configuration sequence");
31

  
32
        env.setAttribute("dconf", getProfile());
33
        return Arc.DEFAULT_ARC;
34
    }
35

  
36

  
37
    private String getProfile() throws Exception {
38
        ISLookUpService service = uniqueServiceLocator.getService(ISLookUpService.class);
39

  
40
        List<String> resourceProfile = service.quickSearchProfile(String.format(query,configName));
41

  
42

  
43
        if (resourceProfile.size()!= 1) throw new IllegalStateException("Query should return on result query:"+String.format(query,configName));
44

  
45
        return  compressArgument(resourceProfile.get(0));
46

  
47

  
48
    }
49

  
50
    private static String compressArgument(final String value)  throws Exception{
51
        ByteArrayOutputStream out = new ByteArrayOutputStream();
52
        GZIPOutputStream gzip = new GZIPOutputStream(out);
53
        gzip.write(value.getBytes());
54
        gzip.close();
55
        return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
56
    }
57

  
58
    public String getConfigName() {
59
        return configName;
60
    }
61

  
62
    public void setConfigName(String configName) {
63
        this.configName = configName;
64
    }
65

  
66

  
67
}
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/msro/workflows/nodes/applicationContext-msro-dli-nodes.xml
27 27
          p:mongoDBName="${services.mdstore.mongodb.db}"
28 28
          scope="prototype"/>
29 29

  
30

  
31
	<bean id="wfNodeFindDedupConfiguration" class="eu.dnetlib.msro.workflows.nodes.dedup.FindDedupConfigurationJobNode"
32
		  scope="prototype"/>
33

  
30 34
</beans>
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/workflows/dedup.dli.spark.global.xml
1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER value="1dd10bf0-5c97-470c-9938-ae8e57a422fc_V29ya2Zsb3dEU1Jlc291cmNlcy9Xb3JrZmxvd0RTUmVzb3VyY2VUeXBl"/>
4
		<RESOURCE_TYPE value="WorkflowDSResourceType"/>
5
		<RESOURCE_KIND value="WorkflowDSResources"/>
6
		<RESOURCE_URI value=""/>
7
		<DATE_OF_CREATION value="2020-02-05T18:13:51.0Z"/>
8
	</HEADER>
9
	<BODY>
10
		<WORKFLOW_NAME menuSection="InfoSpace Deduplication">InfoSpace Deduplication using Spark</WORKFLOW_NAME>
11
		<WORKFLOW_DESCRIPTION>InfoSpace Deduplication using Spark</WORKFLOW_DESCRIPTION>
12
		<WORKFLOW_INFO/>
13
		<WORKFLOW_FAMILY>InfoSpace Deduplication</WORKFLOW_FAMILY>
14
		<WORKFLOW_PRIORITY>35</WORKFLOW_PRIORITY>
15
		<CONFIGURATION status="EXECUTABLE" start="MANUAL">
16

  
17
			<PARAMETERS>
18
				<PARAM description="Oozie Job name" function="listProfiles('HadoopJobConfigurationDSResourceType', '//HADOOP_JOB/@name','executeOozie')" managedBy="user" name="oozieJobName" required="true" type="string"/>
19
				<PARAM name="workingDirPath" description="working dir where generate all the intermediate verison of the graph" required="true" type="string" managedBy="user"/>
20
				<PARAM name="cluster" description="Hadoop cluster logical name" required="true" managedBy="user"   type="string" function="validValues(['DM','IIS', 'GARR'])"/>
21
				<PARAM name="reusePublication" description="reuse publications on HDFS?" required="true" type="boolean" managedBy="user"/>
22
				<PARAM name="reuseResolvedPublication" description="reuse resolved publications on HDFS?" required="true" type="boolean" managedBy="user"/>
23
				<PARAM name="reuseDataset" description="reuse datasets on HDFS?" required="true" type="boolean" managedBy="user"/>
24
				<PARAM name="reuseResolvedDataset" description="reuse resolved datasets on HDFS?" required="true" type="boolean" managedBy="user"/>
25
				<PARAM name="reuseUnresolved" description="reuse unresolved objects on HDFS?" required="true" type="boolean" managedBy="user"/>
26
                <PARAM name="dedupConfigDataset" description="dedup configuration orchestration name" required="true" type="string" function="obtainValues('dedupOrchestrations', {})" managedBy="user"/>
27
                <PARAM name="dedupConfigPublication" description="dedup configuration orchestration name" required="true" type="string" function="listProfiles('DedupConfigurationDSResources', '//DESCRIPTION', '')" managedBy="user"/>
28
				<PARAM name="dedupConfigUnknown" description="dedup configuration orchestration name" required="true" type="string" function="obtainValues('dedupOrchestrations', {})" managedBy="user"/>
29
			</PARAMETERS>
30
			<WORKFLOW>
31
				<NODE name="pmf2hdfs" type="LaunchWorkflowTemplate">
32
					<DESCRIPTION>import PMF Publications to HDFS DIR</DESCRIPTION>
33
					<PARAMETERS>
34
						<PARAM name="wfTemplateId" value="4a268738-b635-4d86-9a4a-52bec6d20866_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
35
						<PARAM name="wfTemplateParams">
36
							<MAP>
37
								<ENTRY key="cluster" ref="cluster"/>
38
								<ENTRY key="reuseMdRecords" ref="reusePublication"/>
39
								<ENTRY key="mdFormat" value="PMF"/>
40
								<ENTRY key="sourcePath" value="${workingDirPath}/xml/pmf.dli.seq"/>
41
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
42
								<ENTRY key="targetPath" value="${workingDirPath}/input/0"/>
43
								<ENTRY key="entity" value="publication"/>
44
							</MAP>
45
						</PARAM>
46
					</PARAMETERS>
47
					<ARCS>
48
						<ARC to="extractPublication"/>
49
					</ARCS>
50
				</NODE>
51
				<NODE name="extractPublication" type="SubmitHadoopJob">
52
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
53
					<PARAMETERS>
54
						<PARAM name="cluster" ref="cluster"/>
55
						<PARAM name="hadoopJob" ref="oozieJobName"/>
56
						<PARAM name="jobParams">
57
							<MAP>
58
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/extractentities/oozie_app"/>
59
								<ENTRY key="sourcePath" value="${workingDirPath}/input/0"/>
60
								<ENTRY key="targetPath" value="${workingDirPath}/extracted"/>
61
								<ENTRY key="targetDir" value="0"/>
62
								<ENTRY key="entities" value="publication,unknown,relation"/>
63
							</MAP>
64
						</PARAM>
65
					</PARAMETERS>
66
					<ARCS>
67
						<ARC to="pmfResolved2hdfs"/>
68
					</ARCS>
69
				</NODE>
70
				<NODE name="pmfResolved2hdfs" type="LaunchWorkflowTemplate">
71
					<DESCRIPTION>import PMF Publications to HDFS DIR</DESCRIPTION>
72
					<PARAMETERS>
73
						<PARAM name="wfTemplateId" value="4a268738-b635-4d86-9a4a-52bec6d20866_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
74
						<PARAM name="wfTemplateParams">
75
							<MAP>
76
								<ENTRY key="cluster" ref="cluster"/>
77
								<ENTRY key="reuseMdRecords" ref="reuseResolvedPublication"/>
78
								<ENTRY key="mdFormat" value="PMF"/>
79
								<ENTRY key="interpretation" value="resolved"/>
80
								<ENTRY key="sourcePath" value="${workingDirPath}/xml/pmf.dli.resolved.seq"/>
81
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
82
								<ENTRY key="targetPath" value="${workingDirPath}/input/1"/>
83
								<ENTRY key="entity" value="publication"/>
84
							</MAP>
85
						</PARAM>
86
					</PARAMETERS>
87
					<ARCS>
88
						<ARC to="extractPublicationResolved"/>
89
					</ARCS>
90
				</NODE>
91
				<NODE name="extractPublicationResolved" type="SubmitHadoopJob">
92
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
93
					<PARAMETERS>
94
						<PARAM name="cluster" ref="cluster"/>
95
						<PARAM name="hadoopJob" ref="oozieJobName"/>
96
						<PARAM name="jobParams">
97
							<MAP>
98
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/extractentities/oozie_app"/>
99
								<ENTRY key="sourcePath" value="${workingDirPath}/input/1"/>
100
								<ENTRY key="targetPath" value="${workingDirPath}/extracted"/>
101
								<ENTRY key="targetDir" value="1"/>
102
								<ENTRY key="entities" value="publication"/>
103
							</MAP>
104
						</PARAM>
105
					</PARAMETERS>
106
					<ARCS>
107
						<ARC to="dmf2hdfs"/>
108
					</ARCS>
109
				</NODE>
110
				<NODE name="dmf2hdfs" type="LaunchWorkflowTemplate">
111
					<DESCRIPTION>import PMF Publications to HDFS DIR</DESCRIPTION>
112
					<PARAMETERS>
113
						<PARAM name="wfTemplateId" value="4a268738-b635-4d86-9a4a-52bec6d20866_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
114
						<PARAM name="wfTemplateParams">
115
							<MAP>
116
								<ENTRY key="cluster" ref="cluster"/>
117
								<ENTRY key="reuseMdRecords" ref="reuseDataset"/>
118
								<ENTRY key="mdFormat" value="DMF"/>
119
								<ENTRY key="sourcePath" value="${workingDirPath}/xml/dmf.dli.seq"/>
120
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
121
								<ENTRY key="targetPath" value="${workingDirPath}/input/2"/>
122
								<ENTRY key="entity" value="dataset"/>
123
							</MAP>
124
						</PARAM>
125
					</PARAMETERS>
126
					<ARCS>
127
						<ARC to="extractDataset"/>
128
					</ARCS>
129
				</NODE>
130
				<NODE name="extractDataset" type="SubmitHadoopJob">
131
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
132
					<PARAMETERS>
133
						<PARAM name="cluster" ref="cluster"/>
134
						<PARAM name="hadoopJob" ref="oozieJobName"/>
135
						<PARAM name="jobParams">
136
							<MAP>
137
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/extractentities/oozie_app"/>
138
								<ENTRY key="sourcePath" value="${workingDirPath}/input/2"/>
139
								<ENTRY key="targetPath" value="${workingDirPath}/extracted"/>
140
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
141
								<ENTRY key="targetDir" value="2"/>
142
								<ENTRY key="entities" value="dataset,unknown,relation"/>
143
							</MAP>
144
						</PARAM>
145
					</PARAMETERS>
146
					<ARCS>
147
						<ARC to="dmfResolved2hdfs"/>
148
					</ARCS>
149
				</NODE>
150
				<NODE name="dmfResolved2hdfs" type="LaunchWorkflowTemplate">
151
					<DESCRIPTION>import PMF Publications to HDFS DIR</DESCRIPTION>
152
					<PARAMETERS>
153
						<PARAM name="wfTemplateId" value="4a268738-b635-4d86-9a4a-52bec6d20866_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
154
						<PARAM name="wfTemplateParams">
155
							<MAP>
156
								<ENTRY key="cluster" ref="cluster"/>
157
								<ENTRY key="reuseMdRecords" ref="reuseResolvedDataset"/>
158
								<ENTRY key="mdFormat" value="DMF"/>
159
								<ENTRY key="interpretation" value="resolved"/>
160
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
161
								<ENTRY key="sourcePath" value="${workingDirPath}/xml/dmf.dli.resolved.seq"/>
162
								<ENTRY key="targetPath" value="${workingDirPath}/input/3"/>
163
								<ENTRY key="entity" value="dataset"/>
164
							</MAP>
165
						</PARAM>
166
					</PARAMETERS>
167
					<ARCS>
168
						<ARC to="extractDatasetResolved"/>
169
					</ARCS>
170
				</NODE>
171
				<NODE name="extractDatasetResolved" type="SubmitHadoopJob">
172
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
173
					<PARAMETERS>
174
						<PARAM name="cluster" ref="cluster"/>
175
						<PARAM name="hadoopJob" ref="oozieJobName"/>
176
						<PARAM name="jobParams">
177
							<MAP>
178
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/extractentities/oozie_app"/>
179
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
180
								<ENTRY key="sourcePath" value="${workingDirPath}/input/3"/>
181
								<ENTRY key="targetPath" value="${workingDirPath}/extracted"/>
182
								<ENTRY key="targetDir" value="3"/>
183
								<ENTRY key="entities" value="dataset"/>
184
							</MAP>
185
						</PARAM>
186
					</PARAMETERS>
187
					<ARCS>
188
						<ARC to="mergeDataset"/>
189
					</ARCS>
190
				</NODE>
191
				<NODE isStart="true" name="mergeDataset" type="SubmitHadoopJob">
192
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
193
					<PARAMETERS>
194
						<PARAM name="cluster" ref="cluster"/>
195
						<PARAM name="hadoopJob" ref="oozieJobName"/>
196
						<PARAM name="jobParams">
197
							<MAP>
198
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/mergeentities/oozie_app"/>
199
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
200
								<ENTRY key="sourcePath" value="${workingDirPath}/extracted"/>
201
								<ENTRY key="targetPath" value="${workingDirPath}/graph"/>
202
								<ENTRY key="entity" value="dataset"/>
203
							</MAP>
204
						</PARAM>
205
					</PARAMETERS>
206
					<ARCS>
207
						<ARC to="mergePublication"/>
208
					</ARCS>
209
				</NODE>
210
				<NODE name="mergePublication" type="SubmitHadoopJob">
211
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
212
					<PARAMETERS>
213
						<PARAM name="cluster" ref="cluster"/>
214
						<PARAM name="hadoopJob" ref="oozieJobName"/>
215
						<PARAM name="jobParams">
216
							<MAP>
217
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/mergeentities/oozie_app"/>
218
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
219
								<ENTRY key="sourcePath" value="${workingDirPath}/extracted"/>
220
								<ENTRY key="targetPath" value="${workingDirPath}/graph"/>
221
								<ENTRY key="entity" value="publication"/>
222
							</MAP>
223
						</PARAM>
224
					</PARAMETERS>
225
					<ARCS>
226
						<ARC to="mergeUnknown"/>
227
					</ARCS>
228
				</NODE>
229
				<NODE name="mergeUnknown" type="SubmitHadoopJob">
230
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
231
					<PARAMETERS>
232
						<PARAM name="cluster" ref="cluster"/>
233
						<PARAM name="hadoopJob" ref="oozieJobName"/>
234
						<PARAM name="jobParams">
235
							<MAP>
236
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/mergeentities/oozie_app"/>
237
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
238
								<ENTRY key="sourcePath" value="${workingDirPath}/extracted"/>
239
								<ENTRY key="targetPath" value="${workingDirPath}/graph"/>
240
								<ENTRY key="entity" value="unknown"/>
241
							</MAP>
242
						</PARAM>
243
					</PARAMETERS>
244
					<ARCS>
245
						<ARC to="mergeRelation"/>
246
					</ARCS>
247
				</NODE>
248
				<NODE name="mergeRelation" type="SubmitHadoopJob">
249
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
250
					<PARAMETERS>
251
						<PARAM name="cluster" ref="cluster"/>
252
						<PARAM name="hadoopJob" ref="oozieJobName"/>
253
						<PARAM name="jobParams">
254
							<MAP>
255
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/mergeentities/oozie_app"/>
256
								<ENTRY key="workingDirPath" value="${workingDirPath}"/>
257
								<ENTRY key="sourcePath" value="${workingDirPath}/extracted"/>
258
								<ENTRY key="targetPath" value="${workingDirPath}/graph"/>
259
								<ENTRY key="entity" value="relation"/>
260
							</MAP>
261
						</PARAM>
262
					</PARAMETERS>
263
					<ARCS>
264
						<ARC to="success"/>
265
					</ARCS>
266
				</NODE>
267
				<NODE name="dedupPublication" type="LaunchWorkflowTemplate">
268
					<DESCRIPTION>import PMF Publications to HDFS DIR</DESCRIPTION>
269
					<PARAMETERS>
270
						<PARAM name="wfTemplateId" value="b8e1afcf-e5ca-47d0-9ee8-47da90e1a9c3_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
271
						<PARAM name="wfTemplateParams">
272
							<MAP>
273
								<ENTRY key="cluster" ref="cluster"/>
274
								<ENTRY key="sourcePath" value="${workingDirPath}/graph"/>
275
								<ENTRY key="targetPath" value="${workingDirPath}/dedupGraphWD"/>
276
								<ENTRY key="entity" value="publication"/>
277
								<ENTRY key="dedup_conf" ref="dedupConfigPublication"/>
278
								<ENTRY key="oozieJob" ref="oozieJobName"/>
279
							</MAP>
280
						</PARAM>
281
					</PARAMETERS>
282
					<ARCS>
283
						<ARC to="success"/>
284
					</ARCS>
285
				</NODE>
286
			</WORKFLOW>
287
		</CONFIGURATION>
288
		<NOTIFICATIONS/>
289
		<SCHEDULING enabled="false">
290
			<CRON>29 5 22 ? * *</CRON>
291
			<MININTERVAL>10080</MININTERVAL>
292
		</SCHEDULING>
293
		<STATUS/>
294
	</BODY>
295
</RESOURCE_PROFILE>
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/workflows/dedupEntitywf.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="b8e1afcf-e5ca-47d0-9ee8-47da90e1a9c3_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="WorkflowTemplateDSResourceType"/>
5
        <RESOURCE_KIND value="WorkflowTemplateDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2020-02-14T12:27:20+01:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
            <PARAMETERS>
12
                <PARAM description="HDFS path" name="sourcePath" required="true" type="string"/>
13
                <PARAM description="HDFS path" name="targetPath" required="true" type="string"/>
14
                <PARAM description="HDFS path" name="entity" required="true" type="string"/>
15
                <PARAM description="Hadoop cluster name" name="cluster" required="true" type="string"/>
16
                <PARAM description="Dedup Configuration" name="dedup_conf" required="true" type="string"/>
17
            </PARAMETERS>
18
            <WORKFLOW>
19
                <NODE isStart="true" name="findDedupConfiguration" type="FindDedupConfiguration">
20
                    <DESCRIPTION>Find Dedup configuration</DESCRIPTION>
21
                    <PARAMETERS>
22
                        <PARAM name="configName" ref="dedup_conf"/>
23
                    </PARAMETERS>
24
                    <ARCS>
25
                        <ARC to="dedupPublication"/>
26
                    </ARCS>
27
                </NODE>
28

  
29

  
30
                <NODE name="dedupPublication" type="SubmitHadoopJob">
31
                    <DESCRIPTION>Run M/R import Job</DESCRIPTION>
32
                    <PARAMETERS>
33
                        <PARAM name="cluster" ref="cluster"/>
34
                        <PARAM name="hadoopJob" value="executeOozieJobGARR"/>
35
                        <PARAM name="jobParams">
36
                            <MAP>
37
                                <ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/dedup/oozie_app"/>
38
                                <ENTRY key="sourcePath" ref="sourcePath"/>
39
                                <ENTRY key="targetPath"  ref="targetPath"/>
40
                                <ENTRY key="entity" ref="entity"/>
41
                                <ENTRY key="dedupConf" env="dconf"/>
42
                            </MAP>
43
                        </PARAM>
44
                    </PARAMETERS>
45
                    <ARCS>
46
                        <ARC to="success"/>
47
                    </ARCS>
48
                </NODE>
49
            </WORKFLOW>
50
        </CONFIGURATION>
51
    </BODY>
52
</RESOURCE_PROFILE>
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/workflows/mergeDLIEntities.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="6f111858-1719-48a9-8e90-8f829497f7a8_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="WorkflowTemplateDSResourceType"/>
5
        <RESOURCE_KIND value="WorkflowTemplateDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2016-06-20T07:52:08+00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
            <PARAMETERS>
12
                <PARAM name="targetDir" description="metadata interpretation name" required="false" type="string" default="cleaned"/>
13
                <PARAM name="sourcePath" description="HDFS path" required="true" type="string"/>
14
                <PARAM name="targetPath" description="HDFS path" required="true" type="string"/>
15
                <PARAM name="entity" description="HDFS path" required="true" type="string"/>
16
                <PARAM name="cluster" description="Hadoop cluster name" required="true" type="string"/>
17
            </PARAMETERS>
18
            <WORKFLOW>
19

  
20

  
21
                <NODE name="convertEntity" type="SubmitHadoopJob">
22
                    <DESCRIPTION>Run M/R import Job</DESCRIPTION>
23
                    <PARAMETERS>
24
                        <PARAM name="cluster" ref="cluster"/>
25
                        <PARAM name="hadoopJob" value="importMetadataToHDFS"/>
26
                        <PARAM name="jobParams">
27
                            <MAP>
28
                                <ENTRY key="sourcePath" ref="sourcePath"/>
29
                                <ENTRY key="targetPath" ref="targetPath"/>
30
                                <ENTRY key="entity" ref="entity"/>
31
                            </MAP>
32
                        </PARAM>
33
                    </PARAMETERS>
34
                    <ARCS>
35
                        <ARC to="success"/>
36
                    </ARCS>
37
                </NODE>
38

  
39
            </WORKFLOW>
40
        </CONFIGURATION>
41
    </BODY>
42
</RESOURCE_PROFILE>
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResourceType/importMetadataToHDFS.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="ea5ea1a0-a750-42db-8dfa-1606158698da_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2019-04-12T13:16:20+02:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="importMetadataToHDFS" type="oozie">
11
            <DESCRIPTION>Import XML Data into sequence File</DESCRIPTION>
12
            <STATIC_CONFIGURATION><!-- Cluster wide -->
13
                <PROPERTY key="queueName" value="default"/>
14
                <PROPERTY key="user.name" value="sandro.labruzzo"/><!-- Runtime -->
15
                <PROPERTY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/oozie_app/"/>
16
                <PROPERTY key="oozie.wf.validate.ForkJoin" value="false"/>
17
                <PROPERTY key="oozie.use.system.libpath" value="True"/>
18
                <PROPERTY key="security_enabled" value="False"/>
19
                <PROPERTY key="dryrun" value="True"/>
20
                <PROPERTY key="oozie.action.sharelib.for.spark" value="spark2"/>
21
                <PROPERTY key="sparkDriverMemory" value="3G"/>
22
                <PROPERTY key="sparkExecutorMemory" value="3G"/>
23
                <PROPERTY key="metadataEncoding" value="XML"/>
24

  
25
            </STATIC_CONFIGURATION>
26
            <JOB_INTERFACE>
27
                <PARAM description="the path of the input hdfs file contains xml" name="sourcePath" required="true"/>
28
                <PARAM description="the path of the result hdfs containig OAF entities" name="targetPath" required="true"/>
29
                <PARAM description="The entity type" name="entity" required="true"/>
30
            </JOB_INTERFACE>
31
        </HADOOP_JOB>
32
        <STATUS>
33
            <LAST_SUBMISSION_DATE value="2019-04-17T17:46:31+02:00"/>
34
            <RUNNING_INSTANCES value="2"/>
35
            <CUMULATIVE_RUN value="83"/>
36
        </STATUS>
37
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
38
    </BODY>
39
</RESOURCE_PROFILE>
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResourceType/mergeEntitiesToHDFS.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="80b642da-533c-4c6f-b896-fbba12146175_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2019-04-12T13:16:20+02:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="mergeEntitiesToHDFS" type="oozie">
11
            <DESCRIPTION>Import XML Data into sequence File</DESCRIPTION>
12
            <STATIC_CONFIGURATION><!-- Cluster wide -->
13
                <PROPERTY key="queueName" value="default"/>
14
                <PROPERTY key="user.name" value="sandro.labruzzo"/><!-- Runtime -->
15
                <PROPERTY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/mergeentities/oozie_app/"/>
16
                <PROPERTY key="oozie.wf.validate.ForkJoin" value="false"/>
17
                <PROPERTY key="oozie.use.system.libpath" value="True"/>
18
                <PROPERTY key="security_enabled" value="False"/>
19
                <PROPERTY key="dryrun" value="True"/>
20
                <PROPERTY key="oozie.action.sharelib.for.spark" value="spark2"/>
21
                <PROPERTY key="sparkDriverMemory" value="4G"/>
22
                <PROPERTY key="sparkExecutorMemory" value="4G"/>
23
                <PROPERTY key="metadataEncoding" value="XML"/>
24
            </STATIC_CONFIGURATION>
25
            <JOB_INTERFACE>
26
                <PARAM description="the path of the input hdfs file contains extracted Entities" name="sourcePath" required="true"/>
27
                <PARAM description="the baseDir path of the result hdfs containig OAF merged entities" name="targetPath" required="true"/>
28
                <PARAM description="The entity type" name="entity" required="true"/>
29
            </JOB_INTERFACE>
30
        </HADOOP_JOB>
31
        <STATUS>
32
            <LAST_SUBMISSION_DATE value="2019-04-17T17:46:31+02:00"/>
33
            <RUNNING_INSTANCES value="2"/>
34
            <CUMULATIVE_RUN value="83"/>
35
        </STATUS>
36
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
37
    </BODY>
38
</RESOURCE_PROFILE>
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResourceType/executeOozieJobICM.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="ed671880-d053-4692-90e4-5901ddde661b_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2020-02-17T15:36:24+01:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="executeOozieJobICM" type="oozie">
11
            <DESCRIPTION>Import XML Data into sequence File</DESCRIPTION>
12
            <STATIC_CONFIGURATION><!-- Cluster wide -->
13
                <PROPERTY key="queueName" value="default"/>
14
                <PROPERTY key="user.name" value="sandro.labruzzo"/>
15
                <PROPERTY key="oozie.wf.validate.ForkJoin" value="false"/>
16
                <PROPERTY key="oozie.use.system.libpath" value="True"/>
17
                <PROPERTY key="security_enabled" value="False"/>
18
                <PROPERTY key="dryrun" value="True"/>
19
                <PROPERTY key="oozie.action.sharelib.for.spark" value="spark2"/>
20
                <PROPERTY key="sparkDriverMemory" value="4G"/>
21
                <PROPERTY key="sparkExecutorMemory" value="4G"/>
22
                <PROPERTY key="metadataEncoding" value="XML"/>
23
                <PROPERTY key="jobTracker" value="yarnRM"/>
24
                <PROPERTY key="nameNode" value="hdfs://nameservice1"/>
25
                <PROPERTY key="sparkExtraOPT" value="--conf spark.extraListeners=&#34;com.cloudera.spark.lineage.NavigatorAppListener&#34; --conf spark.sql.queryExecutionListeners=&#34;com.cloudera.spark.lineage.NavigatorQueryListener&#34; --conf spark.sql.warehouse.dir=&#34;/user/hive/warehouse&#34;"/>
26
                <PROPERTY key="projectVersion" value="1.0.5-SNAPSHOT"/>
27
            </STATIC_CONFIGURATION>
28
            <JOB_INTERFACE>
29
                <PARAM description="the path of the input hdfs file contains xml" name="oozie.wf.application.path" required="true"/>
30
            </JOB_INTERFACE>
31
        </HADOOP_JOB>
32
        <STATUS>
33
            <LAST_SUBMISSION_DATE value="2020-02-17T14:18:37+01:00"/>
34
            <RUNNING_INSTANCES value="4"/>
35
            <CUMULATIVE_RUN value="105"/>
36
        </STATUS>
37
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
38
    </BODY>
39
</RESOURCE_PROFILE>
modules/dnet-dli/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/HadoopJobConfigurationDSResourceType/executeOozieJobGARR.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="c277d01e-d10b-4150-8d9f-caf524e40fe3_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2019-04-12T13:16:20+02:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="executeOozieJobGARR" type="oozie">
11
            <DESCRIPTION>Import XML Data into sequence File</DESCRIPTION>
12
            <STATIC_CONFIGURATION><!-- Cluster wide -->
13
                <PROPERTY key="queueName" value="default"/>
14
                <PROPERTY key="user.name" value="sandro.labruzzo"/><!-- Runtime -->
15
<!--                <PROPERTY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/oozie_app/"/>-->
16
                <PROPERTY key="oozie.wf.validate.ForkJoin" value="false"/>
17
                <PROPERTY key="oozie.use.system.libpath" value="True"/>
18
                <PROPERTY key="security_enabled" value="False"/>
19
                <PROPERTY key="dryrun" value="True"/>
20
                <PROPERTY key="oozie.action.sharelib.for.spark" value="spark2"/>
21
                <PROPERTY key="sparkDriverMemory" value="4G"/>
22
                <PROPERTY key="sparkExecutorMemory" value="4G"/>
23
                <PROPERTY key="metadataEncoding" value="XML"/>
24
                <PROPERTY key="jobTracker" value="hadoop-rm3.garr-pa1.d4science.org:8032"/>
25
                <PROPERTY key="nameNode" value="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020"/>
26
                <PROPERTY key="projectVersion" value="1.0.5-SNAPSHOT"/>
27
            </STATIC_CONFIGURATION>
28
            <JOB_INTERFACE>
29
                <PARAM description="the path of the input hdfs file contains xml" name="oozie.wf.application.path" required="true"/>
30
            </JOB_INTERFACE>
31
        </HADOOP_JOB>
32
        <STATUS>
33
            <LAST_SUBMISSION_DATE value="2019-04-17T17:46:31+02:00"/>
34
            <RUNNING_INSTANCES value="2"/>
35
            <CUMULATIVE_RUN value="83"/>
36
        </STATUS>
37
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
38
    </BODY>
39
</RESOURCE_PROFILE>
modules/dnet-graph/trunk/src/main/resources/eu/dnetlib/bootstrap/profiles/workflows/metadata2hdfs.xml
1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER value="4a268738-b635-4d86-9a4a-52bec6d20866_V29ya2Zsb3dUZW1wbGF0ZURTUmVzb3VyY2VzL1dvcmtmbG93VGVtcGxhdGVEU1Jlc291cmNlVHlwZQ=="/>
4
		<RESOURCE_TYPE value="WorkflowTemplateDSResourceType"/>
5
		<RESOURCE_KIND value="WorkflowTemplateDSResources"/>
6
		<RESOURCE_URI value=""/>
7
		<DATE_OF_CREATION value="2020-02-17T16:16:24+01:00"/>
8
	</HEADER>
9
	<BODY>
10
		<CONFIGURATION>
11
			<PARAMETERS>
12
				<PARAM description="reuse metadata records?" name="reuseMdRecords" required="true" type="boolean"/>
13
				<PARAM description="metadata format name" name="mdFormat" required="true" type="string"/>
14
				<PARAM default="cleaned" description="metadata interpretation name" name="interpretation" required="false" type="string"/>
15
				<PARAM description="HDFS path" name="sourcePath" required="true" type="string"/>
16
				<PARAM description="HDFS path" name="targetPath" required="true" type="string"/>
17
				<PARAM description="HDFS path" name="entity" required="true" type="string"/>
18
				<PARAM description="Hadoop cluster name" name="cluster" required="true" type="string"/>
19
				<PARAM description="Oozie Job Name" name="oozieJob" required="true" type="string"/>
20
			</PARAMETERS>
21
			<WORKFLOW>
22
				<NODE isStart="true" name="reuseHdfsRecords" type="ReuseHdfsRecords">
23
					<DESCRIPTION>reuse mdstore records</DESCRIPTION>
24
					<PARAMETERS>
25
						<PARAM name="reuseMdRecords" ref="reuseMdRecords"/>
26
					</PARAMETERS>
27
					<ARCS>
28
						<ARC name="true" to="doneExport"/>
29
						<ARC name="false" to="exportRecords"/>
30
					</ARCS>
31
				</NODE>
32
				<NODE name="exportRecords" type="MDStoreBatchExporter">
33
					<DESCRIPTION>Fetch mdstore records</DESCRIPTION>
34
					<PARAMETERS>
35
						<PARAM name="format" ref="mdFormat"/>
36
						<PARAM name="layout" value="store"/>
37
						<PARAM name="interpretation" ref="interpretation"/>
38
						<PARAM name="outputEprParam" value="records_epr"/>
39
					</PARAMETERS>
40
					<ARCS>
41
						<ARC to="storeHdfsRecords"/>
42
					</ARCS>
43
				</NODE>
44
				<NODE name="storeHdfsRecords" type="StoreHdfsRecords">
45
					<DESCRIPTION>Store records to HDFS</DESCRIPTION>
46
					<PARAMETERS>
47
						<PARAM name="inputEprParam" value="records_epr"/>
48
						<PARAM name="hdfsPath" ref="hdfsPath"/>
49
						<PARAM name="cluster" ref="cluster"/>
50
					</PARAMETERS>
51
					<ARCS>
52
						<ARC to="doneExport"/>
53
					</ARCS>
54
				</NODE>
55
				<NODE name="doneExport">
56
					<DESCRIPTION/>
57
					<PARAMETERS/>
58
					<ARCS>
59
						<ARC to="convertEntity"/>
60
					</ARCS>
61
				</NODE>
62
				<NODE name="convertEntity" type="SubmitHadoopJob">
63
					<DESCRIPTION>Run M/R import Job</DESCRIPTION>
64
					<PARAMETERS>
65
						<PARAM name="cluster" ref="cluster"/>
66
						<PARAM name="hadoopJob" ref="oozieJob"/>
67
						<PARAM name="jobParams">
68
							<MAP>
69
								<ENTRY key="sourcePath" ref="sourcePath"/>
70
								<ENTRY key="targetPath" ref="targetPath"/>
71
								<ENTRY key="entity" ref="entity"/>
72
								<ENTRY key="oozie.wf.application.path" value="/user/sandro.labruzzo/graph/scholexplorer/oozie_app"/>
73
							</MAP>
74
						</PARAM>
75
					</PARAMETERS>
76
					<ARCS>
77
						<ARC to="success"/>
78
					</ARCS>
79
				</NODE>
80
			</WORKFLOW>
81
		</CONFIGURATION>
82
	</BODY>
83
</RESOURCE_PROFILE>
modules/dnet-core-components/trunk/src/main/java/eu/dnetlib/rmi/data/hadoop/ClusterName.java
15 15
 */
16 16
public enum ClusterName {
17 17
	DM, // Data Management
18
	GARR, // Data Management at GARR with spark2
18 19
	IIS; // Information Inference Service(s)
19 20

  
20 21
	public static List<String> asStringList() {
modules/dnet-msro-service/trunk/src/test/java/eu/dnetlib/graph/GraphLoaderTest.java
1
package eu.dnetlib.graph;
2

  
3
import org.junit.Test;
4

  
5
import java.util.regex.Matcher;
6
import java.util.regex.Pattern;
7

  
8

  
9
public class GraphLoaderTest {
10

  
11

  
12
    final String regex = "\\$\\{(\\w*)\\}";
13

  
14

  
15
    final Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
16

  
17

  
18

  
19

  
20
    @Test
21
    public void testRegEx () {
22
        String string = "${1234}/sdfjasdfpojawpdf/${dd}";
23
        final Matcher matcher = pattern.matcher(string);
24
        int k = 0;
25
        while (matcher.find()) {
26
            System.out.println("Full match: " + matcher.group(0));
27
            for (int i = 1; i <= matcher.groupCount(); i++) {
28
                System.out.println("Group " + i + ": " + matcher.group(i));
29
                string = string.replaceAll(Pattern.quote(matcher.group(0)), "VALORE"+k++);
30
                System.out.println("new String  = " + string);
31

  
32
            }
33
        }
34

  
35

  
36

  
37
    }
38
}
modules/dnet-msro-service/trunk/src/main/java/eu/dnetlib/msro/workflows/graph/GraphLoader.java
5 5
import java.util.List;
6 6
import java.util.Map;
7 7
import java.util.Set;
8
import java.util.regex.Matcher;
9
import java.util.regex.Pattern;
8 10
import java.util.stream.Collectors;
9 11

  
10 12
import javax.annotation.Resource;
......
29 31
 */
30 32
public class GraphLoader {
31 33

  
32
	private static final Log log = LogFactory.getLog(GraphLoader.class);
34
    private static final Log log = LogFactory.getLog(GraphLoader.class);
33 35

  
34
	private NodeHelper nodeHelper;
36
    private NodeHelper nodeHelper;
35 37

  
36
	@Resource(name = "propertyFetcher")
37
	private PropertyFetcher propertyFetcher;
38
    private String regExRef = "\\$\\{(\\w*)\\}";
38 39

  
39
	public Graph loadGraph(final Document doc, final Map<String, String> globalParams) throws MSROException {
40
		final Graph graph = new Graph();
40
    final Pattern pattern = Pattern.compile(regExRef, Pattern.MULTILINE);
41 41

  
42
		for (final Object o : doc.selectNodes("//CONFIGURATION/WORKFLOW/NODE")) {
43
			final Element n = (Element) o;
44
			final String nodeName = n.valueOf("@name");
45
			final String nodeType = n.valueOf("@type");
46
			final boolean isStart = StringUtils.equalsIgnoreCase(n.valueOf("@isStart"), "true");
47
			final boolean isJoin = StringUtils.equalsIgnoreCase(n.valueOf("@isJoin"), "true");
48 42

  
49
			final Map<String, GraphNodeParameter> params = calculateParamsForNode(n, globalParams);
43
    @Resource(name = "propertyFetcher")
44
    private PropertyFetcher propertyFetcher;
50 45

  
51
			if (isStart) {
52
				graph.addNode(GraphNode.newStartNode(nodeName, nodeType, params));
53
			} else if (isJoin) {
54
				graph.addNode(GraphNode.newJoinNode(nodeName, nodeType, params));
55
			} else {
56
				graph.addNode(GraphNode.newNode(nodeName, nodeType, params));
57
			}
46
    public Graph loadGraph(final Document doc, final Map<String, String> globalParams) throws MSROException {
47
        final Graph graph = new Graph();
58 48

  
59
			for (final Object o1 : n.selectNodes(".//ARC")) {
60
				final Element a = (Element) o1;
61
				final String arcName = a.valueOf("@name");
62
				final String to = a.valueOf("@to");
63
				graph.addArc(new Arc(StringUtils.isNotBlank(arcName) ? arcName : Arc.DEFAULT_ARC, nodeName, to));
64
			}
49
        for (final Object o : doc.selectNodes("//CONFIGURATION/WORKFLOW/NODE")) {
50
            final Element n = (Element) o;
51
            final String nodeName = n.valueOf("@name");
52
            final String nodeType = n.valueOf("@type");
53
            final boolean isStart = StringUtils.equalsIgnoreCase(n.valueOf("@isStart"), "true");
54
            final boolean isJoin = StringUtils.equalsIgnoreCase(n.valueOf("@isJoin"), "true");
65 55

  
66
			graph.addNode(GraphNode.newSuccessNode());
67
		}
56
            final Map<String, GraphNodeParameter> params = calculateParamsForNode(n, globalParams);
68 57

  
69
		checkValidity(graph);
58
            if (isStart) {
59
                graph.addNode(GraphNode.newStartNode(nodeName, nodeType, params));
60
            } else if (isJoin) {
61
                graph.addNode(GraphNode.newJoinNode(nodeName, nodeType, params));
62
            } else {
63
                graph.addNode(GraphNode.newNode(nodeName, nodeType, params));
64
            }
70 65

  
71
		return graph;
72
	}
66
            for (final Object o1 : n.selectNodes(".//ARC")) {
67
                final Element a = (Element) o1;
68
                final String arcName = a.valueOf("@name");
69
                final String to = a.valueOf("@to");
70
                graph.addArc(new Arc(StringUtils.isNotBlank(arcName) ? arcName : Arc.DEFAULT_ARC, nodeName, to));
71
            }
73 72

  
74
	public Map<String, GraphNodeParameter> calculateParamsForNode(final Node node, final Map<String, String> globalParams) {
73
            graph.addNode(GraphNode.newSuccessNode());
74
        }
75 75

  
76
		final Map<String, GraphNodeParameter> params = new HashMap<>();
76
        checkValidity(graph);
77 77

  
78
		if (node != null) {
79
			for (final Object o : node.selectNodes(".//PARAM")) {
80
				final Element p = (Element) o;
78
        return graph;
79
    }
81 80

  
82
				final String pName = p.valueOf("@name");
83
				final GraphNodeParameter pValue = calculateSimpleValue((Element) o, globalParams);
81
    public Map<String, GraphNodeParameter> calculateParamsForNode(final Node node, final Map<String, String> globalParams) {
84 82

  
85
				if (pValue != null) {
86
					params.put(pName, pValue);
87
				} else if (p.selectSingleNode("./MAP") != null) {
83
        final Map<String, GraphNodeParameter> params = new HashMap<>();
88 84

  
89
					@SuppressWarnings("unchecked")
90
					final Map<String, GraphNodeParameter> map = ((List<Element>) p.selectNodes("./MAP/ENTRY"))
91
							.stream()
92
							.collect(Collectors.toMap(
93
											e -> e.valueOf("@key"),
94
											e -> {
95
												final GraphNodeParameter gnp = calculateSimpleValue(e, globalParams);
96
												if (gnp == null) {
97
													final String msg = String.format("missing value for param: \"%s\"", e.valueOf("@key"));
98
													log.debug(msg);
99
													return GraphNodeParameter.newNullParam();
100
												}
101
												return gnp;
102
											}));
85
        if (node != null) {
86
            for (final Object o : node.selectNodes(".//PARAM")) {
87
                final Element p = (Element) o;
103 88

  
104
					params.put(pName, GraphNodeParameter.newMapParam(map));
89
                final String pName = p.valueOf("@name");
90
                final GraphNodeParameter pValue = calculateSimpleValue((Element) o, globalParams);
105 91

  
106
				} else if (p.selectSingleNode("./LIST") != null) {
107
					@SuppressWarnings("unchecked")
108
					final List<GraphNodeParameter> list = ((List<Element>) p.selectNodes("./LIST/ITEM"))
109
							.stream()
110
							.map(e -> calculateSimpleValue(e, globalParams))
111
							.collect(Collectors.toList());
112
					params.put(pName, GraphNodeParameter.newListParam(list));
113
				}
114
			}
115
		}
92
                if (pValue != null) {
93
                    params.put(pName, pValue);
94
                } else if (p.selectSingleNode("./MAP") != null) {
116 95

  
117
		return params;
118
	}
96
                    @SuppressWarnings("unchecked") final Map<String, GraphNodeParameter> map = ((List<Element>) p.selectNodes("./MAP/ENTRY"))
97
                            .stream()
98
                            .collect(Collectors.toMap(
99
                                    e -> e.valueOf("@key"),
100
                                    e -> {
101
                                        final GraphNodeParameter gnp = calculateSimpleValue(e, globalParams);
102
                                        if (gnp == null) {
103
                                            final String msg = String.format("missing value for param: \"%s\"", e.valueOf("@key"));
104
                                            log.debug(msg);
105
                                            return GraphNodeParameter.newNullParam();
106
                                        }
107
                                        return gnp;
108
                                    }));
119 109

  
120
	private GraphNodeParameter calculateSimpleValue(final Element elem, final Map<String, String> globalParams) {
121
		final String value = elem.valueOf("@value");
122
		final String ref = elem.valueOf("@ref");
123
		final String prop = elem.valueOf("@property");
124
		final String envRef = elem.valueOf("@env");
110
                    params.put(pName, GraphNodeParameter.newMapParam(map));
125 111

  
126
		if (StringUtils.isNotBlank(ref) && StringUtils.isNotBlank(globalParams.get(ref))) {
127
			return GraphNodeParameter.newSimpleParam(globalParams.get(ref));
128
		} else if (StringUtils.isNotBlank(envRef)) {
129
			return GraphNodeParameter.newEnvParam(envRef);
130
		} else if (StringUtils.isNotBlank(value)) {
131
			return GraphNodeParameter.newSimpleParam(value);
132
		} else if (StringUtils.isNotBlank(prop)) {
133
			return GraphNodeParameter.newSimpleParam(this.propertyFetcher.getProperty(prop));
134
		} else {
135
			return null;
136
		}
137
	}
112
                } else if (p.selectSingleNode("./LIST") != null) {
113
                    @SuppressWarnings("unchecked") final List<GraphNodeParameter> list = ((List<Element>) p.selectNodes("./LIST/ITEM"))
114
                            .stream()
115
                            .map(e -> calculateSimpleValue(e, globalParams))
116
                            .collect(Collectors.toList());
117
                    params.put(pName, GraphNodeParameter.newListParam(list));
118
                }
119
            }
120
        }
138 121

  
139
	private void checkValidity(final Graph graph) throws MSROException {
122
        return params;
123
    }
140 124

  
141
		final Set<String> nodesFromArcs = new HashSet<String>();
125
    private GraphNodeParameter calculateSimpleValue(final Element elem, final Map<String, String> globalParams) {
126
        String value = elem.valueOf("@value");
127
        final String ref = elem.valueOf("@ref");
128
        final String prop = elem.valueOf("@property");
129
        final String envRef = elem.valueOf("@env");
142 130

  
143
		boolean foundSuccess = false;
144
		boolean foundStart = false;
131
        if (StringUtils.isNotBlank(ref) && StringUtils.isNotBlank(globalParams.get(ref))) {
132
            return GraphNodeParameter.newSimpleParam(globalParams.get(ref));
133
        } else if (StringUtils.isNotBlank(envRef)) {
134
            return GraphNodeParameter.newEnvParam(envRef);
135
        } else if (StringUtils.isNotBlank(value)) {
136
            Matcher matcher = pattern.matcher(value);
137
            while (matcher.find()) {
138
                final String rName = matcher.group(1);
139
                final String rValue = globalParams.get(rName);
140
                if (StringUtils.isBlank(rValue)) {
141
                	return null;
142
				}
143
                value = value.replaceAll(Pattern.quote(matcher.group(0)), rValue);
144
				System.out.println("NEW VALUE "+value);
145
            }
146
            return GraphNodeParameter.newSimpleParam(value);
147
        } else if (StringUtils.isNotBlank(prop)) {
148
            return GraphNodeParameter.newSimpleParam(this.propertyFetcher.getProperty(prop));
149
        } else {
150
            return null;
151
        }
145 152

  
146
		for (final Arc arc : graph.getArcs()) {
147
			if (StringUtils.isBlank(arc.getFrom()) || StringUtils.isBlank(arc.getFrom())) { throw new MSROException("Invalid arc: missing from e/o to"); }
148
			if (StringUtils.equals(arc.getTo(), GraphNode.SUCCESS_NODE)) {
149
				foundSuccess = true;
150
			}
151
			nodesFromArcs.add(arc.getFrom());
152
			nodesFromArcs.add(arc.getTo());
153
		}
153
    }
154 154

  
155
		if (!foundSuccess) { throw new MSROException("Arc to success not found"); }
155
    private void checkValidity(final Graph graph) throws MSROException {
156 156

  
157
		final Set<String> diff = Sets.symmetricDifference(graph.nodeNames(), nodesFromArcs);
158
		if (!diff.isEmpty()) { throw new MSROException("Missing or invalid nodes in arcs: " + diff); }
157
        final Set<String> nodesFromArcs = new HashSet<String>();
159 158

  
160
		for (final GraphNode n : graph.nodes()) {
161
			if (StringUtils.isBlank(n.getName())) { throw new MSROException("Invalid node: missing name"); }
162
			if (n.isStart()) {
163
				foundStart = true;
164
			}
165
			if (!this.nodeHelper.isValidType(n.getType())) { throw new MSROException("Invalid node type: " + n.getType()); }
166
		}
167
		if (!foundStart) { throw new MSROException("Start node not found"); }
168
	}
159
        boolean foundSuccess = false;
160
        boolean foundStart = false;
169 161

  
170
	public NodeHelper getNodeHelper() {
171
		return this.nodeHelper;
172
	}
162
        for (final Arc arc : graph.getArcs()) {
163
            if (StringUtils.isBlank(arc.getFrom()) || StringUtils.isBlank(arc.getFrom())) {
164
                throw new MSROException("Invalid arc: missing from e/o to");
165
            }
166
            if (StringUtils.equals(arc.getTo(), GraphNode.SUCCESS_NODE)) {
167
                foundSuccess = true;
168
            }
169
            nodesFromArcs.add(arc.getFrom());
170
            nodesFromArcs.add(arc.getTo());
171
        }
173 172

  
174
	@Required
175
	public void setNodeHelper(final NodeHelper nodeHelper) {
176
		this.nodeHelper = nodeHelper;
177
	}
173
        if (!foundSuccess) {
174
            throw new MSROException("Arc to success not found");
175
        }
178 176

  
177
        final Set<String> diff = Sets.symmetricDifference(graph.nodeNames(), nodesFromArcs);
178
        if (!diff.isEmpty()) {
179
            throw new MSROException("Missing or invalid nodes in arcs: " + diff);
180
        }
181

  
182
        for (final GraphNode n : graph.nodes()) {
183
            if (StringUtils.isBlank(n.getName())) {
184
                throw new MSROException("Invalid node: missing name");
185
            }
186
            if (n.isStart()) {
187
                foundStart = true;
188
            }
189
            if (!this.nodeHelper.isValidType(n.getType())) {
190
                throw new MSROException("Invalid node type: " + n.getType());
191
            }
192
        }
193
        if (!foundStart) {
194
            throw new MSROException("Start node not found");
195
        }
196
    }
197

  
198
    public NodeHelper getNodeHelper() {
199
        return this.nodeHelper;
200
    }
201

  
202
    @Required
203
    public void setNodeHelper(final NodeHelper nodeHelper) {
204
        this.nodeHelper = nodeHelper;
205
    }
206

  
179 207
}

Also available in: Unified diff