Project

General

Profile

« Previous | Next » 

Revision 42411

[maven-release-plugin] copy for tag dnet-openaireplus-profiles-1.0.11

View differences:

modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/coauthorUpdateJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="6d91b311-a7fd-48ff-98d2-1fed70850e3a_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="coauthorUpdateJob" type="mapreduce">
11
 			<DESCRIPTION>update coauthors using a map {merged author id --> anchorId}</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.inputformat.class"	value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
17
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />			
18
        	
19
        		<!-- MAPPER -->
20
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.gt.CoAuthorUpdateMapper" />
21
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
22
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Put" />
23
				
24
				
25
				<!-- MISC -->
26
				<PROPERTY key="mapred.output.compress" value="false" />
27
				<PROPERTY key="mapred.compress.map.output" value="true" />	
28
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
29
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
30
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
31
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />
32
							
33
				<PROPERTY key="mapred.reduce.tasks" value="0" />
34
				
35
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
36
				
37
		<!--  	Uncomment to override the default lib path -->			
38
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
39
        	</STATIC_CONFIGURATION>
40
        	<JOB_INTERFACE>
41
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
42
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
43
        		<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs" /> 
44
        	</JOB_INTERFACE>
45
        	<SCAN>
46
                <FILTERS operator="MUST_PASS_ALL">
47
                    <FILTER type="prefix" value="30"/>
48
                </FILTERS>
49
                <FAMILIES>
50
                    <FAMILY value="person"/>
51
                </FAMILIES>
52
        	</SCAN>
53
        </HADOOP_JOB>
54
        <STATUS>
55
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
56
            <RUNNING_INSTANCES value="0"/>
57
            <CUMULATIVE_RUN value="0" />
58
        </STATUS>
59
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
60
    </BODY>
61
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupDeleteSimRelsJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="6363b833-ac88-421c-8596-440a3dc735db_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="deleteSimRelJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that deletes the similarity rels used to in the deduplication process (person)</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />				
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupDeleteSimRelMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.Writable" />
22
				
23
				<!-- MISC -->				
24
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
25
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
26
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
27
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />		
28
			
29
				<PROPERTY key="mapred.reduce.tasks" value="0" />
30
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
31
				
32
		<!--  	Uncomment to override the default lib path -->			
33
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
34
        	</STATIC_CONFIGURATION>
35
        	<JOB_INTERFACE>
36
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
37
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
38
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
39
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />     
40
        	</JOB_INTERFACE>
41
        	<SCAN>
42
        		<FILTERS operator="MUST_PASS_ALL">
43
        			<FILTER type="prefix" param="entityTypeId" />
44
        		</FILTERS>
45
        		<FAMILIES>
46
        			<FAMILY param="entityType" />
47
        			<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" />
48
        			<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" />
49
     				<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" />
50
        		</FAMILIES>
51
        	</SCAN>
52
        </HADOOP_JOB>
53
        <STATUS>
54
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
55
            <RUNNING_INSTANCES value="0"/>
56
            <CUMULATIVE_RUN value="0" />
57
        </STATUS>
58
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
59
    </BODY>
60
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/offlineHbaseLoadJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="073e55eb-c6f4-49a9-80b3-1a927612ba5b_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="offlineHbaseLoad" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that loads a given entity type in the offline dedup table</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />				
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.OfflineHbaseLoadMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
22
				
23
				<!-- MISC -->				
24
				<PROPERTY key="mapred.compress.map.output" value="true" />	
25
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
26
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
27
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
28
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />				
29
			
30
				<PROPERTY key="mapred.reduce.tasks" value="0" />
31
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
32
				
33
		<!--  	Uncomment to override the default lib path -->			
34
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
35
        	</STATIC_CONFIGURATION>
36
        	<JOB_INTERFACE>
37
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
38
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
39
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
40
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />
41
        	</JOB_INTERFACE>
42
        	<SCAN>
43
        		<FILTERS operator="MUST_PASS_ALL">
44
        			<FILTER type="prefix" param="entityTypeId" />
45
        		</FILTERS>
46
        		<FAMILIES>
47
        			<FAMILY param="entityType" />
48
        		</FAMILIES>
49
        	</SCAN>
50
        </HADOOP_JOB>
51
        <STATUS>
52
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
53
            <RUNNING_INSTANCES value="0"/>
54
            <CUMULATIVE_RUN value="0" />
55
        </STATUS>
56
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
57
    </BODY>
58
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupPersonJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="29638605-235b-4cc1-9bf5-a5dd2fc84915_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="dedupPersonJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that scans a given entity type and creates the similarRel graph</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT  -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.SimpleDedupPersonMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
22
				
23
				<!-- REDUCER -->
24
				<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.SimpleDedupPersonReducer" />
25
				<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />				
26
				<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Writable" />				
27
				
28
				<!-- MISC -->				
29
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
30
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
31
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
32
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />				
33
			
34
				<PROPERTY key="mapred.reduce.tasks" value="1000" />
35
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
36
				
37
		<!--  	Uncomment to override the default lib path -->			
38
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
39
        	</STATIC_CONFIGURATION>
40
        	<JOB_INTERFACE>
41
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
42
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
43
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
44
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />     
45
        	</JOB_INTERFACE>
46
        	<SCAN>
47
        		<FILTERS operator="MUST_PASS_ALL">
48
        			<FILTER type="prefix" param="entityTypeId" />
49
        		</FILTERS>
50
        		<FAMILIES>
51
        			<FAMILY param="entityType" />
52
        		</FAMILIES>
53
        	</SCAN>
54
        </HADOOP_JOB>
55
        <STATUS>
56
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
57
            <RUNNING_INSTANCES value="0"/>
58
            <CUMULATIVE_RUN value="0" />
59
        </STATUS>
60
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
61
    </BODY>
62
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupSimilarity2ActionsJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="5c4b4dbf-8198-4f7a-9a35-367c7b0a7391_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="dedupSimilarity2ActionsJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that scans a given entity type and creates the similarRel graph</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />				
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupSimilarityToActionsMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
22
				
23
				<!-- MISC -->				
24
				<PROPERTY key="mapred.compress.map.output" value="true" />	
25
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
26
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
27
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
28
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />				
29
			
30
				<PROPERTY key="mapred.reduce.tasks" value="0" />
31
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
32
				
33
		<!--  	Uncomment to override the default lib path -->			
34
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
35
        	</STATIC_CONFIGURATION>
36
        	<JOB_INTERFACE>
37
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
38
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
39
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
40
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />
41
        		<PARAM name="dedup.conf" required="true" description="dedup configuration" />
42
        		<PARAM name="rawSetId" required="true" description="raw set identifier" />
43
        		<PARAM name="similarityCF" required="true" description="similarity column family name" />      			
44
        	</JOB_INTERFACE>
45
        	<SCAN>
46
        		<FILTERS operator="MUST_PASS_ALL">
47
        			<FILTER type="prefix" param="entityTypeId" />
48
        		</FILTERS>
49
        		<FAMILIES>
50
        			<FAMILY param="entityType" />
51
           			<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" />
52
        			<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" />
53
     				<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" />        			
54
        		</FAMILIES>
55
        	</SCAN>
56
        </HADOOP_JOB>
57
        <STATUS>
58
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
59
            <RUNNING_INSTANCES value="0"/>
60
            <CUMULATIVE_RUN value="0" />
61
        </STATUS>
62
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
63
    </BODY>
64
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/indexFeedJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="1c34963b-75b3-4440-9f42-72445a26c077_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="indexFeedJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that joins the entities on the hbase table and produces a sequence file containig the xml records</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat" />				
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.index.IndexFeedMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text" />
22
				
23
				<!-- JOB GLOBAL -->		
24
                <PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" />
25
                <PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text"/>
26
				
27
				<!-- MISC -->		
28
				<PROPERTY key="mapred.task.timeout" value="1800000"/>			
29
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
30
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
31
				<PROPERTY key="mapred.reduce.tasks" value="0" />
32
				<PROPERTY key="mapred.fairscheduler.pool" value="solr"/>
33
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
34
				
35
		<!--  	Uncomment to override the default lib path -->			
36
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
37
        	</STATIC_CONFIGURATION>
38
        	<JOB_INTERFACE>
39
        		<PARAM name="mapred.input.dir" required="true" description="source sequence file on hdfs" />
40
        		<PARAM name="mapred.output.dir" required="true" description="destination path on hdfs for rotten index xml records" />
41
        		
42
        		<PARAM name="index.solr.url" required="false" description="url used to instantiate the solr client" /> 
43
       			<PARAM name="index.solr.collection" required="true" description="target solr collection to be fed" />
44

  
45
       			<PARAM name="id" required="true" description="index DS id" />
46
				<PARAM name="index.shutdown.wait.time" required="true" description="wait time before shut down the solr client pool" />
47
       			<PARAM name="index.buffer.flush.threshold" required="true" description="indexing buffer flush threshold" />
48
       			<PARAM name="index.feed.timestamp" required="true" description="timestamp used as ds_version" />
49
				<PARAM name="index.solr.sim.mode" required="true" description="boolean value, allows to run this job in simulation mode" />
50
				<PARAM name="index.xslt" required="true" description="record transformer created by the MSRO service" />
51
        	</JOB_INTERFACE>
52
        	<SCAN>
53
        		<FILTERS />
54
        		<FAMILIES />
55
        	</SCAN>
56
        </HADOOP_JOB>
57
        <STATUS>
58
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
59
            <RUNNING_INSTANCES value="0"/>
60
            <CUMULATIVE_RUN value="0" />
61
        </STATUS>
62
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
63
    </BODY>
64
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupGrouperJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="6b2d8db3-346f-4ddc-8591-39fd488c1191_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="dedupGrouperJob" type="mapreduce">
11
 			<DESCRIPTION>map only job that closes the similarity mesh</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13
        	
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />		        	
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupGrouperMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Put" />
22
				
23
				<!-- MISC -->				
24
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
25
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
26
				<PROPERTY key="mapreduce.map.speculative" value="false" />
27
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />	
28
			
29
				<PROPERTY key="mapred.reduce.tasks" value="0" />
30
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
31
				
32
		<!--  	Uncomment to override the default lib path -->			
33
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
34
        	</STATIC_CONFIGURATION>
35
        	<JOB_INTERFACE>
36
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
37
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
38
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
39
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />     
40
        	</JOB_INTERFACE>
41
        	<SCAN>
42
        		<FILTERS operator="MUST_PASS_ALL">
43
        			<FILTER type="prefix" param="entityTypeId" />
44
        		</FILTERS>
45
        		<FAMILIES>
46
        			<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" />
47
        			<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" />
48
     				<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" />
49
        		</FAMILIES>
50
        	</SCAN>
51
        </HADOOP_JOB>
52
        <STATUS>
53
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
54
            <RUNNING_INSTANCES value="0"/>
55
            <CUMULATIVE_RUN value="0" />
56
        </STATUS>
57
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
58
    </BODY>
59
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/pom.xml
1
<?xml version="1.0" ?>
2
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet-parent</artifactId>
6
		<version>1.0.0</version>
7
		<relativePath />
8
	</parent>
9
	<modelVersion>4.0.0</modelVersion>
10
	<groupId>eu.dnetlib</groupId>
11
	<artifactId>dnet-openaireplus-profiles</artifactId>
12
	<packaging>jar</packaging>
13
	<version>1.0.11</version>
14
	<scm>
15
   		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11</developerConnection>
16
	</scm>
17

  
18
	<dependencies>
19

  
20
	</dependencies>
21
</project>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/oaiFeedJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="03d7af20-63bb-4790-a052-6cdbc1e05fce_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2015-02-09T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="oaiFeedJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that feeds the OAI store</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.NullOutputFormat" />	
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.oai.OaiFeedMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.NullWritable" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.NullWritable" />				
22
				
23
				<!-- JOB GLOBAL -->		
24
<!--                 <PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.NullWritable" /> -->
25
<!--                 <PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.NullWritable"/> -->
26
				
27
				<!-- MISC -->					
28
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
29
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
30
				<PROPERTY key="mapred.reduce.tasks" value="0" />
31
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
32
				
33
				<!--  	Uncomment to override the default lib path -->			
34
				<PROPERTY key="job.lib" value="/user/dnet/lib/dnet-mapreduce-jobs-assembly-0.0.6.3-SNAPSHOT.jar"/> 
35
        	</STATIC_CONFIGURATION>
36
        	<JOB_INTERFACE>
37
        		<PARAM name="mapred.input.dir" required="true" description="source sequence file on hdfs" />
38
				<PARAM name="services.publisher.oai.collection" required="true" description="target mongodb collection" />
39
				<PARAM name="oaiConfiguration" required="true" description="configuration bean used to guide the OAI feeding" />
40
				<PARAM name="oai.feed.date" required="true" description="timestamp" />				
41
				<PARAM name="services.publisher.oai.host" required="true" description="mongodb host" />
42
				<PARAM name="services.publisher.oai.port" required="true" description="mongodb port" />
43
				<PARAM name="services.publisher.oai.db" required="true" description="mongodb database name" />	
44
				<PARAM name="services.publisher.oai.skipDuplicates" required="true" description="skip duplicated records." />	
45
				<PARAM name="services.publisher.oai.duplicateXPath" required="true" description="records with this xpath are identified as duplicates" />																
46
        	</JOB_INTERFACE>
47
        	<SCAN>
48
        		<FILTERS />
49
        		<FAMILIES />
50
        	</SCAN>
51
        </HADOOP_JOB>
52
        <STATUS>
53
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
54
            <RUNNING_INSTANCES value="0"/>
55
            <CUMULATIVE_RUN value="0" />
56
        </STATUS>
57
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
58
    </BODY>
59
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupExportPersonFullnameJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="ba309300-76f2-40d1-afe3-b77016f443e9_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="exportPersonFullnamesJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that exports the person fullnames on a text file on HDFS</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
17

  
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.preprocess.ExportFullnameMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text" />
22

  
23
				<!-- REDUCER -->
24
				<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.preprocess.ExportFullnameReducer" />
25
				<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" />				
26
				<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text" />				
27
	
28
				<!-- MISC -->				
29
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
30
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
31
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
32
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />
33
				
34
				<PROPERTY key="mapred.reduce.tasks" value="1" />
35
				<PROPERTY key="dfs.blocksize" value="256M" />
36
				<PROPERTY key="mapred.textoutputformat.separator" value="" />
37
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
38
	
39
		<!--  	Uncomment to override the default lib path -->			
40
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
41
        	</STATIC_CONFIGURATION>
42
        	<JOB_INTERFACE>
43
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
44
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
45
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
46
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />     
47
        	</JOB_INTERFACE>
48
        	<SCAN caching="10">
49
        		<FILTERS operator="MUST_PASS_ALL">
50
        			<FILTER type="prefix" param="entityTypeId" />
51
        		</FILTERS>
52
        		<FAMILIES/>        		
53
        	</SCAN>
54
        </HADOOP_JOB>
55
        <STATUS>
56
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
57
            <RUNNING_INSTANCES value="0"/>
58
            <CUMULATIVE_RUN value="0" />
59
        </STATUS>
60
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
61
    </BODY>
62
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/iisPreprocessingQuickJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="13beed98-81bf-4fbd-ab4f-de071177997c_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
    	<HADOOP_JOB name="iisPreprocessingQuickJob" type="oozie">
11
        	<DESCRIPTION>IIS preprocessing</DESCRIPTION>
12
            <STATIC_CONFIGURATION>
13
				<!-- Cluster wide -->
14
                <PROPERTY key="queueName" value="default"/>
15
				<PROPERTY key="user.name" value="dnet" />
16

  
17
				<!-- Runtime -->
18
                <PROPERTY key="workingDir" value="/tmp/integration/working_dir/preprocessing_quick_test"/>
19
                <PROPERTY key="oozie.wf.validate.ForkJoin" value="false"/>
20
                <PROPERTY key="input_referenceextraction_project" value="/user/marek.horst/share/referenceextraction/document_projects/2014-04-11"/>
21
                <PROPERTY key="input_referenceextraction_dataset" value="/user/marek.horst/share/referenceextraction/document_datasets/all/2014-04-11"/>
22
                <PROPERTY key="export_action_hbase_table_initialize" value="false"/>
23
            </STATIC_CONFIGURATION>
24
        	<JOB_INTERFACE>
25
 		       	<PARAM name="import_content_object_store_location" required="true" description="mdstore service location" />
26
        		<PARAM name="import_mdstore_service_location" required="true" description="mdstore service location" />
27
        		<PARAM name="import_dataset_mdstore_ids_csv" required="true" description="mdstore id for dataset records" />
28
        		<PARAM name="import_wos_mdstore_id" required="true" description="mdstore id for WoS records" />
29
        		<PARAM name="import_database_service_location" required="true" description="database service endpoint" />
30
        		<PARAM name="import_content_datacite_objectstores_csv" required="true" description="objecstore ids subject to dataset reference extraction" />
31
        		<PARAM name="import_content_wos_plaintext_objectstores_csv" required="true" description="objecstore ids for WoS plaintext" />
32
	      		<PARAM name="export_action_hbase_table_name" required="true" description="destination action manager table" />
33
        		<PARAM name="export_action_hbase_remote_zookeeper_quorum" required="true" description="ZK quorum" />
34
        		<PARAM name="export_action_hbase_remote_zookeeper_clientport" required="true" description="ZK port" />
35
        		<PARAM name="nameNode" required="true" description="hdfs name node" />
36
        		<PARAM name="jobTracker" required="true" description="job tracker name" />
37
        		<PARAM name="oozie.wf.application.path" required="true" description="oozie job application absolute path" />
38
       			<PARAM name="export_action_set_id_document_referencedProjects" required="true" description="target action set for project references" />
39
       			<PARAM name="export_action_set_id_document_referencedDatasets" required="true" description="target action set for dataset references" />
40
       			<PARAM name="export_action_set_id_entity_wos" required="true" description="target action set for WoS entities" />
41
     			<PARAM name="export_action_set_id_entity_dataset" required="true" description="target action set for dataset entities" />
42
        	</JOB_INTERFACE>
43
        </HADOOP_JOB>
44
        <STATUS>
45
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
46
            <RUNNING_INSTANCES value="0"/>
47
            <CUMULATIVE_RUN value="0" />
48
        </STATUS>
49
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
50
    </BODY>
51
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupRootsPersonExportJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="4c63a9ab-057f-442c-8da2-9b956c41e645_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="dedupRootsPersonExportJob" type="mapreduce">
11
 			<DESCRIPTION>map only job that exports the representative publications as json</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13
        	
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.gt.RootPersonExportMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text" />
22
			
23
			
24
				<!-- MISC -->
25

  
26
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
27
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
28
				<PROPERTY key="mapreduce.map.speculative" value="false" />
29
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />	
30

  
31
				<PROPERTY key="dfs.blocksize" value="256M" />
32
			
33
				<PROPERTY key="mapred.reduce.tasks" value="1" />
34
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
35
				
36
		<!--  	Uncomment to override the default lib path -->			
37
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
38
        	</STATIC_CONFIGURATION>
39
        	<JOB_INTERFACE>
40
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
41
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
42
        		
43
        		<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs" />         		
44
        	</JOB_INTERFACE>
45
        	<SCAN>
46
        		<FILTERS operator="MUST_PASS_ALL">
47
        			<FILTER type="prefix" param="entityTypeId" />
48
        		</FILTERS>
49
        		<FAMILIES>
50
	       			<FAMILY param="entityType" />
51
        		</FAMILIES>
52
        	</SCAN>
53
        </HADOOP_JOB>
54
        <STATUS>
55
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
56
            <RUNNING_INSTANCES value="0"/>
57
            <CUMULATIVE_RUN value="0" />
58
        </STATUS>
59
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
60
    </BODY>
61
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/resetDedupJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="bc4f377a-af07-403d-a019-af60aa557652_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="resetDedupJob" type="mapreduce">
11
 			<DESCRIPTION>map only job that reset the dedup jobs</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13
        	
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat"  />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />		
17

  
18
				<!-- MAPPER -->        	
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.reset.HBaseResetMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text"  />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Mutation"  />
22

  
23
				<!-- MISC -->
24
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false"  />
25
				<PROPERTY key="mapreduce.map.speculative" value="false"  />
26
				<PROPERTY key="mapred.reduce.tasks" value="0"  />
27
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
28
				
29
		<!--  	Uncomment to override the default lib path -->			
30
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
31
				
32
        	</STATIC_CONFIGURATION>
33
        	<JOB_INTERFACE>
34
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
35
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
36
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
37
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />        		
38
        	</JOB_INTERFACE>
39
        </HADOOP_JOB>
40
        <STATUS>
41
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
42
            <RUNNING_INSTANCES value="0"/>
43
            <CUMULATIVE_RUN value="0" />
44
        </STATUS>
45
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
46
    </BODY>
47
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupGTCleanerJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="888ef72f-701a-4d59-8b8a-2ad01986f975_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="gtCleanerJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that deletes the non-GT rows</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />				
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.gt.GTCleanerMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Delete" />
22
				
23
				<!-- MISC -->				
24
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
25
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
26
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
27
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />		
28
			
29
				<PROPERTY key="mapred.reduce.tasks" value="0" />
30
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
31
				
32
		<!--  	Uncomment to override the default lib path -->			
33
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
34
        	</STATIC_CONFIGURATION>
35
        	<JOB_INTERFACE>
36
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
37
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
38
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
39
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />     
40
        	</JOB_INTERFACE>
41
        	<SCAN>
42
        		<FILTERS operator="MUST_PASS_ALL">
43
        			<FILTER type="prefix" param="entityTypeId" />
44
        		</FILTERS>
45
        		<FAMILIES>
46
        			<FAMILY param="entityType" />
47
        			<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" />
48
        			<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" />
49
     				<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" />
50
        		</FAMILIES>
51
        	</SCAN>
52
        </HADOOP_JOB>
53
        <STATUS>
54
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
55
            <RUNNING_INSTANCES value="0"/>
56
            <CUMULATIVE_RUN value="0" />
57
        </STATUS>
58
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
59
    </BODY>
60
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/elasticsearchTestJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="719b5d2b-4156-4936-bbc3-41d908ec3c57_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="elastisearchTestJob" type="mapreduce">
11
 			<DESCRIPTION>map only job that indexes over ES</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13
        	
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.elasticsearch.hadoop.mr.EsOutputFormat" />
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.es.ElasticsearchFeedMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.NullWritable" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.BytesWritable" />
22
				
23
			
24
				<!-- MISC -->
25
				<PROPERTY key="es.nodes" value="146.48.87.110:9200" />
26
				<PROPERTY key="es.resource" value="openaire/oaf" />
27
				<PROPERTY key="es.input.json" value="yes" />
28
				
29
				<PROPERTY key="mapred.reduce.tasks" value="0" />
30

  
31
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
32
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
33
	
34
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
35
				
36
		<!--  	Uncomment to override the default lib path -->			
37
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
38
        	</STATIC_CONFIGURATION>
39
        	<JOB_INTERFACE>
40
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
41
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
42
        		
43
        		<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs" />         		
44
        	</JOB_INTERFACE>
45
        	<SCAN>
46
        		<FILTERS />
47
        		<FAMILIES />
48
        	</SCAN>
49
        </HADOOP_JOB>
50
        <STATUS>
51
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
52
            <RUNNING_INSTANCES value="0"/>
53
            <CUMULATIVE_RUN value="0" />
54
        </STATUS>
55
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
56
    </BODY>
57
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupRootsToCSVJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="027554bd-3d5c-4c50-9170-90d8c4402bc3_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="dedupRootsToCSVJob" type="mapreduce">
11
 			<DESCRIPTION>map only job that exports the representatives as CSV files</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13
        	
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat" />
17
				<PROPERTY key="mapreduce.output.lazyoutputformat.outputformat" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
18
        	
19
        		<!-- MAPPER -->
20
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupRootsToCsvMapper" />
21
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
22
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
23
			
24
				<!-- REDUCER -->
25
				<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupRootsToCsvReducer" />
26
				<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" />				
27
				<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text" />				
28
				
29
				<!-- MUTIPLE OUTPUT -->
30
				<PROPERTY key="mapreduce.multipleoutputs" value="NativeGroups Groups NativeEntities" />
31

  
32
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
33
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.key" value="org.apache.hadoop.io.Text" />
34
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.value" value="org.apache.hadoop.io.Text" />
35
				
36
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
37
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.key" value="org.apache.hadoop.io.Text" />
38
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.value" value="org.apache.hadoop.io.Text" />				
39
				
40
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
41
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.key" value="org.apache.hadoop.io.Text" />
42
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.value" value="org.apache.hadoop.io.Text" />				
43
				
44
				<!-- MISC -->
45
				
46
                <PROPERTY key="mapred.textoutputformat.wrapper" value="#"/>
47
                <PROPERTY key="mapred.textoutputformat.separator" value="!"/>
48

  
49
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
50
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
51
				<PROPERTY key="mapreduce.map.speculative" value="false" />
52
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />	
53
			
54
				<PROPERTY key="mapred.reduce.tasks" value="3" />
55
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
56
				
57
		<!--  	Uncomment to override the default lib path -->			
58
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
59
        	</STATIC_CONFIGURATION>
60
        	<JOB_INTERFACE>
61
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
62
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
63
        		
64
        		<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs" />         		
65
        	</JOB_INTERFACE>
66
        	<SCAN>
67
        		<FILTERS operator="MUST_PASS_ALL">
68
        			<FILTER type="prefix" param="entityTypeId" />
69
        		</FILTERS>
70
        		<FAMILIES>
71
	       			<FAMILY param="entityType" />
72
        			<FAMILY value="resultResult_dedup_merges" />
73
        			<FAMILY value="personPerson_dedup_merges" />
74
     				<FAMILY value="organizationOrganization_dedup_merges" />
75
        		</FAMILIES>
76
        	</SCAN>
77
        </HADOOP_JOB>
78
        <STATUS>
79
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
80
            <RUNNING_INSTANCES value="0"/>
81
            <CUMULATIVE_RUN value="0" />
82
        </STATUS>
83
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
84
    </BODY>
85
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/result.step.01.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="c611ec67-eefc-4ffe-a5d4-cb3fc40a8baf_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
        <RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
	        <DESCRIPTION>1 - Publication: Match against the title, whose numbers must match</DESCRIPTION>
12
            <DEDUPLICATION>
13
 			{ 
14
				"wf" : { 
15
			        "threshold" : "0.99", 
16
			        "dedupRun" : "001", 
17
			        "entityType" : "result", 
18
			        "orderField" : "title", 
19
			        "queueMaxSize" : "2000",
20
			        "groupMaxSize" : "10",
21
			        "slidingWindowSize" : "200",
22
			        "rootBuilder" : [ "result", "personResult_authorship_hasAuthor", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
23
			        "includeChildren" : "true" 
24
			    },
25
				"pace" : {		
26
					"clustering" : [
27
						{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
28
						{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } 
29
					],	
30
			  		"conditions" : [ 
31
			  			{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
32
			  			{ "name" : "sizeMatch", "fields" : [ "authors" ] }
33
			  		],		
34
					"model" : [
35
						{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title/value" },
36
						{ "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" } 
37
					],
38
					"blacklists" : { 
39
						"title" : [
40
				
41
				"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
42
				"^Problems with perinatal pathology\.?$",
43
				
44
				"(?i)^Cases? of Puerperal Convulsions$",
45
				"(?i)^Operative Gyna?ecology$",
46
				"(?i)^Mind the gap\!?\:?$",
47
				"^Chronic fatigue syndrome\.?$",
48
				"^Cartas? ao editor Letters? to the Editor$",
49
				"^Note from the Editor$",
50
				"^Anesthesia Abstract$",
51
				
52
				"^Annual report$",
53
				"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$",
54
				"(?i)^Graph and Table of Infectious Diseases?$",
55
				"^Presentation$",
56
				"(?i)^Reviews and Information on Publications$",
57
				"(?i)^PUBLIC HEALTH SERVICES?$",
58
				"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
59
				"(?i)^Adrese autora$",
60
				"(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
61
				"(?i)^Acknowledgement to Referees$",
62
				"(?i)^Behçet's disease\.?$",
63
				"(?i)^Isolation and identification of restriction endonuclease.*$",
64
				"(?i)^CEREBROVASCULAR DISEASES?.?$",
65
				"(?i)^Screening for abdominal aortic aneurysms?\.?$",
66
				"^Event management$",
67
				"(?i)^Breakfast and Crohn's disease.*\.?$",
68
				"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$",
69
				"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$",
70
				"^Gushi hakubutsugaku$",
71

  
72
				"^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",							
73
				"^Intestinal spirocha?etosis$",
74
				"^Treatment of Rodent Ulcer$",
75
				"(?i)^\W*Cloud Computing\W*$",
76
				"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",				
77
				"^Free Communications, Poster Presentations: Session [A-F]$",
78
				
79
				"^“The Historical Aspects? of Quackery\.?”$",
80
				"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
81
				"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
82
				"(?i)^Case Report$",							
83
				"^Boletín Informativo$",
84
				"(?i)^Glioblastoma Multiforme$",
85
				"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
86
				"^Zaměstnanecké výhody$",
87
				"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
88
				"(?i)^Carotid body tumours?\\.?$", 
89
				"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
90
				"^Avant-propos$",
91
				"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
92
				"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
93
				"(?i)^PUBLIC HEALTH VERSUS THE STATE$",							
94
				"^Viñetas de Cortázar$",
95
				"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$",
96
				"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",				
97
				"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
98
				"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
99

  
100
				"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
101
				"^Aus der AGMB$",				
102

  
103
				"^Znanstveno-stručni prilozi$",
104
				"^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
105
				"^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
106
				"^Finanční analýza podniku$",
107
				"^Financial analysis( of business)?$",
108
				"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
109
				"^Jikken nihon shūshinsho$",
110
				"(?i)^CORONER('|s)(s|') INQUESTS$",
111
				"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",				
112
				"(?i)^Consultants' contract(s)?$",
113
				"(?i)^Upute autorima$",
114
				"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
115
				"^Joshi shin kokubun$",
116
				"^Kōtō shōgaku dokuhon nōson'yō$",
117
				"^Jinjō shōgaku shōka$",
118
				"^Shōgaku shūjichō$",
119
				"^Nihon joshi dokuhon$",
120
				"^Joshi shin dokuhon$",
121
				"^Chūtō kanbun dokuhon$",
122
				"^Wabun dokuhon$",
123
				"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
124
				"(?i)^cardiac rehabilitation$",
125
				"(?i)^Analytical summary$",
126
				"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
127
				"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", 
128
				"^Prikazi i osvrti$",
129
				"^Rodinný dům s provozovnou$",
130
				"^Family house with an establishment$",
131
				"^Shinsei chūtō shin kokugun$",
132
				"^Pulmonary alveolar proteinosis(\\.?)$",
133
				"^Shinshū kanbun$",
134
				"^Viñeta(s?) de Rodríguez$",
135
				"(?i)^RUBRIKA UREDNIKA$",
136
				"^A Matching Model of the Academic Publication Market$",
137
				"^Yōgaku kōyō$",
138

  
139
				"^Internetový marketing$",
140
				"^Internet marketing$",
141
				"^Chūtō kokugo dokuhon$",
142
				"^Kokugo dokuhon$",
143
				"^Antibiotic Cover for Dental Extraction(s?)$",
144
				"^Strategie podniku$",				
145
				"^Strategy of an Enterprise$",
146
				"(?i)^respiratory disease(s?)(\.?)$",
147
				"^Award(s?) for Gallantry in Civil Defence$",
148
				"^Podniková kultura$",
149
				"^Corporate Culture$",
150
				"^Severe hyponatraemia in hospital inpatient(s?)(\.?)$",
151
				"^Pracovní motivace$",
152
				"^Work Motivation$",
153
				"^Kaitei kōtō jogaku dokuhon$",
154
				"^Konsolidovaná účetní závěrka$",
155
				"^Consolidated Financial Statements$",
156
				"(?i)^intracranial tumour(s?)$",
157
				"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
158
				"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
159
				"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
160
				"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
161
				"^Úroveň motivačního procesu jako způsobu vedení lidí$",
162
				"^The level of motivation process as a leadership$",
163
				"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
164
				"(?i)^news and events$",
165
				"(?i)^NOVOSTI I DOGAĐAJI$",
166
				"^Sansū no gakushū$",
167
				"^Posouzení informačního systému firmy a návrh změn$",
168
				"^Information System Assessment and Proposal for ICT Modification$",
169
				"^Stresové zatížení pracovníků ve vybrané profesi$",
170
				"^Stress load in a specific job$",
171
				
172
				"^Sunday: Poster Sessions, Pt.*$",
173
				"^Monday: Poster Sessions, Pt.*$",
174
				"^Wednesday: Poster Sessions, Pt.*",
175
				"^Tuesday: Poster Sessions, Pt.*$",
176
				
177
				"^Analýza reklamy$",
178
				"^Analysis of advertising$",
179

  
180
				"^Shōgaku shūshinsho$",
181
				"^Shōgaku sansū$",
182
				"^Shintei joshi kokubun$",
183
				"^Taishō joshi kokubun dokuhon$",
184
				"^Joshi kokubun$",				
185
												
186
				"^Účetní uzávěrka a účetní závěrka v ČR$",
187
				"(?i)^The \"?Causes\"? of Cancer$",
188
				"^Normas para la publicación de artículos$",
189
				"^Editor('|s)(s|') [Rr]eply$",
190
				"^Editor(’|s)(s|’) letter$",
191
				"^Redaktoriaus žodis$",		
192
				"^DISCUSSION ON THE PRECEDING PAPER$",
193
				"^Kōtō shōgaku shūshinsho jidōyō$",
194
				"^Shōgaku nihon rekishi$",
195
				"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
196
				"^Préface$",
197
				"^Occupational [Hh]ealth [Ss]ervices.$",
198
				"^In Memoriam Professor Toshiyuki TAKESHIMA$",
199
				"^Účetní závěrka ve vybraném podniku.*$",
200
				"^Financial statements in selected company$",
201
				"^Abdominal [Aa]ortic [Aa]neurysms.*$",
202
				"^Pseudomyxoma peritonei$",
203
				"^Kazalo autora$",			
204
			
205
				"(?i)^uvodna riječ$",
206
				"^Motivace jako způsob vedení lidí$",
207
				"^Motivation as a leadership$",
208
				"^Polyfunkční dům$",
209
				"^Multi\\-funkcional building$",
210
				"^Podnikatelský plán$",
211
				"^Business Plan$",
212
				"^Oceňování nemovitostí$",
213
				"^Marketingová komunikace$",
214
				"^Marketing communication$",
215
				"^Sumario Analítico$",
216
				"^Riječ uredništva$",
217
				"^Savjetovanja i priredbe$",
218
				"^Índice$",
219
				"^(Starobosanski nadpisi).*$",
220
				"^Vzdělávání pracovníků v organizaci$",
221
				"^Staff training in organization$",
222
				"^(Life Histories of North American Geometridae).*$",
223
				"^Strategická analýza podniku$",
224
				"^Strategic Analysis of an Enterprise$",
225
				"^Sadržaj$",
226
				"^Upute suradnicima$",
227
				"^Rodinný dům$",
228
				"(?i)^Fami(l)?ly house$",
229
				"^Upute autorima$",
230
				"^Strategic Analysis$",
231
				"^Finanční analýza vybraného podniku$",
232
				"^Finanční analýza$",
233
				"^Riječ urednika$",
234
				"(?i)^Content(s?)$",
235
				"(?i)^Inhalt$",
236
				"^Jinjō shōgaku shūshinsho jidōyō$",
237
				"(?i)^Index$",
238
				"^Chūgaku kokubun kyōkasho$",
239
				"^Retrato de una mujer$",
240
				"^Retrato de un hombre$",
241
				"^Kōtō shōgaku dokuhon$",
242
				"^Shotōka kokugo$",
243
				"^Shōgaku dokuhon$",
244
				"^Jinjō shōgaku kokugo dokuhon$",
245
				"^Shinsei kokugo dokuhon$",
246
				"^Teikoku dokuhon$",
247
				"^Instructions to Authors$",
248
				"^KİTAP TAHLİLİ$",
249
				"^PRZEGLĄD PIŚMIENNICTWA$",
250
				"(?i)^Presentación$",
251
				"^İçindekiler$",
252
				"(?i)^Tabl?e of contents$",
253
				"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
254
				"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
255
				"^Editorial( Board)?$",
256
				"(?i)^Editorial \\(English\\)$",
257
				"^Editörden$",			
258
				"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
259
				"^(Kiri Karl Morgensternile).*$",
260
				"^(\\[Eksliibris Aleksandr).*\\]$",
261
				"^(\\[Eksliibris Aleksandr).*$",
262
				"^(Eksliibris Aleksandr).*$",
263
				"^(Kiri A\\. de Vignolles).*$",
264
				"^(2 kirja Karl Morgensternile).*$",
265
				"^(Pirita kloostri idaosa arheoloogilised).*$",
266
				"^(Kiri tundmatule).*$",
267
				"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
268
				"^(Eksliibris Nikolai Birukovile).*$",
269
				"^(Eksliibris Nikolai Issakovile).*$",
270
				"^(WHP Cruise Summary Information of section).*$",
271
				"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
272
				"^(Measurement of the spin\\-dependent structure function).*"
273
						]
274
					} 		
275
				}
276
			}
277
            </DEDUPLICATION>
278
        </CONFIGURATION>
279
        <STATUS>
280
            <LAST_UPDATE value="2001-12-31T12:00:00"/>
281
        </STATUS>
282
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
283
    </BODY>
284
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupDeleteDedupRelsJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="5626c94e-0005-416a-9ea4-48fc8af85ecd_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="deleteDedupRelsJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that deletes the dedup rels used to in the deduplication process</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />				
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupDeleteRelMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.Writable" />
22
				
23
				<!-- MISC -->				
24
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
25
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
26
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
27
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />		
28
			
29
				<PROPERTY key="mapred.reduce.tasks" value="0" />
30
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
31
				
32
		<!--  	Uncomment to override the default lib path -->			
33
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
34
        	</STATIC_CONFIGURATION>
35
        	<JOB_INTERFACE>
36
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
37
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
38
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
39
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />     
40
        	</JOB_INTERFACE>
41
        	<SCAN>
42
        		<FILTERS operator="MUST_PASS_ALL">
43
        			<FILTER type="prefix" param="entityTypeId" />
44
        		</FILTERS>
45
        		<FAMILIES>
46
        			<FAMILY param="entityType" />
47
			        <FAMILY value="resultResult_dedup_merges" />
48
			        <FAMILY value="resultResult_dedup_isMergedIn" />
49
			        <FAMILY value="resultResult_dedupSimilarity_isSimilarTo" />
50

  
51
			        <FAMILY value="personPerson_dedup_merges" />
52
			        <FAMILY value="personPerson_dedup_isMergedIn" />
53
			        <FAMILY value="personPerson_dedupSimilarity_isSimilarTo" />
54

  
55
			        <FAMILY value="organizationOrganization_dedup_merges" />
56
			        <FAMILY value="organizationOrganization_dedup_isMergedIn" />
57
			        <FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" />
58
        		</FAMILIES>
59
        	</SCAN>
60
        </HADOOP_JOB>
61
        <STATUS>
62
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
63
            <RUNNING_INSTANCES value="0"/>
64
            <CUMULATIVE_RUN value="0" />
65
        </STATUS>
66
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
67
    </BODY>
68
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/person.step.02.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="82b1c7fb-c36c-4291-8863-0393c7c588ee_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
        <RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
           	<DESCRIPTION>1 - Person: Decision tree</DESCRIPTION>
12
            <DEDUPLICATION>
13
			{
14
				"wf" : {
15
			        "threshold" : "1.0",
16
			        "dedupRun" : "001",
17
			        "entityType" : "person",
18
			        "orderField" : "fullname",
19
			        "queueMaxSize" : "2000",
20
			        "groupMaxSize" : "10",
21
			        "slidingWindowSize" : "200",
22
			        "rootBuilder" : [ "person" ],
23
			        "includeChildren" : "true"
24
			    },
25
				"pace" : {
26
					"clustering" : [
27
						{ "name" : "personclustering", "fields" : [ "person" ], "params" : { } }
28
					],
29
					"model" : [
30
			            { "name" : "fullname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "false", "path" : "person/metadata/fullname/value", "params" : { } },
31
			            { "name" : "person", "algo" : "PersonDistance", "type" : "JSON", "weight" : "0.7", "ignoreMissing" : "false", "path" : "person", "params" : { "common.anchors" : "1", "common.surnames" : "3" } },
32
			            { "name" : "lastname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "person/metadata/secondnames/value" }
33
					],
34
					"blacklists" : {
35
			            "lastname" : [
36
				            "(?i)^wang$",
37
				            "(?i)^~wang$",
38
				            "(?i)^zhang$",
39
				            "(?i)^zhou$",
40
				            "(?i)^zhao$",
41
				            "(?i)^li$",
42
				            "(?i)^~li$",
43
				            "(?i)^liu$",
44
				            "(?i)^chen$",
45
				            "(?i)^yang$",
46
				            "(?i)^kim$",
47
				            "(?i)^xu$",
48
				            "(?i)^huang$",
49
				            "(?i)^sun$",
50
				            "(?i)^lee$",
51
				            "(?i)^ma$",
52
				            "(?i)^kim$",
53
				            "(?i)^hu$",
54
				            "(?i)^wu$",
55
				            "(?i)^zhu$",
56
				            "(?i)^lu$"
57
			            ]
58
	                }
59
				}
60
			}
61
            </DEDUPLICATION>
62
        </CONFIGURATION>
63
        <STATUS>
64
            <LAST_UPDATE value="2001-12-31T12:00:00"/>
65
        </STATUS>
66
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
67
    </BODY>
68
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/personCsvJoinJob.xml
1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER value="3f544a36-f123-4f5c-acf4-7c25f6591ec4_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
		<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
		<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
		<RESOURCE_URI value=""/>
7
		<DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
	</HEADER>
9
	<BODY>
10
		<HADOOP_JOB name="personCsvJoinJob" type="mapreduce">
11
			<DESCRIPTION>map reduce job that joins person entities by "surname+first name letter" and serialise the output as csv</DESCRIPTION>
12
			<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat" />
17
				<PROPERTY key="mapreduce.output.lazyoutputformat.outputformat" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
18

  
19
				<!-- MAPPER -->
20
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.experiment.JoinPersonGroupMapper" />
21
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
22
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text" />
23

  
24
				<!-- REDUCER -->
25
				<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.experiment.JoinPersonGroupReducer" />
26
				<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" />
27
				<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text" />
28

  
29
				<!-- MISC -->
30
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />
31
				<PROPERTY key="mapreduce.map.speculative" value="false" />
32
				<PROPERTY key="mapred.reduce.tasks" value="10" />
33

  
34
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
35

  
36
				<!--  	Uncomment to override the default lib path -->
37
				<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
38

  
39
			</STATIC_CONFIGURATION>
40
			<JOB_INTERFACE>
41
				<PARAM name="mapred.input.dir" required="true" description="input sequence file" />
42
			</JOB_INTERFACE>
43
		</HADOOP_JOB>
44
		<STATUS>
45
			<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
46
			<RUNNING_INSTANCES value="0"/>
47
			<CUMULATIVE_RUN value="0" />
48
		</STATUS>
49
		<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
50
	</BODY>
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff