Project

General

Profile

« Previous | Next » 

Revision 36726

merged branch dedupConf

View differences:

modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/5f52f22e-b077-43ac-bf22-83de1543c9e1.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="5f52f22e-b077-43ac-bf22-83de1543c9e1_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
        <RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2015-01-18T11:37:10+00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION id="00">
11
           	<DESCRIPTION>00 - Generic configuration</DESCRIPTION>
12
            <DEDUPLICATION>
13
                <ENTITY name="result">
14
                    <PACE>pace.conf {
15
                        	clustering {
16
                        		ngrampairs   { fields = [title], params = { max = 1, ngramLen = 3} },
17
                        		suffixprefix { fields = [title], params = { max = 1, len = 3 } } },
18
                        	conditions {
19
                        		titleVersionMatch { fields = [title] } },
20
                        	model {
21
                        		title { algo = JaroWinkler, type = String, weight = 1.0, ignoreMissing = false, path = result/metadata/title/value } },
22
                        	blacklists = { title = [
23
			                    "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
24
								"^(Kiri Karl Morgensternile).*$",
25
								"^(\\[Eksliibris Aleksandr).*\\]$",
26
								"^(\\[Eksliibris Aleksandr).*$",
27
								"^(Kiri A\\. de Vignolles).*$",
28
								"^(2 kirja Karl Morgensternile).*$",
29
								"^(Pirita kloostri idaosa arheoloogilised).*$",
30
								"^(Kiri tundmatule).*$",
31
								"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
32
								"^(Eksliibris Nikolai Birukovile).*$",
33
								"^(Eksliibris Nikolai Issakovile).*$",
34
								"^(WHP Cruise Summary Information of section).*$",
35
								"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
36
								"^(Measurement of the spin\\-dependent structure function).*"
37
                        	]}}</PACE>
38
                    <WORKFLOW>dedup.conf { 
39
                            threshold = 0.99, 
40
                            run = '001', 
41
                            entity.type = result, 
42
                            order.field = title, 
43
                            queue.max.size = 2000,
44
                            group.max.size = 10,
45
                            sliding.window.size = 200,
46
                            rootbuilder = [result,personResult_authorship_hasAuthor,resultProject_outcome_isProducedBy,resultResult_publicationDataset_isRelatedTo,resultResult_similarity_isAmongTopNSimilarDocuments,resultResult_similarity_hasAmongTopNSimilarDocuments] }</WORKFLOW>
47
                </ENTITY>
48
                <ENTITY name="person">
49
                    <PACE>pace.conf {
50
                        	clustering { \
51
                        		ngrampairs   { fields = [fullname], params = { max = 1, ngramLen = 3} },
52
                        		suffixprefix { fields = [fullname], params = { max = 1, len = 3 } } },
53
                        	model { \
54
                        		fullname { algo = JaroWinkler, type = String, weight = 0.6, ignoreMissing = false, path = person/metadata/fullname/value },
55
                        		coauthors { algo = JaroWinkler, type = String, weight = 0.4, ignoreMissing = true, path = person/metadata/coauthors/value } } }</PACE>
56
                    <WORKFLOW>dedup.conf { 
57
                            threshold = 0.99, 
58
                            run = '001', 
59
                            entity.type = person, 
60
                            queue.max.size = 2000,
61
                            group.max.size = 10,
62
                            sliding.window.size = 200,                            
63
                            order.field = fullname, rootbuilder = [person,personResult_authorship_isAuthorOf,projectPerson_contactPerson_isContact] }</WORKFLOW>
64
                </ENTITY>
65
                <ENTITY name="organization">
66
                    <PACE>pace.conf {
67
                        	clustering {
68
                        		ngrampairs   { fields = [legalname], params = { max = 1, ngramLen = 3} },
69
                        		suffixprefix { fields = [legalname], params = { max = 1, len = 3 } } },
70
                        	model {
71
                        		legalname 	   { algo = JaroWinkler, type = String, weight = 0.6, ignoreMissing = false, path = organization/metadata/legalname/value },
72
                        		legalshortname   { algo = JaroWinkler, type = String, weight = 0.4, ignoreMissing = true, path = organization/metadata/legalname/value  } } }</PACE>
73
                    <WORKFLOW>dedup.conf { 
74
                            threshold = 0.99, 
75
                            run = '001', 
76
                            entity.type = organization, 
77
                            order.field = legalname, 
78
                            queue.max.size = 2000,
79
                            group.max.size = 10,
80
                            sliding.window.size = 200,                            
81
                            rootbuilder = [organization,projectOrganization_participation_isParticipant,datasourceOrganization_provision_isProvidedBy] }</WORKFLOW>
82
                </ENTITY>
83
            </DEDUPLICATION>
84
        </CONFIGURATION>
85
        <STATUS>
86
            <LAST_UPDATE value="2001-12-31T12:00:00"/>
87
        </STATUS>
88
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
89
    </BODY>
90
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/offlineHbaseLoadJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="073e55eb-c6f4-49a9-80b3-1a927612ba5b_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="offlineHbaseLoad" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that loads a given entity type in the offline dedup table</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />				
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.OfflineHbaseLoadMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
22
				
23
				<!-- MISC -->				
24
				<PROPERTY key="mapred.compress.map.output" value="true" />	
25
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
26
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
27
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
28
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />				
29
			
30
				<PROPERTY key="mapred.reduce.tasks" value="0" />
31
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
32
				
33
		<!--  	Uncomment to override the default lib path -->			
34
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
35
        	</STATIC_CONFIGURATION>
36
        	<JOB_INTERFACE>
37
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
38
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
39
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
40
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />
41
        	</JOB_INTERFACE>
42
        	<SCAN>
43
        		<FILTERS operator="MUST_PASS_ALL">
44
        			<FILTER type="prefix" param="entityTypeId" />
45
        		</FILTERS>
46
        		<FAMILIES>
47
        			<FAMILY param="entityType" />
48
        		</FAMILIES>
49
        	</SCAN>
50
        </HADOOP_JOB>
51
        <STATUS>
52
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
53
            <RUNNING_INSTANCES value="0"/>
54
            <CUMULATIVE_RUN value="0" />
55
        </STATUS>
56
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
57
    </BODY>
58
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupCandidateScanJob.xml
43 43
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
44 44
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
45 45
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />
46
        		<PARAM name="dedup.pace.conf" required="true" description="dedup pace configuration" />
47
        		<PARAM name="dedup.wf.conf" required="true" description="dedup workflow configuration" />        		
46
        		<PARAM name="dedup.conf" required="true" description="dedup configuration" />
48 47
        	</JOB_INTERFACE>
49 48
        	<SCAN>
50 49
        		<FILTERS operator="MUST_PASS_ALL">
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupRootsToCSVJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="027554bd-3d5c-4c50-9170-90d8c4402bc3_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="dedupRootsToCSVJob" type="mapreduce">
11
 			<DESCRIPTION>map only job that exports the representatives as CSV files</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13
        	
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat	" />
17
				<PROPERTY key="mapreduce.output.lazyoutputformat.outputformat" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
18
        	
19
        		<!-- MAPPER -->
20
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupRootsToCsvMapper" />
21
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
22
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
23
			
24
				<!-- REDUCER -->
25
				<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupRootsToCsvReducer" />
26
				<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" />				
27
				<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text" />				
28
				
29
				<!-- MUTIPLE OUTPUT -->
30
				<PROPERTY key="mapreduce.multipleoutputs" value="NativeGroups Groups NativeEntities" />
31

  
32
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
33
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.key" value="org.apache.hadoop.io.Text" />
34
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.value" value="org.apache.hadoop.io.Text" />
35
				
36
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
37
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.key" value="org.apache.hadoop.io.Text" />
38
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.value" value="org.apache.hadoop.io.Text" />				
39
				
40
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" />
41
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.key" value="org.apache.hadoop.io.Text" />
42
				<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.value" value="org.apache.hadoop.io.Text" />				
43
				
44
				<!-- MISC -->
45
				
46
                <PROPERTY key="mapred.textoutputformat.wrapper" value="#"/>
47
                <PROPERTY key="mapred.textoutputformat.separator" value="!"/>
48

  
49
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
50
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
51
				<PROPERTY key="mapreduce.map.speculative" value="false" />
52
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />	
53
			
54
				<PROPERTY key="mapred.reduce.tasks" value="3" />
55
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
56
				
57
		<!--  	Uncomment to override the default lib path -->			
58
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
59
        	</STATIC_CONFIGURATION>
60
        	<JOB_INTERFACE>
61
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
62
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
63
        		
64
        		<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs" />         		
65
        	</JOB_INTERFACE>
66
        	<SCAN>
67
        		<FILTERS operator="MUST_PASS_ALL">
68
        			<FILTER type="prefix" param="entityTypeId" />
69
        		</FILTERS>
70
        		<FAMILIES>
71
	       			<FAMILY param="entityType" />
72
        			<FAMILY value="resultResult_dedup_merges" />
73
        			<FAMILY value="personPerson_dedup_merges" />
74
     				<FAMILY value="organizationOrganization_dedup_merges" />
75
        		</FAMILIES>
76
        	</SCAN>
77
        </HADOOP_JOB>
78
        <STATUS>
79
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
80
            <RUNNING_INSTANCES value="0"/>
81
            <CUMULATIVE_RUN value="0" />
82
        </STATUS>
83
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
84
    </BODY>
85
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupBuildRootsJob.xml
14 14
				<!-- I/O FORMAT -->
15 15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16 16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />
17
        	
17

  
18 18
        		<!-- MAPPER -->
19 19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupBuildRootsMapper" />
20 20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21 21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
22
				
22

  
23 23
				<!-- REDUCER -->
24 24
				<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupBuildRootsReducer" />
25 25
				<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />				
26 26
				<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Writable" />				
27
				
27
	
28 28
				<!-- MISC -->				
29 29
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
30 30
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupSimilarity2ActionsJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="5c4b4dbf-8198-4f7a-9a35-367c7b0a7391_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="dedupSimilarity2ActionsJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that scans a given entity type and creates the similarRel graph</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />				
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupSimilarityToActionsMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" />
22
				
23
				<!-- MISC -->				
24
				<PROPERTY key="mapred.compress.map.output" value="true" />	
25
				<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" />	
26
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
27
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
28
				<PROPERTY key="mapreduce.reduce.speculative" value="false" />				
29
			
30
				<PROPERTY key="mapred.reduce.tasks" value="0" />
31
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
32
				
33
		<!--  	Uncomment to override the default lib path -->			
34
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
35
        	</STATIC_CONFIGURATION>
36
        	<JOB_INTERFACE>
37
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
38
        		<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
39
        		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
40
        		<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />
41
        		<PARAM name="dedup.conf" required="true" description="dedup configuration" />
42
        		<PARAM name="rawSetId" required="true" description="raw set identifier" />
43
        		<PARAM name="similarityCF" required="true" description="similarity column family name" />      			
44
        	</JOB_INTERFACE>
45
        	<SCAN>
46
        		<FILTERS operator="MUST_PASS_ALL">
47
        			<FILTER type="prefix" param="entityTypeId" />
48
        		</FILTERS>
49
        		<FAMILIES>
50
        			<FAMILY param="entityType" />
51
           			<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" />
52
        			<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" />
53
     				<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" />        			
54
        		</FAMILIES>
55
        	</SCAN>
56
        </HADOOP_JOB>
57
        <STATUS>
58
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
59
            <RUNNING_INSTANCES value="0"/>
60
            <CUMULATIVE_RUN value="0" />
61
        </STATUS>
62
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
63
    </BODY>
64
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/promoteActions.xml
1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER
4
			value="8bb6c559-edf3-4da1-87d7-cdee4fba21dd_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ==" />
5
		<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType" />
6
		<RESOURCE_KIND value="HadoopJobConfigurationDSResources" />
7
		<RESOURCE_URI value="" />
8
		<DATE_OF_CREATION value="2001-12-31T12:00:00" />
9
	</HEADER>
10
	<BODY>
11
		<HADOOP_JOB name="promoteActionsJob" type="mapreduce">
12
			<DESCRIPTION>map only job that promote actions from a set to the data table</DESCRIPTION>
13
			<STATIC_CONFIGURATION>
14
				<PROPERTY key="mapreduce.map.class"	value="eu.dnetlib.data.mapreduce.hbase.actions2.PromoteActionsMapper" />
15
				<PROPERTY key="mapreduce.inputformat.class"	value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" />
17
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
18
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Put" />
19
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />
20
				<PROPERTY key="mapreduce.map.speculative" value="false" />
21
				<PROPERTY key="mapred.reduce.tasks" value="0" />
22
				<!-- <PROPERTY key="job.lib" value="/tmp/dnet-mapreduce-jobs-0.0.3-SNAPSHOT-jar-with-dependencies.jar" />  -->
23
			</STATIC_CONFIGURATION>
24

  
25
			<JOB_INTERFACE>
26
				<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
27
				<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" />
28
				<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
29
				<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" />
30
				<PARAM name="latestRawSets" required="true" description="the latest action raw sets" />
31
			</JOB_INTERFACE>
32

  
33
			<SCAN>
34
				<FILTERS operator="MUST_PASS_ALL">
35
					<FILTER type="prefix" value="aac|" />
36
				</FILTERS>
37
				<FAMILIES />
38
			</SCAN>
39
			
40
		</HADOOP_JOB>
41
		
42
		<STATUS>
43
			<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00" />
44
			<RUNNING_INSTANCES value="0" />
45
			<CUMULATIVE_RUN value="0" />
46
		</STATUS>
47
		
48
		<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
49
	</BODY>
50
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupIndexFeedJob.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="fa0e188c-aec4-4877-93b4-43e3c5acae1d_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/>
4
        <RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="HadoopJobConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <HADOOP_JOB name="dedupIndexFeedJob" type="mapreduce">
11
 			<DESCRIPTION>map reduce job that updates the dedup index</DESCRIPTION>
12
        	<STATIC_CONFIGURATION>
13

  
14
				<!-- I/O FORMAT -->
15
				<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" />
16
				<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat" />				
17
        	
18
        		<!-- MAPPER -->
19
				<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.index.DedupIndexFeedMapper" />
20
				<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" />
21
				<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text" />
22
				
23
				<!-- JOB GLOBAL -->		
24
                <PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" />
25
                <PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text"/>
26
				
27
				<!-- MISC -->					
28
				<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" />	
29
				<PROPERTY key="mapreduce.map.speculative" value="false" />		
30
				<PROPERTY key="mapred.reduce.tasks" value="0" />
31
				<PROPERTY key="mapred.fairscheduler.pool" value="solr"/>
32
				<!-- <PROPERTY key="user.name" value="dnet" /> -->
33
				
34
		<!--  	Uncomment to override the default lib path -->			
35
		<!--	<PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> -->
36
        	</STATIC_CONFIGURATION>
37
        	<JOB_INTERFACE>
38
        		<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" />
39
	       		<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" />
40
        		<PARAM name="mapred.output.dir" required="true" description="destination path on hdfs for rotten index xml records" />
41
        		
42
        		<PARAM name="index.solr.url" required="false" description="url used to instantiate the solr client" /> 
43
       			<PARAM name="index.solr.collection" required="true" description="target solr collection to be fed" />
44

  
45
       			<PARAM name="id" required="true" description="index DS id" />
46
				<PARAM name="index.shutdown.wait.time" required="true" description="wait time before shut down the solr client pool" />
47
       			<PARAM name="index.buffer.flush.threshold" required="true" description="indexing buffer flush threshold" />
48
       			<PARAM name="index.feed.timestamp" required="true" description="timestamp used as ds_version" />
49
				<PARAM name="index.solr.sim.mode" required="true" description="boolean value, allows to run this job in simulation mode" />
50
				<PARAM name="index.fields" required="true" description="fields from a given MDFormatDSResourceType" />
51
				<PARAM name="entityType" required="true" description="entity Type name" />
52
        	</JOB_INTERFACE>
53
        	<SCAN>
54
        		<FILTERS operator="MUST_PASS_ALL">
55
        			<FILTER type="prefix" param="entityTypeId" />
56
        		</FILTERS>
57
        		<FAMILIES>
58
        			<FAMILY param="entityType" />
59
        		</FAMILIES>
60
        	</SCAN>
61
        </HADOOP_JOB>
62
        <STATUS>
63
            <LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/>
64
            <RUNNING_INSTANCES value="0"/>
65
            <CUMULATIVE_RUN value="0" />
66
        </STATUS>
67
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
68
    </BODY>
69
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupOrchestrationDSResources/DedupOrchestrationDSResourceType/organization.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="4b9a58d4-a048-4161-b5b1-a5cdf78a9956_RGVkdXBPcmNoZXN0cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBPcmNoZXN0cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
        <RESOURCE_TYPE value="DedupOrchestrationDSResourceType"/>
5
        <RESOURCE_KIND value="DedupOrchestrationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
            <DEDUPLICATION>
12
                <ENTITY name="organization" code="20" label="Organization" />
13
                <ACTION_SET id="dedup-similarity-organization" />
14
                <SCAN_SEQUENCE>
15
                	<SCAN id="98494a63-f5d1-46f7-9afd-e026c1dda913_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
16
                	<SCAN id="1d52dba7-1902-4c25-bf5b-3598f29ef11c_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
17
                </SCAN_SEQUENCE>
18
            </DEDUPLICATION>
19
        </CONFIGURATION>
20
        <STATUS>
21
            <LAST_UPDATE value="2001-12-31T12:00:00"/>
22
        </STATUS>
23
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
24
    </BODY>
25
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupOrchestrationDSResources/DedupOrchestrationDSResourceType/result.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="acaf1433-8a35-4708-b903-ab35c899851d_RGVkdXBPcmNoZXN0cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBPcmNoZXN0cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
        <RESOURCE_TYPE value="DedupOrchestrationDSResourceType"/>
5
        <RESOURCE_KIND value="DedupOrchestrationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
            <DEDUPLICATION>
12
                <ENTITY name="result" code="50" label="Publication" />
13
                <ACTION_SET id="dedup-similarity-result" />
14
                <SCAN_SEQUENCE>
15
                	<SCAN id="c611ec67-eefc-4ffe-a5d4-cb3fc40a8baf_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
16
                </SCAN_SEQUENCE>
17
            </DEDUPLICATION>
18
        </CONFIGURATION>
19
        <STATUS>
20
            <LAST_UPDATE value="2001-12-31T12:00:00"/>
21
        </STATUS>
22
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
23
    </BODY>
24
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/result.step.01.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="c611ec67-eefc-4ffe-a5d4-cb3fc40a8baf_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
        <RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION id="01">
11
            <DEDUPLICATION>
12
 			{ 
13
				"wf" : { 
14
			        "threshold" : "0.99", 
15
			        "dedupRun" : "001", 
16
			        "entityType" : "result", 
17
			        "orderField" : "title", 
18
			        "queueMaxSize" : "2000",
19
			        "groupMaxSize" : "10",
20
			        "slidingWindowSize" : "200",
21
			        "rootBuilder" : [ "result,personResult_authorship_hasAuthor", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
22
			        "includeChildren" : "true" 
23
			    },
24
				"pace" : {		
25
					"clustering" : [
26
						{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
27
						{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } 
28
					],		
29
			  		"conditions" : [ 
30
			  			{ "name" : "titleVersionMatch", "fields" : [ "title" ] }
31
			  		],		
32
					"model" : [
33
						{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.8", "ignoreMissing" : "false", "path" : "result/metadata/title/value" }
34
					],
35
					"blacklists" : { 
36
						"title" : [
37
							"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
38
							"^(Kiri Karl Morgensternile).*$",
39
							"^(\\[Eksliibris Aleksandr).*\\]$",
40
							"^(\\[Eksliibris Aleksandr).*$",
41
							"^(Eksliibris Aleksandr).*$",
42
							"^(Kiri A\\. de Vignolles).*$",
43
							"^(2 kirja Karl Morgensternile).*$",
44
							"^(Pirita kloostri idaosa arheoloogilised).*$",
45
							"^(Kiri tundmatule).*$",
46
							"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
47
							"^(Eksliibris Nikolai Birukovile).*$",
48
							"^(Eksliibris Nikolai Issakovile).*$",
49
							"^(WHP Cruise Summary Information of section).*$",
50
							"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
51
							"^(Measurement of the spin\\-dependent structure function).*"
52
						]
53
					} 		
54
				}
55
			}
56
            </DEDUPLICATION>
57
        </CONFIGURATION>
58
        <STATUS>
59
            <LAST_UPDATE value="2001-12-31T12:00:00"/>
60
        </STATUS>
61
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
62
    </BODY>
63
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/organization.step.01.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="98494a63-f5d1-46f7-9afd-e026c1dda913_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
        <RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
           	<DESCRIPTION>1 - Organization: Merge by legalshortname</DESCRIPTION>
12
            <DEDUPLICATION>
13
			{ 
14
				"wf" : { 
15
			        "threshold" : "1.0", 
16
			        "dedupRun" : "001", 
17
			        "entityType" : "organization", 
18
			        "orderField" : "legalshortname", 
19
			        "queueMaxSize" : "2000",
20
			        "groupMaxSize" : "10",
21
			        "slidingWindowSize" : "200",
22
			        "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
23
			        "includeChildren" : "true" 
24
			    },
25
				"pace" : {		
26
					"clustering" : [
27
						{ "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } }
28
					],		
29
					"model" : [
30
						{ "name" : "legalshortname", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }
31
					],
32
					"blacklists" : { } 		
33
				}
34
			}            
35
            </DEDUPLICATION>
36
        </CONFIGURATION>
37
        <STATUS>
38
            <LAST_UPDATE value="2001-12-31T12:00:00"/>
39
        </STATUS>
40
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
41
    </BODY>
42
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/organization.step.02.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="1d52dba7-1902-4c25-bf5b-3598f29ef11c_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
        <RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
        	<DESCRIPTION>2 - Organization: Match against the legalname</DESCRIPTION>
12
            <DEDUPLICATION>
13
			{ 
14
				"wf" : { 
15
			        "threshold" : "0.95", 
16
			        "dedupRun" : "001", 
17
			        "entityType" : "organization", 
18
			        "orderField" : "legalname", 
19
			        "queueMaxSize" : "2000",
20
			        "groupMaxSize" : "10",
21
			        "slidingWindowSize" : "200",
22
			        "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
23
			        "includeChildren" : "true" 
24
			    },
25
				"pace" : {		
26
					"clustering" : [
27
						{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
28
						{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } } 
29
					],		
30
					"model" : [
31
						{ "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }
32
					],
33
					"blacklists" : { } 		
34
				}
35
			} 	
36
            </DEDUPLICATION>
37
        </CONFIGURATION>
38
        <STATUS>
39
            <LAST_UPDATE value="2001-12-31T12:00:00"/>
40
        </STATUS>
41
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
42
    </BODY>
43
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/MDFormatDSResources/MDFormatDSResourceType/919cd504-6a4b-4de1-a33c-ad26ece21350.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="919cd504-6a4b-4de1-a33c-ad26ece21350_TURGb3JtYXREU1Jlc291cmNlcy9NREZvcm1hdERTUmVzb3VyY2VUeXBl"/>
4
        <RESOURCE_TYPE value="MDFormatDSResourceType"/>
5
        <RESOURCE_KIND value="MDFormatDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2008-05-22T14:40:04+02:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
            <NAME>OPENAIRE</NAME>
12
            <DESCRIPTION>Openaire Metadata Format</DESCRIPTION>
13
            <INTERPRETATION>dedup</INTERPRETATION>
14
            <SCHEMA uri=""/>
15
        </CONFIGURATION>
16
        <STATUS>
17
            <LAYOUTS>
18
                <LAYOUT name="index">
19
                    <FIELDS>
20
                        <!-- COMMON FIELDS -->
21
                        <FIELD name="oaftype" path="entity/type" tokenizable="false" indexable="true" stat="false" result="false"/>
22
                        <FIELD name="objIdentifier" path="entity/id" tokenizable="false" indexable="true" stat="false" result="false"/>
23
                        <FIELD name="collectedfrom" path="entity/collectedfrom/value" tokenizable="false" indexable="true" result="false" stat="false" />
24
                        <FIELD name="pid" path="entity/pid/value" tokenizable="false" stat="false" result="false" indexable="true"/>
25
                        <FIELD name="deletedbyinference" path="dataInfo/deletedbyinference" tokenizable="false" stat="false" result="false" indexable="true"/>
26
                       	<FIELD name="inferred" path="dataInfo/inferred" tokenizable="false" stat="false" result="false" indexable="true"/>                 
27
						<FIELD name="actionset" indexable="true" stat="false" tokenizable="false" result="false"/>
28

  
29
                        <!-- ORGANIZATION FIELDS -->
30
                        <FIELD name="organizationlegalname" path="entity/organization/metadata/legalname/value | entity/children/organization/metadata/legalname/value" type="ngramtext" stat="false" indexable="true" result="false"/>
31
                        <FIELD name="organizationlegalshortname" path="entity/organization/metadata/legalshortname/value | entity/children/organization/metadata/legalshortname/value" type="ngramtext" stat="false" indexable="true" result="false"/>
32
                        <FIELD name="organizationwebsiteurl" path="entity/organization/metadata/websiteurl/value" stat="false" indexable="true" result="false"/>
33

  
34
                        <!-- PERSON FIELDS -->
35
                        <FIELD name="personfirstname" path="entity/person/metadata/firstname/value" stat="false" indexable="true" result="false"/>
36
                        <FIELD name="personsecondnames" path="entity/person/metadata/secondnames/value" stat="false" indexable="true" result="false"/>
37
                        <FIELD name="personfullname" path="entity/person/metadata/fullname/value" stat="false" indexable="true" result="false"/>
38

  
39
                        <!-- RESULT FIELDS -->
40
                        <FIELD name="resulttitle" path="entity/result/metadata/title/value | entity/children/result/metadata/title/value" stat="false" result="false" indexable="true"/>
41
                        <FIELD name="resultdescription" path="entity/result/metadata/description/value | entity/children/result/metadata/description/value" result="false" indexable="true" stat="false"/>
42

  
43
       	 			</FIELDS>
44
                </LAYOUT>
45
            </LAYOUTS>
46
        </STATUS>
47
    </BODY>
48
</RESOURCE_PROFILE>

Also available in: Unified diff