Project

General

Profile

1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="5f52f22e-b077-43ac-bf22-83de1543c9e1_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
        <RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
            <DEDUPLICATION>
12
                <ENTITY name="result">
13
                    <PACE>pace.conf {
14
                        	clustering {
15
                        		ngrampairs   { fields = [title], params = { max = 1, ngramLen = 3} },
16
                        		suffixprefix { fields = [title], params = { max = 1, len = 3 } } },
17
                        	conditions { 
18
                        		yearMatch { fields = [dateofacceptance] },
19
                        		titleVersionMatch { fields = [title] },
20
                        		sizeMatch { fields = [authors] } 
21
                        	},
22
                        	model {
23
                        		title { algo = JaroWinkler, type = String, weight = 1.0, ignoreMissing = false, path = result/metadata/title/value },
24
                        		dateofacceptance/value { algo = Null, type = String, weight = 0.0, ignoreMissing = true, path = result/metadata/dateofacceptance/value },
25
                        		authors { algo = Null, type = List, weight = 0.0, ignoreMissing = true, path = result/author/metadata/fullname/value } } }</PACE>
26
                    <WORKFLOW>dedup.conf { 
27
                            threshold = 0.99, 
28
                            run = '001', 
29
                            entity.type = result, 
30
                            order.field = title, 
31
                            queue.max.size = 2000,
32
                            group.max.size = 10,
33
                            sliding.window.size = 200,
34
                            rootbuilder = [result] }</WORKFLOW>
35
                </ENTITY>
36
                <ENTITY name="person">
37
                    <PACE>pace.conf {
38
                        	clustering { \
39
                        		ngrampairs   { fields = [fullname], params = { max = 1, ngramLen = 3} },
40
                        		suffixprefix { fields = [fullname], params = { max = 1, len = 3 } } },
41
                        	model { \
42
                        		fullname/value { algo = JaroWinkler, type = String, weight = 0.6, ignoreMissing = false },
43
                        		coauthors/value { algo = JaroWinkler, type = String, weight = 0.4, ignoreMissing = true } } }</PACE>
44
                    <WORKFLOW>dedup.conf { 
45
                            threshold = 0.99, 
46
                            run = '001', 
47
                            entity.type = person, 
48
                            queue.max.size = 2000,
49
                            group.max.size = 10,
50
                            sliding.window.size = 200,                            
51
                            order.field = fullname, rootbuilder = [person,personResult_authorship_isAuthorOf,projectPerson_contactPerson_isContact] }</WORKFLOW>
52
                </ENTITY>
53
                <ENTITY name="organization">
54
                    <PACE>pace.conf {
55
                        	clustering {
56
                        		ngrampairs   { fields = [legalname], params = { max = 1, ngramLen = 3} },
57
                        		suffixprefix { fields = [legalname], params = { max = 1, len = 3 } } },
58
                        	model {
59
                        		legalname/value 	   { algo = JaroWinkler, type = String, weight = 0.6, ignoreMissing = false },
60
                        		legalshortname/value   { algo = JaroWinkler, type = String, weight = 0.4, ignoreMissing = true  } } }</PACE>
61
                    <WORKFLOW>dedup.conf { 
62
                            threshold = 0.99, 
63
                            run = '001', 
64
                            entity.type = organization, 
65
                            order.field = legalname, 
66
                            queue.max.size = 2000,
67
                            group.max.size = 10,
68
                            sliding.window.size = 200,                            
69
                            rootbuilder = [organization,projectOrganization_participation_isParticipant,datasourceOrganization_provision_isProvidedBy] }</WORKFLOW>
70
                </ENTITY>
71
            </DEDUPLICATION>
72
        </CONFIGURATION>
73
        <STATUS>
74
            <LAST_UPDATE value="2001-12-31T12:00:00"/>
75
        </STATUS>
76
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
77
    </BODY>
78
</RESOURCE_PROFILE>
    (1-1/1)