Project

General

Profile

« Previous | Next » 

Revision 35387

added configuration id and some more clustering functions

View differences:

modules/dnet-pace-core/branches/configurationId/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
25 25
		params.put("max", 8);
26 26
		params.put("maxPerToken", 1);
27 27
		params.put("minNgramLen", 3);
28
		
29
		ClusteringFunction ngram = new Ngrams(params);
30
		
31
		String s = "Search for the Standard Model Higgs Boson";
28

  
29
		final ClusteringFunction ngram = new Ngrams(params);
30

  
31
		final String s = "Search for the Standard Model Higgs Boson";
32 32
		System.out.println(s);
33 33
		System.out.println(ngram.apply(Lists.newArrayList(title(s))));
34 34
	}
35
	
35

  
36 36
	@Test
37 37
	public void testNgramPairs() {
38 38
		params.put("ngramLen", 2);
39 39
		params.put("max", 4);
40
		
41
		ClusteringFunction np = new NgramPairs(params);
42
		
43
		String s = "Search for the Standard Model Higgs Boson";
40

  
41
		final ClusteringFunction np = new NgramPairs(params);
42

  
43
		final String s = "Search for the Standard Model Higgs Boson";
44 44
		System.out.println(s);
45 45
		System.out.println(np.apply(Lists.newArrayList(title(s))));
46
	}	
47
	
46
	}
47

  
48 48
	@Test
49 49
	public void testAcronym() {
50 50
		params.put("max", 4);
51 51
		params.put("minLen", 1);
52 52
		params.put("maxLen", 3);
53
		
54
		ClusteringFunction acro = new Acronyms(params);
55
		
56
		String s = "Search for the Standard Model Higgs Boson";
53

  
54
		final ClusteringFunction acro = new Acronyms(params);
55

  
56
		final String s = "Search for the Standard Model Higgs Boson";
57 57
		System.out.println(s);
58 58
		System.out.println(acro.apply(Lists.newArrayList(title(s))));
59 59
	}
60
	
60

  
61 61
	@Test
62 62
	public void testSuffixPrefix() {
63 63
		params.put("len", 2);
64 64
		params.put("max", 3);
65
		
66
		ClusteringFunction sp = new SuffixPrefix(params);
67
		
68
		String s = "Search for the Standard Model Higgs Boson";
65

  
66
		final ClusteringFunction sp = new SuffixPrefix(params);
67

  
68
		final String s = "Search for the Standard Model Higgs Boson";
69 69
		System.out.println(s);
70 70
		System.out.println(sp.apply(Lists.newArrayList(title(s))));
71 71
	}
72 72

  
73
	@Test
74
	public void testFieldValue() {
75
		final ClusteringFunction sp = new SpaceTrimmingFieldValue(params);
76

  
77
		final String s = "Search for the Standard Model Higgs Boson";
78
		System.out.println(s);
79
		System.out.println(sp.apply(Lists.newArrayList(title(s))));
80
	}
81

  
73 82
}
modules/dnet-pace-core/branches/configurationId/src/test/java/eu/dnetlib/pace/util/DedupConfigTest.java
14 14

  
15 15
	@Test
16 16
	public void testCfg1() {
17
		String s =
17
		final String s =
18 18
				"dedup.conf { " +
19
						"id = '01', " +
19 20
						"threshold = 0.99, " +
20
						"run = '001', " + 
21
						"run = '001', " +
21 22
						"entity.type = organization, " +
22 23
						"order.field = legalname, " +
23 24
						"ngram.fields = [legalname], " +
24
                        "queue.max.size = 2000, " +
25
                        "group.max.size = 10, " +
26
                        "sliding.window.size = 200, " +
25
						"queue.max.size = 2000, " +
26
						"group.max.size = 10, " +
27
						"sliding.window.size = 200, " +
27 28
						"rootbuilder = [organization,projectOrganization_participation_isParticipant,datasourceOrganization_provision_isProvidedBy], " +
28 29
						"skiplist = [od_______908,od________18] }";
29 30

  
30 31
		final DedupConfig cfg = DedupConfigLoader.load(s);
31 32

  
32 33
		assertNotNull(cfg);
34
		assertTrue(cfg.getConfigurationId().equals("01"));
33 35
		assertTrue(cfg.getThreshold() == 0.99);
34 36
		assertTrue(cfg.getDedupRun().equals("001"));
35 37
		assertTrue(cfg.getEntityType().equals("organization"));
36 38
		assertTrue(cfg.getOrderField().equals("legalname"));
37
		assertTrue(Lists.newArrayList("organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy").equals(cfg.getRootBuilderFamilies()));
39
		assertTrue(Lists.newArrayList("organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy")
40
				.equals(cfg.getRootBuilderFamilies()));
38 41
		assertTrue(Sets.newHashSet("od_______908", "od________18").equals(cfg.getSkipList()));
39 42
		assertTrue(cfg.getSlidingWindowSize() == 200);
40 43
		assertTrue(cfg.getGroupMaxSize() == 10);
41 44
		assertTrue(cfg.getQueueMaxSize() == 2000);
42 45
	}
43
	
46

  
44 47
	@Test
45 48
	public void testCfg2() {
46
		String s =
49
		final String s =
47 50
				"dedup.conf { " +
51
						"id = '01', " +
48 52
						"threshold = 0.99, " +
49
						"run = '001', " + 
53
						"run = '001', " +
50 54
						"entity.type = organization, " +
51 55
						"order.field = legalname, " +
52 56
						"ngram.fields = [legalname], " +
53
                        "queue.max.size = 2000, " +
54
                        "group.max.size = 10, " +
55
                        "sliding.window.size = 200, " +						
57
						"queue.max.size = 2000, " +
58
						"group.max.size = 10, " +
59
						"sliding.window.size = 200, " +
56 60
						"rootbuilder = [organization,projectOrganization_participation_isParticipant,datasourceOrganization_provision_isProvidedBy] }";
57 61

  
58 62
		final DedupConfig cfg = DedupConfigLoader.load(s);
59 63

  
60 64
		assertNotNull(cfg);
65
		assertTrue(cfg.getConfigurationId().equals("01"));
61 66
		assertTrue(cfg.getThreshold() == 0.99);
62 67
		assertTrue(cfg.getEntityType().equals("organization"));
63 68
		assertTrue(cfg.getOrderField().equals("legalname"));
64
		assertTrue(Lists.newArrayList("organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy").equals(cfg.getRootBuilderFamilies()));
69
		assertTrue(Lists.newArrayList("organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy")
70
				.equals(cfg.getRootBuilderFamilies()));
65 71
		assertTrue(new HashSet<String>().equals(cfg.getSkipList()));
66 72
		assertTrue(cfg.getSlidingWindowSize() == 200);
67 73
		assertTrue(cfg.getGroupMaxSize() == 10);
68
		assertTrue(cfg.getQueueMaxSize() == 2000);		
74
		assertTrue(cfg.getQueueMaxSize() == 2000);
69 75
	}
70
	
71 76

  
72 77
}
modules/dnet-pace-core/branches/configurationId/src/test/resources/eu/dnetlib/pace/config/organization.pace.conf
1 1
pace.conf { 
2
	clustering {
3
		fieldvalue { fields = [legalshortname], params = { } }
4
	},
2 5
	conditions { },
3 6
	model {
4
		legalname { algo = JaroWinkler, type = String, weight = 0.6, ignoreMissing = false, path = organization/metadata/legalname/value }, 
5
		legalshortname { algo = JaroWinkler, type = String, weight = 0.4, ignoreMissing = true, path = organization/metadata/legalshortname/value } 
7
		legalname { algo = JaroWinkler, type = String, weight = 1.0, ignoreMissing = false, path = organization/metadata/legalname/value }
8

  
6 9
	} 
7 10
}
modules/dnet-pace-core/branches/configurationId/src/main/java/eu/dnetlib/pace/clustering/Clustering.java
1 1
package eu.dnetlib.pace.clustering;
2 2

  
3 3
public enum Clustering {
4
	acronyms, ngrams, ngrampairs, suffixprefix
4
	acronyms, ngrams, ngrampairs, suffixprefix, spacetrimmingfieldvalue
5 5
}
modules/dnet-pace-core/branches/configurationId/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import org.apache.commons.lang.RandomStringUtils;
8
import org.apache.commons.lang.StringUtils;
9

  
10
import com.google.common.collect.Lists;
11

  
12
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
13

  
14
	public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
15
		super(params);
16
	}
17

  
18
	@Override
19
	protected Collection<String> doApply(final String s) {
20
		final List<String> res = Lists.newArrayList();
21

  
22
		res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
23

  
24
		return res;
25
	}
26

  
27
}
modules/dnet-pace-core/branches/configurationId/src/main/java/eu/dnetlib/pace/config/ConfigurableModel.java
19 19
import eu.dnetlib.pace.clustering.NgramPairs;
20 20
import eu.dnetlib.pace.clustering.Ngrams;
21 21
import eu.dnetlib.pace.clustering.RandomClusteringFunction;
22
import eu.dnetlib.pace.clustering.SpaceTrimmingFieldValue;
22 23
import eu.dnetlib.pace.clustering.SuffixPrefix;
23 24
import eu.dnetlib.pace.condition.AlwaysTrueCondition;
24 25
import eu.dnetlib.pace.condition.ConditionAlgo;
......
98 99
				final String name = e.getKey();
99 100

  
100 101
				final String path = config.getString(String.format("pace.conf.model%s.%s.path", base, name));
101
				double weight = config.getDouble(String.format("pace.conf.model%s.%s.weight", base, name));
102
				Boolean ignoreMissing = config.getBoolean(String.format("pace.conf.model%s.%s.ignoreMissing", base, name));
102
				final double weight = config.getDouble(String.format("pace.conf.model%s.%s.weight", base, name));
103
				final Boolean ignoreMissing = config.getBoolean(String.format("pace.conf.model%s.%s.ignoreMissing", base, name));
103 104
				// Type type = Type.valueOf(config.getString(String.format("pace.conf.model%s.%s.type", base, name)));
104 105

  
105 106
				return new FieldDef(name, path, getAlgo(base, name, weight), ignoreMissing);
......
203 204
					return new NgramPairs(params);
204 205
				case suffixprefix:
205 206
					return new SuffixPrefix(params);
207
				case spacetrimmingfieldvalue:
208
					return new SpaceTrimmingFieldValue(params);
206 209
				default:
207 210
					return new RandomClusteringFunction(params);
208 211
				}
......
212 215

  
213 216
	@SuppressWarnings("unchecked")
214 217
	public Map<String, Iterable<String>> parseBlacklists() {
215
		if (!config.hasPath("pace.conf.blacklists")) { return Maps.newHashMap(); }
218
		if (!config.hasPath("pace.conf.blacklists")) return Maps.newHashMap();
216 219
		return (Map<String, Iterable<String>>) config.getObject("pace.conf.blacklists");
217 220
	}
218 221

  
modules/dnet-pace-core/branches/configurationId/src/main/java/eu/dnetlib/pace/util/DedupConfig.java
49 49
	/** The sliding window size. */
50 50
	private final int slidingWindowSize;
51 51

  
52
	/** The configuration id. */
53
	private final String configurationId;
54

  
52 55
	/**
53 56
	 * Instantiates a new dedup config.
54
	 * 
57
	 *
55 58
	 * @param entityType
56 59
	 *            the entity type
57 60
	 * @param orderField
......
70 73
	 *            the group max size
71 74
	 * @param slidingWindowSize
72 75
	 *            the sliding window size
76
	 * @param configurationId
77
	 *            the configuration identifier
73 78
	 */
74
	public DedupConfig(final String entityType, final String orderField, final List<String> rootBuilderFamilies, final String dedupRun, final double threshold,
79
	public DedupConfig(final String entityType, final String orderField, final List<String> rootBuilderFamilies, final String dedupRun,
80
			final String configurationId, final double threshold,
75 81
			final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize) {
76 82
		super();
77 83
		this.entityType = entityType;
78 84
		this.orderField = orderField;
79 85
		this.rootBuilderFamilies = rootBuilderFamilies;
80
		this.dedupRun = dedupRun.contains("'") ? dedupRun.replaceAll("'", "") : dedupRun;
86
		this.dedupRun = cleanupStringNumber(dedupRun);
87
		this.configurationId = cleanupStringNumber(configurationId);
81 88
		this.threshold = threshold;
82 89
		this.skipList = skipList;
83 90
		this.queueMaxSize = queueMaxSize;
......
85 92
		this.slidingWindowSize = slidingWindowSize;
86 93
	}
87 94

  
95
	private String cleanupStringNumber(final String s) {
96
		return s.contains("'") ? s.replaceAll("'", "") : s;
97
	}
98

  
88 99
	/**
89 100
	 * Gets the entity type.
90
	 * 
101
	 *
91 102
	 * @return the entity type
92 103
	 */
93 104
	public String getEntityType() {
......
96 107

  
97 108
	/**
98 109
	 * Gets the entity name bytes.
99
	 * 
110
	 *
100 111
	 * @return the entity name bytes
101 112
	 */
102 113
	public byte[] getEntityNameBytes() {
......
105 116

  
106 117
	/**
107 118
	 * Gets the order field.
108
	 * 
119
	 *
109 120
	 * @return the order field
110 121
	 */
111 122
	public String getOrderField() {
......
114 125

  
115 126
	/**
116 127
	 * Gets the root builder families.
117
	 * 
128
	 *
118 129
	 * @return the root builder families
119 130
	 */
120 131
	public List<String> getRootBuilderFamilies() {
......
123 134

  
124 135
	/**
125 136
	 * Gets the dedup run.
126
	 * 
137
	 *
127 138
	 * @return the dedup run
128 139
	 */
129 140
	public String getDedupRun() {
......
132 143

  
133 144
	/**
134 145
	 * Gets the threshold.
135
	 * 
146
	 *
136 147
	 * @return the threshold
137 148
	 */
138 149
	public double getThreshold() {
......
141 152

  
142 153
	/**
143 154
	 * Gets the skip list.
144
	 * 
155
	 *
145 156
	 * @return the skip list
146 157
	 */
147 158
	public Set<String> getSkipList() {
......
150 161

  
151 162
	/**
152 163
	 * Gets the queue max size.
153
	 * 
164
	 *
154 165
	 * @return the queue max size
155 166
	 */
156 167
	public int getQueueMaxSize() {
......
159 170

  
160 171
	/**
161 172
	 * Gets the group max size.
162
	 * 
173
	 *
163 174
	 * @return the group max size
164 175
	 */
165 176
	public int getGroupMaxSize() {
......
168 179

  
169 180
	/**
170 181
	 * Gets the sliding window size.
171
	 * 
182
	 *
172 183
	 * @return the sliding window size
173 184
	 */
174 185
	public int getSlidingWindowSize() {
......
177 188

  
178 189
	/*
179 190
	 * (non-Javadoc)
180
	 * 
191
	 *
181 192
	 * @see java.lang.Object#toString()
182 193
	 */
183 194
	@Override
......
185 196
		return new GsonBuilder().setPrettyPrinting().create().toJson(this);
186 197
	}
187 198

  
199
	public String getConfigurationId() {
200
		return configurationId;
201
	}
202

  
188 203
}
modules/dnet-pace-core/branches/configurationId/src/main/java/eu/dnetlib/pace/util/DedupConfigLoader.java
16 16
	private com.typesafe.config.Config config;
17 17

  
18 18
	/**
19
	 * Load.
19
	 * Loads the configuration. Static accessor.
20 20
	 *
21
	 * @param s the s
21
	 * @param s
22
	 *            the string
22 23
	 * @return the dedup config
23 24
	 */
24 25
	public static DedupConfig load(final String s) {
25
		DedupConfigLoader cfg = new DedupConfigLoader();
26
		final DedupConfigLoader cfg = new DedupConfigLoader();
26 27
		cfg.loadConf(s);
27 28
		return cfg.parseFields("");
28 29
	}
29 30

  
30 31
	/**
31
	 * Load conf.
32
	 * Loads the configuration.
32 33
	 *
33
	 * @param s the s
34
	 * @param s
35
	 *            the string
34 36
	 */
35 37
	protected void loadConf(final String s) {
36 38
		config = ConfigFactory.parseReader(new StringReader(s));
......
39 41
	/**
40 42
	 * Parses the fields.
41 43
	 *
42
	 * @param base the base
44
	 * @param base
45
	 *            the base
43 46
	 * @return the dedup config
44 47
	 */
45 48
	private DedupConfig parseFields(final String base) {
......
48 51
				getString("dedup.conf.order.field"),
49 52
				getStringList("dedup.conf.rootbuilder"),
50 53
				getString("dedup.conf.run"),
54
				getString("dedup.conf.id"),
51 55
				getDouble("dedup.conf.threshold"),
52 56
				getStringSet("dedup.conf.skiplist"),
53 57
				getInt("dedup.conf.queue.max.size"),
54 58
				getInt("dedup.conf.group.max.size"),
55 59
				getInt("dedup.conf.sliding.window.size"));
56 60
	}
57
	
61

  
58 62
	/**
59 63
	 * Safe.
60 64
	 *
61
	 * @param path the path
65
	 * @param path
66
	 *            the path
62 67
	 * @return the object
63 68
	 */
64 69
	protected Object safe(final String path) {
65 70
		return config.hasPath(path) ? config.getAnyRef(path) : null;
66 71
	}
67
	
72

  
68 73
	/**
69 74
	 * Gets the string.
70 75
	 *
71
	 * @param path the path
76
	 * @param path
77
	 *            the path
72 78
	 * @return the string
73 79
	 */
74 80
	public String getString(final String path) {
75 81
		return (String) safe(path);
76 82
	}
77
	
83

  
78 84
	/**
79 85
	 * Gets the double.
80 86
	 *
81
	 * @param path the path
87
	 * @param path
88
	 *            the path
82 89
	 * @return the double
83 90
	 */
84 91
	public Double getDouble(final String path) {
85
		Object safe = safe(path);
86
		if (safe instanceof Integer) {
87
			return Double.parseDouble(safe.toString());
88
		}
92
		final Object safe = safe(path);
93
		if (safe instanceof Integer) return Double.parseDouble(safe.toString());
89 94
		return (Double) safe;
90 95
	}
91
	
96

  
92 97
	/**
93 98
	 * Gets the int.
94 99
	 *
95
	 * @param path the path
100
	 * @param path
101
	 *            the path
96 102
	 * @return the int
97 103
	 */
98 104
	public Integer getInt(final String path) {
99
		Object safe = safe(path);
100
		if (safe instanceof Integer) {
101
			return Integer.parseInt(safe.toString());
102
		}
105
		final Object safe = safe(path);
106
		if (safe instanceof Integer) return Integer.parseInt(safe.toString());
103 107
		return (Integer) safe;
104
	}		
105
	
108
	}
109

  
106 110
	/**
107 111
	 * Gets the string list.
108 112
	 *
109
	 * @param path the path
113
	 * @param path
114
	 *            the path
110 115
	 * @return the string list
111 116
	 */
112 117
	@SuppressWarnings("unchecked")
113 118
	public List<String> getStringList(final String path) {
114 119
		return (List<String>) safe(path);
115 120
	}
116
	
121

  
117 122
	/**
118 123
	 * Gets the string set.
119 124
	 *
120
	 * @param path the path
125
	 * @param path
126
	 *            the path
121 127
	 * @return the string set
122 128
	 */
123 129
	@SuppressWarnings("unchecked")
124 130
	public Set<String> getStringSet(final String path) {
125 131
		final List<String> list = getStringList(path);
126 132
		return (Set<String>) (list != null ? Sets.newHashSet(list) : Sets.newHashSet());
127
	}	
133
	}
128 134

  
129 135
}

Also available in: Unified diff