Revision 49442
Added by Claudio Atzori over 6 years ago
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/deploy.info | ||
---|---|---|
1 |
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-pace-core/trunk/", "deploy_repository": "dnet45-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots", "name": "dnet-pace-core"} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Map; |
|
4 |
|
|
5 |
import com.google.common.collect.Lists; |
|
6 |
import com.google.common.collect.Maps; |
|
7 |
import eu.dnetlib.pace.AbstractPaceTest; |
|
8 |
import org.junit.Before; |
|
9 |
import org.junit.Test; |
|
10 |
|
|
11 |
public class ClusteringFunctionTest extends AbstractPaceTest { |
|
12 |
|
|
13 |
private Map<String, Integer> params; |
|
14 |
|
|
15 |
@Before |
|
16 |
public void setUp() throws Exception { |
|
17 |
params = Maps.newHashMap(); |
|
18 |
} |
|
19 |
|
|
20 |
@Test |
|
21 |
public void testNgram() { |
|
22 |
params.put("ngramLen", 3); |
|
23 |
params.put("max", 8); |
|
24 |
params.put("maxPerToken", 2); |
|
25 |
params.put("minNgramLen", 1); |
|
26 |
|
|
27 |
final ClusteringFunction ngram = new Ngrams(params); |
|
28 |
|
|
29 |
final String s = "Search for the Standard Model Higgs Boson"; |
|
30 |
System.out.println(s); |
|
31 |
System.out.println(ngram.apply(Lists.newArrayList(title(s)))); |
|
32 |
} |
|
33 |
|
|
34 |
@Test |
|
35 |
public void testNgramPairs() { |
|
36 |
params.put("ngramLen", 3); |
|
37 |
params.put("max", 3); |
|
38 |
|
|
39 |
final ClusteringFunction np = new NgramPairs(params); |
|
40 |
|
|
41 |
final String s = "Search for the Standard Model Higgs Boson"; |
|
42 |
System.out.println(s); |
|
43 |
System.out.println(np.apply(Lists.newArrayList(title(s)))); |
|
44 |
} |
|
45 |
|
|
46 |
@Test |
|
47 |
public void testSortedNgramPairs() { |
|
48 |
params.put("ngramLen", 3); |
|
49 |
params.put("max", 1); |
|
50 |
|
|
51 |
final ClusteringFunction np = new SortedNgramPairs(params); |
|
52 |
|
|
53 |
final String s1 = "University of Pisa"; |
|
54 |
System.out.println(s1); |
|
55 |
System.out.println(np.apply(Lists.newArrayList(title(s1)))); |
|
56 |
|
|
57 |
final String s2 = "Pisa University"; |
|
58 |
System.out.println(s2); |
|
59 |
System.out.println(np.apply(Lists.newArrayList(title(s2)))); |
|
60 |
} |
|
61 |
|
|
62 |
@Test |
|
63 |
public void testAcronym() { |
|
64 |
params.put("max", 4); |
|
65 |
params.put("minLen", 1); |
|
66 |
params.put("maxLen", 3); |
|
67 |
|
|
68 |
final ClusteringFunction acro = new Acronyms(params); |
|
69 |
|
|
70 |
final String s = "Search for the Standard Model Higgs Boson"; |
|
71 |
System.out.println(s); |
|
72 |
System.out.println(acro.apply(Lists.newArrayList(title(s)))); |
|
73 |
} |
|
74 |
|
|
75 |
@Test |
|
76 |
public void testSuffixPrefix() { |
|
77 |
params.put("len", 3); |
|
78 |
params.put("max", 4); |
|
79 |
|
|
80 |
final ClusteringFunction sp = new SuffixPrefix(params); |
|
81 |
|
|
82 |
final String s = "Search for the Standard Model Higgs Boson"; |
|
83 |
System.out.println(s); |
|
84 |
System.out.println(sp.apply(Lists.newArrayList(title(s)))); |
|
85 |
} |
|
86 |
|
|
87 |
@Test |
|
88 |
public void testFieldValue() { |
|
89 |
|
|
90 |
params.put("randomLength", 5); |
|
91 |
|
|
92 |
final ClusteringFunction sp = new SpaceTrimmingFieldValue(params); |
|
93 |
|
|
94 |
final String s = "Search for the Standard Model Higgs Boson"; |
|
95 |
System.out.println(s); |
|
96 |
System.out.println(sp.apply(Lists.newArrayList(title(s)))); |
|
97 |
} |
|
98 |
|
|
99 |
@Test |
|
100 |
public void testPersonClustering2() { |
|
101 |
final ClusteringFunction cf = new PersonClustering(params); |
|
102 |
|
|
103 |
final String s = readFromClasspath("gt.author.json"); |
|
104 |
System.out.println(s); |
|
105 |
System.out.println(cf.apply(Lists.newArrayList(person(s)))); |
|
106 |
} |
|
107 |
|
|
108 |
} |
|
0 | 109 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertFalse; |
|
4 |
import static org.junit.Assert.assertTrue; |
|
5 |
|
|
6 |
import org.junit.Test; |
|
7 |
|
|
8 |
public class PersonComparatorUtilsSimilarityTest { |
|
9 |
|
|
10 |
@Test |
|
11 |
public void testSimilarity_0() { |
|
12 |
assertTrue(PersonComparatorUtils.areSimilar("Artini Michele", "Michele Artini")); |
|
13 |
} |
|
14 |
|
|
15 |
@Test |
|
16 |
public void testSimilarity_1() { |
|
17 |
assertTrue(PersonComparatorUtils.areSimilar("ARTINI Michele", "Artini, Michele")); |
|
18 |
} |
|
19 |
|
|
20 |
@Test |
|
21 |
public void testSimilarity_2() { |
|
22 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini Michele")); |
|
23 |
} |
|
24 |
|
|
25 |
@Test |
|
26 |
public void testSimilarity_3() { |
|
27 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, Michele")); |
|
28 |
} |
|
29 |
|
|
30 |
@Test |
|
31 |
public void testSimilarity_4() { |
|
32 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, M.G.")); |
|
33 |
} |
|
34 |
|
|
35 |
@Test |
|
36 |
public void testSimilarity_5() { |
|
37 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, M. (sig.)", "Artini, Michele")); |
|
38 |
} |
|
39 |
|
|
40 |
@Test |
|
41 |
public void testSimilarity_6() { |
|
42 |
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, G.")); |
|
43 |
} |
|
44 |
|
|
45 |
@Test |
|
46 |
public void testSimilarity_7() { |
|
47 |
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, M.A.")); |
|
48 |
} |
|
49 |
|
|
50 |
@Test |
|
51 |
public void testSimilarity_8() { |
|
52 |
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, Giuseppe")); |
|
53 |
} |
|
54 |
|
|
55 |
@Test |
|
56 |
public void testSimilarity_9() { |
|
57 |
assertFalse(PersonComparatorUtils.areSimilar("Manghi, Paolo", "Artini, Michele")); |
|
58 |
} |
|
59 |
|
|
60 |
@Test |
|
61 |
public void testSimilarity_10() { |
|
62 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, Michele", "Artini, Michele Giovanni")); |
|
63 |
} |
|
64 |
|
|
65 |
@Test |
|
66 |
public void testSimilarity_11() { |
|
67 |
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.A.G.", "Artini, M.B.G.")); |
|
68 |
} |
|
69 |
|
|
70 |
@Test |
|
71 |
public void testSimilarity_12() { |
|
72 |
assertFalse(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini, Michele")); |
|
73 |
} |
|
74 |
|
|
75 |
@Test |
|
76 |
public void testSimilarity_13() { |
|
77 |
assertTrue(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini Manghi Michele")); |
|
78 |
} |
|
79 |
|
|
80 |
@Test |
|
81 |
public void testSimilarity_14() { |
|
82 |
assertFalse(PersonComparatorUtils.areSimilar("Artini, Michele", "Michele, Artini")); |
|
83 |
} |
|
84 |
|
|
85 |
@Test |
|
86 |
public void testSimilarity_15() { |
|
87 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Michele ARTINI")); |
|
88 |
} |
|
89 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/model/PersonTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertEquals; |
|
4 |
|
|
5 |
import java.text.Normalizer; |
|
6 |
import java.util.Queue; |
|
7 |
|
|
8 |
import org.junit.Test; |
|
9 |
|
|
10 |
import com.google.common.collect.Lists; |
|
11 |
|
|
12 |
public class PersonTest { |
|
13 |
|
|
14 |
@Test |
|
15 |
public void test_1() { |
|
16 |
check("Atzori, Claudio", "Atzori, Claudio"); |
|
17 |
} |
|
18 |
|
|
19 |
@Test |
|
20 |
public void test_2() { |
|
21 |
check("Atzori, Claudio A.", "Atzori, Claudio A."); |
|
22 |
} |
|
23 |
|
|
24 |
@Test |
|
25 |
public void test_3() { |
|
26 |
check("Claudio ATZORI", "Atzori, Claudio"); |
|
27 |
} |
|
28 |
|
|
29 |
@Test |
|
30 |
public void test_4() { |
|
31 |
check("ATZORI, Claudio", "Atzori, Claudio"); |
|
32 |
} |
|
33 |
|
|
34 |
@Test |
|
35 |
public void test_5() { |
|
36 |
check("Claudio Atzori", "Claudio Atzori"); |
|
37 |
} |
|
38 |
|
|
39 |
@Test |
|
40 |
public void test_6() { |
|
41 |
check(" Manghi , Paolo", "Manghi, Paolo"); |
|
42 |
} |
|
43 |
|
|
44 |
@Test |
|
45 |
public void test_7() { |
|
46 |
check("ATZORI, CLAUDIO", "Atzori, Claudio"); |
|
47 |
} |
|
48 |
|
|
49 |
@Test |
|
50 |
public void test_8() { |
|
51 |
check("ATZORI, CLAUDIO A", "Atzori, Claudio A."); |
|
52 |
} |
|
53 |
|
|
54 |
@Test |
|
55 |
public void test_9() { |
|
56 |
check("Bølviken, B.", "Bølviken, B."); |
|
57 |
} |
|
58 |
|
|
59 |
@Test |
|
60 |
public void test_10() { |
|
61 |
check("Bñlviken, B.", "B" + Normalizer.normalize("ñ", Normalizer.Form.NFD) + "lviken, B."); |
|
62 |
} |
|
63 |
|
|
64 |
@Test |
|
65 |
public void test_11() { |
|
66 |
check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰ ø", "Aaeeiioooouuuu, Aaeeiioooouuuu Ø.", true); |
|
67 |
} |
|
68 |
|
|
69 |
@Test |
|
70 |
public void test_12() { |
|
71 |
check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.normalize("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.Form.NFD), false); |
|
72 |
} |
|
73 |
|
|
74 |
@Test |
|
75 |
public void test_13() { |
|
76 |
check("Tkačíková, Daniela", Normalizer.normalize("Tkačíková, Daniela", Normalizer.Form.NFD), false); |
|
77 |
} |
|
78 |
|
|
79 |
@Test |
|
80 |
public void test_hashes() { |
|
81 |
checkHash(" Claudio ATZORI ", "ATZORI Claudio", "Atzori , Claudio", "ATZORI, Claudio"); |
|
82 |
} |
|
83 |
|
|
84 |
private void checkHash(String... ss) { |
|
85 |
Queue<String> q = Lists.newLinkedList(Lists.newArrayList(ss)); |
|
86 |
String h1 = new Person(q.remove(), false).hash(); |
|
87 |
while (!q.isEmpty()) { |
|
88 |
assertEquals(h1, new Person(q.remove(), false).hash()); |
|
89 |
} |
|
90 |
} |
|
91 |
|
|
92 |
private void check(String s, String expectedFullName) { |
|
93 |
check(s, expectedFullName, false); |
|
94 |
} |
|
95 |
|
|
96 |
private void check(String s, String expectedFullName, boolean aggressive) { |
|
97 |
Person p = new Person(s, aggressive); |
|
98 |
|
|
99 |
System.out.println("original: " + p.getOriginal()); |
|
100 |
System.out.println("accurate: " + p.isAccurate()); |
|
101 |
System.out.println("normalised: '" + p.getNormalisedFullname() + "'"); |
|
102 |
if (p.isAccurate()) { |
|
103 |
System.out.println("name: " + p.getNormalisedFirstName()); |
|
104 |
System.out.println("surname: " + p.getNormalisedSurname()); |
|
105 |
} |
|
106 |
System.out.println("hash: " + p.hash()); |
|
107 |
System.out.println(""); |
|
108 |
assertEquals(expectedFullName, p.getNormalisedFullname()); |
|
109 |
} |
|
110 |
|
|
111 |
} |
|
0 | 112 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertEquals; |
|
4 |
import static org.junit.Assert.assertTrue; |
|
5 |
|
|
6 |
import java.util.Set; |
|
7 |
|
|
8 |
import org.junit.Test; |
|
9 |
|
|
10 |
public class PersonComparatorUtilsNGramsTest { |
|
11 |
|
|
12 |
@Test |
|
13 |
public void testNormaizePerson_1() { |
|
14 |
verifyGetNgramsForPerson("Artini Michele", 2, "a_michele", "m_artini"); |
|
15 |
} |
|
16 |
|
|
17 |
@Test |
|
18 |
public void testNormaizePerson_2() { |
|
19 |
verifyGetNgramsForPerson("Michele Artini", 2, "a_michele", "m_artini"); |
|
20 |
} |
|
21 |
|
|
22 |
@Test |
|
23 |
public void testNormaizePerson_3() { |
|
24 |
verifyGetNgramsForPerson("Michele ARTINI", 1, "m_artini"); |
|
25 |
} |
|
26 |
|
|
27 |
@Test |
|
28 |
public void testNormaizePerson_4() { |
|
29 |
verifyGetNgramsForPerson("ARTINI Michele", 1, "m_artini"); |
|
30 |
} |
|
31 |
|
|
32 |
@Test |
|
33 |
public void testNormaizePerson_5() { |
|
34 |
verifyGetNgramsForPerson("Michele G. Artini", 2, "m_artini", "g_artini"); |
|
35 |
} |
|
36 |
|
|
37 |
@Test |
|
38 |
public void testNormaizePerson_6() { |
|
39 |
verifyGetNgramsForPerson(" Artini, Michele ", 1, "m_artini"); |
|
40 |
} |
|
41 |
|
|
42 |
@Test |
|
43 |
public void testNormaizePerson_7() { |
|
44 |
verifyGetNgramsForPerson("Artini, Michele (sig.)", 1, "m_artini"); |
|
45 |
} |
|
46 |
|
|
47 |
@Test |
|
48 |
public void testNormaizePerson_8() { |
|
49 |
verifyGetNgramsForPerson("Artini Michele [sig.] ", 2, "a_michele", "m_artini"); |
|
50 |
} |
|
51 |
|
|
52 |
@Test |
|
53 |
public void testNormaizePerson_9() { |
|
54 |
verifyGetNgramsForPerson("Artini, M", 1, "m_artini"); |
|
55 |
} |
|
56 |
|
|
57 |
@Test |
|
58 |
public void testNormaizePerson_10() { |
|
59 |
verifyGetNgramsForPerson("Artini, M.", 1, "m_artini"); |
|
60 |
} |
|
61 |
|
|
62 |
@Test |
|
63 |
public void testNormaizePerson_11() { |
|
64 |
verifyGetNgramsForPerson("Artini, M. (sig.)", 1, "m_artini"); |
|
65 |
} |
|
66 |
|
|
67 |
@Test |
|
68 |
public void testNormaizePerson_12() { |
|
69 |
verifyGetNgramsForPerson("Artini, M[sig.] ", 1, "m_artini"); |
|
70 |
} |
|
71 |
|
|
72 |
@Test |
|
73 |
public void testNormaizePerson_13() { |
|
74 |
verifyGetNgramsForPerson("Artini-SIG, Michele ", 1, "m_artini-sig"); |
|
75 |
} |
|
76 |
|
|
77 |
@Test |
|
78 |
public void testNormaizePerson_14() { |
|
79 |
verifyGetNgramsForPerson("Artini - SIG, Michele ", 1, "m_artini-sig"); |
|
80 |
} |
|
81 |
|
|
82 |
@Test |
|
83 |
public void testNormaizePerson_15() { |
|
84 |
verifyGetNgramsForPerson("Artini {sig.}, M", 1, "m_artini"); |
|
85 |
} |
|
86 |
|
|
87 |
@Test |
|
88 |
public void testNormaizePerson_16() { |
|
89 |
verifyGetNgramsForPerson("Artini, M., sig.", 1, "m_artini"); |
|
90 |
} |
|
91 |
|
|
92 |
@Test |
|
93 |
public void testNormaizePerson_17() { |
|
94 |
verifyGetNgramsForPerson("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA, BBBBBBBBBBBBBBBBBBBBBBBBBBBBB CCCCCCCCCCCCCCCCCCCC", 0); |
|
95 |
} |
|
96 |
|
|
97 |
@Test |
|
98 |
public void testNormaizePerson_18() { |
|
99 |
verifyGetNgramsForPerson("Dell'amico, Andrea", 1, "a_amico"); |
|
100 |
} |
|
101 |
|
|
102 |
@Test |
|
103 |
public void testNormaizePerson_19() { |
|
104 |
verifyGetNgramsForPerson("Smith, Paul van der", 1, "p_smith"); |
|
105 |
} |
|
106 |
|
|
107 |
@Test |
|
108 |
public void testNormaizePerson_20() { |
|
109 |
verifyGetNgramsForPerson("AAAAAAA, BBBB, CCCC, DDDD, EEEE", 1, "b_aaaaaaa"); |
|
110 |
} |
|
111 |
|
|
112 |
@Test |
|
113 |
public void testNormaizePerson_21() { |
|
114 |
verifyGetNgramsForPerson("Kompetenzzentrum Informelle Bildung (KIB),", 6); |
|
115 |
} |
|
116 |
|
|
117 |
private void verifyGetNgramsForPerson(String name, int expectedSize, String... expectedTokens) { |
|
118 |
Set<String> list = PersonComparatorUtils.getNgramsForPerson(name); |
|
119 |
System.out.println(list); |
|
120 |
assertEquals(expectedSize, list.size()); |
|
121 |
for (String s : expectedTokens) { |
|
122 |
assertTrue(list.contains(s)); |
|
123 |
} |
|
124 |
} |
|
125 |
|
|
126 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/config/ConfigTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.config; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
|
|
5 |
import eu.dnetlib.pace.AbstractPaceTest; |
|
6 |
import org.junit.Test; |
|
7 |
|
|
8 |
import static org.junit.Assert.assertNotNull; |
|
9 |
|
|
10 |
public class ConfigTest extends AbstractPaceTest { |
|
11 |
|
|
12 |
@Test |
|
13 |
public void test() throws IOException { |
|
14 |
final DedupConfig cfg = DedupConfig.load(readFromClasspath("result.pace.conf.json")); |
|
15 |
|
|
16 |
assertNotNull(cfg); |
|
17 |
|
|
18 |
System.out.println(cfg.toString()); |
|
19 |
} |
|
20 |
|
|
21 |
@Test |
|
22 |
public void test2() throws IOException { |
|
23 |
final DedupConfig cfg = DedupConfig.load(readFromClasspath("person.pace.conf.json")); |
|
24 |
|
|
25 |
assertNotNull(cfg); |
|
26 |
|
|
27 |
System.out.println(cfg.toString()); |
|
28 |
} |
|
29 |
|
|
30 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import org.junit.Before; |
|
4 |
import org.junit.Test; |
|
5 |
|
|
6 |
import eu.dnetlib.pace.common.AbstractPaceFunctions; |
|
7 |
|
|
8 |
public class DistanceAlgoTest extends AbstractPaceFunctions { |
|
9 |
|
|
10 |
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; |
|
11 |
|
|
12 |
@Before |
|
13 |
public void setup() { |
|
14 |
System.out.println("****************************************************************"); |
|
15 |
System.out.println("Test String : " + TEST_STRING); |
|
16 |
} |
|
17 |
|
|
18 |
@Test |
|
19 |
public void testGetNumbers() { |
|
20 |
System.out.println("Numbers : " + getNumbers(TEST_STRING)); |
|
21 |
} |
|
22 |
|
|
23 |
@Test |
|
24 |
public void testRemoveSymbols() { |
|
25 |
System.out.println("Without symbols: " + removeSymbols(TEST_STRING)); |
|
26 |
} |
|
27 |
|
|
28 |
@Test |
|
29 |
public void testFixAliases() { |
|
30 |
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); |
|
31 |
} |
|
32 |
|
|
33 |
@Test |
|
34 |
public void testCleanup() { |
|
35 |
System.out.println("cleaned up : " + cleanup(TEST_STRING)); |
|
36 |
} |
|
37 |
|
|
38 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.io.StringWriter; |
|
5 |
|
|
6 |
import org.apache.commons.io.IOUtils; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.config.Type; |
|
9 |
import eu.dnetlib.pace.model.Field; |
|
10 |
import eu.dnetlib.pace.model.FieldValueImpl; |
|
11 |
|
|
12 |
public abstract class AbstractPaceTest { |
|
13 |
|
|
14 |
protected String readFromClasspath(final String filename) { |
|
15 |
final StringWriter sw = new StringWriter(); |
|
16 |
try { |
|
17 |
IOUtils.copy(getClass().getResourceAsStream(filename), sw); |
|
18 |
return sw.toString(); |
|
19 |
} catch (final IOException e) { |
|
20 |
throw new RuntimeException("cannot load resource from classpath: " + filename); |
|
21 |
} |
|
22 |
} |
|
23 |
|
|
24 |
protected Field title(final String s) { |
|
25 |
return new FieldValueImpl(Type.String, "title", s); |
|
26 |
} |
|
27 |
|
|
28 |
protected Field person(final String s) { |
|
29 |
return new FieldValueImpl(Type.JSON, "person", s); |
|
30 |
} |
|
31 |
|
|
32 |
} |
|
0 | 33 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/resources/eu/dnetlib/pace/clustering/gt.author.json | ||
---|---|---|
1 |
{"metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}},"coauthor": [{"id": "30|od________88::1d22c2a22d7a1c7082006154ae6dd221","anchorId": "30|dedup_wf_001::7b1cfb3c4ec57d71cf331ba669a8e12c","metadata": {"firstname": {"value": "Maria Teresa"},"secondnames": [{"value": "Catanese"}],"fullname": {"value": "Catanese, Maria Teresa"}}},{"id": "30|od________88::2299c043fcaa751e266c82ec24b5a6cf","anchorId": "30|dedup_wf_001::ce73dc26c95e27d22f88e9ed9948b322","metadata": {"firstname": {"value": "Thomas S."},"secondnames": [{"value": "Oh"}],"fullname": {"value": "Oh, Thomas S."}}},{"id": "30|od_______908::52d670e6298c055c6c9c496aad4f2913","anchorId": "30|dedup_wf_001::8e1fafd9778a4cb5569830c299e5b52e","metadata": {"firstname": {"value": "Salman R."},"secondnames": [{"value": "Khetani"}],"fullname": {"value": "Khetani, Salman R."}}},{"id": "30|od________88::1458ae8d3663574e53dcd849ff8aa27d","anchorId": "30|dedup_wf_001::dd9f1dce92f402424de0d7d8afd7ca2d","metadata": {"firstname": {"value": "Sangeeta N."},"secondnames": [{"value": "Bhatia"}],"fullname": {"value": "Bhatia, Sangeeta N."}}},{"id": "30|od________88::837b992599e35b1a9baed833bf9a216e","anchorId": "30|dedup_wf_001::acb87ae171fd37f0ad65bcb728b11064","metadata": {"firstname": {"value": "Andrew J."},"secondnames": [{"value": "Syder"}],"fullname": {"value": "Syder, Andrew J."}}},{"id": "30|od_______908::2299c043fcaa751e266c82ec24b5a6cf","anchorId": "30|dedup_wf_001::ce73dc26c95e27d22f88e9ed9948b322","metadata": {"firstname": {"value": "Thomas S."},"secondnames": [{"value": "Oh"}],"fullname": {"value": "Oh, Thomas S."}}},{"id": "30|od_______908::97e1b5f96f76500dfd9e10ee0de5d380","anchorId": "30|dedup_wf_001::da35eb52feb1b1a789861976342b2570","metadata": {"firstname": {"value": "John W."},"secondnames": [{"value": "Schoggins"}],"fullname": {"value": "Schoggins, John W."}}},{"id": "30|od________88::97e1b5f96f76500dfd9e10ee0de5d380","anchorId": "30|dedup_wf_001::da35eb52feb1b1a789861976342b2570","metadata": {"firstname": {"value": "John W."},"secondnames": [{"value": "Schoggins"}],"fullname": {"value": "Schoggins, John W."}}},{"id": "30|od_______908::5bd4cd7e4065ffd73f39817e2a1bb1ae","anchorId": "30|dedup_wf_001::8ea4c1052c6a7aa1bb2b1097cb3893d2","metadata": {"firstname": {"value": "Lok Man J."},"secondnames": [{"value": "Law"}],"fullname": {"value": "Law, Lok Man J."}}},{"id": "30|od________88::845fd19e1e7201fcd1c492775f04a56b","anchorId": "30|dedup_wf_001::4e971919118e71ea2b2ac840ca319956","metadata": {"firstname": {"value": "Alexander"},"secondnames": [{"value": "Ploss"}],"fullname": {"value": "Ploss, Alexander"}}},{"id": "30|od_______908::7b6a37259ff32dba0e7ea884b8446228","anchorId": "30|dedup_wf_001::a600d9103b7947b1c52f823f8e4833cc","metadata": {"firstname": {"value": "Christopher T."},"secondnames": [{"value": "Jones"}],"fullname": {"value": "Jones, Christopher T."}}},{"id": "30|od________88::5bd4cd7e4065ffd73f39817e2a1bb1ae","anchorId": "30|dedup_wf_001::8ea4c1052c6a7aa1bb2b1097cb3893d2","metadata": {"firstname": {"value": "Lok Man J."},"secondnames": [{"value": "Law"}],"fullname": {"value": "Law, Lok Man J."}}},{"id": "30|od_______908::1d22c2a22d7a1c7082006154ae6dd221","anchorId": "30|dedup_wf_001::7b1cfb3c4ec57d71cf331ba669a8e12c","metadata": {"firstname": {"value": "Maria Teresa"},"secondnames": [{"value": "Catanese"}],"fullname": {"value": "Catanese, Maria Teresa"}}},{"id": "30|od________88::52d670e6298c055c6c9c496aad4f2913","anchorId": "30|dedup_wf_001::8e1fafd9778a4cb5569830c299e5b52e","metadata": {"firstname": {"value": "Salman R."},"secondnames": [{"value": "Khetani"}],"fullname": {"value": "Khetani, Salman R."}}},{"id": "30|od_______908::46acd9f206c2559f13b9119f8c5aef4c","anchorId": "30|dedup_wf_001::06a55cf2c97156d48ec49bcaf4bddcaf","metadata": {"firstname": {"value": "Stephen P."},"secondnames": [{"value": "Goff"}],"fullname": {"value": "Goff, Stephen P."}}},{"id": "30|od________88::7b6a37259ff32dba0e7ea884b8446228","anchorId": "30|dedup_wf_001::a600d9103b7947b1c52f823f8e4833cc","metadata": {"firstname": {"value": "Christopher T."},"secondnames": [{"value": "Jones"}],"fullname": {"value": "Jones, Christopher T."}}},{"id": "30|od_______908::1458ae8d3663574e53dcd849ff8aa27d","anchorId": "30|dedup_wf_001::dd9f1dce92f402424de0d7d8afd7ca2d","metadata": {"firstname": {"value": "Sangeeta N."},"secondnames": [{"value": "Bhatia"}],"fullname": {"value": "Bhatia, Sangeeta N."}}},{"id": "30|od_______908::845fd19e1e7201fcd1c492775f04a56b","anchorId": "30|dedup_wf_001::4e971919118e71ea2b2ac840ca319956","metadata": {"firstname": {"value": "Alexander"},"secondnames": [{"value": "Ploss"}],"fullname": {"value": "Ploss, Alexander"}}},{"id": "30|od_______908::837b992599e35b1a9baed833bf9a216e","anchorId": "30|dedup_wf_001::acb87ae171fd37f0ad65bcb728b11064","metadata": {"firstname": {"value": "Andrew J."},"secondnames": [{"value": "Syder"}],"fullname": {"value": "Syder, Andrew J."}}}],"mergedperson": [{"id": "30|od_______908::715fc4c41052a5b8ce881b23b826f648","metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}}},{"id": "30|od________88::715fc4c41052a5b8ce881b23b826f648","metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}}}],"anchor": true} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json | ||
---|---|---|
1 |
{ |
|
2 |
"wf" : { |
|
3 |
"threshold" : "0.99", |
|
4 |
"run" : "001", |
|
5 |
"entityType" : "result", |
|
6 |
"orderField" : "title", |
|
7 |
"queueMaxSize" : "2000", |
|
8 |
"groupMaxSize" : "10", |
|
9 |
"slidingWindowSize" : "200", |
|
10 |
"rootBuilder" : [ "result" ], |
|
11 |
"includeChildren" : "true" |
|
12 |
}, |
|
13 |
"pace" : { |
|
14 |
"clustering" : [ |
|
15 |
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, |
|
16 |
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, |
|
17 |
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } |
|
18 |
], |
|
19 |
"strictConditions" : [ |
|
20 |
{ "name" : "exactMatch", "fields" : [ "pid" ] } |
|
21 |
], |
|
22 |
"conditions" : [ |
|
23 |
{ "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, |
|
24 |
{ "name" : "titleVersionMatch", "fields" : [ "title" ] }, |
|
25 |
{ "name" : "sizeMatch", "fields" : [ "authors" ] } |
|
26 |
], |
|
27 |
"model" : [ |
|
28 |
{ "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" }, |
|
29 |
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, |
|
30 |
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } , |
|
31 |
{ "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" }, |
|
32 |
{ |
|
33 |
"name": "anchors", |
|
34 |
"algo": "PersonCoAnchorsDistance", |
|
35 |
"type": "JSON", |
|
36 |
"weight": "0.0", |
|
37 |
"ignoreMissing": "true", |
|
38 |
"path": "person", |
|
39 |
"params": { |
|
40 |
"common.anchors": "1" |
|
41 |
} |
|
42 |
}, |
|
43 |
{ |
|
44 |
"name": "coauthor", |
|
45 |
"algo": "PersonCoAuthorSurnamesDistance", |
|
46 |
"type": "JSON", |
|
47 |
"weight": "0.0", |
|
48 |
"ignoreMissing": "true", |
|
49 |
"path": "person", |
|
50 |
"params": { |
|
51 |
"common.surnames": "2" |
|
52 |
} |
|
53 |
}, |
|
54 |
{ |
|
55 |
"name": "person", |
|
56 |
"algo": "PersonDistance", |
|
57 |
"type": "JSON", |
|
58 |
"weight": "0.0", |
|
59 |
"ignoreMissing": "true", |
|
60 |
"path": "person", |
|
61 |
"params": { |
|
62 |
"common.surnames": "2" |
|
63 |
} |
|
64 |
} |
|
65 |
], |
|
66 |
"blacklists" : { |
|
67 |
"title" : [ |
|
68 |
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$", |
|
69 |
"^(Kiri Karl Morgensternile).*$", |
|
70 |
"^(\\[Eksliibris Aleksandr).*\\]$", |
|
71 |
"^(\\[Eksliibris Aleksandr).*$", |
|
72 |
"^(Eksliibris Aleksandr).*$", |
|
73 |
"^(Kiri A\\. de Vignolles).*$", |
|
74 |
"^(2 kirja Karl Morgensternile).*$", |
|
75 |
"^(Pirita kloostri idaosa arheoloogilised).*$", |
|
76 |
"^(Kiri tundmatule).*$", |
|
77 |
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", |
|
78 |
"^(Eksliibris Nikolai Birukovile).*$", |
|
79 |
"^(Eksliibris Nikolai Issakovile).*$", |
|
80 |
"^(WHP Cruise Summary Information of section).*$", |
|
81 |
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", |
|
82 |
"^(Measurement of the spin\\-dependent structure function).*" |
|
83 |
] } |
|
84 |
} |
|
85 |
|
|
86 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/resources/eu/dnetlib/pace/config/title_blacklist.txt | ||
---|---|---|
1 |
^(Corpus Oral Dialectal \(COD\)\.).*$ |
|
2 |
^(Kiri Karl Morgensternile).*$ |
|
3 |
^(\[Eksliibris Aleksandr).*\]$ |
|
4 |
^(Kiri A\. de Vignolles).*$ |
|
5 |
^(2 kirja Karl Morgensternile).*$ |
|
6 |
^(Pirita kloostri idaosa arheoloogilised).*$ |
|
7 |
^(Kiri tundmatule).*$ |
|
8 |
^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$ |
|
9 |
^(Eksliibris Nikolai Birukovile).*$ |
|
10 |
^(Eksliibris Nikolai Issakovile).*$ |
|
11 |
^(\[Eksliibris Aleksandr).*$ |
|
12 |
^(WHP Cruise Summary Information of section).*$ |
|
13 |
^(Measurement of the top quark\-pair production cross section with ATLAS in pp collisions at).*$ |
|
14 |
^(Measurement of the spin\-dependent structure function).* |
|
15 |
^(lorem ipsum).* |
|
0 | 16 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/resources/eu/dnetlib/pace/config/person.pace.conf.json | ||
---|---|---|
1 |
{ |
|
2 |
"wf": { |
|
3 |
"threshold": "0.98", |
|
4 |
"dedupRun": "001", |
|
5 |
"entityType": "person", |
|
6 |
"orderField": "fullname", |
|
7 |
"queueMaxSize": "2000", |
|
8 |
"groupMaxSize": "10", |
|
9 |
"slidingWindowSize": "200", |
|
10 |
"rootBuilder": [ |
|
11 |
"person" |
|
12 |
], |
|
13 |
"includeChildren": "true" |
|
14 |
}, |
|
15 |
"pace": { |
|
16 |
"clustering": [ |
|
17 |
{ |
|
18 |
"name": "personclustering", |
|
19 |
"fields": [ |
|
20 |
"person" |
|
21 |
], |
|
22 |
"params": {} |
|
23 |
} |
|
24 |
], |
|
25 |
"model": [ |
|
26 |
{ |
|
27 |
"name": "person", |
|
28 |
"algo": "PersonDistance", |
|
29 |
"type": "JSON", |
|
30 |
"weight": "1.0", |
|
31 |
"ignoreMissing": "false", |
|
32 |
"path": "person", |
|
33 |
"params": { |
|
34 |
"common.surnames": "2" |
|
35 |
} |
|
36 |
}, |
|
37 |
{ |
|
38 |
"name": "fullname", |
|
39 |
"algo": "Null", |
|
40 |
"type": "String", |
|
41 |
"weight": "0", |
|
42 |
"ignoreMissing": "true", |
|
43 |
"path": "person/metadata/fullname/value" |
|
44 |
} |
|
45 |
], |
|
46 |
"blacklists": { |
|
47 |
"lastname": [ |
|
48 |
"(?i)^wang$", |
|
49 |
"(?i)^~wang$", |
|
50 |
"(?i)^zhang$", |
|
51 |
"(?i)^zhou$", |
|
52 |
"(?i)^zhao$", |
|
53 |
"(?i)^li$", |
|
54 |
"(?i)^~li$", |
|
55 |
"(?i)^liu$", |
|
56 |
"(?i)^chen$", |
|
57 |
"(?i)^yang$", |
|
58 |
"(?i)^kim$", |
|
59 |
"(?i)^xu$", |
|
60 |
"(?i)^huang$", |
|
61 |
"(?i)^sun$", |
|
62 |
"(?i)^lee$", |
|
63 |
"(?i)^ma$", |
|
64 |
"(?i)^kim$", |
|
65 |
"(?i)^hu$", |
|
66 |
"(?i)^wu$", |
|
67 |
"(?i)^zhu$", |
|
68 |
"(?i)^lu$" |
|
69 |
] |
|
70 |
} |
|
71 |
} |
|
72 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import com.google.common.collect.Lists; |
|
8 |
|
|
9 |
public class NgramPairs extends Ngrams { |
|
10 |
|
|
11 |
public NgramPairs(Map<String, Integer> params) { |
|
12 |
super(params); |
|
13 |
} |
|
14 |
|
|
15 |
@Override |
|
16 |
protected Collection<String> doApply(String s) { |
|
17 |
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max")); |
|
18 |
} |
|
19 |
|
|
20 |
protected Collection<String> ngramPairs(final List<String> ngrams, int maxNgrams) { |
|
21 |
Collection<String> res = Lists.newArrayList(); |
|
22 |
int j = 0; |
|
23 |
for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) { |
|
24 |
if (++j >= ngrams.size()) { |
|
25 |
break; |
|
26 |
} |
|
27 |
res.add(ngrams.get(i) + ngrams.get(j)); |
|
28 |
//System.out.println("-- " + concatNgrams); |
|
29 |
} |
|
30 |
return res; |
|
31 |
} |
|
32 |
|
|
33 |
} |
|
0 | 34 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import org.apache.commons.lang.RandomStringUtils; |
|
8 |
import org.apache.commons.lang.StringUtils; |
|
9 |
|
|
10 |
import com.google.common.collect.Lists; |
|
11 |
|
|
12 |
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { |
|
13 |
|
|
14 |
public SpaceTrimmingFieldValue(final Map<String, Integer> params) { |
|
15 |
super(params); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
protected Collection<String> doApply(final String s) { |
|
20 |
final List<String> res = Lists.newArrayList(); |
|
21 |
|
|
22 |
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); |
|
23 |
|
|
24 |
return res; |
|
25 |
} |
|
26 |
|
|
27 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
import java.util.Set; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.model.FieldList; |
|
9 |
import eu.dnetlib.pace.model.FieldValue; |
|
10 |
import org.apache.commons.lang.StringUtils; |
|
11 |
|
|
12 |
import com.google.common.base.Splitter; |
|
13 |
import com.google.common.collect.Iterables; |
|
14 |
import com.google.common.collect.Sets; |
|
15 |
|
|
16 |
import eu.dnetlib.pace.common.AbstractPaceFunctions; |
|
17 |
import eu.dnetlib.pace.model.Field; |
|
18 |
import eu.dnetlib.pace.model.gt.Author; |
|
19 |
import eu.dnetlib.pace.model.gt.GTAuthor; |
|
20 |
|
|
21 |
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction { |
|
22 |
|
|
23 |
private Map<String, Integer> params; |
|
24 |
|
|
25 |
private static final int MAX_TOKENS = 5; |
|
26 |
|
|
27 |
public PersonClustering(final Map<String, Integer> params) { |
|
28 |
this.params = params; |
|
29 |
} |
|
30 |
|
|
31 |
@Override |
|
32 |
public Collection<String> apply(final List<Field> fields) { |
|
33 |
final Set<String> hashes = Sets.newHashSet(); |
|
34 |
|
|
35 |
for (final Field f : fields) { |
|
36 |
|
|
37 |
final GTAuthor gta = GTAuthor.fromOafJson(f.stringValue()); |
|
38 |
|
|
39 |
final Author a = gta.getAuthor(); |
|
40 |
if (a.isWellFormed()) { |
|
41 |
hashes.add(firstLC(a.getFirstname()) + a.getSecondnames().toLowerCase()); |
|
42 |
} else { |
|
43 |
for (final String token1 : tokens(a.getFullname())) { |
|
44 |
for (final String token2 : tokens(a.getFullname())) { |
|
45 |
if (!token1.equals(token2)) { |
|
46 |
hashes.add(firstLC(token1) + token2); |
|
47 |
} |
|
48 |
} |
|
49 |
} |
|
50 |
} |
|
51 |
} |
|
52 |
|
|
53 |
return hashes; |
|
54 |
} |
|
55 |
|
|
56 |
private String firstLC(final String s) { |
|
57 |
return StringUtils.substring(s, 0, 1).toLowerCase(); |
|
58 |
} |
|
59 |
|
|
60 |
private Iterable<String> tokens(final String s) { |
|
61 |
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), MAX_TOKENS); |
|
62 |
} |
|
63 |
|
|
64 |
@Override |
|
65 |
public Map<String, Integer> getParams() { |
|
66 |
return params; |
|
67 |
} |
|
68 |
|
|
69 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
import java.util.Map.Entry; |
|
7 |
import java.util.Set; |
|
8 |
|
|
9 |
import com.google.common.collect.Iterables; |
|
10 |
import com.google.common.collect.Lists; |
|
11 |
import com.google.common.collect.Maps; |
|
12 |
|
|
13 |
import eu.dnetlib.pace.config.Config; |
|
14 |
import eu.dnetlib.pace.model.Document; |
|
15 |
import eu.dnetlib.pace.model.Field; |
|
16 |
import eu.dnetlib.pace.model.FieldListImpl; |
|
17 |
import eu.dnetlib.pace.model.MapDocument; |
|
18 |
|
|
19 |
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner { |
|
20 |
|
|
21 |
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf, final Map<String, List<String>> blacklists) { |
|
22 |
|
|
23 |
final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, blacklists); |
|
24 |
return combine(filtered, conf); |
|
25 |
} |
|
26 |
|
|
27 |
private MapDocument filter(final MapDocument a, final Map<String, List<String>> blacklists) { |
|
28 |
final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap()); |
|
29 |
if (blacklists != null) { |
|
30 |
for (final Entry<String, Field> e : filtered.entrySet()) { |
|
31 |
|
|
32 |
final FieldListImpl fl = new FieldListImpl(); |
|
33 |
fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists)))); |
|
34 |
filtered.put(e.getKey(), fl); |
|
35 |
} |
|
36 |
} |
|
37 |
return new MapDocument(a.getIdentifier(), filtered); |
|
38 |
} |
|
39 |
|
|
40 |
/** |
|
41 |
* Tries to match the fields in the regex blacklist. |
|
42 |
* |
|
43 |
* @param fieldName |
|
44 |
* @param value |
|
45 |
* @return true if the field matches, false otherwise |
|
46 |
*/ |
|
47 |
protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) { |
|
48 |
if (blacklists.containsKey(fieldName)) { |
|
49 |
for (final String regex : blacklists.get(fieldName)) { |
|
50 |
if (value.matches(regex)) return true; |
|
51 |
} |
|
52 |
} |
|
53 |
return false; |
|
54 |
} |
|
55 |
} |
|
0 | 56 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import com.google.common.collect.Sets; |
|
8 |
|
|
9 |
import eu.dnetlib.pace.common.AbstractPaceFunctions; |
|
10 |
import eu.dnetlib.pace.model.Field; |
|
11 |
|
|
12 |
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction { |
|
13 |
|
|
14 |
protected Map<String, Integer> params; |
|
15 |
|
|
16 |
public AbstractClusteringFunction(final Map<String, Integer> params) { |
|
17 |
this.params = params; |
|
18 |
} |
|
19 |
|
|
20 |
protected abstract Collection<String> doApply(String s); |
|
21 |
|
|
22 |
@Override |
|
23 |
public Collection<String> apply(List<Field> fields) { |
|
24 |
Collection<String> c = Sets.newLinkedHashSet(); |
|
25 |
for(Field f : fields) { |
|
26 |
c.addAll(filterBlacklisted(doApply(filterStopWords(normalize(f.stringValue()), stopwords)), ngramBlacklist)); |
|
27 |
} |
|
28 |
return c; |
|
29 |
} |
|
30 |
|
|
31 |
public Map<String, Integer> getParams() { |
|
32 |
return params; |
|
33 |
} |
|
34 |
|
|
35 |
protected Integer param(String name) { |
|
36 |
return params.get(name); |
|
37 |
} |
|
38 |
} |
|
0 | 39 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import com.google.common.collect.Lists; |
|
8 |
|
|
9 |
import eu.dnetlib.pace.model.Person; |
|
10 |
|
|
11 |
public class PersonHash extends AbstractClusteringFunction { |
|
12 |
|
|
13 |
private boolean DEFAULT_AGGRESSIVE = false; |
|
14 |
|
|
15 |
public PersonHash(final Map<String, Integer> params) { |
|
16 |
super(params); |
|
17 |
} |
|
18 |
|
|
19 |
@Override |
|
20 |
protected Collection<String> doApply(final String s) { |
|
21 |
final List<String> res = Lists.newArrayList(); |
|
22 |
|
|
23 |
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); |
|
24 |
|
|
25 |
res.add(new Person(s, aggressive).hash()); |
|
26 |
|
|
27 |
return res; |
|
28 |
} |
|
29 |
|
|
30 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/Clustering.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
public enum Clustering { |
|
4 |
acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase |
|
5 |
} |
|
0 | 6 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import com.google.common.collect.Lists; |
|
8 |
|
|
9 |
public class ImmutableFieldValue extends AbstractClusteringFunction { |
|
10 |
|
|
11 |
public ImmutableFieldValue(final Map<String, Integer> params) { |
|
12 |
super(params); |
|
13 |
} |
|
14 |
|
|
15 |
@Override |
|
16 |
protected Collection<String> doApply(final String s) { |
|
17 |
final List<String> res = Lists.newArrayList(); |
|
18 |
|
|
19 |
res.add(s); |
|
20 |
|
|
21 |
return res; |
|
22 |
} |
|
23 |
|
|
24 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Map; |
|
5 |
import java.util.Set; |
|
6 |
|
|
7 |
import com.google.common.collect.Sets; |
|
8 |
|
|
9 |
public class SuffixPrefix extends AbstractClusteringFunction { |
|
10 |
|
|
11 |
public SuffixPrefix(Map<String, Integer> params) { |
|
12 |
super(params); |
|
13 |
} |
|
14 |
|
|
15 |
@Override |
|
16 |
protected Collection<String> doApply(String s) { |
|
17 |
return suffixPrefix(s, param("len"), param("max")); |
|
18 |
} |
|
19 |
|
|
20 |
private Collection<String> suffixPrefix(String s, int len, int max) { |
|
21 |
final Set<String> bigrams = Sets.newLinkedHashSet(); |
|
22 |
int i = 0; |
|
23 |
while (++i < s.length() && bigrams.size() < max) { |
|
24 |
int j = s.indexOf(" ", i); |
|
25 |
|
|
26 |
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length(); |
|
27 |
|
|
28 |
if (j - len > 0) { |
|
29 |
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim(); |
|
30 |
if (bigram.length() >= 4) { |
|
31 |
bigrams.add(bigram); |
|
32 |
} |
|
33 |
} |
|
34 |
} |
|
35 |
return bigrams; |
|
36 |
} |
|
37 |
|
|
38 |
} |
|
0 | 39 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Collections; |
|
5 |
import java.util.List; |
|
6 |
import java.util.Map; |
|
7 |
|
|
8 |
import com.google.common.base.Joiner; |
|
9 |
import com.google.common.base.Splitter; |
|
10 |
import com.google.common.collect.Lists; |
|
11 |
|
|
12 |
public class SortedNgramPairs extends NgramPairs { |
|
13 |
|
|
14 |
public SortedNgramPairs(Map<String, Integer> params) { |
|
15 |
super(params); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
protected Collection<String> doApply(String s) { |
|
20 |
|
|
21 |
final List<String> tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s)); |
|
22 |
|
|
23 |
Collections.sort(tokens); |
|
24 |
|
|
25 |
return ngramPairs(Lists.newArrayList(getNgrams(Joiner.on(" ").join(tokens), param("ngramLen"), param("max") * 2, 1, 2)), param("max")); |
|
26 |
} |
|
27 |
|
|
28 |
} |
|
0 | 29 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import com.google.common.collect.Lists; |
|
8 |
import com.google.common.collect.Sets; |
|
9 |
import eu.dnetlib.pace.model.Field; |
|
10 |
import org.apache.commons.lang.StringUtils; |
|
11 |
|
|
12 |
public class LowercaseClustering extends AbstractClusteringFunction { |
|
13 |
|
|
14 |
public LowercaseClustering(final Map<String, Integer> params) { |
|
15 |
super(params); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
public Collection<String> apply(List<Field> fields) { |
|
20 |
Collection<String> c = Sets.newLinkedHashSet(); |
|
21 |
for(Field f : fields) { |
|
22 |
c.addAll(doApply(f.stringValue())); |
|
23 |
} |
|
24 |
return c; |
|
25 |
} |
|
26 |
|
|
27 |
@Override |
|
28 |
protected Collection<String> doApply(final String s) { |
|
29 |
if(StringUtils.isBlank(s)) { |
|
30 |
return Lists.newArrayList(); |
|
31 |
} |
|
32 |
return Lists.newArrayList(s.toLowerCase().trim()); |
|
33 |
} |
|
34 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Set; |
|
4 |
|
|
5 |
import org.apache.commons.lang.StringUtils; |
|
6 |
|
|
7 |
import eu.dnetlib.pace.common.AbstractPaceFunctions; |
|
8 |
|
|
9 |
public class NGramUtils extends AbstractPaceFunctions { |
|
10 |
|
|
11 |
private static final int SIZE = 100; |
|
12 |
|
|
13 |
private static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); |
|
14 |
|
|
15 |
public static String cleanupForOrdering(String s) { |
|
16 |
NGramUtils utils = new NGramUtils(); |
|
17 |
return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", ""); |
|
18 |
} |
|
19 |
|
|
20 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import eu.dnetlib.pace.model.Field; |
|
8 |
|
|
9 |
public interface ClusteringFunction { |
|
10 |
|
|
11 |
public Collection<String> apply(List<Field> fields); |
|
12 |
|
|
13 |
public Map<String, Integer> getParams(); |
|
14 |
|
|
15 |
} |
|
0 | 16 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
public class RandomClusteringFunction extends AbstractClusteringFunction { |
|
7 |
|
|
8 |
public RandomClusteringFunction(Map<String, Integer> params) { |
|
9 |
super(params); |
|
10 |
} |
|
11 |
|
|
12 |
@Override |
|
13 |
protected Collection<String> doApply(String s) { |
|
14 |
// TODO Auto-generated method stub |
|
15 |
return null; |
|
16 |
} |
|
17 |
|
|
18 |
} |
|
0 | 19 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.LinkedHashSet; |
|
5 |
import java.util.Map; |
|
6 |
import java.util.StringTokenizer; |
|
7 |
|
|
8 |
public class Ngrams extends AbstractClusteringFunction { |
|
9 |
|
|
10 |
public Ngrams(Map<String, Integer> params) { |
|
11 |
super(params); |
|
12 |
} |
|
13 |
|
|
14 |
@Override |
|
15 |
protected Collection<String> doApply(String s) { |
|
16 |
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); |
|
17 |
} |
|
18 |
|
|
19 |
protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) { |
|
20 |
|
|
21 |
final Collection<String> ngrams = new LinkedHashSet<String>(); |
|
22 |
final StringTokenizer st = new StringTokenizer(s); |
|
23 |
|
|
24 |
while (st.hasMoreTokens()) { |
|
25 |
final String token = st.nextToken(); |
|
26 |
if (!token.isEmpty()) { |
|
27 |
|
|
28 |
for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) { |
|
29 |
String ngram = (token + " ").substring(i, ngramLen + i).trim(); |
|
30 |
if (ngrams.size() >= max) { |
|
31 |
return ngrams; |
|
32 |
} |
|
33 |
if (ngram.length() >= minNgramLen) { |
|
34 |
ngrams.add(ngram); |
|
35 |
} |
|
36 |
} |
|
37 |
} |
|
38 |
} |
|
39 |
//System.out.println(ngrams + " n: " + ngrams.size()); |
|
40 |
return ngrams; |
|
41 |
} |
|
42 |
|
|
43 |
} |
|
0 | 44 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
import com.google.common.base.Predicate; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.model.Field; |
|
9 |
|
|
10 |
public class FieldFilter implements Predicate<Field> { |
|
11 |
|
|
12 |
private Map<String, List<String>> blacklists; |
|
13 |
|
|
14 |
private String filedName; |
|
15 |
|
|
16 |
public FieldFilter(final String fieldName, final Map<String, List<String>> blacklists) { |
|
17 |
this.filedName = fieldName; |
|
18 |
this.blacklists = blacklists; |
|
19 |
} |
|
20 |
|
|
21 |
@Override |
|
22 |
public boolean apply(final Field f) { |
|
23 |
return !regexMatches(filedName, f.stringValue(), blacklists); |
|
24 |
} |
|
25 |
|
|
26 |
/** |
|
27 |
* Tries to match the fields in the regex blacklist. |
|
28 |
* |
|
29 |
* @param fieldName |
|
30 |
* @param value |
|
31 |
* @return true if the field matches, false otherwise |
|
32 |
*/ |
|
33 |
protected boolean regexMatches(final String fieldName, final String value, final Map<String, List<String>> blacklists) { |
|
34 |
if (blacklists.containsKey(fieldName)) { |
|
35 |
final Iterable<String> regexes = blacklists.get(fieldName); |
|
36 |
for (final String regex : regexes) { |
|
37 |
if (value.matches(regex)) return true; |
|
38 |
} |
|
39 |
} |
|
40 |
return false; |
|
41 |
} |
|
42 |
} |
|
0 | 43 |
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Map; |
|
5 |
import java.util.Set; |
|
6 |
import java.util.StringTokenizer; |
|
7 |
|
|
8 |
import com.google.common.collect.Sets; |
|
9 |
|
|
10 |
public class Acronyms extends AbstractClusteringFunction { |
|
11 |
|
|
12 |
public Acronyms(Map<String, Integer> params) { |
|
13 |
super(params); |
|
14 |
} |
|
15 |
|
|
16 |
@Override |
|
17 |
protected Collection<String> doApply(String s) { |
|
18 |
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); |
|
19 |
} |
|
20 |
|
|
21 |
private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) { |
|
22 |
|
|
23 |
final Set<String> acronyms = Sets.newLinkedHashSet(); |
|
24 |
|
|
25 |
for (int i = 0; i < maxAcronyms; i++) { |
|
26 |
|
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-pace-core-2.5.2