Revision 36172
Added by Alessia Bardi over 9 years ago
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/deploy.info | ||
---|---|---|
1 |
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-pace-core/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "dnet-pace-core"} |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Map; |
|
4 |
|
|
5 |
import org.junit.Before; |
|
6 |
import org.junit.Test; |
|
7 |
|
|
8 |
import com.google.common.collect.Lists; |
|
9 |
import com.google.common.collect.Maps; |
|
10 |
|
|
11 |
import eu.dnetlib.pace.AbstractPaceTest; |
|
12 |
|
|
13 |
public class ClusteringFunctionTest extends AbstractPaceTest { |
|
14 |
|
|
15 |
private Map<String, Integer> params; |
|
16 |
|
|
17 |
@Before |
|
18 |
public void setUp() throws Exception { |
|
19 |
params = Maps.newHashMap(); |
|
20 |
} |
|
21 |
|
|
22 |
@Test |
|
23 |
public void testNgram() { |
|
24 |
params.put("ngramLen", 2); |
|
25 |
params.put("max", 8); |
|
26 |
params.put("maxPerToken", 1); |
|
27 |
params.put("minNgramLen", 3); |
|
28 |
|
|
29 |
final ClusteringFunction ngram = new Ngrams(params); |
|
30 |
|
|
31 |
final String s = "Search for the Standard Model Higgs Boson"; |
|
32 |
System.out.println(s); |
|
33 |
System.out.println(ngram.apply(Lists.newArrayList(title(s)))); |
|
34 |
} |
|
35 |
|
|
36 |
@Test |
|
37 |
public void testNgramPairs() { |
|
38 |
params.put("ngramLen", 2); |
|
39 |
params.put("max", 4); |
|
40 |
|
|
41 |
final ClusteringFunction np = new NgramPairs(params); |
|
42 |
|
|
43 |
final String s = "Search for the Standard Model Higgs Boson"; |
|
44 |
System.out.println(s); |
|
45 |
System.out.println(np.apply(Lists.newArrayList(title(s)))); |
|
46 |
} |
|
47 |
|
|
48 |
@Test |
|
49 |
public void testAcronym() { |
|
50 |
params.put("max", 4); |
|
51 |
params.put("minLen", 1); |
|
52 |
params.put("maxLen", 3); |
|
53 |
|
|
54 |
final ClusteringFunction acro = new Acronyms(params); |
|
55 |
|
|
56 |
final String s = "Search for the Standard Model Higgs Boson"; |
|
57 |
System.out.println(s); |
|
58 |
System.out.println(acro.apply(Lists.newArrayList(title(s)))); |
|
59 |
} |
|
60 |
|
|
61 |
@Test |
|
62 |
public void testSuffixPrefix() { |
|
63 |
params.put("len", 2); |
|
64 |
params.put("max", 3); |
|
65 |
|
|
66 |
final ClusteringFunction sp = new SuffixPrefix(params); |
|
67 |
|
|
68 |
final String s = "Search for the Standard Model Higgs Boson"; |
|
69 |
System.out.println(s); |
|
70 |
System.out.println(sp.apply(Lists.newArrayList(title(s)))); |
|
71 |
} |
|
72 |
|
|
73 |
@Test |
|
74 |
public void testFieldValue() { |
|
75 |
final ClusteringFunction sp = new SpaceTrimmingFieldValue(params); |
|
76 |
|
|
77 |
final String s = "Search for the Standard Model Higgs Boson"; |
|
78 |
System.out.println(s); |
|
79 |
System.out.println(sp.apply(Lists.newArrayList(title(s)))); |
|
80 |
} |
|
81 |
|
|
82 |
} |
|
0 | 83 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertFalse; |
|
4 |
import static org.junit.Assert.assertTrue; |
|
5 |
|
|
6 |
import org.junit.Test; |
|
7 |
|
|
8 |
public class PersonComparatorUtilsSimilarityTest { |
|
9 |
|
|
10 |
@Test |
|
11 |
public void testSimilarity_0() { |
|
12 |
assertTrue(PersonComparatorUtils.areSimilar("Artini Michele", "Michele Artini")); |
|
13 |
} |
|
14 |
|
|
15 |
@Test |
|
16 |
public void testSimilarity_1() { |
|
17 |
assertTrue(PersonComparatorUtils.areSimilar("ARTINI Michele", "Artini, Michele")); |
|
18 |
} |
|
19 |
|
|
20 |
@Test |
|
21 |
public void testSimilarity_2() { |
|
22 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini Michele")); |
|
23 |
} |
|
24 |
|
|
25 |
@Test |
|
26 |
public void testSimilarity_3() { |
|
27 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, Michele")); |
|
28 |
} |
|
29 |
|
|
30 |
@Test |
|
31 |
public void testSimilarity_4() { |
|
32 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, M.G.")); |
|
33 |
} |
|
34 |
|
|
35 |
@Test |
|
36 |
public void testSimilarity_5() { |
|
37 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, M. (sig.)", "Artini, Michele")); |
|
38 |
} |
|
39 |
|
|
40 |
@Test |
|
41 |
public void testSimilarity_6() { |
|
42 |
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, G.")); |
|
43 |
} |
|
44 |
|
|
45 |
@Test |
|
46 |
public void testSimilarity_7() { |
|
47 |
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, M.A.")); |
|
48 |
} |
|
49 |
|
|
50 |
@Test |
|
51 |
public void testSimilarity_8() { |
|
52 |
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, Giuseppe")); |
|
53 |
} |
|
54 |
|
|
55 |
@Test |
|
56 |
public void testSimilarity_9() { |
|
57 |
assertFalse(PersonComparatorUtils.areSimilar("Manghi, Paolo", "Artini, Michele")); |
|
58 |
} |
|
59 |
|
|
60 |
@Test |
|
61 |
public void testSimilarity_10() { |
|
62 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, Michele", "Artini, Michele Giovanni")); |
|
63 |
} |
|
64 |
|
|
65 |
@Test |
|
66 |
public void testSimilarity_11() { |
|
67 |
assertFalse(PersonComparatorUtils.areSimilar("Artini, M.A.G.", "Artini, M.B.G.")); |
|
68 |
} |
|
69 |
|
|
70 |
@Test |
|
71 |
public void testSimilarity_12() { |
|
72 |
assertFalse(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini, Michele")); |
|
73 |
} |
|
74 |
|
|
75 |
@Test |
|
76 |
public void testSimilarity_13() { |
|
77 |
assertTrue(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini Manghi Michele")); |
|
78 |
} |
|
79 |
|
|
80 |
@Test |
|
81 |
public void testSimilarity_14() { |
|
82 |
assertFalse(PersonComparatorUtils.areSimilar("Artini, Michele", "Michele, Artini")); |
|
83 |
} |
|
84 |
|
|
85 |
@Test |
|
86 |
public void testSimilarity_15() { |
|
87 |
assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Michele ARTINI")); |
|
88 |
} |
|
89 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/java/eu/dnetlib/pace/model/PersonTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertEquals; |
|
4 |
|
|
5 |
import java.text.Normalizer; |
|
6 |
import java.util.Queue; |
|
7 |
|
|
8 |
import org.junit.Test; |
|
9 |
|
|
10 |
import com.google.common.collect.Lists; |
|
11 |
|
|
12 |
public class PersonTest { |
|
13 |
|
|
14 |
@Test |
|
15 |
public void test_1() { |
|
16 |
check("Atzori, Claudio", "Atzori, Claudio"); |
|
17 |
} |
|
18 |
|
|
19 |
@Test |
|
20 |
public void test_2() { |
|
21 |
check("Atzori, Claudio A.", "Atzori, Claudio A."); |
|
22 |
} |
|
23 |
|
|
24 |
@Test |
|
25 |
public void test_3() { |
|
26 |
check("Claudio ATZORI", "Atzori, Claudio"); |
|
27 |
} |
|
28 |
|
|
29 |
@Test |
|
30 |
public void test_4() { |
|
31 |
check("ATZORI, Claudio", "Atzori, Claudio"); |
|
32 |
} |
|
33 |
|
|
34 |
@Test |
|
35 |
public void test_5() { |
|
36 |
check("Claudio Atzori", "Claudio Atzori"); |
|
37 |
} |
|
38 |
|
|
39 |
@Test |
|
40 |
public void test_6() { |
|
41 |
check(" Manghi , Paolo", "Manghi, Paolo"); |
|
42 |
} |
|
43 |
|
|
44 |
@Test |
|
45 |
public void test_7() { |
|
46 |
check("ATZORI, CLAUDIO", "Atzori, Claudio"); |
|
47 |
} |
|
48 |
|
|
49 |
@Test |
|
50 |
public void test_8() { |
|
51 |
check("ATZORI, CLAUDIO A", "Atzori, Claudio A."); |
|
52 |
} |
|
53 |
|
|
54 |
@Test |
|
55 |
public void test_9() { |
|
56 |
check("Bølviken, B.", "Bølviken, B."); |
|
57 |
} |
|
58 |
|
|
59 |
@Test |
|
60 |
public void test_10() { |
|
61 |
check("Bñlviken, B.", "B" + Normalizer.normalize("ñ", Normalizer.Form.NFD) + "lviken, B."); |
|
62 |
} |
|
63 |
|
|
64 |
@Test |
|
65 |
public void test_11() { |
|
66 |
check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰ ø", "Aaeeiioooouuuu, Aaeeiioooouuuu Ø.", true); |
|
67 |
} |
|
68 |
|
|
69 |
@Test |
|
70 |
public void test_12() { |
|
71 |
check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.normalize("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.Form.NFD), false); |
|
72 |
} |
|
73 |
|
|
74 |
@Test |
|
75 |
public void test_13() { |
|
76 |
check("Tkačíková, Daniela", Normalizer.normalize("Tkačíková, Daniela", Normalizer.Form.NFD), false); |
|
77 |
} |
|
78 |
|
|
79 |
@Test |
|
80 |
public void test_hashes() { |
|
81 |
checkHash(" Claudio ATZORI ", "ATZORI Claudio", "Atzori , Claudio", "ATZORI, Claudio"); |
|
82 |
} |
|
83 |
|
|
84 |
private void checkHash(String... ss) { |
|
85 |
Queue<String> q = Lists.newLinkedList(Lists.newArrayList(ss)); |
|
86 |
String h1 = new Person(q.remove(), false).hash(); |
|
87 |
while (!q.isEmpty()) { |
|
88 |
assertEquals(h1, new Person(q.remove(), false).hash()); |
|
89 |
} |
|
90 |
} |
|
91 |
|
|
92 |
private void check(String s, String expectedFullName) { |
|
93 |
check(s, expectedFullName, false); |
|
94 |
} |
|
95 |
|
|
96 |
private void check(String s, String expectedFullName, boolean aggressive) { |
|
97 |
Person p = new Person(s, aggressive); |
|
98 |
|
|
99 |
System.out.println("original: " + p.getOriginal()); |
|
100 |
System.out.println("accurate: " + p.isAccurate()); |
|
101 |
System.out.println("normalised: '" + p.getNormalisedFullname() + "'"); |
|
102 |
if (p.isAccurate()) { |
|
103 |
System.out.println("name: " + p.getNormalisedFirstName()); |
|
104 |
System.out.println("surname: " + p.getNormalisedSurname()); |
|
105 |
} |
|
106 |
System.out.println("hash: " + p.hash()); |
|
107 |
System.out.println(""); |
|
108 |
assertEquals(expectedFullName, p.getNormalisedFullname()); |
|
109 |
} |
|
110 |
|
|
111 |
} |
|
0 | 112 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertEquals; |
|
4 |
import static org.junit.Assert.assertTrue; |
|
5 |
|
|
6 |
import java.util.Set; |
|
7 |
|
|
8 |
import org.junit.Test; |
|
9 |
|
|
10 |
public class PersonComparatorUtilsNGramsTest { |
|
11 |
|
|
12 |
@Test |
|
13 |
public void testNormaizePerson_1() { |
|
14 |
verifyGetNgramsForPerson("Artini Michele", 2, "a_michele", "m_artini"); |
|
15 |
} |
|
16 |
|
|
17 |
@Test |
|
18 |
public void testNormaizePerson_2() { |
|
19 |
verifyGetNgramsForPerson("Michele Artini", 2, "a_michele", "m_artini"); |
|
20 |
} |
|
21 |
|
|
22 |
@Test |
|
23 |
public void testNormaizePerson_3() { |
|
24 |
verifyGetNgramsForPerson("Michele ARTINI", 1, "m_artini"); |
|
25 |
} |
|
26 |
|
|
27 |
@Test |
|
28 |
public void testNormaizePerson_4() { |
|
29 |
verifyGetNgramsForPerson("ARTINI Michele", 1, "m_artini"); |
|
30 |
} |
|
31 |
|
|
32 |
@Test |
|
33 |
public void testNormaizePerson_5() { |
|
34 |
verifyGetNgramsForPerson("Michele G. Artini", 2, "m_artini", "g_artini"); |
|
35 |
} |
|
36 |
|
|
37 |
@Test |
|
38 |
public void testNormaizePerson_6() { |
|
39 |
verifyGetNgramsForPerson(" Artini, Michele ", 1, "m_artini"); |
|
40 |
} |
|
41 |
|
|
42 |
@Test |
|
43 |
public void testNormaizePerson_7() { |
|
44 |
verifyGetNgramsForPerson("Artini, Michele (sig.)", 1, "m_artini"); |
|
45 |
} |
|
46 |
|
|
47 |
@Test |
|
48 |
public void testNormaizePerson_8() { |
|
49 |
verifyGetNgramsForPerson("Artini Michele [sig.] ", 2, "a_michele", "m_artini"); |
|
50 |
} |
|
51 |
|
|
52 |
@Test |
|
53 |
public void testNormaizePerson_9() { |
|
54 |
verifyGetNgramsForPerson("Artini, M", 1, "m_artini"); |
|
55 |
} |
|
56 |
|
|
57 |
@Test |
|
58 |
public void testNormaizePerson_10() { |
|
59 |
verifyGetNgramsForPerson("Artini, M.", 1, "m_artini"); |
|
60 |
} |
|
61 |
|
|
62 |
@Test |
|
63 |
public void testNormaizePerson_11() { |
|
64 |
verifyGetNgramsForPerson("Artini, M. (sig.)", 1, "m_artini"); |
|
65 |
} |
|
66 |
|
|
67 |
@Test |
|
68 |
public void testNormaizePerson_12() { |
|
69 |
verifyGetNgramsForPerson("Artini, M[sig.] ", 1, "m_artini"); |
|
70 |
} |
|
71 |
|
|
72 |
@Test |
|
73 |
public void testNormaizePerson_13() { |
|
74 |
verifyGetNgramsForPerson("Artini-SIG, Michele ", 1, "m_artini-sig"); |
|
75 |
} |
|
76 |
|
|
77 |
@Test |
|
78 |
public void testNormaizePerson_14() { |
|
79 |
verifyGetNgramsForPerson("Artini - SIG, Michele ", 1, "m_artini-sig"); |
|
80 |
} |
|
81 |
|
|
82 |
@Test |
|
83 |
public void testNormaizePerson_15() { |
|
84 |
verifyGetNgramsForPerson("Artini {sig.}, M", 1, "m_artini"); |
|
85 |
} |
|
86 |
|
|
87 |
@Test |
|
88 |
public void testNormaizePerson_16() { |
|
89 |
verifyGetNgramsForPerson("Artini, M., sig.", 1, "m_artini"); |
|
90 |
} |
|
91 |
|
|
92 |
@Test |
|
93 |
public void testNormaizePerson_17() { |
|
94 |
verifyGetNgramsForPerson("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA, BBBBBBBBBBBBBBBBBBBBBBBBBBBBB CCCCCCCCCCCCCCCCCCCC", 0); |
|
95 |
} |
|
96 |
|
|
97 |
@Test |
|
98 |
public void testNormaizePerson_18() { |
|
99 |
verifyGetNgramsForPerson("Dell'amico, Andrea", 1, "a_amico"); |
|
100 |
} |
|
101 |
|
|
102 |
@Test |
|
103 |
public void testNormaizePerson_19() { |
|
104 |
verifyGetNgramsForPerson("Smith, Paul van der", 1, "p_smith"); |
|
105 |
} |
|
106 |
|
|
107 |
@Test |
|
108 |
public void testNormaizePerson_20() { |
|
109 |
verifyGetNgramsForPerson("AAAAAAA, BBBB, CCCC, DDDD, EEEE", 1, "b_aaaaaaa"); |
|
110 |
} |
|
111 |
|
|
112 |
@Test |
|
113 |
public void testNormaizePerson_21() { |
|
114 |
verifyGetNgramsForPerson("Kompetenzzentrum Informelle Bildung (KIB),", 6); |
|
115 |
} |
|
116 |
|
|
117 |
private void verifyGetNgramsForPerson(String name, int expectedSize, String... expectedTokens) { |
|
118 |
Set<String> list = PersonComparatorUtils.getNgramsForPerson(name); |
|
119 |
System.out.println(list); |
|
120 |
assertEquals(expectedSize, list.size()); |
|
121 |
for (String s : expectedTokens) { |
|
122 |
assertTrue(list.contains(s)); |
|
123 |
} |
|
124 |
} |
|
125 |
|
|
126 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/java/eu/dnetlib/pace/config/ConfigTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.config; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertFalse; |
|
4 |
import static org.junit.Assert.assertNotNull; |
|
5 |
import static org.junit.Assert.assertTrue; |
|
6 |
|
|
7 |
import java.util.List; |
|
8 |
import java.util.Map; |
|
9 |
|
|
10 |
import org.junit.Test; |
|
11 |
|
|
12 |
import eu.dnetlib.pace.model.ClusteringDef; |
|
13 |
import eu.dnetlib.pace.model.CondDef; |
|
14 |
import eu.dnetlib.pace.model.FieldDef; |
|
15 |
|
|
16 |
public class ConfigTest { |
|
17 |
|
|
18 |
public static String cfgFull = "pace.conf { " + "clustering { " + "acronyms { fields = [title], params = { max = 1, minLen = 2, maxLen = 4} }, " |
|
19 |
+ "ngrampairs { fields = [title], params = { max = 1, ngramLen = 3} }, " + "suffixprefix { fields = [title], params = { max = 1, len = 3 } } }, " |
|
20 |
+ |
|
21 |
|
|
22 |
"conditions { " + "yearMatch { fields = [dateofacceptance] }, " + "titleVersionMatch { fields = [title] } }," + |
|
23 |
|
|
24 |
"model { " + "title/value { algo = JaroWinkler, type = String, weight = 1.0, ignoreMissing = false }, " |
|
25 |
+ "dateofacceptance { algo = Null, type = String, weight = 0.0, ignoreMissing = true } }, " + |
|
26 |
|
|
27 |
"blacklists = {" + "title = [" + "\"^(Corpus Oral Dialectal \\\\(COD\\\\).).*$\", " + "\"^(Kiri Karl Morgensternile).*$\", " |
|
28 |
+ "\"^(Kiri A. de Vignolles).*$\", " + "\"^(\\\\[Eksliibris Aleksandr).*\\\\]$\", " + "\"^(\\\\[Eksliibris Aleksandr).*$\", " |
|
29 |
+ "\"^(Eksliibris Aleksandr).*$\"" + "] }" + "}"; |
|
30 |
|
|
31 |
public static String cfgOrganizations = "pace.conf { " + "clustering { " + "ngrampairs { fields = [legalname], params = { max = 1, ngramLen = 3} }, " |
|
32 |
+ "suffixprefix { fields = [legalname], params = { max = 1, len = 3 } } }, " + "model { " |
|
33 |
+ "legalname { algo = JaroWinkler, type = String, weight = 0.6, ignoreMissing = false }, " |
|
34 |
+ "legalshortname { algo = JaroWinkler, type = String, weight = 0.4, ignoreMissing = true } } } "; |
|
35 |
|
|
36 |
private Config config; |
|
37 |
|
|
38 |
@Test |
|
39 |
public void testFull() { |
|
40 |
config = DynConf.load(cfgFull); |
|
41 |
|
|
42 |
assertNotNull(config); |
|
43 |
|
|
44 |
checkFields(config.fields()); |
|
45 |
checkConditions(config.conditions()); |
|
46 |
checkClustering(config.clusterings()); |
|
47 |
checkBlacklists(config.blacklists()); |
|
48 |
} |
|
49 |
|
|
50 |
@Test |
|
51 |
public void testOrganizations() { |
|
52 |
config = DynConf.load(cfgOrganizations); |
|
53 |
|
|
54 |
assertNotNull(config); |
|
55 |
|
|
56 |
checkFields(config.fields()); |
|
57 |
checkClustering(config.clusterings()); |
|
58 |
|
|
59 |
assertNotNull(config.blacklists()); |
|
60 |
assertTrue(config.blacklists().isEmpty()); |
|
61 |
} |
|
62 |
|
|
63 |
private void checkBlacklists(final Map<String, Iterable<String>> b) { |
|
64 |
assertNotNull(b); |
|
65 |
assertFalse(b.isEmpty()); |
|
66 |
|
|
67 |
System.out.println("blacklists:\n" + b); |
|
68 |
|
|
69 |
assertTrue(tryMatch(b.get("title"), "Corpus Oral Dialectal (COD). Barcelona")); |
|
70 |
assertTrue(tryMatch(b.get("title"), "Kiri Karl Morgensternile")); |
|
71 |
assertTrue(tryMatch(b.get("title"), "[Eksliibris Aleksandr Fadejevile]")); |
|
72 |
assertTrue(tryMatch(b.get("title"), "Eksliibris Aleksandr Baldinile")); |
|
73 |
assertTrue(tryMatch(b.get("title"), "Kiri A. de Vignolles'ile, Potsdam")); |
|
74 |
assertTrue(tryMatch(b.get("title"), "Kiri A. de Vignolles`ile, Brandenburg")); |
|
75 |
} |
|
76 |
|
|
77 |
private boolean tryMatch(final Iterable<String> set, final String string) { |
|
78 |
for (String regex : set) { |
|
79 |
if (string.matches(regex)) { return true; } |
|
80 |
} |
|
81 |
return false; |
|
82 |
} |
|
83 |
|
|
84 |
private void checkClustering(final List<ClusteringDef> clusterings) { |
|
85 |
assertNotNull(clusterings); |
|
86 |
|
|
87 |
for (ClusteringDef cd : clusterings) { |
|
88 |
assertNotNull(cd); |
|
89 |
|
|
90 |
assertNotNull(cd.getClusteringFunction()); |
|
91 |
assertNotNull(cd.getClusteringFunction().getParams()); |
|
92 |
|
|
93 |
assertNotNull(cd.getFields()); |
|
94 |
assertFalse(cd.getFields().isEmpty()); |
|
95 |
} |
|
96 |
|
|
97 |
System.out.println("clustering: " + clusterings); |
|
98 |
} |
|
99 |
|
|
100 |
private void checkConditions(final List<CondDef> conds) { |
|
101 |
assertNotNull(conds); |
|
102 |
|
|
103 |
for (CondDef cd : conds) { |
|
104 |
assertNotNull(cd); |
|
105 |
|
|
106 |
assertNotNull(cd.getConditionAlgo()); |
|
107 |
} |
|
108 |
|
|
109 |
System.out.println("conditions: " + conds); |
|
110 |
} |
|
111 |
|
|
112 |
private void checkFields(final List<FieldDef> fields) { |
|
113 |
|
|
114 |
assertNotNull(fields); |
|
115 |
|
|
116 |
for (FieldDef fd : fields) { |
|
117 |
assertNotNull(fd); |
|
118 |
|
|
119 |
assertNotNull(fd.getName()); |
|
120 |
assertTrue(fd.getName().length() > 0); |
|
121 |
|
|
122 |
assertNotNull(fd.getAlgo()); |
|
123 |
|
|
124 |
System.out.println(fd); |
|
125 |
} |
|
126 |
} |
|
127 |
|
|
128 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import org.junit.Before; |
|
4 |
import org.junit.Test; |
|
5 |
|
|
6 |
import eu.dnetlib.pace.common.AbstractPaceFunctions; |
|
7 |
|
|
8 |
public class DistanceAlgoTest extends AbstractPaceFunctions { |
|
9 |
|
|
10 |
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; |
|
11 |
|
|
12 |
@Before |
|
13 |
public void setup() { |
|
14 |
System.out.println("****************************************************************"); |
|
15 |
System.out.println("Test String : " + TEST_STRING); |
|
16 |
} |
|
17 |
|
|
18 |
@Test |
|
19 |
public void testGetNumbers() { |
|
20 |
System.out.println("Numbers : " + getNumbers(TEST_STRING)); |
|
21 |
} |
|
22 |
|
|
23 |
@Test |
|
24 |
public void testRemoveSymbols() { |
|
25 |
System.out.println("Without symbols: " + removeSymbols(TEST_STRING)); |
|
26 |
} |
|
27 |
|
|
28 |
@Test |
|
29 |
public void testFixAliases() { |
|
30 |
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); |
|
31 |
} |
|
32 |
|
|
33 |
@Test |
|
34 |
public void testCleanup() { |
|
35 |
System.out.println("cleaned up : " + cleanup(TEST_STRING)); |
|
36 |
} |
|
37 |
|
|
38 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.io.StringWriter; |
|
5 |
|
|
6 |
import org.apache.commons.io.IOUtils; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.config.Config; |
|
9 |
import eu.dnetlib.pace.config.DynConf; |
|
10 |
import eu.dnetlib.pace.config.Type; |
|
11 |
import eu.dnetlib.pace.model.Field; |
|
12 |
import eu.dnetlib.pace.model.FieldValueImpl; |
|
13 |
|
|
14 |
public abstract class AbstractPaceTest { |
|
15 |
|
|
16 |
protected Config getResultConf() { |
|
17 |
return DynConf.load(readFromClasspath("/eu/dnetlib/pace/config/result.pace.conf")); |
|
18 |
} |
|
19 |
|
|
20 |
protected Config getOrganizationConf() { |
|
21 |
return DynConf.load(readFromClasspath("/eu/dnetlib/pace/config/organization.pace.conf")); |
|
22 |
} |
|
23 |
|
|
24 |
private String readFromClasspath(final String filename) { |
|
25 |
StringWriter sw = new StringWriter(); |
|
26 |
try { |
|
27 |
IOUtils.copy(getClass().getResourceAsStream(filename), sw); |
|
28 |
return sw.toString(); |
|
29 |
} catch (IOException e) { |
|
30 |
throw new RuntimeException("cannot load resource from classpath: " + filename); |
|
31 |
} |
|
32 |
} |
|
33 |
|
|
34 |
protected Field title(final String s) { |
|
35 |
return new FieldValueImpl(Type.String, "title", s); |
|
36 |
} |
|
37 |
|
|
38 |
} |
|
0 | 39 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/java/eu/dnetlib/pace/util/DedupConfigTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.util; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertFalse; |
|
4 |
import static org.junit.Assert.assertNotNull; |
|
5 |
import static org.junit.Assert.assertTrue; |
|
6 |
|
|
7 |
import java.util.HashSet; |
|
8 |
|
|
9 |
import org.junit.Test; |
|
10 |
|
|
11 |
import com.google.common.collect.Lists; |
|
12 |
import com.google.common.collect.Sets; |
|
13 |
|
|
14 |
public class DedupConfigTest { |
|
15 |
|
|
16 |
@Test |
|
17 |
public void testCfg1() { |
|
18 |
final String s = |
|
19 |
"dedup.conf { " + |
|
20 |
"id = '01', " + |
|
21 |
"threshold = 0.99, " + |
|
22 |
"run = '001', " + |
|
23 |
"entity.type = organization, " + |
|
24 |
"order.field = legalname, " + |
|
25 |
"ngram.fields = [legalname], " + |
|
26 |
"queue.max.size = 2000, " + |
|
27 |
"group.max.size = 10, " + |
|
28 |
"sliding.window.size = 200, " + |
|
29 |
"rootbuilder = [organization,projectOrganization_participation_isParticipant,datasourceOrganization_provision_isProvidedBy], " + |
|
30 |
"skiplist = [od_______908,od________18]," + |
|
31 |
"include.children = true }"; |
|
32 |
|
|
33 |
final DedupConfig cfg = DedupConfigLoader.load(s); |
|
34 |
|
|
35 |
assertNotNull(cfg); |
|
36 |
assertTrue(cfg.getConfigurationId().equals("01")); |
|
37 |
assertTrue(cfg.getThreshold() == 0.99); |
|
38 |
assertTrue(cfg.getDedupRun().equals("001")); |
|
39 |
assertTrue(cfg.getEntityType().equals("organization")); |
|
40 |
assertTrue(cfg.getOrderField().equals("legalname")); |
|
41 |
assertTrue(Lists.newArrayList("organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy") |
|
42 |
.equals(cfg.getRootBuilderFamilies())); |
|
43 |
assertTrue(Sets.newHashSet("od_______908", "od________18").equals(cfg.getSkipList())); |
|
44 |
assertTrue(cfg.getSlidingWindowSize() == 200); |
|
45 |
assertTrue(cfg.getGroupMaxSize() == 10); |
|
46 |
assertTrue(cfg.getQueueMaxSize() == 2000); |
|
47 |
assertTrue(cfg.isIncludeChildren()); |
|
48 |
} |
|
49 |
|
|
50 |
@Test |
|
51 |
public void testCfg2() { |
|
52 |
final String s = |
|
53 |
"dedup.conf { " + |
|
54 |
"id = '01', " + |
|
55 |
"threshold = 0.99, " + |
|
56 |
"run = '001', " + |
|
57 |
"entity.type = organization, " + |
|
58 |
"order.field = legalname, " + |
|
59 |
"ngram.fields = [legalname], " + |
|
60 |
"queue.max.size = 2000, " + |
|
61 |
"group.max.size = 10, " + |
|
62 |
"sliding.window.size = 200, " + |
|
63 |
"rootbuilder = [organization,projectOrganization_participation_isParticipant,datasourceOrganization_provision_isProvidedBy], " + |
|
64 |
"include.children = false }"; |
|
65 |
|
|
66 |
final DedupConfig cfg = DedupConfigLoader.load(s); |
|
67 |
|
|
68 |
assertNotNull(cfg); |
|
69 |
assertTrue(cfg.getConfigurationId().equals("01")); |
|
70 |
assertTrue(cfg.getThreshold() == 0.99); |
|
71 |
assertTrue(cfg.getEntityType().equals("organization")); |
|
72 |
assertTrue(cfg.getOrderField().equals("legalname")); |
|
73 |
assertTrue(Lists.newArrayList("organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy") |
|
74 |
.equals(cfg.getRootBuilderFamilies())); |
|
75 |
assertTrue(new HashSet<String>().equals(cfg.getSkipList())); |
|
76 |
assertTrue(cfg.getSlidingWindowSize() == 200); |
|
77 |
assertTrue(cfg.getGroupMaxSize() == 10); |
|
78 |
assertTrue(cfg.getQueueMaxSize() == 2000); |
|
79 |
assertFalse(cfg.isIncludeChildren()); |
|
80 |
} |
|
81 |
|
|
82 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/resources/eu/dnetlib/pace/config/organization.pace.conf | ||
---|---|---|
1 |
pace.conf { |
|
2 |
clustering { |
|
3 |
fieldvalue { fields = [legalshortname], params = { } } |
|
4 |
}, |
|
5 |
conditions { }, |
|
6 |
model { |
|
7 |
legalname { algo = JaroWinkler, type = String, weight = 1.0, ignoreMissing = false, path = organization/metadata/legalname/value } |
|
8 |
|
|
9 |
} |
|
10 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/resources/eu/dnetlib/pace/config/title_blacklist.txt | ||
---|---|---|
1 |
^(Corpus Oral Dialectal \(COD\)\.).*$ |
|
2 |
^(Kiri Karl Morgensternile).*$ |
|
3 |
^(\[Eksliibris Aleksandr).*\]$ |
|
4 |
^(Kiri A\. de Vignolles).*$ |
|
5 |
^(2 kirja Karl Morgensternile).*$ |
|
6 |
^(Pirita kloostri idaosa arheoloogilised).*$ |
|
7 |
^(Kiri tundmatule).*$ |
|
8 |
^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$ |
|
9 |
^(Eksliibris Nikolai Birukovile).*$ |
|
10 |
^(Eksliibris Nikolai Issakovile).*$ |
|
11 |
^(\[Eksliibris Aleksandr).*$ |
|
12 |
^(WHP Cruise Summary Information of section).*$ |
|
13 |
^(Measurement of the top quark\-pair production cross section with ATLAS in pp collisions at).*$ |
|
14 |
^(Measurement of the spin\-dependent structure function).* |
|
15 |
^(lorem ipsum).* |
|
0 | 16 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/test/resources/eu/dnetlib/pace/config/result.pace.conf | ||
---|---|---|
1 |
pace.conf { |
|
2 |
clustering { |
|
3 |
acronyms { fields = [title], params = { max = 1, minLen = 2, maxLen = 4} }, |
|
4 |
ngrampairs { fields = [title], params = { max = 1, ngramLen = 3} }, |
|
5 |
suffixprefix { fields = [title], params = { max = 1, len = 3 } } |
|
6 |
}, |
|
7 |
strictconditions { |
|
8 |
exactMatch { fields = [pid] } |
|
9 |
}, |
|
10 |
conditions { |
|
11 |
yearMatch { fields = [dateofacceptance] }, |
|
12 |
titleVersionMatch { fields = [title] }, |
|
13 |
sizeMatch { fields = [authors] } |
|
14 |
}, |
|
15 |
model { |
|
16 |
pid { algo = ExactMatch, type = String, weight = 0.0, ignoreMissing = true, path = pid/value, overrideMatch = true }, |
|
17 |
title { algo = JaroWinkler, type = String, weight = 1.0, ignoreMissing = false, path = result/metadata/title/value }, |
|
18 |
dateofacceptance { algo = Null, type = String, weight = 0.0, ignoreMissing = true, path = result/metadata/dateofacceptance/value } , |
|
19 |
authors { algo = Null, type = List, weight = 0.0, ignoreMissing = true, path = result/author/metadata/fullname/value } |
|
20 |
}, |
|
21 |
blacklists = { |
|
22 |
title = [ |
|
23 |
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$", |
|
24 |
"^(Kiri Karl Morgensternile).*$", |
|
25 |
"^(\\[Eksliibris Aleksandr).*\\]$", |
|
26 |
"^(\\[Eksliibris Aleksandr).*$", |
|
27 |
"^(Eksliibris Aleksandr).*$", |
|
28 |
"^(Kiri A\\. de Vignolles).*$", |
|
29 |
"^(2 kirja Karl Morgensternile).*$", |
|
30 |
"^(Pirita kloostri idaosa arheoloogilised).*$", |
|
31 |
"^(Kiri tundmatule).*$", |
|
32 |
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", |
|
33 |
"^(Eksliibris Nikolai Birukovile).*$", |
|
34 |
"^(Eksliibris Nikolai Issakovile).*$", |
|
35 |
"^(WHP Cruise Summary Information of section).*$", |
|
36 |
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", |
|
37 |
"^(Measurement of the spin\\-dependent structure function).*" |
|
38 |
] } |
|
39 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/Clustering.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
public enum Clustering { |
|
4 |
acronyms, ngrams, ngrampairs, suffixprefix, spacetrimmingfieldvalue |
|
5 |
} |
|
0 | 6 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Map; |
|
5 |
import java.util.Set; |
|
6 |
|
|
7 |
import com.google.common.collect.Sets; |
|
8 |
|
|
9 |
public class SuffixPrefix extends AbstractClusteringFunction { |
|
10 |
|
|
11 |
public SuffixPrefix(Map<String, Integer> params) { |
|
12 |
super(params); |
|
13 |
} |
|
14 |
|
|
15 |
@Override |
|
16 |
protected Collection<String> doApply(String s) { |
|
17 |
return suffixPrefix(s, param("len"), param("max")); |
|
18 |
} |
|
19 |
|
|
20 |
private Collection<String> suffixPrefix(String s, int len, int max) { |
|
21 |
final Set<String> bigrams = Sets.newLinkedHashSet(); |
|
22 |
int i = 0; |
|
23 |
while (++i < s.length() && bigrams.size() < max) { |
|
24 |
int j = s.indexOf(" ", i); |
|
25 |
|
|
26 |
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length(); |
|
27 |
|
|
28 |
if (j - len > 0) { |
|
29 |
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim(); |
|
30 |
if (bigram.length() >= 4) { |
|
31 |
bigrams.add(bigram); |
|
32 |
} |
|
33 |
} |
|
34 |
} |
|
35 |
return bigrams; |
|
36 |
} |
|
37 |
|
|
38 |
} |
|
0 | 39 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import com.google.common.collect.Lists; |
|
8 |
|
|
9 |
public class NgramPairs extends Ngrams { |
|
10 |
|
|
11 |
public NgramPairs(Map<String, Integer> params) { |
|
12 |
super(params); |
|
13 |
} |
|
14 |
|
|
15 |
@Override |
|
16 |
protected Collection<String> doApply(String s) { |
|
17 |
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max")); |
|
18 |
} |
|
19 |
|
|
20 |
private Collection<String> ngramPairs(final List<String> ngrams, int maxNgrams) { |
|
21 |
Collection<String> res = Lists.newArrayList(); |
|
22 |
int j = 0; |
|
23 |
for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) { |
|
24 |
if (++j >= ngrams.size()) { |
|
25 |
break; |
|
26 |
} |
|
27 |
res.add(ngrams.get(i) + ngrams.get(j)); |
|
28 |
//System.out.println("-- " + concatNgrams); |
|
29 |
} |
|
30 |
return res; |
|
31 |
} |
|
32 |
|
|
33 |
} |
|
0 | 34 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import org.apache.commons.lang.RandomStringUtils; |
|
8 |
import org.apache.commons.lang.StringUtils; |
|
9 |
|
|
10 |
import com.google.common.collect.Lists; |
|
11 |
|
|
12 |
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { |
|
13 |
|
|
14 |
public SpaceTrimmingFieldValue(final Map<String, Integer> params) { |
|
15 |
super(params); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
protected Collection<String> doApply(final String s) { |
|
20 |
final List<String> res = Lists.newArrayList(); |
|
21 |
|
|
22 |
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); |
|
23 |
|
|
24 |
return res; |
|
25 |
} |
|
26 |
|
|
27 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import eu.dnetlib.pace.model.Field; |
|
8 |
|
|
9 |
public interface ClusteringFunction { |
|
10 |
|
|
11 |
public Collection<String> apply(List<Field> fields); |
|
12 |
|
|
13 |
public Map<String, Integer> getParams(); |
|
14 |
|
|
15 |
} |
|
0 | 16 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Set; |
|
4 |
|
|
5 |
import org.apache.commons.lang.StringUtils; |
|
6 |
|
|
7 |
import eu.dnetlib.pace.common.AbstractPaceFunctions; |
|
8 |
|
|
9 |
public class NGramUtils extends AbstractPaceFunctions { |
|
10 |
|
|
11 |
private static final int SIZE = 100; |
|
12 |
|
|
13 |
private static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); |
|
14 |
|
|
15 |
public static String cleanupForOrdering(String s) { |
|
16 |
NGramUtils utils = new NGramUtils(); |
|
17 |
return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", ""); |
|
18 |
} |
|
19 |
|
|
20 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
public class RandomClusteringFunction extends AbstractClusteringFunction { |
|
7 |
|
|
8 |
public RandomClusteringFunction(Map<String, Integer> params) { |
|
9 |
super(params); |
|
10 |
} |
|
11 |
|
|
12 |
@Override |
|
13 |
protected Collection<String> doApply(String s) { |
|
14 |
// TODO Auto-generated method stub |
|
15 |
return null; |
|
16 |
} |
|
17 |
|
|
18 |
} |
|
0 | 19 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Map; |
|
5 |
import java.util.Map.Entry; |
|
6 |
import java.util.Set; |
|
7 |
|
|
8 |
import com.google.common.collect.Iterables; |
|
9 |
import com.google.common.collect.Lists; |
|
10 |
import com.google.common.collect.Maps; |
|
11 |
|
|
12 |
import eu.dnetlib.pace.config.Config; |
|
13 |
import eu.dnetlib.pace.model.Document; |
|
14 |
import eu.dnetlib.pace.model.FieldListImpl; |
|
15 |
import eu.dnetlib.pace.model.MapDocument; |
|
16 |
|
|
17 |
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner { |
|
18 |
|
|
19 |
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf, final Map<String, Iterable<String>> blacklists) { |
|
20 |
|
|
21 |
Document filtered = new BlacklistAwareClusteringCombiner().filter(a, blacklists); |
|
22 |
return combine(filtered, conf); |
|
23 |
} |
|
24 |
|
|
25 |
private MapDocument filter(final MapDocument a, final Map<String, Iterable<String>> blacklists) { |
|
26 |
final Map<String, FieldListImpl> filtered = Maps.newHashMap(a.getFieldMap()); |
|
27 |
if (blacklists != null) { |
|
28 |
for (final Entry<String, FieldListImpl> e : filtered.entrySet()) { |
|
29 |
|
|
30 |
FieldListImpl fl = new FieldListImpl(); |
|
31 |
fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists)))); |
|
32 |
filtered.put(e.getKey(), fl); |
|
33 |
} |
|
34 |
} |
|
35 |
return new MapDocument(a.getIdentifier(), filtered); |
|
36 |
} |
|
37 |
|
|
38 |
/** |
|
39 |
* Tries to match the fields in the regex blacklist. |
|
40 |
* |
|
41 |
* @param fieldName |
|
42 |
* @param value |
|
43 |
* @return true if the field matches, false otherwise |
|
44 |
*/ |
|
45 |
protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) { |
|
46 |
if (blacklists.containsKey(fieldName)) { |
|
47 |
for (final String regex : blacklists.get(fieldName)) { |
|
48 |
if (value.matches(regex)) { return true; } |
|
49 |
} |
|
50 |
} |
|
51 |
return false; |
|
52 |
} |
|
53 |
} |
|
0 | 54 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.LinkedHashSet; |
|
5 |
import java.util.Map; |
|
6 |
import java.util.StringTokenizer; |
|
7 |
|
|
8 |
public class Ngrams extends AbstractClusteringFunction { |
|
9 |
|
|
10 |
public Ngrams(Map<String, Integer> params) { |
|
11 |
super(params); |
|
12 |
} |
|
13 |
|
|
14 |
@Override |
|
15 |
protected Collection<String> doApply(String s) { |
|
16 |
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); |
|
17 |
} |
|
18 |
|
|
19 |
protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) { |
|
20 |
|
|
21 |
final Collection<String> ngrams = new LinkedHashSet<String>(); |
|
22 |
final StringTokenizer st = new StringTokenizer(s); |
|
23 |
|
|
24 |
while (st.hasMoreTokens()) { |
|
25 |
final String token = st.nextToken(); |
|
26 |
if (!token.isEmpty()) { |
|
27 |
|
|
28 |
for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) { |
|
29 |
String ngram = (token + " ").substring(i, ngramLen + i).trim(); |
|
30 |
if (ngrams.size() >= max) { |
|
31 |
return ngrams; |
|
32 |
} |
|
33 |
if (ngram.length() >= minNgramLen) { |
|
34 |
ngrams.add(ngram); |
|
35 |
} |
|
36 |
} |
|
37 |
} |
|
38 |
} |
|
39 |
//System.out.println(ngrams + " n: " + ngrams.size()); |
|
40 |
return ngrams; |
|
41 |
} |
|
42 |
|
|
43 |
} |
|
0 | 44 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import com.google.common.collect.Sets; |
|
8 |
|
|
9 |
import eu.dnetlib.pace.common.AbstractPaceFunctions; |
|
10 |
import eu.dnetlib.pace.model.Field; |
|
11 |
|
|
12 |
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction { |
|
13 |
|
|
14 |
protected Map<String, Integer> params; |
|
15 |
|
|
16 |
public AbstractClusteringFunction(final Map<String, Integer> params) { |
|
17 |
this.params = params; |
|
18 |
} |
|
19 |
|
|
20 |
protected abstract Collection<String> doApply(String s); |
|
21 |
|
|
22 |
@Override |
|
23 |
public Collection<String> apply(List<Field> fields) { |
|
24 |
Collection<String> c = Sets.newLinkedHashSet(); |
|
25 |
for(Field f : fields) { |
|
26 |
c.addAll(filterBlacklisted(doApply(filterStopWords(normalize(f.stringValue()), stopwords)), ngramBlacklist)); |
|
27 |
} |
|
28 |
return c; |
|
29 |
} |
|
30 |
|
|
31 |
public Map<String, Integer> getParams() { |
|
32 |
return params; |
|
33 |
} |
|
34 |
|
|
35 |
protected Integer param(String name) { |
|
36 |
return params.get(name); |
|
37 |
} |
|
38 |
} |
|
0 | 39 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Map; |
|
4 |
|
|
5 |
import com.google.common.base.Predicate; |
|
6 |
|
|
7 |
import eu.dnetlib.pace.model.Field; |
|
8 |
|
|
9 |
public class FieldFilter implements Predicate<Field> { |
|
10 |
|
|
11 |
private Map<String, Iterable<String>> blacklists; |
|
12 |
|
|
13 |
private String filedName; |
|
14 |
|
|
15 |
public FieldFilter(final String fieldName, final Map<String, Iterable<String>> blacklists) { |
|
16 |
this.filedName = fieldName; |
|
17 |
this.blacklists = blacklists; |
|
18 |
} |
|
19 |
|
|
20 |
@Override |
|
21 |
public boolean apply(final Field f) { |
|
22 |
return !regexMatches(filedName, f.stringValue(), blacklists); |
|
23 |
} |
|
24 |
|
|
25 |
/** |
|
26 |
* Tries to match the fields in the regex blacklist. |
|
27 |
* |
|
28 |
* @param fieldName |
|
29 |
* @param value |
|
30 |
* @return true if the field matches, false otherwise |
|
31 |
*/ |
|
32 |
protected boolean regexMatches(final String fieldName, final String value, final Map<String, Iterable<String>> blacklists) { |
|
33 |
if (blacklists.containsKey(fieldName)) { |
|
34 |
final Iterable<String> regexes = blacklists.get(fieldName); |
|
35 |
for (final String regex : regexes) { |
|
36 |
if (value.matches(regex)) { return true; } |
|
37 |
} |
|
38 |
} |
|
39 |
return false; |
|
40 |
} |
|
41 |
} |
|
0 | 42 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Map; |
|
5 |
import java.util.Set; |
|
6 |
import java.util.StringTokenizer; |
|
7 |
|
|
8 |
import com.google.common.collect.Sets; |
|
9 |
|
|
10 |
public class Acronyms extends AbstractClusteringFunction { |
|
11 |
|
|
12 |
public Acronyms(Map<String, Integer> params) { |
|
13 |
super(params); |
|
14 |
} |
|
15 |
|
|
16 |
@Override |
|
17 |
protected Collection<String> doApply(String s) { |
|
18 |
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); |
|
19 |
} |
|
20 |
|
|
21 |
private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) { |
|
22 |
|
|
23 |
final Set<String> acronyms = Sets.newLinkedHashSet(); |
|
24 |
|
|
25 |
for (int i = 0; i < maxAcronyms; i++) { |
|
26 |
|
|
27 |
final StringTokenizer st = new StringTokenizer(s); |
|
28 |
final StringBuilder sb = new StringBuilder(); |
|
29 |
|
|
30 |
while (st.hasMoreTokens()) { |
|
31 |
final String token = st.nextToken(); |
|
32 |
if (sb.length() > maxLen) { |
|
33 |
break; |
|
34 |
} |
|
35 |
if (token.length() > 1 && i < token.length()) { |
|
36 |
sb.append(token.charAt(i)); |
|
37 |
} |
|
38 |
} |
|
39 |
String acronym = sb.toString(); |
|
40 |
if (acronym.length() > minLen) { |
|
41 |
acronyms.add(acronym); |
|
42 |
} |
|
43 |
} |
|
44 |
return acronyms; |
|
45 |
} |
|
46 |
|
|
47 |
} |
|
0 | 48 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
|
|
6 |
import com.google.common.collect.Sets; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.config.Config; |
|
9 |
import eu.dnetlib.pace.model.ClusteringDef; |
|
10 |
import eu.dnetlib.pace.model.Document; |
|
11 |
|
|
12 |
public class ClusteringCombiner { |
|
13 |
|
|
14 |
public static Collection<String> combine(Document a, Config conf) { |
|
15 |
return new ClusteringCombiner().doCombine(a, conf.clusterings()); |
|
16 |
} |
|
17 |
|
|
18 |
private Collection<String> doCombine(Document a, List<ClusteringDef> defs) { |
|
19 |
final Collection<String> res = Sets.newLinkedHashSet(); |
|
20 |
for(ClusteringDef cd : defs) { |
|
21 |
for(String fieldName : cd .getFields()) { |
|
22 |
res.addAll(cd.getClusteringFunction().apply(a.values(fieldName))); |
|
23 |
} |
|
24 |
} |
|
25 |
return res; |
|
26 |
} |
|
27 |
|
|
28 |
} |
|
0 | 29 |
modules/dnet-pace-core/tags/dnet-pace-core-1.3.0/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Iterator; |
|
5 |
import java.util.List; |
|
6 |
import java.util.ListIterator; |
|
7 |
|
|
8 |
import com.google.common.base.Function; |
|
9 |
import com.google.common.base.Joiner; |
|
10 |
import com.google.common.collect.Iterables; |
|
11 |
import com.google.common.collect.Lists; |
|
12 |
|
|
13 |
import eu.dnetlib.pace.config.Type; |
|
14 |
|
|
15 |
/** |
|
16 |
* The Class FieldListImpl. |
|
17 |
*/ |
|
18 |
public class FieldListImpl extends AbstractField implements FieldList { |
|
19 |
|
|
20 |
/** The fields. */ |
|
21 |
private List<Field> fields; |
|
22 |
|
|
23 |
/** |
|
24 |
* Instantiates a new field list impl. |
|
25 |
*/ |
|
26 |
public FieldListImpl() { |
|
27 |
fields = Lists.newArrayList(); |
|
28 |
} |
|
29 |
|
|
30 |
/** |
|
31 |
* Instantiates a new field list impl. |
|
32 |
* |
|
33 |
* @param name |
|
34 |
* the name |
|
35 |
*/ |
|
36 |
public FieldListImpl(final String name) { |
|
37 |
super(Type.List, name); |
|
38 |
fields = Lists.newArrayList(); |
|
39 |
} |
|
40 |
|
|
41 |
/* |
|
42 |
* (non-Javadoc) |
|
43 |
* |
|
44 |
* @see java.util.List#add(java.lang.Object) |
|
45 |
*/ |
|
46 |
@Override |
|
47 |
public boolean add(final Field f) { |
|
48 |
return fields.add(f); |
|
49 |
} |
|
50 |
|
|
51 |
/* |
|
52 |
* (non-Javadoc) |
|
53 |
* |
|
54 |
* @see java.util.List#add(int, java.lang.Object) |
|
55 |
*/ |
|
56 |
@Override |
|
57 |
public void add(final int i, final Field f) { |
|
58 |
fields.add(i, f); |
|
59 |
} |
|
60 |
|
|
61 |
/* |
|
62 |
* (non-Javadoc) |
|
63 |
* |
|
64 |
* @see java.util.List#addAll(java.util.Collection) |
|
65 |
*/ |
|
66 |
@Override |
|
67 |
public boolean addAll(final Collection<? extends Field> f) { |
|
68 |
return fields.addAll(f); |
|
69 |
} |
|
70 |
|
|
71 |
/* |
|
72 |
* (non-Javadoc) |
|
73 |
* |
|
74 |
* @see java.util.List#addAll(int, java.util.Collection) |
|
75 |
*/ |
|
76 |
@Override |
|
77 |
public boolean addAll(final int i, final Collection<? extends Field> f) { |
|
78 |
return fields.addAll(i, f); |
|
79 |
} |
|
80 |
|
|
81 |
/* |
|
82 |
* (non-Javadoc) |
|
83 |
* |
|
84 |
* @see java.util.List#clear() |
|
85 |
*/ |
|
86 |
@Override |
|
87 |
public void clear() { |
|
88 |
fields.clear(); |
|
89 |
} |
|
90 |
|
|
91 |
/* |
|
92 |
* (non-Javadoc) |
|
93 |
* |
|
94 |
* @see java.util.List#contains(java.lang.Object) |
|
95 |
*/ |
|
96 |
@Override |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-pace-core-1.3.0