Project

General

Profile

« Previous | Next » 

Revision 49442

[maven-release-plugin] copy for tag dnet-pace-core-2.5.2

View differences:

modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/deploy.info
1
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-pace-core/trunk/", "deploy_repository": "dnet45-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots", "name": "dnet-pace-core"}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Map;
4

  
5
import com.google.common.collect.Lists;
6
import com.google.common.collect.Maps;
7
import eu.dnetlib.pace.AbstractPaceTest;
8
import org.junit.Before;
9
import org.junit.Test;
10

  
11
public class ClusteringFunctionTest extends AbstractPaceTest {
12

  
13
	private Map<String, Integer> params;
14

  
15
	@Before
16
	public void setUp() throws Exception {
17
		params = Maps.newHashMap();
18
	}
19

  
20
	@Test
21
	public void testNgram() {
22
		params.put("ngramLen", 3);
23
		params.put("max", 8);
24
		params.put("maxPerToken", 2);
25
		params.put("minNgramLen", 1);
26

  
27
		final ClusteringFunction ngram = new Ngrams(params);
28

  
29
		final String s = "Search for the Standard Model Higgs Boson";
30
		System.out.println(s);
31
		System.out.println(ngram.apply(Lists.newArrayList(title(s))));
32
	}
33

  
34
	@Test
35
	public void testNgramPairs() {
36
		params.put("ngramLen", 3);
37
		params.put("max", 3);
38

  
39
		final ClusteringFunction np = new NgramPairs(params);
40

  
41
		final String s = "Search for the Standard Model Higgs Boson";
42
		System.out.println(s);
43
		System.out.println(np.apply(Lists.newArrayList(title(s))));
44
	}
45

  
46
	@Test
47
	public void testSortedNgramPairs() {
48
		params.put("ngramLen", 3);
49
		params.put("max", 1);
50

  
51
		final ClusteringFunction np = new SortedNgramPairs(params);
52

  
53
		final String s1 = "University of Pisa";
54
		System.out.println(s1);
55
		System.out.println(np.apply(Lists.newArrayList(title(s1))));
56

  
57
		final String s2 = "Pisa University";
58
		System.out.println(s2);
59
		System.out.println(np.apply(Lists.newArrayList(title(s2))));
60
	}
61

  
62
	@Test
63
	public void testAcronym() {
64
		params.put("max", 4);
65
		params.put("minLen", 1);
66
		params.put("maxLen", 3);
67

  
68
		final ClusteringFunction acro = new Acronyms(params);
69

  
70
		final String s = "Search for the Standard Model Higgs Boson";
71
		System.out.println(s);
72
		System.out.println(acro.apply(Lists.newArrayList(title(s))));
73
	}
74

  
75
	@Test
76
	public void testSuffixPrefix() {
77
		params.put("len", 3);
78
		params.put("max", 4);
79

  
80
		final ClusteringFunction sp = new SuffixPrefix(params);
81

  
82
		final String s = "Search for the Standard Model Higgs Boson";
83
		System.out.println(s);
84
		System.out.println(sp.apply(Lists.newArrayList(title(s))));
85
	}
86

  
87
	@Test
88
	public void testFieldValue() {
89

  
90
		params.put("randomLength", 5);
91

  
92
		final ClusteringFunction sp = new SpaceTrimmingFieldValue(params);
93

  
94
		final String s = "Search for the Standard Model Higgs Boson";
95
		System.out.println(s);
96
		System.out.println(sp.apply(Lists.newArrayList(title(s))));
97
	}
98

  
99
	@Test
100
	public void testPersonClustering2() {
101
		final ClusteringFunction cf = new PersonClustering(params);
102

  
103
		final String s = readFromClasspath("gt.author.json");
104
		System.out.println(s);
105
		System.out.println(cf.apply(Lists.newArrayList(person(s))));
106
	}
107

  
108
}
0 109

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java
1
package eu.dnetlib.pace.model;
2

  
3
import static org.junit.Assert.assertFalse;
4
import static org.junit.Assert.assertTrue;
5

  
6
import org.junit.Test;
7

  
8
public class PersonComparatorUtilsSimilarityTest {
9

  
10
	@Test
11
	public void testSimilarity_0() {
12
		assertTrue(PersonComparatorUtils.areSimilar("Artini Michele", "Michele Artini"));
13
	}
14

  
15
	@Test
16
	public void testSimilarity_1() {
17
		assertTrue(PersonComparatorUtils.areSimilar("ARTINI Michele", "Artini, Michele"));
18
	}
19

  
20
	@Test
21
	public void testSimilarity_2() {
22
		assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini Michele"));
23
	}
24

  
25
	@Test
26
	public void testSimilarity_3() {
27
		assertTrue(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, Michele"));
28
	}
29

  
30
	@Test
31
	public void testSimilarity_4() {
32
		assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, M.G."));
33
	}
34

  
35
	@Test
36
	public void testSimilarity_5() {
37
		assertTrue(PersonComparatorUtils.areSimilar("Artini, M. (sig.)", "Artini, Michele"));
38
	}
39

  
40
	@Test
41
	public void testSimilarity_6() {
42
		assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, G."));
43
	}
44

  
45
	@Test
46
	public void testSimilarity_7() {
47
		assertFalse(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, M.A."));
48
	}
49

  
50
	@Test
51
	public void testSimilarity_8() {
52
		assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, Giuseppe"));
53
	}
54

  
55
	@Test
56
	public void testSimilarity_9() {
57
		assertFalse(PersonComparatorUtils.areSimilar("Manghi, Paolo", "Artini, Michele"));
58
	}
59

  
60
	@Test
61
	public void testSimilarity_10() {
62
		assertTrue(PersonComparatorUtils.areSimilar("Artini, Michele", "Artini, Michele Giovanni"));
63
	}
64

  
65
	@Test
66
	public void testSimilarity_11() {
67
		assertFalse(PersonComparatorUtils.areSimilar("Artini, M.A.G.", "Artini, M.B.G."));
68
	}
69

  
70
	@Test
71
	public void testSimilarity_12() {
72
		assertFalse(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini, Michele"));
73
	}
74

  
75
	@Test
76
	public void testSimilarity_13() {
77
		assertTrue(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini Manghi Michele"));
78
	}
79

  
80
	@Test
81
	public void testSimilarity_14() {
82
		assertFalse(PersonComparatorUtils.areSimilar("Artini, Michele", "Michele, Artini"));
83
	}
84

  
85
	@Test
86
	public void testSimilarity_15() {
87
		assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Michele ARTINI"));
88
	}
89
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/model/PersonTest.java
1
package eu.dnetlib.pace.model;
2

  
3
import static org.junit.Assert.assertEquals;
4

  
5
import java.text.Normalizer;
6
import java.util.Queue;
7

  
8
import org.junit.Test;
9

  
10
import com.google.common.collect.Lists;
11

  
12
public class PersonTest {
13

  
14
	@Test
15
	public void test_1() {
16
		check("Atzori, Claudio", "Atzori, Claudio");
17
	}
18

  
19
	@Test
20
	public void test_2() {
21
		check("Atzori, Claudio A.", "Atzori, Claudio A.");
22
	}
23

  
24
	@Test
25
	public void test_3() {
26
		check("Claudio ATZORI", "Atzori, Claudio");
27
	}
28

  
29
	@Test
30
	public void test_4() {
31
		check("ATZORI, Claudio", "Atzori, Claudio");
32
	}
33

  
34
	@Test
35
	public void test_5() {
36
		check("Claudio Atzori", "Claudio Atzori");
37
	}
38

  
39
	@Test
40
	public void test_6() {
41
		check(" Manghi ,  Paolo", "Manghi, Paolo");
42
	}
43

  
44
	@Test
45
	public void test_7() {
46
		check("ATZORI, CLAUDIO", "Atzori, Claudio");
47
	}
48

  
49
	@Test
50
	public void test_8() {
51
		check("ATZORI, CLAUDIO A", "Atzori, Claudio A.");
52
	}
53

  
54
	@Test
55
	public void test_9() {
56
		check("Bølviken, B.", "Bølviken, B.");
57
	}
58

  
59
	@Test
60
	public void test_10() {
61
		check("Bñlviken, B.", "B" + Normalizer.normalize("ñ", Normalizer.Form.NFD) + "lviken, B.");
62
	}
63

  
64
	@Test
65
	public void test_11() {
66
		check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰ ø", "Aaeeiioooouuuu, Aaeeiioooouuuu Ø.", true);
67
	}
68

  
69
	@Test
70
	public void test_12() {
71
		check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.normalize("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.Form.NFD), false);
72
	}
73

  
74
	@Test
75
	public void test_13() {
76
		check("Tkačíková, Daniela", Normalizer.normalize("Tkačíková, Daniela", Normalizer.Form.NFD), false);
77
	}
78

  
79
	@Test
80
	public void test_hashes() {
81
		checkHash(" Claudio  ATZORI ", "ATZORI Claudio", "Atzori , Claudio", "ATZORI, Claudio");
82
	}
83

  
84
	private void checkHash(String... ss) {
85
		Queue<String> q = Lists.newLinkedList(Lists.newArrayList(ss));
86
		String h1 = new Person(q.remove(), false).hash();
87
		while (!q.isEmpty()) {
88
			assertEquals(h1, new Person(q.remove(), false).hash());
89
		}
90
	}
91

  
92
	private void check(String s, String expectedFullName) {
93
		check(s, expectedFullName, false);
94
	}
95

  
96
	private void check(String s, String expectedFullName, boolean aggressive) {
97
		Person p = new Person(s, aggressive);
98

  
99
		System.out.println("original:   " + p.getOriginal());
100
		System.out.println("accurate:   " + p.isAccurate());
101
		System.out.println("normalised: '" + p.getNormalisedFullname() + "'");
102
		if (p.isAccurate()) {
103
			System.out.println("name:     " + p.getNormalisedFirstName());
104
			System.out.println("surname:  " + p.getNormalisedSurname());
105
		}
106
		System.out.println("hash: " + p.hash());
107
		System.out.println("");
108
		assertEquals(expectedFullName, p.getNormalisedFullname());
109
	}
110

  
111
}
0 112

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java
1
package eu.dnetlib.pace.model;
2

  
3
import static org.junit.Assert.assertEquals;
4
import static org.junit.Assert.assertTrue;
5

  
6
import java.util.Set;
7

  
8
import org.junit.Test;
9

  
10
public class PersonComparatorUtilsNGramsTest {
11

  
12
	@Test
13
	public void testNormaizePerson_1() {
14
		verifyGetNgramsForPerson("Artini Michele", 2, "a_michele", "m_artini");
15
	}
16

  
17
	@Test
18
	public void testNormaizePerson_2() {
19
		verifyGetNgramsForPerson("Michele Artini", 2, "a_michele", "m_artini");
20
	}
21

  
22
	@Test
23
	public void testNormaizePerson_3() {
24
		verifyGetNgramsForPerson("Michele ARTINI", 1, "m_artini");
25
	}
26

  
27
	@Test
28
	public void testNormaizePerson_4() {
29
		verifyGetNgramsForPerson("ARTINI Michele", 1, "m_artini");
30
	}
31

  
32
	@Test
33
	public void testNormaizePerson_5() {
34
		verifyGetNgramsForPerson("Michele G. Artini", 2, "m_artini", "g_artini");
35
	}
36

  
37
	@Test
38
	public void testNormaizePerson_6() {
39
		verifyGetNgramsForPerson(" Artini, Michele ", 1, "m_artini");
40
	}
41

  
42
	@Test
43
	public void testNormaizePerson_7() {
44
		verifyGetNgramsForPerson("Artini, Michele (sig.)", 1, "m_artini");
45
	}
46

  
47
	@Test
48
	public void testNormaizePerson_8() {
49
		verifyGetNgramsForPerson("Artini Michele [sig.] ", 2, "a_michele", "m_artini");
50
	}
51

  
52
	@Test
53
	public void testNormaizePerson_9() {
54
		verifyGetNgramsForPerson("Artini, M", 1, "m_artini");
55
	}
56

  
57
	@Test
58
	public void testNormaizePerson_10() {
59
		verifyGetNgramsForPerson("Artini, M.", 1, "m_artini");
60
	}
61

  
62
	@Test
63
	public void testNormaizePerson_11() {
64
		verifyGetNgramsForPerson("Artini, M. (sig.)", 1, "m_artini");
65
	}
66

  
67
	@Test
68
	public void testNormaizePerson_12() {
69
		verifyGetNgramsForPerson("Artini, M[sig.] ", 1, "m_artini");
70
	}
71

  
72
	@Test
73
	public void testNormaizePerson_13() {
74
		verifyGetNgramsForPerson("Artini-SIG, Michele ", 1, "m_artini-sig");
75
	}
76

  
77
	@Test
78
	public void testNormaizePerson_14() {
79
		verifyGetNgramsForPerson("Artini - SIG, Michele ", 1, "m_artini-sig");
80
	}
81

  
82
	@Test
83
	public void testNormaizePerson_15() {
84
		verifyGetNgramsForPerson("Artini {sig.}, M", 1, "m_artini");
85
	}
86

  
87
	@Test
88
	public void testNormaizePerson_16() {
89
		verifyGetNgramsForPerson("Artini, M., sig.", 1, "m_artini");
90
	}
91

  
92
	@Test
93
	public void testNormaizePerson_17() {
94
		verifyGetNgramsForPerson("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA, BBBBBBBBBBBBBBBBBBBBBBBBBBBBB CCCCCCCCCCCCCCCCCCCC", 0);
95
	}
96

  
97
	@Test
98
	public void testNormaizePerson_18() {
99
		verifyGetNgramsForPerson("Dell'amico, Andrea", 1, "a_amico");
100
	}
101

  
102
	@Test
103
	public void testNormaizePerson_19() {
104
		verifyGetNgramsForPerson("Smith, Paul van der", 1, "p_smith");
105
	}
106

  
107
	@Test
108
	public void testNormaizePerson_20() {
109
		verifyGetNgramsForPerson("AAAAAAA, BBBB, CCCC, DDDD, EEEE", 1, "b_aaaaaaa");
110
	}
111

  
112
	@Test
113
	public void testNormaizePerson_21() {
114
		verifyGetNgramsForPerson("Kompetenzzentrum Informelle Bildung (KIB),", 6);
115
	}
116

  
117
	private void verifyGetNgramsForPerson(String name, int expectedSize, String... expectedTokens) {
118
		Set<String> list = PersonComparatorUtils.getNgramsForPerson(name);
119
		System.out.println(list);
120
		assertEquals(expectedSize, list.size());
121
		for (String s : expectedTokens) {
122
			assertTrue(list.contains(s));
123
		}
124
	}
125

  
126
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
1
package eu.dnetlib.pace.config;
2

  
3
import java.io.IOException;
4

  
5
import eu.dnetlib.pace.AbstractPaceTest;
6
import org.junit.Test;
7

  
8
import static org.junit.Assert.assertNotNull;
9

  
10
public class ConfigTest extends AbstractPaceTest {
11

  
12
	@Test
13
	public void test() throws IOException {
14
		final DedupConfig cfg = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
15

  
16
		assertNotNull(cfg);
17

  
18
		System.out.println(cfg.toString());
19
	}
20

  
21
	@Test
22
	public void test2() throws IOException {
23
		final DedupConfig cfg = DedupConfig.load(readFromClasspath("person.pace.conf.json"));
24

  
25
		assertNotNull(cfg);
26

  
27
		System.out.println(cfg.toString());
28
	}
29

  
30
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java
1
package eu.dnetlib.pace.distance;
2

  
3
import org.junit.Before;
4
import org.junit.Test;
5

  
6
import eu.dnetlib.pace.common.AbstractPaceFunctions;
7

  
8
public class DistanceAlgoTest extends AbstractPaceFunctions {
9

  
10
	private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
11

  
12
	@Before
13
	public void setup() {
14
		System.out.println("****************************************************************");
15
		System.out.println("Test String    : " + TEST_STRING);
16
	}
17

  
18
	@Test
19
	public void testGetNumbers() {
20
		System.out.println("Numbers        : " + getNumbers(TEST_STRING));
21
	}
22

  
23
	@Test
24
	public void testRemoveSymbols() {
25
		System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
26
	}
27

  
28
	@Test
29
	public void testFixAliases() {
30
		System.out.println("Fixed aliases  : " + fixAliases(TEST_STRING));
31
	}
32

  
33
	@Test
34
	public void testCleanup() {
35
		System.out.println("cleaned up     : " + cleanup(TEST_STRING));
36
	}
37

  
38
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java
1
package eu.dnetlib.pace;
2

  
3
import java.io.IOException;
4
import java.io.StringWriter;
5

  
6
import org.apache.commons.io.IOUtils;
7

  
8
import eu.dnetlib.pace.config.Type;
9
import eu.dnetlib.pace.model.Field;
10
import eu.dnetlib.pace.model.FieldValueImpl;
11

  
12
public abstract class AbstractPaceTest {
13

  
14
	protected String readFromClasspath(final String filename) {
15
		final StringWriter sw = new StringWriter();
16
		try {
17
			IOUtils.copy(getClass().getResourceAsStream(filename), sw);
18
			return sw.toString();
19
		} catch (final IOException e) {
20
			throw new RuntimeException("cannot load resource from classpath: " + filename);
21
		}
22
	}
23

  
24
	protected Field title(final String s) {
25
		return new FieldValueImpl(Type.String, "title", s);
26
	}
27

  
28
	protected Field person(final String s) {
29
		return new FieldValueImpl(Type.JSON, "person", s);
30
	}
31

  
32
}
0 33

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/resources/eu/dnetlib/pace/clustering/gt.author.json
1
{"metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}},"coauthor": [{"id": "30|od________88::1d22c2a22d7a1c7082006154ae6dd221","anchorId": "30|dedup_wf_001::7b1cfb3c4ec57d71cf331ba669a8e12c","metadata": {"firstname": {"value": "Maria Teresa"},"secondnames": [{"value": "Catanese"}],"fullname": {"value": "Catanese, Maria Teresa"}}},{"id": "30|od________88::2299c043fcaa751e266c82ec24b5a6cf","anchorId": "30|dedup_wf_001::ce73dc26c95e27d22f88e9ed9948b322","metadata": {"firstname": {"value": "Thomas S."},"secondnames": [{"value": "Oh"}],"fullname": {"value": "Oh, Thomas S."}}},{"id": "30|od_______908::52d670e6298c055c6c9c496aad4f2913","anchorId": "30|dedup_wf_001::8e1fafd9778a4cb5569830c299e5b52e","metadata": {"firstname": {"value": "Salman R."},"secondnames": [{"value": "Khetani"}],"fullname": {"value": "Khetani, Salman R."}}},{"id": "30|od________88::1458ae8d3663574e53dcd849ff8aa27d","anchorId": "30|dedup_wf_001::dd9f1dce92f402424de0d7d8afd7ca2d","metadata": {"firstname": {"value": "Sangeeta N."},"secondnames": [{"value": "Bhatia"}],"fullname": {"value": "Bhatia, Sangeeta N."}}},{"id": "30|od________88::837b992599e35b1a9baed833bf9a216e","anchorId": "30|dedup_wf_001::acb87ae171fd37f0ad65bcb728b11064","metadata": {"firstname": {"value": "Andrew J."},"secondnames": [{"value": "Syder"}],"fullname": {"value": "Syder, Andrew J."}}},{"id": "30|od_______908::2299c043fcaa751e266c82ec24b5a6cf","anchorId": "30|dedup_wf_001::ce73dc26c95e27d22f88e9ed9948b322","metadata": {"firstname": {"value": "Thomas S."},"secondnames": [{"value": "Oh"}],"fullname": {"value": "Oh, Thomas S."}}},{"id": "30|od_______908::97e1b5f96f76500dfd9e10ee0de5d380","anchorId": "30|dedup_wf_001::da35eb52feb1b1a789861976342b2570","metadata": {"firstname": {"value": "John W."},"secondnames": [{"value": "Schoggins"}],"fullname": {"value": "Schoggins, John W."}}},{"id": "30|od________88::97e1b5f96f76500dfd9e10ee0de5d380","anchorId": "30|dedup_wf_001::da35eb52feb1b1a789861976342b2570","metadata": {"firstname": {"value": "John W."},"secondnames": [{"value": "Schoggins"}],"fullname": {"value": "Schoggins, John W."}}},{"id": "30|od_______908::5bd4cd7e4065ffd73f39817e2a1bb1ae","anchorId": "30|dedup_wf_001::8ea4c1052c6a7aa1bb2b1097cb3893d2","metadata": {"firstname": {"value": "Lok Man J."},"secondnames": [{"value": "Law"}],"fullname": {"value": "Law, Lok Man J."}}},{"id": "30|od________88::845fd19e1e7201fcd1c492775f04a56b","anchorId": "30|dedup_wf_001::4e971919118e71ea2b2ac840ca319956","metadata": {"firstname": {"value": "Alexander"},"secondnames": [{"value": "Ploss"}],"fullname": {"value": "Ploss, Alexander"}}},{"id": "30|od_______908::7b6a37259ff32dba0e7ea884b8446228","anchorId": "30|dedup_wf_001::a600d9103b7947b1c52f823f8e4833cc","metadata": {"firstname": {"value": "Christopher T."},"secondnames": [{"value": "Jones"}],"fullname": {"value": "Jones, Christopher T."}}},{"id": "30|od________88::5bd4cd7e4065ffd73f39817e2a1bb1ae","anchorId": "30|dedup_wf_001::8ea4c1052c6a7aa1bb2b1097cb3893d2","metadata": {"firstname": {"value": "Lok Man J."},"secondnames": [{"value": "Law"}],"fullname": {"value": "Law, Lok Man J."}}},{"id": "30|od_______908::1d22c2a22d7a1c7082006154ae6dd221","anchorId": "30|dedup_wf_001::7b1cfb3c4ec57d71cf331ba669a8e12c","metadata": {"firstname": {"value": "Maria Teresa"},"secondnames": [{"value": "Catanese"}],"fullname": {"value": "Catanese, Maria Teresa"}}},{"id": "30|od________88::52d670e6298c055c6c9c496aad4f2913","anchorId": "30|dedup_wf_001::8e1fafd9778a4cb5569830c299e5b52e","metadata": {"firstname": {"value": "Salman R."},"secondnames": [{"value": "Khetani"}],"fullname": {"value": "Khetani, Salman R."}}},{"id": "30|od_______908::46acd9f206c2559f13b9119f8c5aef4c","anchorId": "30|dedup_wf_001::06a55cf2c97156d48ec49bcaf4bddcaf","metadata": {"firstname": {"value": "Stephen P."},"secondnames": [{"value": "Goff"}],"fullname": {"value": "Goff, Stephen P."}}},{"id": "30|od________88::7b6a37259ff32dba0e7ea884b8446228","anchorId": "30|dedup_wf_001::a600d9103b7947b1c52f823f8e4833cc","metadata": {"firstname": {"value": "Christopher T."},"secondnames": [{"value": "Jones"}],"fullname": {"value": "Jones, Christopher T."}}},{"id": "30|od_______908::1458ae8d3663574e53dcd849ff8aa27d","anchorId": "30|dedup_wf_001::dd9f1dce92f402424de0d7d8afd7ca2d","metadata": {"firstname": {"value": "Sangeeta N."},"secondnames": [{"value": "Bhatia"}],"fullname": {"value": "Bhatia, Sangeeta N."}}},{"id": "30|od_______908::845fd19e1e7201fcd1c492775f04a56b","anchorId": "30|dedup_wf_001::4e971919118e71ea2b2ac840ca319956","metadata": {"firstname": {"value": "Alexander"},"secondnames": [{"value": "Ploss"}],"fullname": {"value": "Ploss, Alexander"}}},{"id": "30|od_______908::837b992599e35b1a9baed833bf9a216e","anchorId": "30|dedup_wf_001::acb87ae171fd37f0ad65bcb728b11064","metadata": {"firstname": {"value": "Andrew J."},"secondnames": [{"value": "Syder"}],"fullname": {"value": "Syder, Andrew J."}}}],"mergedperson": [{"id": "30|od_______908::715fc4c41052a5b8ce881b23b826f648","metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}}},{"id": "30|od________88::715fc4c41052a5b8ce881b23b826f648","metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}}}],"anchor": true}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json
1
{ 
2
	"wf" : { 
3
        "threshold" : "0.99", 
4
        "run" : "001", 
5
        "entityType" : "result", 
6
        "orderField" : "title", 
7
        "queueMaxSize" : "2000",
8
        "groupMaxSize" : "10",
9
        "slidingWindowSize" : "200",
10
        "rootBuilder" : [ "result" ],
11
        "includeChildren" : "true" 
12
    },
13
	"pace" : {		
14
		"clustering" : [
15
			{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
16
			{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
17
			{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } 
18
		],		
19
		"strictConditions" : [
20
  			{ "name" : "exactMatch", "fields" : [ "pid" ] }
21
  		], 
22
  		"conditions" : [ 
23
  			{ "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
24
  			{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
25
  			{ "name" : "sizeMatch", "fields" : [ "authors" ] } 
26
  		],		
27
		"model" : [
28
			{ "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" }, 	
29
			{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
30
			{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ,
31
			{ "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" },
32
		  {
33
			"name": "anchors",
34
			"algo": "PersonCoAnchorsDistance",
35
			"type": "JSON",
36
			"weight": "0.0",
37
			"ignoreMissing": "true",
38
			"path": "person",
39
			"params": {
40
			  "common.anchors": "1"
41
			}
42
		  },
43
		  {
44
			"name": "coauthor",
45
			"algo": "PersonCoAuthorSurnamesDistance",
46
			"type": "JSON",
47
			"weight": "0.0",
48
			"ignoreMissing": "true",
49
			"path": "person",
50
			"params": {
51
			  "common.surnames": "2"
52
			}
53
		  },
54
		  {
55
			"name": "person",
56
			"algo": "PersonDistance",
57
			"type": "JSON",
58
			"weight": "0.0",
59
			"ignoreMissing": "true",
60
			"path": "person",
61
			"params": {
62
			  "common.surnames": "2"
63
			}
64
		  }
65
		],
66
		"blacklists" : {
67
			"title" : [
68
				"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
69
				"^(Kiri Karl Morgensternile).*$",
70
				"^(\\[Eksliibris Aleksandr).*\\]$",
71
				"^(\\[Eksliibris Aleksandr).*$",
72
				"^(Eksliibris Aleksandr).*$",
73
				"^(Kiri A\\. de Vignolles).*$",
74
				"^(2 kirja Karl Morgensternile).*$",
75
				"^(Pirita kloostri idaosa arheoloogilised).*$",
76
				"^(Kiri tundmatule).*$",
77
				"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
78
				"^(Eksliibris Nikolai Birukovile).*$",
79
				"^(Eksliibris Nikolai Issakovile).*$",
80
				"^(WHP Cruise Summary Information of section).*$",
81
				"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
82
				"^(Measurement of the spin\\-dependent structure function).*"
83
			] } 		
84
	}
85

  
86
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/resources/eu/dnetlib/pace/config/title_blacklist.txt
1
^(Corpus Oral Dialectal \(COD\)\.).*$
2
^(Kiri Karl Morgensternile).*$
3
^(\[Eksliibris Aleksandr).*\]$
4
^(Kiri A\. de Vignolles).*$
5
^(2 kirja Karl Morgensternile).*$
6
^(Pirita kloostri idaosa arheoloogilised).*$
7
^(Kiri tundmatule).*$
8
^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$
9
^(Eksliibris Nikolai Birukovile).*$
10
^(Eksliibris Nikolai Issakovile).*$
11
^(\[Eksliibris Aleksandr).*$
12
^(WHP Cruise Summary Information of section).*$
13
^(Measurement of the top quark\-pair production cross section with ATLAS in pp collisions at).*$
14
^(Measurement of the spin\-dependent structure function).*
15
^(lorem ipsum).*
0 16

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/test/resources/eu/dnetlib/pace/config/person.pace.conf.json
1
{
2
  "wf": {
3
	"threshold": "0.98",
4
	"dedupRun": "001",
5
	"entityType": "person",
6
	"orderField": "fullname",
7
	"queueMaxSize": "2000",
8
	"groupMaxSize": "10",
9
	"slidingWindowSize": "200",
10
	"rootBuilder": [
11
	  "person"
12
	],
13
	"includeChildren": "true"
14
  },
15
  "pace": {
16
	"clustering": [
17
	  {
18
		"name": "personclustering",
19
		"fields": [
20
		  "person"
21
		],
22
		"params": {}
23
	  }
24
	],
25
	"model": [
26
	  {
27
		"name": "person",
28
		"algo": "PersonDistance",
29
		"type": "JSON",
30
		"weight": "1.0",
31
		"ignoreMissing": "false",
32
		"path": "person",
33
		"params": {
34
		  "common.surnames": "2"
35
		}
36
	  },
37
	  {
38
		"name": "fullname",
39
		"algo": "Null",
40
		"type": "String",
41
		"weight": "0",
42
		"ignoreMissing": "true",
43
		"path": "person/metadata/fullname/value"
44
	  }
45
	],
46
	"blacklists": {
47
	  "lastname": [
48
		"(?i)^wang$",
49
		"(?i)^~wang$",
50
		"(?i)^zhang$",
51
		"(?i)^zhou$",
52
		"(?i)^zhao$",
53
		"(?i)^li$",
54
		"(?i)^~li$",
55
		"(?i)^liu$",
56
		"(?i)^chen$",
57
		"(?i)^yang$",
58
		"(?i)^kim$",
59
		"(?i)^xu$",
60
		"(?i)^huang$",
61
		"(?i)^sun$",
62
		"(?i)^lee$",
63
		"(?i)^ma$",
64
		"(?i)^kim$",
65
		"(?i)^hu$",
66
		"(?i)^wu$",
67
		"(?i)^zhu$",
68
		"(?i)^lu$"
69
	  ]
70
	}
71
  }
72
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import com.google.common.collect.Lists;
8

  
9
public class NgramPairs extends Ngrams {
10

  
11
	public NgramPairs(Map<String, Integer> params) {
12
		super(params);
13
	}
14
	
15
	@Override
16
	protected Collection<String> doApply(String s) {
17
		return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
18
	}
19

  
20
	protected Collection<String> ngramPairs(final List<String> ngrams, int maxNgrams) {
21
		Collection<String> res = Lists.newArrayList();
22
		int j = 0;
23
		for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) {
24
			if (++j >= ngrams.size()) {
25
				break;
26
			}
27
			res.add(ngrams.get(i) + ngrams.get(j));
28
			//System.out.println("-- " + concatNgrams);
29
		}
30
		return res;
31
	}
32

  
33
}
0 34

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import org.apache.commons.lang.RandomStringUtils;
8
import org.apache.commons.lang.StringUtils;
9

  
10
import com.google.common.collect.Lists;
11

  
12
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
13

  
14
	public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
15
		super(params);
16
	}
17

  
18
	@Override
19
	protected Collection<String> doApply(final String s) {
20
		final List<String> res = Lists.newArrayList();
21

  
22
		res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
23

  
24
		return res;
25
	}
26

  
27
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.Set;
7

  
8
import eu.dnetlib.pace.model.FieldList;
9
import eu.dnetlib.pace.model.FieldValue;
10
import org.apache.commons.lang.StringUtils;
11

  
12
import com.google.common.base.Splitter;
13
import com.google.common.collect.Iterables;
14
import com.google.common.collect.Sets;
15

  
16
import eu.dnetlib.pace.common.AbstractPaceFunctions;
17
import eu.dnetlib.pace.model.Field;
18
import eu.dnetlib.pace.model.gt.Author;
19
import eu.dnetlib.pace.model.gt.GTAuthor;
20

  
21
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
22

  
23
	private Map<String, Integer> params;
24

  
25
	private static final int MAX_TOKENS = 5;
26

  
27
	public PersonClustering(final Map<String, Integer> params) {
28
		this.params = params;
29
	}
30

  
31
	@Override
32
	public Collection<String> apply(final List<Field> fields) {
33
		final Set<String> hashes = Sets.newHashSet();
34

  
35
		for (final Field f : fields) {
36

  
37
			final GTAuthor gta = GTAuthor.fromOafJson(f.stringValue());
38

  
39
			final Author a = gta.getAuthor();
40
			if (a.isWellFormed()) {
41
				hashes.add(firstLC(a.getFirstname()) + a.getSecondnames().toLowerCase());
42
			} else {
43
				for (final String token1 : tokens(a.getFullname())) {
44
					for (final String token2 : tokens(a.getFullname())) {
45
						if (!token1.equals(token2)) {
46
							hashes.add(firstLC(token1) + token2);
47
						}
48
					}
49
				}
50
			}
51
		}
52

  
53
		return hashes;
54
	}
55

  
56
	private String firstLC(final String s) {
57
		return StringUtils.substring(s, 0, 1).toLowerCase();
58
	}
59

  
60
	private Iterable<String> tokens(final String s) {
61
		return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), MAX_TOKENS);
62
	}
63

  
64
	@Override
65
	public Map<String, Integer> getParams() {
66
		return params;
67
	}
68

  
69
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.Map.Entry;
7
import java.util.Set;
8

  
9
import com.google.common.collect.Iterables;
10
import com.google.common.collect.Lists;
11
import com.google.common.collect.Maps;
12

  
13
import eu.dnetlib.pace.config.Config;
14
import eu.dnetlib.pace.model.Document;
15
import eu.dnetlib.pace.model.Field;
16
import eu.dnetlib.pace.model.FieldListImpl;
17
import eu.dnetlib.pace.model.MapDocument;
18

  
19
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
20

  
21
	public static Collection<String> filterAndCombine(final MapDocument a, final Config conf, final Map<String, List<String>> blacklists) {
22

  
23
		final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, blacklists);
24
		return combine(filtered, conf);
25
	}
26

  
27
	private MapDocument filter(final MapDocument a, final Map<String, List<String>> blacklists) {
28
		final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
29
		if (blacklists != null) {
30
			for (final Entry<String, Field> e : filtered.entrySet()) {
31

  
32
				final FieldListImpl fl = new FieldListImpl();
33
				fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
34
				filtered.put(e.getKey(), fl);
35
			}
36
		}
37
		return new MapDocument(a.getIdentifier(), filtered);
38
	}
39

  
40
	/**
41
	 * Tries to match the fields in the regex blacklist.
42
	 *
43
	 * @param fieldName
44
	 * @param value
45
	 * @return true if the field matches, false otherwise
46
	 */
47
	protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) {
48
		if (blacklists.containsKey(fieldName)) {
49
			for (final String regex : blacklists.get(fieldName)) {
50
				if (value.matches(regex)) return true;
51
			}
52
		}
53
		return false;
54
	}
55
}
0 56

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import com.google.common.collect.Sets;
8

  
9
import eu.dnetlib.pace.common.AbstractPaceFunctions;
10
import eu.dnetlib.pace.model.Field;
11

  
12
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
13

  
14
	protected Map<String, Integer> params;
15
	
16
	public AbstractClusteringFunction(final Map<String, Integer> params) {
17
		this.params = params;
18
	}
19
	
20
	protected abstract Collection<String> doApply(String s);
21
	
22
	@Override
23
	public Collection<String> apply(List<Field> fields) {
24
		Collection<String> c = Sets.newLinkedHashSet();
25
		for(Field f : fields) {
26
			c.addAll(filterBlacklisted(doApply(filterStopWords(normalize(f.stringValue()), stopwords)), ngramBlacklist));
27
		}
28
		return c;
29
	}
30

  
31
	public Map<String, Integer> getParams() {
32
		return params;
33
	}
34
	
35
	protected Integer param(String name) {
36
		return params.get(name);
37
	}
38
}
0 39

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import com.google.common.collect.Lists;
8

  
9
import eu.dnetlib.pace.model.Person;
10

  
11
public class PersonHash extends AbstractClusteringFunction {
12

  
13
	private boolean DEFAULT_AGGRESSIVE = false;
14

  
15
	public PersonHash(final Map<String, Integer> params) {
16
		super(params);
17
	}
18

  
19
	@Override
20
	protected Collection<String> doApply(final String s) {
21
		final List<String> res = Lists.newArrayList();
22

  
23
		final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
24

  
25
		res.add(new Person(s, aggressive).hash());
26

  
27
		return res;
28
	}
29

  
30
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/Clustering.java
1
package eu.dnetlib.pace.clustering;
2

  
3
public enum Clustering {
4
	acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase
5
}
0 6

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import com.google.common.collect.Lists;
8

  
9
public class ImmutableFieldValue extends AbstractClusteringFunction {
10

  
11
	public ImmutableFieldValue(final Map<String, Integer> params) {
12
		super(params);
13
	}
14

  
15
	@Override
16
	protected Collection<String> doApply(final String s) {
17
		final List<String> res = Lists.newArrayList();
18

  
19
		res.add(s);
20

  
21
		return res;
22
	}
23

  
24
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.Map;
5
import java.util.Set;
6

  
7
import com.google.common.collect.Sets;
8

  
9
public class SuffixPrefix extends AbstractClusteringFunction {
10

  
11
	public SuffixPrefix(Map<String, Integer> params) {
12
		super(params);
13
	}
14

  
15
	@Override
16
	protected Collection<String> doApply(String s) {
17
		return suffixPrefix(s, param("len"), param("max"));
18
	}
19
	
20
	private Collection<String> suffixPrefix(String s, int len, int max) {
21
		final Set<String> bigrams = Sets.newLinkedHashSet();
22
		int i = 0;
23
		while (++i < s.length() && bigrams.size() < max) {
24
			int j = s.indexOf(" ", i);
25

  
26
			int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
27

  
28
			if (j - len > 0) {
29
				String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
30
				if (bigram.length() >= 4) {
31
					bigrams.add(bigram);
32
				}
33
			}
34
		}
35
		return bigrams;
36
	}
37

  
38
}
0 39

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.Collections;
5
import java.util.List;
6
import java.util.Map;
7

  
8
import com.google.common.base.Joiner;
9
import com.google.common.base.Splitter;
10
import com.google.common.collect.Lists;
11

  
12
public class SortedNgramPairs extends NgramPairs {
13

  
14
	public SortedNgramPairs(Map<String, Integer> params) {
15
		super(params);
16
	}
17

  
18
	@Override
19
	protected Collection<String> doApply(String s) {
20

  
21
		final List<String> tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s));
22

  
23
		Collections.sort(tokens);
24

  
25
		return ngramPairs(Lists.newArrayList(getNgrams(Joiner.on(" ").join(tokens), param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
26
	}
27

  
28
}
0 29

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import com.google.common.collect.Lists;
8
import com.google.common.collect.Sets;
9
import eu.dnetlib.pace.model.Field;
10
import org.apache.commons.lang.StringUtils;
11

  
12
public class LowercaseClustering extends AbstractClusteringFunction {
13

  
14
	public LowercaseClustering(final Map<String, Integer> params) {
15
		super(params);
16
	}
17

  
18
	@Override
19
	public Collection<String> apply(List<Field> fields) {
20
		Collection<String> c = Sets.newLinkedHashSet();
21
		for(Field f : fields) {
22
			c.addAll(doApply(f.stringValue()));
23
		}
24
		return c;
25
	}
26

  
27
	@Override
28
	protected Collection<String> doApply(final String s) {
29
		if(StringUtils.isBlank(s)) {
30
			return Lists.newArrayList();
31
		}
32
		return Lists.newArrayList(s.toLowerCase().trim());
33
	}
34
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Set;
4

  
5
import org.apache.commons.lang.StringUtils;
6

  
7
import eu.dnetlib.pace.common.AbstractPaceFunctions;
8

  
9
public class NGramUtils extends AbstractPaceFunctions {
10

  
11
	private static final int SIZE = 100;
12

  
13
	private static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
14

  
15
	public static String cleanupForOrdering(String s) {
16
		NGramUtils utils = new NGramUtils();
17
		return (utils.filterStopWords(utils.normalize(s), stopwords) +  StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", "");
18
	}
19

  
20
}
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import eu.dnetlib.pace.model.Field;
8

  
9
public interface ClusteringFunction {
10
	
11
	public Collection<String> apply(List<Field> fields); 
12
	
13
	public Map<String, Integer> getParams();
14

  
15
}
0 16

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.Map;
5

  
6
public class RandomClusteringFunction extends AbstractClusteringFunction {
7

  
8
	public RandomClusteringFunction(Map<String, Integer> params) {
9
		super(params);
10
	}
11

  
12
	@Override
13
	protected Collection<String> doApply(String s) {
14
		// TODO Auto-generated method stub
15
		return null;
16
	}
17

  
18
}
0 19

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.LinkedHashSet;
5
import java.util.Map;
6
import java.util.StringTokenizer;
7

  
8
public class Ngrams extends AbstractClusteringFunction {
9

  
10
	public Ngrams(Map<String, Integer> params) {
11
		super(params);
12
	}
13
	
14
	@Override
15
	protected Collection<String> doApply(String s) {
16
		return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
17
	}
18

  
19
	protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
20

  
21
		final Collection<String> ngrams = new LinkedHashSet<String>();
22
		final StringTokenizer st = new StringTokenizer(s);
23

  
24
		while (st.hasMoreTokens()) {
25
			final String token = st.nextToken();
26
			if (!token.isEmpty()) {
27

  
28
				for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
29
					String ngram = (token + "    ").substring(i, ngramLen + i).trim();
30
					if (ngrams.size() >= max) {
31
						return ngrams;
32
					}
33
					if (ngram.length() >= minNgramLen) {
34
						ngrams.add(ngram);
35
					}
36
				}
37
			}
38
		}
39
		//System.out.println(ngrams + " n: " + ngrams.size());
40
		return ngrams;
41
	}
42

  
43
}
0 44

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.List;
4
import java.util.Map;
5

  
6
import com.google.common.base.Predicate;
7

  
8
import eu.dnetlib.pace.model.Field;
9

  
10
public class FieldFilter implements Predicate<Field> {
11

  
12
	private Map<String, List<String>> blacklists;
13

  
14
	private String filedName;
15

  
16
	public FieldFilter(final String fieldName, final Map<String, List<String>> blacklists) {
17
		this.filedName = fieldName;
18
		this.blacklists = blacklists;
19
	}
20

  
21
	@Override
22
	public boolean apply(final Field f) {
23
		return !regexMatches(filedName, f.stringValue(), blacklists);
24
	}
25

  
26
	/**
27
	 * Tries to match the fields in the regex blacklist.
28
	 *
29
	 * @param fieldName
30
	 * @param value
31
	 * @return true if the field matches, false otherwise
32
	 */
33
	protected boolean regexMatches(final String fieldName, final String value, final Map<String, List<String>> blacklists) {
34
		if (blacklists.containsKey(fieldName)) {
35
			final Iterable<String> regexes = blacklists.get(fieldName);
36
			for (final String regex : regexes) {
37
				if (value.matches(regex)) return true;
38
			}
39
		}
40
		return false;
41
	}
42
}
0 43

  
modules/dnet-pace-core/tags/dnet-pace-core-2.5.2/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.Map;
5
import java.util.Set;
6
import java.util.StringTokenizer;
7

  
8
import com.google.common.collect.Sets;
9

  
10
public class Acronyms extends AbstractClusteringFunction {
11

  
12
	public Acronyms(Map<String, Integer> params) {
13
		super(params);
14
	}
15

  
16
	@Override
17
	protected Collection<String> doApply(String s) {
18
		return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
19
	}
20
	
21
	private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
22
		
23
		final Set<String> acronyms = Sets.newLinkedHashSet();
24
		
25
		for (int i = 0; i < maxAcronyms; i++) {
26
			
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff