Project

General

Profile

« Previous | Next » 

Revision 37202

[maven-release-plugin] copy for tag dnet-pace-core-2.1.0

View differences:

modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/deploy.info
1
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-pace-core/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "dnet-pace-core"}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Map;
4

  
5
import org.junit.Before;
6
import org.junit.Test;
7

  
8
import com.google.common.collect.Lists;
9
import com.google.common.collect.Maps;
10

  
11
import eu.dnetlib.pace.AbstractPaceTest;
12

  
13
public class ClusteringFunctionTest extends AbstractPaceTest {
14

  
15
	private Map<String, Integer> params;
16

  
17
	@Before
18
	public void setUp() throws Exception {
19
		params = Maps.newHashMap();
20
	}
21

  
22
	@Test
23
	public void testNgram() {
24
		params.put("ngramLen", 2);
25
		params.put("max", 8);
26
		params.put("maxPerToken", 1);
27
		params.put("minNgramLen", 3);
28

  
29
		final ClusteringFunction ngram = new Ngrams(params);
30

  
31
		final String s = "Search for the Standard Model Higgs Boson";
32
		System.out.println(s);
33
		System.out.println(ngram.apply(Lists.newArrayList(title(s))));
34
	}
35

  
36
	@Test
37
	public void testNgramPairs() {
38
		params.put("ngramLen", 2);
39
		params.put("max", 4);
40

  
41
		final ClusteringFunction np = new NgramPairs(params);
42

  
43
		final String s = "Search for the Standard Model Higgs Boson";
44
		System.out.println(s);
45
		System.out.println(np.apply(Lists.newArrayList(title(s))));
46
	}
47

  
48
	@Test
49
	public void testAcronym() {
50
		params.put("max", 4);
51
		params.put("minLen", 1);
52
		params.put("maxLen", 3);
53

  
54
		final ClusteringFunction acro = new Acronyms(params);
55

  
56
		final String s = "Search for the Standard Model Higgs Boson";
57
		System.out.println(s);
58
		System.out.println(acro.apply(Lists.newArrayList(title(s))));
59
	}
60

  
61
	@Test
62
	public void testSuffixPrefix() {
63
		params.put("len", 2);
64
		params.put("max", 3);
65

  
66
		final ClusteringFunction sp = new SuffixPrefix(params);
67

  
68
		final String s = "Search for the Standard Model Higgs Boson";
69
		System.out.println(s);
70
		System.out.println(sp.apply(Lists.newArrayList(title(s))));
71
	}
72

  
73
	@Test
74
	public void testFieldValue() {
75
		final ClusteringFunction sp = new SpaceTrimmingFieldValue(params);
76

  
77
		final String s = "Search for the Standard Model Higgs Boson";
78
		System.out.println(s);
79
		System.out.println(sp.apply(Lists.newArrayList(title(s))));
80
	}
81

  
82
}
0 83

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java
1
package eu.dnetlib.pace.model;
2

  
3
import static org.junit.Assert.assertFalse;
4
import static org.junit.Assert.assertTrue;
5

  
6
import org.junit.Test;
7

  
8
public class PersonComparatorUtilsSimilarityTest {
9

  
10
	@Test
11
	public void testSimilarity_0() {
12
		assertTrue(PersonComparatorUtils.areSimilar("Artini Michele", "Michele Artini"));
13
	}
14

  
15
	@Test
16
	public void testSimilarity_1() {
17
		assertTrue(PersonComparatorUtils.areSimilar("ARTINI Michele", "Artini, Michele"));
18
	}
19

  
20
	@Test
21
	public void testSimilarity_2() {
22
		assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini Michele"));
23
	}
24

  
25
	@Test
26
	public void testSimilarity_3() {
27
		assertTrue(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, Michele"));
28
	}
29

  
30
	@Test
31
	public void testSimilarity_4() {
32
		assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, M.G."));
33
	}
34

  
35
	@Test
36
	public void testSimilarity_5() {
37
		assertTrue(PersonComparatorUtils.areSimilar("Artini, M. (sig.)", "Artini, Michele"));
38
	}
39

  
40
	@Test
41
	public void testSimilarity_6() {
42
		assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, G."));
43
	}
44

  
45
	@Test
46
	public void testSimilarity_7() {
47
		assertFalse(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, M.A."));
48
	}
49

  
50
	@Test
51
	public void testSimilarity_8() {
52
		assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, Giuseppe"));
53
	}
54

  
55
	@Test
56
	public void testSimilarity_9() {
57
		assertFalse(PersonComparatorUtils.areSimilar("Manghi, Paolo", "Artini, Michele"));
58
	}
59

  
60
	@Test
61
	public void testSimilarity_10() {
62
		assertTrue(PersonComparatorUtils.areSimilar("Artini, Michele", "Artini, Michele Giovanni"));
63
	}
64

  
65
	@Test
66
	public void testSimilarity_11() {
67
		assertFalse(PersonComparatorUtils.areSimilar("Artini, M.A.G.", "Artini, M.B.G."));
68
	}
69

  
70
	@Test
71
	public void testSimilarity_12() {
72
		assertFalse(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini, Michele"));
73
	}
74

  
75
	@Test
76
	public void testSimilarity_13() {
77
		assertTrue(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini Manghi Michele"));
78
	}
79

  
80
	@Test
81
	public void testSimilarity_14() {
82
		assertFalse(PersonComparatorUtils.areSimilar("Artini, Michele", "Michele, Artini"));
83
	}
84

  
85
	@Test
86
	public void testSimilarity_15() {
87
		assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Michele ARTINI"));
88
	}
89
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/test/java/eu/dnetlib/pace/model/PersonTest.java
1
package eu.dnetlib.pace.model;
2

  
3
import static org.junit.Assert.assertEquals;
4

  
5
import java.text.Normalizer;
6
import java.util.Queue;
7

  
8
import org.junit.Test;
9

  
10
import com.google.common.collect.Lists;
11

  
12
public class PersonTest {
13

  
14
	@Test
15
	public void test_1() {
16
		check("Atzori, Claudio", "Atzori, Claudio");
17
	}
18

  
19
	@Test
20
	public void test_2() {
21
		check("Atzori, Claudio A.", "Atzori, Claudio A.");
22
	}
23

  
24
	@Test
25
	public void test_3() {
26
		check("Claudio ATZORI", "Atzori, Claudio");
27
	}
28

  
29
	@Test
30
	public void test_4() {
31
		check("ATZORI, Claudio", "Atzori, Claudio");
32
	}
33

  
34
	@Test
35
	public void test_5() {
36
		check("Claudio Atzori", "Claudio Atzori");
37
	}
38

  
39
	@Test
40
	public void test_6() {
41
		check(" Manghi ,  Paolo", "Manghi, Paolo");
42
	}
43

  
44
	@Test
45
	public void test_7() {
46
		check("ATZORI, CLAUDIO", "Atzori, Claudio");
47
	}
48

  
49
	@Test
50
	public void test_8() {
51
		check("ATZORI, CLAUDIO A", "Atzori, Claudio A.");
52
	}
53

  
54
	@Test
55
	public void test_9() {
56
		check("Bølviken, B.", "Bølviken, B.");
57
	}
58

  
59
	@Test
60
	public void test_10() {
61
		check("Bñlviken, B.", "B" + Normalizer.normalize("ñ", Normalizer.Form.NFD) + "lviken, B.");
62
	}
63

  
64
	@Test
65
	public void test_11() {
66
		check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰ ø", "Aaeeiioooouuuu, Aaeeiioooouuuu Ø.", true);
67
	}
68

  
69
	@Test
70
	public void test_12() {
71
		check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.normalize("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.Form.NFD), false);
72
	}
73

  
74
	@Test
75
	public void test_13() {
76
		check("Tkačíková, Daniela", Normalizer.normalize("Tkačíková, Daniela", Normalizer.Form.NFD), false);
77
	}
78

  
79
	@Test
80
	public void test_hashes() {
81
		checkHash(" Claudio  ATZORI ", "ATZORI Claudio", "Atzori , Claudio", "ATZORI, Claudio");
82
	}
83

  
84
	private void checkHash(String... ss) {
85
		Queue<String> q = Lists.newLinkedList(Lists.newArrayList(ss));
86
		String h1 = new Person(q.remove(), false).hash();
87
		while (!q.isEmpty()) {
88
			assertEquals(h1, new Person(q.remove(), false).hash());
89
		}
90
	}
91

  
92
	private void check(String s, String expectedFullName) {
93
		check(s, expectedFullName, false);
94
	}
95

  
96
	private void check(String s, String expectedFullName, boolean aggressive) {
97
		Person p = new Person(s, aggressive);
98

  
99
		System.out.println("original:   " + p.getOriginal());
100
		System.out.println("accurate:   " + p.isAccurate());
101
		System.out.println("normalised: '" + p.getNormalisedFullname() + "'");
102
		if (p.isAccurate()) {
103
			System.out.println("name:     " + p.getNormalisedFirstName());
104
			System.out.println("surname:  " + p.getNormalisedSurname());
105
		}
106
		System.out.println("hash: " + p.hash());
107
		System.out.println("");
108
		assertEquals(expectedFullName, p.getNormalisedFullname());
109
	}
110

  
111
}
0 112

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java
1
package eu.dnetlib.pace.model;
2

  
3
import static org.junit.Assert.assertEquals;
4
import static org.junit.Assert.assertTrue;
5

  
6
import java.util.Set;
7

  
8
import org.junit.Test;
9

  
10
public class PersonComparatorUtilsNGramsTest {
11

  
12
	@Test
13
	public void testNormaizePerson_1() {
14
		verifyGetNgramsForPerson("Artini Michele", 2, "a_michele", "m_artini");
15
	}
16

  
17
	@Test
18
	public void testNormaizePerson_2() {
19
		verifyGetNgramsForPerson("Michele Artini", 2, "a_michele", "m_artini");
20
	}
21

  
22
	@Test
23
	public void testNormaizePerson_3() {
24
		verifyGetNgramsForPerson("Michele ARTINI", 1, "m_artini");
25
	}
26

  
27
	@Test
28
	public void testNormaizePerson_4() {
29
		verifyGetNgramsForPerson("ARTINI Michele", 1, "m_artini");
30
	}
31

  
32
	@Test
33
	public void testNormaizePerson_5() {
34
		verifyGetNgramsForPerson("Michele G. Artini", 2, "m_artini", "g_artini");
35
	}
36

  
37
	@Test
38
	public void testNormaizePerson_6() {
39
		verifyGetNgramsForPerson(" Artini, Michele ", 1, "m_artini");
40
	}
41

  
42
	@Test
43
	public void testNormaizePerson_7() {
44
		verifyGetNgramsForPerson("Artini, Michele (sig.)", 1, "m_artini");
45
	}
46

  
47
	@Test
48
	public void testNormaizePerson_8() {
49
		verifyGetNgramsForPerson("Artini Michele [sig.] ", 2, "a_michele", "m_artini");
50
	}
51

  
52
	@Test
53
	public void testNormaizePerson_9() {
54
		verifyGetNgramsForPerson("Artini, M", 1, "m_artini");
55
	}
56

  
57
	@Test
58
	public void testNormaizePerson_10() {
59
		verifyGetNgramsForPerson("Artini, M.", 1, "m_artini");
60
	}
61

  
62
	@Test
63
	public void testNormaizePerson_11() {
64
		verifyGetNgramsForPerson("Artini, M. (sig.)", 1, "m_artini");
65
	}
66

  
67
	@Test
68
	public void testNormaizePerson_12() {
69
		verifyGetNgramsForPerson("Artini, M[sig.] ", 1, "m_artini");
70
	}
71

  
72
	@Test
73
	public void testNormaizePerson_13() {
74
		verifyGetNgramsForPerson("Artini-SIG, Michele ", 1, "m_artini-sig");
75
	}
76

  
77
	@Test
78
	public void testNormaizePerson_14() {
79
		verifyGetNgramsForPerson("Artini - SIG, Michele ", 1, "m_artini-sig");
80
	}
81

  
82
	@Test
83
	public void testNormaizePerson_15() {
84
		verifyGetNgramsForPerson("Artini {sig.}, M", 1, "m_artini");
85
	}
86

  
87
	@Test
88
	public void testNormaizePerson_16() {
89
		verifyGetNgramsForPerson("Artini, M., sig.", 1, "m_artini");
90
	}
91

  
92
	@Test
93
	public void testNormaizePerson_17() {
94
		verifyGetNgramsForPerson("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA, BBBBBBBBBBBBBBBBBBBBBBBBBBBBB CCCCCCCCCCCCCCCCCCCC", 0);
95
	}
96

  
97
	@Test
98
	public void testNormaizePerson_18() {
99
		verifyGetNgramsForPerson("Dell'amico, Andrea", 1, "a_amico");
100
	}
101

  
102
	@Test
103
	public void testNormaizePerson_19() {
104
		verifyGetNgramsForPerson("Smith, Paul van der", 1, "p_smith");
105
	}
106

  
107
	@Test
108
	public void testNormaizePerson_20() {
109
		verifyGetNgramsForPerson("AAAAAAA, BBBB, CCCC, DDDD, EEEE", 1, "b_aaaaaaa");
110
	}
111

  
112
	@Test
113
	public void testNormaizePerson_21() {
114
		verifyGetNgramsForPerson("Kompetenzzentrum Informelle Bildung (KIB),", 6);
115
	}
116

  
117
	private void verifyGetNgramsForPerson(String name, int expectedSize, String... expectedTokens) {
118
		Set<String> list = PersonComparatorUtils.getNgramsForPerson(name);
119
		System.out.println(list);
120
		assertEquals(expectedSize, list.size());
121
		for (String s : expectedTokens) {
122
			assertTrue(list.contains(s));
123
		}
124
	}
125

  
126
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
1
package eu.dnetlib.pace.config;
2

  
3
import static org.junit.Assert.assertNotNull;
4

  
5
import java.io.IOException;
6

  
7
import org.junit.Test;
8

  
9
import eu.dnetlib.pace.AbstractPaceTest;
10

  
11
public class ConfigTest extends AbstractPaceTest {
12

  
13
	@Test
14
	public void test() throws IOException {
15
		final DedupConfig cfg = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
16

  
17
		assertNotNull(cfg);
18

  
19
		System.out.println(cfg);
20
	}
21

  
22
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java
1
package eu.dnetlib.pace.distance;
2

  
3
import org.junit.Before;
4
import org.junit.Test;
5

  
6
import eu.dnetlib.pace.common.AbstractPaceFunctions;
7

  
8
public class DistanceAlgoTest extends AbstractPaceFunctions {
9

  
10
	private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
11

  
12
	@Before
13
	public void setup() {
14
		System.out.println("****************************************************************");
15
		System.out.println("Test String    : " + TEST_STRING);
16
	}
17

  
18
	@Test
19
	public void testGetNumbers() {
20
		System.out.println("Numbers        : " + getNumbers(TEST_STRING));
21
	}
22

  
23
	@Test
24
	public void testRemoveSymbols() {
25
		System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
26
	}
27

  
28
	@Test
29
	public void testFixAliases() {
30
		System.out.println("Fixed aliases  : " + fixAliases(TEST_STRING));
31
	}
32

  
33
	@Test
34
	public void testCleanup() {
35
		System.out.println("cleaned up     : " + cleanup(TEST_STRING));
36
	}
37

  
38
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java
1
package eu.dnetlib.pace;
2

  
3
import java.io.IOException;
4
import java.io.StringWriter;
5

  
6
import org.apache.commons.io.IOUtils;
7

  
8
import eu.dnetlib.pace.config.Type;
9
import eu.dnetlib.pace.model.Field;
10
import eu.dnetlib.pace.model.FieldValueImpl;
11

  
12
public abstract class AbstractPaceTest {
13

  
14
	protected String readFromClasspath(final String filename) {
15
		final StringWriter sw = new StringWriter();
16
		try {
17
			IOUtils.copy(getClass().getResourceAsStream(filename), sw);
18
			return sw.toString();
19
		} catch (final IOException e) {
20
			throw new RuntimeException("cannot load resource from classpath: " + filename);
21
		}
22
	}
23

  
24
	protected Field title(final String s) {
25
		return new FieldValueImpl(Type.String, "title", s);
26
	}
27

  
28
}
0 29

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json
1
{ 
2
	"wf" : { 
3
        "threshold" : "0.99", 
4
        "run" : "001", 
5
        "entityType" : "result", 
6
        "orderField" : "title", 
7
        "queueMaxSize" : "2000",
8
        "groupMaxSize" : "10",
9
        "slidingWindowSize" : "200",
10
        "rootBuilder" : [ "result" ],
11
        "includeChildren" : "true" 
12
    },
13
	"pace" : {		
14
		"clustering" : [
15
			{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
16
			{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
17
			{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } 
18
		],		
19
		"strictConditions" : [
20
  			{ "name" : "exactMatch", "fields" : [ "pid" ] }
21
  		], 
22
  		"conditions" : [ 
23
  			{ "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
24
  			{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
25
  			{ "name" : "sizeMatch", "fields" : [ "authors" ] } 
26
  		],		
27
		"model" : [
28
			{ "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid/value", "overrideMatch" : "true" }, 	
29
			{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title/value" },
30
			{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ,
31
			{ "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" } 		
32
		],
33
		"blacklists" : {
34
			"title" : [
35
				"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
36
				"^(Kiri Karl Morgensternile).*$",
37
				"^(\\[Eksliibris Aleksandr).*\\]$",
38
				"^(\\[Eksliibris Aleksandr).*$",
39
				"^(Eksliibris Aleksandr).*$",
40
				"^(Kiri A\\. de Vignolles).*$",
41
				"^(2 kirja Karl Morgensternile).*$",
42
				"^(Pirita kloostri idaosa arheoloogilised).*$",
43
				"^(Kiri tundmatule).*$",
44
				"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
45
				"^(Eksliibris Nikolai Birukovile).*$",
46
				"^(Eksliibris Nikolai Issakovile).*$",
47
				"^(WHP Cruise Summary Information of section).*$",
48
				"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
49
				"^(Measurement of the spin\\-dependent structure function).*"
50
			] } 		
51
	}
52

  
53
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/test/resources/eu/dnetlib/pace/config/title_blacklist.txt
1
^(Corpus Oral Dialectal \(COD\)\.).*$
2
^(Kiri Karl Morgensternile).*$
3
^(\[Eksliibris Aleksandr).*\]$
4
^(Kiri A\. de Vignolles).*$
5
^(2 kirja Karl Morgensternile).*$
6
^(Pirita kloostri idaosa arheoloogilised).*$
7
^(Kiri tundmatule).*$
8
^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$
9
^(Eksliibris Nikolai Birukovile).*$
10
^(Eksliibris Nikolai Issakovile).*$
11
^(\[Eksliibris Aleksandr).*$
12
^(WHP Cruise Summary Information of section).*$
13
^(Measurement of the top quark\-pair production cross section with ATLAS in pp collisions at).*$
14
^(Measurement of the spin\-dependent structure function).*
15
^(lorem ipsum).*
0 16

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/Clustering.java
1
package eu.dnetlib.pace.clustering;
2

  
3
public enum Clustering {
4
	acronyms, ngrams, ngrampairs, suffixprefix, spacetrimmingfieldvalue
5
}
0 6

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.Map;
5
import java.util.Set;
6

  
7
import com.google.common.collect.Sets;
8

  
9
public class SuffixPrefix extends AbstractClusteringFunction {
10

  
11
	public SuffixPrefix(Map<String, Integer> params) {
12
		super(params);
13
	}
14

  
15
	@Override
16
	protected Collection<String> doApply(String s) {
17
		return suffixPrefix(s, param("len"), param("max"));
18
	}
19
	
20
	private Collection<String> suffixPrefix(String s, int len, int max) {
21
		final Set<String> bigrams = Sets.newLinkedHashSet();
22
		int i = 0;
23
		while (++i < s.length() && bigrams.size() < max) {
24
			int j = s.indexOf(" ", i);
25

  
26
			int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
27

  
28
			if (j - len > 0) {
29
				String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
30
				if (bigram.length() >= 4) {
31
					bigrams.add(bigram);
32
				}
33
			}
34
		}
35
		return bigrams;
36
	}
37

  
38
}
0 39

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import com.google.common.collect.Lists;
8

  
9
public class NgramPairs extends Ngrams {
10

  
11
	public NgramPairs(Map<String, Integer> params) {
12
		super(params);
13
	}
14
	
15
	@Override
16
	protected Collection<String> doApply(String s) {
17
		return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
18
	}	
19
	
20
	private Collection<String> ngramPairs(final List<String> ngrams, int maxNgrams) {
21
		Collection<String> res = Lists.newArrayList();
22
		int j = 0;
23
		for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) {
24
			if (++j >= ngrams.size()) {
25
				break;
26
			}
27
			res.add(ngrams.get(i) + ngrams.get(j));
28
			//System.out.println("-- " + concatNgrams);
29
		}
30
		return res;
31
	}
32

  
33
}
0 34

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import org.apache.commons.lang.RandomStringUtils;
8
import org.apache.commons.lang.StringUtils;
9

  
10
import com.google.common.collect.Lists;
11

  
12
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
13

  
14
	public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
15
		super(params);
16
	}
17

  
18
	@Override
19
	protected Collection<String> doApply(final String s) {
20
		final List<String> res = Lists.newArrayList();
21

  
22
		res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
23

  
24
		return res;
25
	}
26

  
27
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import eu.dnetlib.pace.model.Field;
8

  
9
public interface ClusteringFunction {
10
	
11
	public Collection<String> apply(List<Field> fields); 
12
	
13
	public Map<String, Integer> getParams();
14

  
15
}
0 16

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Set;
4

  
5
import org.apache.commons.lang.StringUtils;
6

  
7
import eu.dnetlib.pace.common.AbstractPaceFunctions;
8

  
9
public class NGramUtils extends AbstractPaceFunctions {
10

  
11
	private static final int SIZE = 100;
12

  
13
	private static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
14

  
15
	public static String cleanupForOrdering(String s) {
16
		NGramUtils utils = new NGramUtils();
17
		return (utils.filterStopWords(utils.normalize(s), stopwords) +  StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", "");
18
	}
19

  
20
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.Map;
5

  
6
public class RandomClusteringFunction extends AbstractClusteringFunction {
7

  
8
	public RandomClusteringFunction(Map<String, Integer> params) {
9
		super(params);
10
	}
11

  
12
	@Override
13
	protected Collection<String> doApply(String s) {
14
		// TODO Auto-generated method stub
15
		return null;
16
	}
17

  
18
}
0 19

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.Map.Entry;
7
import java.util.Set;
8

  
9
import com.google.common.collect.Iterables;
10
import com.google.common.collect.Lists;
11
import com.google.common.collect.Maps;
12

  
13
import eu.dnetlib.pace.config.Config;
14
import eu.dnetlib.pace.model.Document;
15
import eu.dnetlib.pace.model.FieldListImpl;
16
import eu.dnetlib.pace.model.MapDocument;
17

  
18
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
19

  
20
	public static Collection<String> filterAndCombine(final MapDocument a, final Config conf, final Map<String, List<String>> blacklists) {
21

  
22
		final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, blacklists);
23
		return combine(filtered, conf);
24
	}
25

  
26
	private MapDocument filter(final MapDocument a, final Map<String, List<String>> blacklists) {
27
		final Map<String, FieldListImpl> filtered = Maps.newHashMap(a.getFieldMap());
28
		if (blacklists != null) {
29
			for (final Entry<String, FieldListImpl> e : filtered.entrySet()) {
30

  
31
				final FieldListImpl fl = new FieldListImpl();
32
				fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
33
				filtered.put(e.getKey(), fl);
34
			}
35
		}
36
		return new MapDocument(a.getIdentifier(), filtered);
37
	}
38

  
39
	/**
40
	 * Tries to match the fields in the regex blacklist.
41
	 *
42
	 * @param fieldName
43
	 * @param value
44
	 * @return true if the field matches, false otherwise
45
	 */
46
	protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) {
47
		if (blacklists.containsKey(fieldName)) {
48
			for (final String regex : blacklists.get(fieldName)) {
49
				if (value.matches(regex)) return true;
50
			}
51
		}
52
		return false;
53
	}
54
}
0 55

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.LinkedHashSet;
5
import java.util.Map;
6
import java.util.StringTokenizer;
7

  
8
public class Ngrams extends AbstractClusteringFunction {
9

  
10
	public Ngrams(Map<String, Integer> params) {
11
		super(params);
12
	}
13
	
14
	@Override
15
	protected Collection<String> doApply(String s) {
16
		return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
17
	}
18

  
19
	protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
20

  
21
		final Collection<String> ngrams = new LinkedHashSet<String>();
22
		final StringTokenizer st = new StringTokenizer(s);
23

  
24
		while (st.hasMoreTokens()) {
25
			final String token = st.nextToken();
26
			if (!token.isEmpty()) {
27

  
28
				for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
29
					String ngram = (token + "    ").substring(i, ngramLen + i).trim();
30
					if (ngrams.size() >= max) {
31
						return ngrams;
32
					}
33
					if (ngram.length() >= minNgramLen) {
34
						ngrams.add(ngram);
35
					}
36
				}
37
			}
38
		}
39
		//System.out.println(ngrams + " n: " + ngrams.size());
40
		return ngrams;
41
	}
42

  
43
}
0 44

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import com.google.common.collect.Sets;
8

  
9
import eu.dnetlib.pace.common.AbstractPaceFunctions;
10
import eu.dnetlib.pace.model.Field;
11

  
12
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
13

  
14
	protected Map<String, Integer> params;
15
	
16
	public AbstractClusteringFunction(final Map<String, Integer> params) {
17
		this.params = params;
18
	}
19
	
20
	protected abstract Collection<String> doApply(String s);
21
	
22
	@Override
23
	public Collection<String> apply(List<Field> fields) {
24
		Collection<String> c = Sets.newLinkedHashSet();
25
		for(Field f : fields) {
26
			c.addAll(filterBlacklisted(doApply(filterStopWords(normalize(f.stringValue()), stopwords)), ngramBlacklist));
27
		}
28
		return c;
29
	}
30

  
31
	public Map<String, Integer> getParams() {
32
		return params;
33
	}
34
	
35
	protected Integer param(String name) {
36
		return params.get(name);
37
	}
38
}
0 39

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.List;
4
import java.util.Map;
5

  
6
import com.google.common.base.Predicate;
7

  
8
import eu.dnetlib.pace.model.Field;
9

  
10
public class FieldFilter implements Predicate<Field> {
11

  
12
	private Map<String, List<String>> blacklists;
13

  
14
	private String filedName;
15

  
16
	public FieldFilter(final String fieldName, final Map<String, List<String>> blacklists) {
17
		this.filedName = fieldName;
18
		this.blacklists = blacklists;
19
	}
20

  
21
	@Override
22
	public boolean apply(final Field f) {
23
		return !regexMatches(filedName, f.stringValue(), blacklists);
24
	}
25

  
26
	/**
27
	 * Tries to match the fields in the regex blacklist.
28
	 *
29
	 * @param fieldName
30
	 * @param value
31
	 * @return true if the field matches, false otherwise
32
	 */
33
	protected boolean regexMatches(final String fieldName, final String value, final Map<String, List<String>> blacklists) {
34
		if (blacklists.containsKey(fieldName)) {
35
			final Iterable<String> regexes = blacklists.get(fieldName);
36
			for (final String regex : regexes) {
37
				if (value.matches(regex)) return true;
38
			}
39
		}
40
		return false;
41
	}
42
}
0 43

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.Map;
5
import java.util.Set;
6
import java.util.StringTokenizer;
7

  
8
import com.google.common.collect.Sets;
9

  
10
public class Acronyms extends AbstractClusteringFunction {
11

  
12
	public Acronyms(Map<String, Integer> params) {
13
		super(params);
14
	}
15

  
16
	@Override
17
	protected Collection<String> doApply(String s) {
18
		return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
19
	}
20
	
21
	private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
22
		
23
		final Set<String> acronyms = Sets.newLinkedHashSet();
24
		
25
		for (int i = 0; i < maxAcronyms; i++) {
26
			
27
			final StringTokenizer st = new StringTokenizer(s);
28
			final StringBuilder sb = new StringBuilder();
29
			
30
			while (st.hasMoreTokens()) {
31
				final String token = st.nextToken();
32
				if (sb.length() > maxLen) {
33
					break;
34
				}
35
				if (token.length() > 1 && i < token.length()) {
36
					sb.append(token.charAt(i));
37
				}
38
			}
39
			String acronym = sb.toString();
40
			if (acronym.length() > minLen) {
41
				acronyms.add(acronym);
42
			}
43
		}
44
		return acronyms;
45
	}
46

  
47
}
0 48

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5

  
6
import com.google.common.collect.Sets;
7

  
8
import eu.dnetlib.pace.config.Config;
9
import eu.dnetlib.pace.model.ClusteringDef;
10
import eu.dnetlib.pace.model.Document;
11
import eu.dnetlib.pace.model.FieldList;
12

  
13
public class ClusteringCombiner {
14

  
15
	public static Collection<String> combine(final Document a, final Config conf) {
16
		return new ClusteringCombiner().doCombine(a, conf.clusterings());
17
	}
18

  
19
	private Collection<String> doCombine(final Document a, final List<ClusteringDef> defs) {
20
		final Collection<String> res = Sets.newLinkedHashSet();
21
		for (final ClusteringDef cd : defs) {
22
			for (final String fieldName : cd.getFields()) {
23
				final FieldList values = a.values(fieldName);
24
				res.addAll(cd.getClusteringFunction().apply(values));
25
			}
26
		}
27
		return res;
28
	}
29

  
30
}
0 31

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.0/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.Collection;
4
import java.util.Iterator;
5
import java.util.List;
6
import java.util.ListIterator;
7

  
8
import com.google.common.base.Function;
9
import com.google.common.base.Joiner;
10
import com.google.common.collect.Iterables;
11
import com.google.common.collect.Lists;
12

  
13
import eu.dnetlib.pace.config.Type;
14

  
15
/**
16
 * The Class FieldListImpl.
17
 */
18
public class FieldListImpl extends AbstractField implements FieldList {
19

  
20
	/** The fields. */
21
	private List<Field> fields;
22

  
23
	/**
24
	 * Instantiates a new field list impl.
25
	 */
26
	public FieldListImpl() {
27
		fields = Lists.newArrayList();
28
	}
29

  
30
	/**
31
	 * Instantiates a new field list impl.
32
	 * 
33
	 * @param name
34
	 *            the name
35
	 */
36
	public FieldListImpl(final String name) {
37
		super(Type.List, name);
38
		fields = Lists.newArrayList();
39
	}
40

  
41
	/*
42
	 * (non-Javadoc)
43
	 * 
44
	 * @see java.util.List#add(java.lang.Object)
45
	 */
46
	@Override
47
	public boolean add(final Field f) {
48
		return fields.add(f);
49
	}
50

  
51
	/*
52
	 * (non-Javadoc)
53
	 * 
54
	 * @see java.util.List#add(int, java.lang.Object)
55
	 */
56
	@Override
57
	public void add(final int i, final Field f) {
58
		fields.add(i, f);
59
	}
60

  
61
	/*
62
	 * (non-Javadoc)
63
	 * 
64
	 * @see java.util.List#addAll(java.util.Collection)
65
	 */
66
	@Override
67
	public boolean addAll(final Collection<? extends Field> f) {
68
		return fields.addAll(f);
69
	}
70

  
71
	/*
72
	 * (non-Javadoc)
73
	 * 
74
	 * @see java.util.List#addAll(int, java.util.Collection)
75
	 */
76
	@Override
77
	public boolean addAll(final int i, final Collection<? extends Field> f) {
78
		return fields.addAll(i, f);
79
	}
80

  
81
	/*
82
	 * (non-Javadoc)
83
	 * 
84
	 * @see java.util.List#clear()
85
	 */
86
	@Override
87
	public void clear() {
88
		fields.clear();
89
	}
90

  
91
	/*
92
	 * (non-Javadoc)
93
	 * 
94
	 * @see java.util.List#contains(java.lang.Object)
95
	 */
96
	@Override
97
	public boolean contains(final Object o) {
98
		return fields.contains(o);
99
	}
100

  
101
	/*
102
	 * (non-Javadoc)
103
	 * 
104
	 * @see java.util.List#containsAll(java.util.Collection)
105
	 */
106
	@Override
107
	public boolean containsAll(final Collection<?> f) {
108
		return fields.containsAll(f);
109
	}
110

  
111
	/*
112
	 * (non-Javadoc)
113
	 * 
114
	 * @see java.util.List#get(int)
115
	 */
116
	@Override
117
	public Field get(final int i) {
118
		return fields.get(i);
119
	}
120

  
121
	/*
122
	 * (non-Javadoc)
123
	 * 
124
	 * @see java.util.List#indexOf(java.lang.Object)
125
	 */
126
	@Override
127
	public int indexOf(final Object o) {
128
		return fields.indexOf(o);
129
	}
130

  
131
	/*
132
	 * (non-Javadoc)
133
	 * 
134
	 * @see eu.dnetlib.pace.model.Field#isEmpty()
135
	 */
136
	@Override
137
	public boolean isEmpty() {
138
		return fields.isEmpty();
139
	}
140

  
141
	/*
142
	 * (non-Javadoc)
143
	 * 
144
	 * @see java.lang.Iterable#iterator()
145
	 */
146
	@Override
147
	public Iterator<Field> iterator() {
148
		return fields.iterator();
149
	}
150

  
151
	/*
152
	 * (non-Javadoc)
153
	 * 
154
	 * @see java.util.List#lastIndexOf(java.lang.Object)
155
	 */
156
	@Override
157
	public int lastIndexOf(final Object o) {
158
		return fields.lastIndexOf(o);
159
	}
160

  
161
	/*
162
	 * (non-Javadoc)
163
	 * 
164
	 * @see java.util.List#listIterator()
165
	 */
166
	@Override
167
	public ListIterator<Field> listIterator() {
168
		return fields.listIterator();
169
	}
170

  
171
	/*
172
	 * (non-Javadoc)
173
	 * 
174
	 * @see java.util.List#listIterator(int)
175
	 */
176
	@Override
177
	public ListIterator<Field> listIterator(final int i) {
178
		return fields.listIterator(i);
179
	}
180

  
181
	/*
182
	 * (non-Javadoc)
183
	 * 
184
	 * @see java.util.List#remove(java.lang.Object)
185
	 */
186
	@Override
187
	public boolean remove(final Object o) {
188
		return fields.remove(o);
189
	}
190

  
191
	/*
192
	 * (non-Javadoc)
193
	 * 
194
	 * @see java.util.List#remove(int)
195
	 */
196
	@Override
197
	public Field remove(final int i) {
198
		return fields.remove(i);
199
	}
200

  
201
	/*
202
	 * (non-Javadoc)
203
	 * 
204
	 * @see java.util.List#removeAll(java.util.Collection)
205
	 */
206
	@Override
207
	public boolean removeAll(final Collection<?> f) {
208
		return fields.removeAll(f);
209
	}
210

  
211
	/*
212
	 * (non-Javadoc)
213
	 * 
214
	 * @see java.util.List#retainAll(java.util.Collection)
215
	 */
216
	@Override
217
	public boolean retainAll(final Collection<?> f) {
218
		return fields.retainAll(f);
219
	}
220

  
221
	/*
222
	 * (non-Javadoc)
223
	 * 
224
	 * @see java.util.List#set(int, java.lang.Object)
225
	 */
226
	@Override
227
	public Field set(final int i, final Field f) {
228
		return fields.set(i, f);
229
	}
230

  
231
	/*
232
	 * (non-Javadoc)
233
	 * 
234
	 * @see java.util.List#size()
235
	 */
236
	@Override
237
	public int size() {
238
		return fields.size();
239
	}
240

  
241
	/*
242
	 * (non-Javadoc)
243
	 * 
244
	 * @see java.util.List#subList(int, int)
245
	 */
246
	@Override
247
	public List<Field> subList(final int from, final int to) {
248
		return fields.subList(from, to);
249
	}
250

  
251
	/*
252
	 * (non-Javadoc)
253
	 * 
254
	 * @see java.util.List#toArray()
255
	 */
256
	@Override
257
	public Object[] toArray() {
258
		return fields.toArray();
259
	}
260

  
261
	/*
262
	 * (non-Javadoc)
263
	 * 
264
	 * @see java.util.List#toArray(java.lang.Object[])
265
	 */
266
	@Override
267
	public <T> T[] toArray(final T[] t) {
268
		return fields.toArray(t);
269
	}
270

  
271
	/*
272
	 * (non-Javadoc)
273
	 * 
274
	 * @see eu.dnetlib.pace.model.Field#stringValue()
275
	 */
276
	@Override
277
	public String stringValue() {
278
		return Joiner.on(" ").join(stringList());
279
	}
280

  
281
	/*
282
	 * (non-Javadoc)
283
	 * 
284
	 * @see eu.dnetlib.pace.model.FieldList#stringList()
285
	 */
286
	@Override
287
	public List<String> stringList() {
288
		return Lists.newArrayList(Iterables.transform(fields, new Function<Field, String>() {
289

  
290
			@Override
291
			public String apply(final Field f) {
292
				return f.stringValue();
293
			}
294
		}));
295
	}
296

  
297
	@Override
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff