Project

General

Profile

« Previous | Next » 

Revision 37509

[maven-release-plugin] copy for tag dnet-pace-core-2.1.2

View differences:

modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/LevensteinDate.java
1
package eu.dnetlib.pace.distance;
2

  
3

  
4
public class LevensteinDate extends Levenstein {
5

  
6

  
7
	public LevensteinDate(double w) {
8
		super(w);
9
	}
10

  
11
	
12
	@Override
13
	public double distance(String a, String b) {
14

  
15
		return 1.0;
16
	}
17
	
18

  
19
	
20
	@Override
21
	public double getWeight() {
22
		return super.weight;
23
	}
24

  
25
}
0 26

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/JaroWinklerTitle.java
1
package eu.dnetlib.pace.distance;
2

  
3
import com.wcohen.ss.AbstractStringDistance;
4

  
5
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
6
public class JaroWinklerTitle extends SecondStringDistanceAlgo {
7

  
8
	public JaroWinklerTitle(double weight) {
9
		super(weight, new com.wcohen.ss.JaroWinkler());
10
	}
11

  
12
	protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
13
		super(weight, ssalgo);
14
	}
15
	
16
	@Override
17
	public double distance(String a, String b) {
18
		String ca = cleanup(a);
19
		String cb = cleanup(b);
20

  
21
		boolean check = checkNumbers(ca, cb);
22
		return check ? 0.5 : normalize(ssalgo.score(ca, cb));
23
	}	
24

  
25
	@Override
26
	public double getWeight() {
27
		return super.weight;
28
	}
29

  
30
	@Override
31
	protected double normalize(double d) {
32
		return d;
33
	}
34

  
35
}
0 36

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/Level2JaroWinklerTitle.java
1
package eu.dnetlib.pace.distance;
2

  
3
import com.wcohen.ss.AbstractStringDistance;
4

  
5
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
6

  
7
	public Level2JaroWinklerTitle(final double w) {
8
		super(w, new com.wcohen.ss.Level2JaroWinkler());
9
	}
10

  
11
	protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) {
12
		super(w, ssalgo);
13
	}
14

  
15
	@Override
16
	public double distance(final String a, final String b) {
17
		final String ca = cleanup(a);
18
		final String cb = cleanup(b);
19

  
20
		final boolean check = checkNumbers(ca, cb);
21

  
22
		if (check) return 0.5;
23

  
24
		final String cca = finalCleanup(ca);
25
		final String ccb = finalCleanup(cb);
26

  
27
		return ssalgo.score(cca, ccb);
28
	}
29

  
30
	@Override
31
	public double getWeight() {
32
		return super.weight;
33
	}
34

  
35
	@Override
36
	protected double normalize(final double d) {
37
		return d;
38
	}
39

  
40
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java
1
package eu.dnetlib.pace.distance;
2

  
3
import eu.dnetlib.pace.model.Field;
4

  
5
/**
6
 * Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two
7
 * objects.
8
 */
9
public interface DistanceAlgo {
10

  
11
	public abstract double distance(Field a, Field b);
12

  
13
	public double getWeight();
14

  
15
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java
1
package eu.dnetlib.pace.distance;
2

  
3
import java.util.List;
4

  
5
import eu.dnetlib.pace.condition.ConditionAlgo;
6
import eu.dnetlib.pace.model.Document;
7
import eu.dnetlib.pace.model.Field;
8
import eu.dnetlib.pace.model.FieldDef;
9

  
10
/**
11
 * The distance between two documents is given by the weighted mean of the field distances
12
 */
13
public class DistanceScorer {
14

  
15
	private List<FieldDef> fields;
16

  
17
	private List<ConditionAlgo> strictConditions;
18

  
19
	private List<ConditionAlgo> conditions;
20

  
21
	public DistanceScorer(final List<FieldDef> fields, final List<ConditionAlgo> strictConditions, final List<ConditionAlgo> conditions) {
22
		this.fields = fields;
23
		this.conditions = conditions;
24
		this.strictConditions = strictConditions;
25
	}
26

  
27
	public double distance(final Document a, final Document b) {
28

  
29
		double w = sumWeights(fields);
30
		double sum = 0.0;
31
		final int cond = verify(a, b, strictConditions, true);
32

  
33
		if (cond > 0) return 1.0;
34
		if (cond < 0) return 0.0;
35

  
36
		if (verify(a, b, conditions, true) >= 0) {
37
			for (final FieldDef fd : fields) {
38
				final double d = fieldDistance(a, b, fd);
39

  
40
				if (d > 0) {
41
					sum += d;
42
				} else {
43
					w -= fd.getWeight();
44
				}
45
			}
46
			return w == 0 ? 0 : sum / w;
47
		}
48
		return 0.0;
49
	}
50

  
51
	private int verify(final Document a, final Document b, final List<ConditionAlgo> conditions, final boolean strict) {
52
		int cond = 0;
53

  
54
		for (final ConditionAlgo cd : conditions) {
55
			final int verify = cd.verify(a, b);
56
			if (strict && (verify < 0)) return -1;
57
			cond += verify;
58
		}
59
		return cond;
60
	}
61

  
62
	private double fieldDistance(final Document a, final Document b, final FieldDef fd) {
63
		final double w = fd.getWeight();
64
		if ((w == 0)) return 0.0; // optimization for 0 weight
65
		else {
66
			final Field va = getValue(a, fd);
67
			final Field vb = getValue(b, fd);
68

  
69
			if (va.isEmpty() || vb.isEmpty()) {
70
				if (fd.isIgnoreMissing()) return -1;
71
				else return w;
72
			} else {
73

  
74
				if (va.getType().equals(vb.getType())) {
75
					final double d = fd.getDistanceAlgo().distance(va, vb);
76
					return w * d;
77
				}
78
				throw new IllegalArgumentException("Types are differents type");
79
			}
80
		}
81
	}
82

  
83
	private Field getValue(final Document d, final FieldDef fd) {
84
		return d.values(fd.getName());
85
	}
86

  
87
	private double sumWeights(final List<FieldDef> fields) {
88
		double sum = 0.0;
89
		for (final FieldDef fd : fields) {
90
			sum += fd.getWeight();
91
		}
92
		return sum;
93
	}
94

  
95
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/Distance.java
1
package eu.dnetlib.pace.distance;
2

  
3
import eu.dnetlib.pace.config.Config;
4

  
5
public interface Distance<A> {
6

  
7
	public double between(A a, A b, Config config);
8
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/SortedSecondStringDistanceAlgo.java
1
package eu.dnetlib.pace.distance;
2

  
3
import java.util.Collections;
4
import java.util.List;
5

  
6
import com.google.common.collect.Lists;
7
import com.wcohen.ss.AbstractStringDistance;
8

  
9
import eu.dnetlib.pace.model.Field;
10
import eu.dnetlib.pace.model.FieldList;
11

  
12
/**
13
 * For the rest of the fields delegate the distance measure to the second string library.
14
 */
15
public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo {
16

  
17
	/**
18
	 * Instantiates a new sorted second string distance algo.
19
	 * 
20
	 * @param weight
21
	 *            the weight
22
	 * @param ssalgo
23
	 *            the ssalgo
24
	 */
25
	protected SortedSecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) {
26
		super(weight, ssalgo);
27
	}
28

  
29
	/*
30
	 * (non-Javadoc)
31
	 * 
32
	 * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
33
	 */
34
	@Override
35
	protected List<String> toList(final Field list) {
36
		FieldList fl = (FieldList) list;
37
		List<String> values = Lists.newArrayList(fl.stringList());
38
		Collections.sort(values);
39
		return values;
40
	}
41

  
42
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import com.google.common.collect.Lists;
8

  
9
import eu.dnetlib.pace.model.Person;
10

  
11
public class PersonHash extends AbstractClusteringFunction {
12

  
13
	private boolean DEFAULT_AGGRESSIVE = false;
14

  
15
	public PersonHash(final Map<String, Integer> params) {
16
		super(params);
17
	}
18

  
19
	@Override
20
	protected Collection<String> doApply(final String s) {
21
		final List<String> res = Lists.newArrayList();
22

  
23
		final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
24

  
25
		res.add(new Person(s, aggressive).hash());
26

  
27
		return res;
28
	}
29

  
30
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/SubStringLevenstein.java
1
package eu.dnetlib.pace.distance;
2

  
3
import org.apache.commons.lang.StringUtils;
4

  
5
import com.wcohen.ss.AbstractStringDistance;
6

  
7
import eu.dnetlib.pace.config.Type;
8
import eu.dnetlib.pace.model.Field;
9

  
10
/**
11
 * The Class SubStringLevenstein.
12
 */
13
public class SubStringLevenstein extends SecondStringDistanceAlgo {
14

  
15
	/** The limit. */
16
	protected int limit;
17

  
18
	/**
19
	 * Instantiates a new sub string levenstein.
20
	 * 
21
	 * @param w
22
	 *            the w
23
	 */
24
	public SubStringLevenstein(final double w) {
25
		super(w, new com.wcohen.ss.Levenstein());
26
	}
27

  
28
	/**
29
	 * Instantiates a new sub string levenstein.
30
	 * 
31
	 * @param w
32
	 *            the w
33
	 * @param limit
34
	 *            the limit
35
	 */
36
	public SubStringLevenstein(final double w, final int limit) {
37
		super(w, new com.wcohen.ss.Levenstein());
38
		this.limit = limit;
39
	}
40

  
41
	/**
42
	 * Instantiates a new sub string levenstein.
43
	 * 
44
	 * @param w
45
	 *            the w
46
	 * @param limit
47
	 *            the limit
48
	 * @param ssalgo
49
	 *            the ssalgo
50
	 */
51
	protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
52
		super(w, ssalgo);
53
		this.limit = limit;
54
	}
55

  
56
	/*
57
	 * (non-Javadoc)
58
	 * 
59
	 * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
60
	 */
61
	@Override
62
	public double distance(final Field a, final Field b) {
63
		if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
64
			return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
65

  
66
		throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
67
	}
68

  
69
	/*
70
	 * (non-Javadoc)
71
	 * 
72
	 * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
73
	 */
74
	@Override
75
	public double getWeight() {
76
		return super.weight;
77
	}
78

  
79
	/*
80
	 * (non-Javadoc)
81
	 * 
82
	 * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
83
	 */
84
	@Override
85
	protected double normalize(final double d) {
86
		return 1 / Math.pow(Math.abs(d) + 1, 0.1);
87
	}
88

  
89
}
0 90

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/SortedLevel2JaroWinkler.java
1
package eu.dnetlib.pace.distance;
2

  
3
import com.wcohen.ss.AbstractStringDistance;
4

  
5
/**
6
 * The Class SortedJaroWinkler.
7
 */
8
public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
9

  
10
	/**
11
	 * Instantiates a new sorted jaro winkler.
12
	 * 
13
	 * @param weight
14
	 *            the weight
15
	 */
16
	public SortedLevel2JaroWinkler(final double weight) {
17
		super(weight, new com.wcohen.ss.Level2JaroWinkler());
18
	}
19

  
20
	/**
21
	 * Instantiates a new sorted jaro winkler.
22
	 * 
23
	 * @param weight
24
	 *            the weight
25
	 * @param ssalgo
26
	 *            the ssalgo
27
	 */
28
	protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
29
		super(weight, ssalgo);
30
	}
31

  
32
	/*
33
	 * (non-Javadoc)
34
	 * 
35
	 * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
36
	 */
37
	@Override
38
	public double getWeight() {
39
		return super.weight;
40
	}
41

  
42
	/*
43
	 * (non-Javadoc)
44
	 * 
45
	 * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
46
	 */
47
	@Override
48
	protected double normalize(final double d) {
49
		return d;
50
	}
51

  
52
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.List;
4
import java.util.Map;
5

  
6
import com.google.common.base.Predicate;
7

  
8
import eu.dnetlib.pace.model.Field;
9

  
10
public class FieldFilter implements Predicate<Field> {
11

  
12
	private Map<String, List<String>> blacklists;
13

  
14
	private String filedName;
15

  
16
	public FieldFilter(final String fieldName, final Map<String, List<String>> blacklists) {
17
		this.filedName = fieldName;
18
		this.blacklists = blacklists;
19
	}
20

  
21
	@Override
22
	public boolean apply(final Field f) {
23
		return !regexMatches(filedName, f.stringValue(), blacklists);
24
	}
25

  
26
	/**
27
	 * Tries to match the fields in the regex blacklist.
28
	 *
29
	 * @param fieldName
30
	 * @param value
31
	 * @return true if the field matches, false otherwise
32
	 */
33
	protected boolean regexMatches(final String fieldName, final String value, final Map<String, List<String>> blacklists) {
34
		if (blacklists.containsKey(fieldName)) {
35
			final Iterable<String> regexes = blacklists.get(fieldName);
36
			for (final String regex : regexes) {
37
				if (value.matches(regex)) return true;
38
			}
39
		}
40
		return false;
41
	}
42
}
0 43

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/YearLevenstein.java
1
package eu.dnetlib.pace.distance;
2

  
3

  
4
public class YearLevenstein extends SubStringLevenstein {
5

  
6
	public YearLevenstein(double w) {
7
		super(w);
8
	}
9

  
10
	public YearLevenstein(double w, int limit) {
11
		super(w, limit);	
12
	}
13
	
14
	@Override
15
	public double distance(String a, String b) {
16
		boolean check = checkLength(a) && checkLength(b);
17
		if (check) {
18
			if (a.equals(b)) {
19
				return 1.0;
20
			} else {
21
				return 0.5;
22
			}
23
		} else {
24
			return 1.0;
25
		}
26
	}
27
	
28
	protected boolean checkLength(String s) {
29
		return getNumbers(s).length() == limit;
30
	}
31
	
32
	@Override
33
	public double getWeight() {
34
		return super.weight;
35
	}
36

  
37
}
0 38

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/NullDistanceAlgo.java
1
package eu.dnetlib.pace.distance;
2

  
3
import eu.dnetlib.pace.model.Field;
4

  
5
/**
6
 * Not all fields of a document need to partecipate in the distance measure. We model those fields as having a
7
 * NullDistanceAlgo.
8
 */
9
public class NullDistanceAlgo implements DistanceAlgo {
10

  
11
	@Override
12
	public double distance(Field a, Field b) {
13
		return 0.0;
14
	}
15

  
16
	@Override
17
	public double getWeight() {
18
		return 0.0;
19
	}
20

  
21
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/Level2Levenstein.java
1
package eu.dnetlib.pace.distance;
2

  
3
import com.wcohen.ss.AbstractStringDistance;
4

  
5
public class Level2Levenstein extends SecondStringDistanceAlgo {
6

  
7
	public Level2Levenstein(double w) {
8
		super(w, new com.wcohen.ss.Level2Levenstein());
9
	}
10

  
11
	protected Level2Levenstein(double w, AbstractStringDistance ssalgo) {
12
		super(w, ssalgo);
13
	}
14

  
15
	@Override
16
	public double getWeight() {
17
		return super.weight;
18
	}
19

  
20
	@Override
21
	protected double normalize(double d) {
22
		return 1 / Math.pow(Math.abs(d) + 1, 0.1);
23
	}
24

  
25
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/pom.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet-hadoop-parent</artifactId>
6
		<version>1.0.0</version>
7
		<relativePath />
8
	</parent>
9
	<modelVersion>4.0.0</modelVersion>
10
	<groupId>eu.dnetlib</groupId>
11
	<artifactId>dnet-pace-core</artifactId>
12
	<packaging>jar</packaging>
13
	<version>2.1.2</version>
14
	<scm>
15
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-pace-core/tags/dnet-pace-core-2.1.2</developerConnection>
16
	</scm>
17
	<dependencies>
18
		<dependency>
19
			<groupId>edu.cmu</groupId>
20
			<artifactId>secondstring</artifactId>
21
			<version>1.0.0</version>
22
		</dependency>
23
		<dependency>
24
			<groupId>com.google.guava</groupId>
25
			<artifactId>guava</artifactId>
26
			<version>${google.guava.version}</version>
27
		</dependency>
28
		<dependency>
29
			<groupId>com.google.code.gson</groupId>
30
			<artifactId>gson</artifactId>
31
			<version>${google.gson.version}</version>
32
		</dependency>
33
		<dependency>
34
			<groupId>commons-lang</groupId>
35
			<artifactId>commons-lang</artifactId>
36
			<version>${commons.lang.version}</version>
37
		</dependency>
38
		<dependency>
39
			<groupId>commons-io</groupId>
40
			<artifactId>commons-io</artifactId>
41
			<version>${commons.io.version}</version>
42
		</dependency>
43
		<dependency>
44
			<groupId>commons-collections</groupId>
45
			<artifactId>commons-collections</artifactId>
46
			<version>${commons.collections.version}</version>
47
		</dependency>
48
		<dependency>
49
			<groupId>org.antlr</groupId>
50
			<artifactId>stringtemplate</artifactId>
51
			<version>3.2</version>
52
		</dependency>	
53
		<dependency>
54
			<groupId>junit</groupId>
55
			<artifactId>junit</artifactId>
56
			<version>${junit.version}</version>
57
			<scope>test</scope>
58
		</dependency>
59
	</dependencies>
60
</project>
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/Clustering.java
1
package eu.dnetlib.pace.clustering;
2

  
3
public enum Clustering {
4
	acronyms, ngrams, ngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash
5
}
0 6

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.Map.Entry;
7
import java.util.Set;
8

  
9
import com.google.common.collect.Iterables;
10
import com.google.common.collect.Lists;
11
import com.google.common.collect.Maps;
12

  
13
import eu.dnetlib.pace.config.Config;
14
import eu.dnetlib.pace.model.Document;
15
import eu.dnetlib.pace.model.FieldListImpl;
16
import eu.dnetlib.pace.model.MapDocument;
17

  
18
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
19

  
20
	public static Collection<String> filterAndCombine(final MapDocument a, final Config conf, final Map<String, List<String>> blacklists) {
21

  
22
		final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, blacklists);
23
		return combine(filtered, conf);
24
	}
25

  
26
	private MapDocument filter(final MapDocument a, final Map<String, List<String>> blacklists) {
27
		final Map<String, FieldListImpl> filtered = Maps.newHashMap(a.getFieldMap());
28
		if (blacklists != null) {
29
			for (final Entry<String, FieldListImpl> e : filtered.entrySet()) {
30

  
31
				final FieldListImpl fl = new FieldListImpl();
32
				fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
33
				filtered.put(e.getKey(), fl);
34
			}
35
		}
36
		return new MapDocument(a.getIdentifier(), filtered);
37
	}
38

  
39
	/**
40
	 * Tries to match the fields in the regex blacklist.
41
	 *
42
	 * @param fieldName
43
	 * @param value
44
	 * @return true if the field matches, false otherwise
45
	 */
46
	protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) {
47
		if (blacklists.containsKey(fieldName)) {
48
			for (final String regex : blacklists.get(fieldName)) {
49
				if (value.matches(regex)) return true;
50
			}
51
		}
52
		return false;
53
	}
54
}
0 55

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/model/FieldDef.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.List;
4

  
5
import com.google.common.base.Splitter;
6
import com.google.common.collect.Lists;
7
import com.google.gson.Gson;
8

  
9
import eu.dnetlib.pace.config.Algo;
10
import eu.dnetlib.pace.config.Type;
11
import eu.dnetlib.pace.distance.AlwaysMatch;
12
import eu.dnetlib.pace.distance.DistanceAlgo;
13
import eu.dnetlib.pace.distance.ExactMatch;
14
import eu.dnetlib.pace.distance.JaroWinkler;
15
import eu.dnetlib.pace.distance.JaroWinklerTitle;
16
import eu.dnetlib.pace.distance.Level2JaroWinkler;
17
import eu.dnetlib.pace.distance.Level2JaroWinklerTitle;
18
import eu.dnetlib.pace.distance.Level2Levenstein;
19
import eu.dnetlib.pace.distance.Levenstein;
20
import eu.dnetlib.pace.distance.LevensteinTitle;
21
import eu.dnetlib.pace.distance.NullDistanceAlgo;
22
import eu.dnetlib.pace.distance.SortedJaroWinkler;
23
import eu.dnetlib.pace.distance.SortedLevel2JaroWinkler;
24
import eu.dnetlib.pace.distance.SubStringLevenstein;
25
import eu.dnetlib.pace.distance.YearLevenstein;
26

  
27
/**
28
 * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm.
29
 */
30
public class FieldDef {
31

  
32
	public final static String PATH_SEPARATOR = "/";
33

  
34
	private Algo algo;
35

  
36
	private String name;
37

  
38
	private String path;
39

  
40
	private boolean ignoreMissing;
41

  
42
	private Type type;
43

  
44
	private boolean overrideMatch;
45

  
46
	private double weight;
47

  
48
	private int limit = -1;
49

  
50
	public FieldDef() {}
51

  
52
	// def apply(s: String): Field[A]
53
	public Field apply(final Type type, final String s) {
54
		switch (type) {
55
		case Int:
56
			return new FieldValueImpl(type, name, Integer.parseInt(s));
57
		case String:
58
			return new FieldValueImpl(type, name, s);
59
		case List:
60
			return new FieldListImpl(name);
61
		default:
62
			throw new IllegalArgumentException("Casting not implemented for type " + type);
63
		}
64
	}
65

  
66
	public String getName() {
67
		return name;
68
	}
69

  
70
	public String getPath() {
71
		return path;
72
	}
73

  
74
	public List<String> getPathList() {
75
		return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
76
	}
77

  
78
	public DistanceAlgo getDistanceAlgo() {
79
		switch (getAlgo()) {
80
		case JaroWinkler:
81
			return new JaroWinkler(getWeight());
82
		case JaroWinklerTitle:
83
			return new JaroWinklerTitle(getWeight());
84
		case Level2JaroWinkler:
85
			return new Level2JaroWinkler(getWeight());
86
		case Level2JaroWinklerTitle:
87
			return new Level2JaroWinklerTitle(getWeight());
88
		case Level2Levenstein:
89
			return new Level2Levenstein(getWeight());
90
		case Levenstein:
91
			return new Levenstein(getWeight());
92
		case LevensteinTitle:
93
			return new LevensteinTitle(getWeight());
94
		case SubStringLevenstein:
95
			return new SubStringLevenstein(getWeight(), getLimit());
96
		case YearLevenstein:
97
			return new YearLevenstein(getWeight(), getLimit());
98
		case SortedJaroWinkler:
99
			return new SortedJaroWinkler(getWeight());
100
		case SortedLevel2JaroWinkler:
101
			return new SortedLevel2JaroWinkler(getWeight());
102
		case ExactMatch:
103
			return new ExactMatch(getWeight());
104
		case AlwaysMatch:
105
			return new AlwaysMatch(getWeight());
106
		case Null:
107
			return new NullDistanceAlgo();
108
		default:
109
			return new NullDistanceAlgo();
110
		}
111
	}
112

  
113
	public boolean isIgnoreMissing() {
114
		return ignoreMissing;
115
	}
116

  
117
	public Type getType() {
118
		return type;
119
	}
120

  
121
	public void setType(final Type type) {
122
		this.type = type;
123
	}
124

  
125
	public boolean isOverrideMatch() {
126
		return overrideMatch;
127
	}
128

  
129
	public void setOverrideMatch(final boolean overrideMatch) {
130
		this.overrideMatch = overrideMatch;
131
	}
132

  
133
	@Override
134
	public String toString() {
135
		return new Gson().toJson(this);
136
	}
137

  
138
	public double getWeight() {
139
		return weight;
140
	}
141

  
142
	public void setWeight(final double weight) {
143
		this.weight = weight;
144
	}
145

  
146
	public Algo getAlgo() {
147
		return algo;
148
	}
149

  
150
	public void setAlgo(final Algo algo) {
151
		this.algo = algo;
152
	}
153

  
154
	public int getLimit() {
155
		return limit;
156
	}
157

  
158
	public void setLimit(final int limit) {
159
		this.limit = limit;
160
	}
161

  
162
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6

  
7
import org.apache.commons.lang.RandomStringUtils;
8
import org.apache.commons.lang.StringUtils;
9

  
10
import com.google.common.collect.Lists;
11

  
12
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
13

  
14
	public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
15
		super(params);
16
	}
17

  
18
	@Override
19
	protected Collection<String> doApply(final String s) {
20
		final List<String> res = Lists.newArrayList();
21

  
22
		res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
23

  
24
		return res;
25
	}
26

  
27
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.List;
4
import java.util.Map;
5

  
6
import com.google.gson.Gson;
7

  
8
import eu.dnetlib.pace.clustering.Acronyms;
9
import eu.dnetlib.pace.clustering.Clustering;
10
import eu.dnetlib.pace.clustering.ClusteringFunction;
11
import eu.dnetlib.pace.clustering.ImmutableFieldValue;
12
import eu.dnetlib.pace.clustering.NgramPairs;
13
import eu.dnetlib.pace.clustering.Ngrams;
14
import eu.dnetlib.pace.clustering.PersonHash;
15
import eu.dnetlib.pace.clustering.RandomClusteringFunction;
16
import eu.dnetlib.pace.clustering.SpaceTrimmingFieldValue;
17
import eu.dnetlib.pace.clustering.SuffixPrefix;
18

  
19
public class ClusteringDef {
20

  
21
	private Clustering name;
22

  
23
	private List<String> fields;
24

  
25
	private Map<String, Integer> params;
26

  
27
	public ClusteringDef() {}
28

  
29
	public Clustering getName() {
30
		return name;
31
	}
32

  
33
	public void setName(final Clustering name) {
34
		this.name = name;
35
	}
36

  
37
	public ClusteringFunction getClusteringFunction() {
38
		switch (getName()) {
39
		case acronyms:
40
			return new Acronyms(getParams());
41
		case ngrams:
42
			return new Ngrams(getParams());
43
		case ngrampairs:
44
			return new NgramPairs(getParams());
45
		case suffixprefix:
46
			return new SuffixPrefix(getParams());
47
		case spacetrimmingfieldvalue:
48
			return new SpaceTrimmingFieldValue(getParams());
49
		case immutablefieldvalue:
50
			return new ImmutableFieldValue(getParams());
51
		case personhash:
52
			return new PersonHash(getParams());
53
		default:
54
			return new RandomClusteringFunction(getParams());
55
		}
56
	}
57

  
58
	public List<String> getFields() {
59
		return fields;
60
	}
61

  
62
	public void setFields(final List<String> fields) {
63
		this.fields = fields;
64
	}
65

  
66
	public Map<String, Integer> getParams() {
67
		return params;
68
	}
69

  
70
	public void setParams(final Map<String, Integer> params) {
71
		this.params = params;
72
	}
73

  
74
	@Override
75
	public String toString() {
76
		return new Gson().toJson(this);
77
	}
78

  
79
}
0 80

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.LinkedHashSet;
5
import java.util.Map;
6
import java.util.StringTokenizer;
7

  
8
public class Ngrams extends AbstractClusteringFunction {
9

  
10
	public Ngrams(Map<String, Integer> params) {
11
		super(params);
12
	}
13
	
14
	@Override
15
	protected Collection<String> doApply(String s) {
16
		return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
17
	}
18

  
19
	protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
20

  
21
		final Collection<String> ngrams = new LinkedHashSet<String>();
22
		final StringTokenizer st = new StringTokenizer(s);
23

  
24
		while (st.hasMoreTokens()) {
25
			final String token = st.nextToken();
26
			if (!token.isEmpty()) {
27

  
28
				for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
29
					String ngram = (token + "    ").substring(i, ngramLen + i).trim();
30
					if (ngrams.size() >= max) {
31
						return ngrams;
32
					}
33
					if (ngram.length() >= minNgramLen) {
34
						ngrams.add(ngram);
35
					}
36
				}
37
			}
38
		}
39
		//System.out.println(ngrams + " n: " + ngrams.size());
40
		return ngrams;
41
	}
42

  
43
}
0 44

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Collection;
4
import java.util.Map;
5
import java.util.Set;
6
import java.util.StringTokenizer;
7

  
8
import com.google.common.collect.Sets;
9

  
10
public class Acronyms extends AbstractClusteringFunction {
11

  
12
	public Acronyms(Map<String, Integer> params) {
13
		super(params);
14
	}
15

  
16
	@Override
17
	protected Collection<String> doApply(String s) {
18
		return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
19
	}
20
	
21
	private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
22
		
23
		final Set<String> acronyms = Sets.newLinkedHashSet();
24
		
25
		for (int i = 0; i < maxAcronyms; i++) {
26
			
27
			final StringTokenizer st = new StringTokenizer(s);
28
			final StringBuilder sb = new StringBuilder();
29
			
30
			while (st.hasMoreTokens()) {
31
				final String token = st.nextToken();
32
				if (sb.length() > maxLen) {
33
					break;
34
				}
35
				if (token.length() > 1 && i < token.length()) {
36
					sb.append(token.charAt(i));
37
				}
38
			}
39
			String acronym = sb.toString();
40
			if (acronym.length() > minLen) {
41
				acronyms.add(acronym);
42
			}
43
		}
44
		return acronyms;
45
	}
46

  
47
}
0 48

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/model/CondDef.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.List;
4

  
5
import com.google.gson.Gson;
6

  
7
import eu.dnetlib.pace.condition.AlwaysTrueCondition;
8
import eu.dnetlib.pace.condition.ConditionAlgo;
9
import eu.dnetlib.pace.condition.DoiExactMatch;
10
import eu.dnetlib.pace.condition.ExactMatch;
11
import eu.dnetlib.pace.condition.SizeMatch;
12
import eu.dnetlib.pace.condition.TitleVersionMatch;
13
import eu.dnetlib.pace.condition.YearMatch;
14
import eu.dnetlib.pace.config.Cond;
15

  
16
public class CondDef {
17

  
18
	private Cond name;
19

  
20
	private List<String> fields;
21

  
22
	public CondDef() {}
23

  
24
	public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) {
25
		switch (getName()) {
26
		case yearMatch:
27
			return new YearMatch(fields);
28
		case titleVersionMatch:
29
			return new TitleVersionMatch(fields);
30
		case sizeMatch:
31
			return new SizeMatch(fields);
32
		case exactMatch:
33
			return new ExactMatch(fields);
34
		case doiExactMatch:
35
			return new DoiExactMatch(fields);
36
		default:
37
			return new AlwaysTrueCondition(fields);
38
		}
39
	}
40

  
41
	public Cond getName() {
42
		return name;
43
	}
44

  
45
	public void setName(final Cond name) {
46
		this.name = name;
47
	}
48

  
49
	public List<String> getFields() {
50
		return fields;
51
	}
52

  
53
	public void setFields(final List<String> fields) {
54
		this.fields = fields;
55
	}
56

  
57
	@Override
58
	public String toString() {
59
		return new Gson().toJson(this);
60
	}
61

  
62
}
0 63

  
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
1
package eu.dnetlib.pace.clustering;
2

  
3
import java.util.Set;
4

  
5
import org.apache.commons.lang.StringUtils;
6

  
7
import eu.dnetlib.pace.common.AbstractPaceFunctions;
8

  
9
public class NGramUtils extends AbstractPaceFunctions {
10

  
11
	private static final int SIZE = 100;
12

  
13
	private static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
14

  
15
	public static String cleanupForOrdering(String s) {
16
		NGramUtils utils = new NGramUtils();
17
		return (utils.filterStopWords(utils.normalize(s), stopwords) +  StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", "");
18
	}
19

  
20
}
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.Collection;
4
import java.util.Iterator;
5
import java.util.List;
6
import java.util.ListIterator;
7

  
8
import com.google.common.base.Function;
9
import com.google.common.base.Joiner;
10
import com.google.common.collect.Iterables;
11
import com.google.common.collect.Lists;
12

  
13
import eu.dnetlib.pace.config.Type;
14

  
15
/**
16
 * The Class FieldListImpl.
17
 */
18
public class FieldListImpl extends AbstractField implements FieldList {
19

  
20
	/** The fields. */
21
	private List<Field> fields;
22

  
23
	/**
24
	 * Instantiates a new field list impl.
25
	 */
26
	public FieldListImpl() {
27
		fields = Lists.newArrayList();
28
	}
29

  
30
	/**
31
	 * Instantiates a new field list impl.
32
	 * 
33
	 * @param name
34
	 *            the name
35
	 */
36
	public FieldListImpl(final String name) {
37
		super(Type.List, name);
38
		fields = Lists.newArrayList();
39
	}
40

  
41
	/*
42
	 * (non-Javadoc)
43
	 * 
44
	 * @see java.util.List#add(java.lang.Object)
45
	 */
46
	@Override
47
	public boolean add(final Field f) {
48
		return fields.add(f);
49
	}
50

  
51
	/*
52
	 * (non-Javadoc)
53
	 * 
54
	 * @see java.util.List#add(int, java.lang.Object)
55
	 */
56
	@Override
57
	public void add(final int i, final Field f) {
58
		fields.add(i, f);
59
	}
60

  
61
	/*
62
	 * (non-Javadoc)
63
	 * 
64
	 * @see java.util.List#addAll(java.util.Collection)
65
	 */
66
	@Override
67
	public boolean addAll(final Collection<? extends Field> f) {
68
		return fields.addAll(f);
69
	}
70

  
71
	/*
72
	 * (non-Javadoc)
73
	 * 
74
	 * @see java.util.List#addAll(int, java.util.Collection)
75
	 */
76
	@Override
77
	public boolean addAll(final int i, final Collection<? extends Field> f) {
78
		return fields.addAll(i, f);
79
	}
80

  
81
	/*
82
	 * (non-Javadoc)
83
	 * 
84
	 * @see java.util.List#clear()
85
	 */
86
	@Override
87
	public void clear() {
88
		fields.clear();
89
	}
90

  
91
	/*
92
	 * (non-Javadoc)
93
	 * 
94
	 * @see java.util.List#contains(java.lang.Object)
95
	 */
96
	@Override
97
	public boolean contains(final Object o) {
98
		return fields.contains(o);
99
	}
100

  
101
	/*
102
	 * (non-Javadoc)
103
	 * 
104
	 * @see java.util.List#containsAll(java.util.Collection)
105
	 */
106
	@Override
107
	public boolean containsAll(final Collection<?> f) {
108
		return fields.containsAll(f);
109
	}
110

  
111
	/*
112
	 * (non-Javadoc)
113
	 * 
114
	 * @see java.util.List#get(int)
115
	 */
116
	@Override
117
	public Field get(final int i) {
118
		return fields.get(i);
119
	}
120

  
121
	/*
122
	 * (non-Javadoc)
123
	 * 
124
	 * @see java.util.List#indexOf(java.lang.Object)
125
	 */
126
	@Override
127
	public int indexOf(final Object o) {
128
		return fields.indexOf(o);
129
	}
130

  
131
	/*
132
	 * (non-Javadoc)
133
	 * 
134
	 * @see eu.dnetlib.pace.model.Field#isEmpty()
135
	 */
136
	@Override
137
	public boolean isEmpty() {
138
		return fields.isEmpty();
139
	}
140

  
141
	/*
142
	 * (non-Javadoc)
143
	 * 
144
	 * @see java.lang.Iterable#iterator()
145
	 */
146
	@Override
147
	public Iterator<Field> iterator() {
148
		return fields.iterator();
149
	}
150

  
151
	/*
152
	 * (non-Javadoc)
153
	 * 
154
	 * @see java.util.List#lastIndexOf(java.lang.Object)
155
	 */
156
	@Override
157
	public int lastIndexOf(final Object o) {
158
		return fields.lastIndexOf(o);
159
	}
160

  
161
	/*
162
	 * (non-Javadoc)
163
	 * 
164
	 * @see java.util.List#listIterator()
165
	 */
166
	@Override
167
	public ListIterator<Field> listIterator() {
168
		return fields.listIterator();
169
	}
170

  
171
	/*
172
	 * (non-Javadoc)
173
	 * 
174
	 * @see java.util.List#listIterator(int)
175
	 */
176
	@Override
177
	public ListIterator<Field> listIterator(final int i) {
178
		return fields.listIterator(i);
179
	}
180

  
181
	/*
182
	 * (non-Javadoc)
183
	 * 
184
	 * @see java.util.List#remove(java.lang.Object)
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff