Project

General

Profile

« Previous | Next » 

Revision 36615

[maven-release-plugin] copy for tag dnet-pace-core-2.0.0

View differences:

modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.List;
4
import java.util.Map;
5

  
6
import com.google.gson.Gson;
7

  
8
import eu.dnetlib.pace.clustering.Acronyms;
9
import eu.dnetlib.pace.clustering.Clustering;
10
import eu.dnetlib.pace.clustering.ClusteringFunction;
11
import eu.dnetlib.pace.clustering.NgramPairs;
12
import eu.dnetlib.pace.clustering.Ngrams;
13
import eu.dnetlib.pace.clustering.RandomClusteringFunction;
14
import eu.dnetlib.pace.clustering.SpaceTrimmingFieldValue;
15
import eu.dnetlib.pace.clustering.SuffixPrefix;
16

  
17
public class ClusteringDef {
18

  
19
	private Clustering name;
20

  
21
	private List<String> fields;
22

  
23
	private Map<String, Integer> params;
24

  
25
	public ClusteringDef() {}
26

  
27
	public Clustering getName() {
28
		return name;
29
	}
30

  
31
	public void setName(final Clustering name) {
32
		this.name = name;
33
	}
34

  
35
	public ClusteringFunction getClusteringFunction() {
36
		switch (getName()) {
37
		case acronyms:
38
			return new Acronyms(getParams());
39
		case ngrams:
40
			return new Ngrams(getParams());
41
		case ngrampairs:
42
			return new NgramPairs(getParams());
43
		case suffixprefix:
44
			return new SuffixPrefix(getParams());
45
		case spacetrimmingfieldvalue:
46
			return new SpaceTrimmingFieldValue(getParams());
47
		default:
48
			return new RandomClusteringFunction(getParams());
49
		}
50
	}
51

  
52
	public List<String> getFields() {
53
		return fields;
54
	}
55

  
56
	public void setFields(final List<String> fields) {
57
		this.fields = fields;
58
	}
59

  
60
	public Map<String, Integer> getParams() {
61
		return params;
62
	}
63

  
64
	public void setParams(final Map<String, Integer> params) {
65
		this.params = params;
66
	}
67

  
68
	@Override
69
	public String toString() {
70
		return new Gson().toJson(this);
71
	}
72

  
73
}
0 74

  
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.Iterator;
4
import java.util.List;
5

  
6
import org.apache.commons.collections.iterators.SingletonIterator;
7

  
8
import eu.dnetlib.pace.config.Type;
9

  
10
/**
11
 * The Class FieldValueImpl.
12
 */
13
public class FieldValueImpl extends AbstractField implements FieldValue {
14

  
15
	/** The value. */
16
	private Object value = null;
17

  
18
	/**
19
	 * Instantiates a new field value impl.
20
	 */
21
	public FieldValueImpl() {}
22

  
23
	/**
24
	 * Instantiates a new field value impl.
25
	 * 
26
	 * @param type
27
	 *            the type
28
	 * @param name
29
	 *            the name
30
	 * @param value
31
	 *            the value
32
	 */
33
	public FieldValueImpl(final Type type, final String name, final Object value) {
34
		super(type, name);
35
		this.value = value;
36
	}
37

  
38
	/*
39
	 * (non-Javadoc)
40
	 * 
41
	 * @see eu.dnetlib.pace.model.Field#isEmpty()
42
	 */
43
	@Override
44
	public boolean isEmpty() {
45
		if (value == null) return false;
46

  
47
		switch (type) {
48
		case String:
49
			return value.toString().isEmpty();
50
		case List:
51
			List<?> list = (List<?>) value;
52
			return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty();
53
		default:
54
			return true;
55
		}
56
	}
57

  
58
	/*
59
	 * (non-Javadoc)
60
	 * 
61
	 * @see eu.dnetlib.pace.model.FieldValue#getValue()
62
	 */
63
	@Override
64
	public Object getValue() {
65
		return value;
66
	}
67

  
68
	/*
69
	 * (non-Javadoc)
70
	 * 
71
	 * @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object)
72
	 */
73
	@Override
74
	public void setValue(final Object value) {
75
		this.value = value;
76
	}
77

  
78
	/*
79
	 * (non-Javadoc)
80
	 * 
81
	 * @see eu.dnetlib.pace.model.Field#stringValue()
82
	 */
83
	@Override
84
	// @SuppressWarnings("unchecked")
85
	public String stringValue() {
86
		return String.valueOf(getValue());
87
		// switch (getType()) {
88
		//
89
		// case Int:
90
		// return String.valueOf(getValue());
91
		// case List:
92
		// return Joiner.on(" ").join((List<String>) getValue());
93
		// case String:
94
		// return (String) getValue();
95
		// default:
96
		// throw new IllegalArgumentException("Unknown type: " + getType().toString());
97
		// }
98
	}
99

  
100
	/*
101
	 * (non-Javadoc)
102
	 * 
103
	 * @see java.lang.Iterable#iterator()
104
	 */
105
	@Override
106
	@SuppressWarnings("unchecked")
107
	public Iterator<Field> iterator() {
108
		return new SingletonIterator(this);
109
	}
110

  
111
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/FieldValue.java
1
package eu.dnetlib.pace.model;
2

  
3
/**
4
 * The Interface FieldValue.
5
 */
6
public interface FieldValue extends Field {
7

  
8
	/**
9
	 * Gets the value.
10
	 * 
11
	 * @return the value
12
	 */
13
	public Object getValue();
14

  
15
	/**
16
	 * Sets the value.
17
	 * 
18
	 * @param value
19
	 *            the new value
20
	 */
21
	public void setValue(final Object value);
22

  
23
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.lang.reflect.Type;
4

  
5
import com.google.gson.Gson;
6
import com.google.gson.GsonBuilder;
7
import com.google.gson.InstanceCreator;
8
import com.google.gson.JsonDeserializationContext;
9
import com.google.gson.JsonDeserializer;
10
import com.google.gson.JsonElement;
11
import com.google.gson.JsonParseException;
12

  
13
/**
14
 * The Class MapDocumentSerializer.
15
 */
16
public class MapDocumentSerializer implements InstanceCreator<MapDocument> {
17

  
18
	@Override
19
	public MapDocument createInstance(final Type type) {
20
		return new MapDocument();
21
	}
22

  
23
	/**
24
	 * Decode.
25
	 *
26
	 * @param bytes
27
	 *            the bytes
28
	 * @return the map document
29
	 */
30
	public static MapDocument decode(final byte[] bytes) {
31

  
32
		GsonBuilder gson = new GsonBuilder();
33

  
34
		gson.registerTypeAdapter(Field.class, new JsonDeserializer<Field>() {
35

  
36
			@Override
37
			public Field deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException {
38
				FieldListImpl fl = new FieldListImpl();
39
				if (json.isJsonObject()) {
40
					String name = json.getAsJsonObject().get("name").getAsString();
41
					String type = json.getAsJsonObject().get("type").getAsString();
42
					String value = json.getAsJsonObject().get("value").getAsString();
43
					fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value));
44
				}
45
				return fl;
46
			}
47
		});
48

  
49
		return gson.create().fromJson(new String(bytes), MapDocument.class);
50
	}
51

  
52
	/**
53
	 * To string.
54
	 *
55
	 * @param doc
56
	 *            the doc
57
	 * @return the string
58
	 */
59
	public static String toString(final MapDocument doc) {
60
		return new Gson().toJson(doc);
61
	}
62

  
63
	/**
64
	 * To byte array.
65
	 *
66
	 * @param doc
67
	 *            the doc
68
	 * @return the byte[]
69
	 */
70
	public static byte[] toByteArray(final MapDocument doc) {
71
		return toString(doc).getBytes();
72
	}
73

  
74
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/Person.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.text.Normalizer;
4
import java.util.List;
5
import java.util.Set;
6

  
7
import com.google.common.base.Joiner;
8
import com.google.common.base.Splitter;
9
import com.google.common.collect.Iterables;
10
import com.google.common.collect.Lists;
11
import com.google.common.hash.Hashing;
12

  
13
import eu.dnetlib.pace.clustering.NGramUtils;
14
import eu.dnetlib.pace.util.Capitalise;
15
import eu.dnetlib.pace.util.DotAbbreviations;
16

  
17
public class Person {
18
	private List<String> name = Lists.newArrayList();
19
	private List<String> surname = Lists.newArrayList();
20
	private List<String> fullname = Lists.newArrayList();
21
	private final String original;
22

  
23
	private static Set<String> particles = null;
24

  
25
	public Person(String s, boolean aggressive) {
26
		original = s;
27
		s = Normalizer.normalize(s, Normalizer.Form.NFD);
28
		s = s.replaceAll("\\(.+\\)", "");
29
		s = s.replaceAll("\\[.+\\]", "");
30
		s = s.replaceAll("\\{.+\\}", "");
31
		s = s.replaceAll("\\s+-\\s+", "-");
32
		s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
33
		s = s.replaceAll("\\d", " ");
34
		s = s.replaceAll("\\n", " ");
35
		s = s.replaceAll("\\.", " ");
36
		s = s.replaceAll("\\s+", " ");
37

  
38
		if (aggressive) {
39
			s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
40
			//s = s.replaceAll("[\\W&&[^,-]]", "");
41
		}
42
		
43
		if (s.contains(",")) {
44
			String[] arr = s.split(",");
45
			if (arr.length == 1) {
46
				fullname = splitTerms(arr[0]);
47
			} else if (arr.length > 1) {
48
				surname = splitTerms(arr[0]);
49
				name = splitTerms(arr[1]);
50
				fullname.addAll(surname);
51
				fullname.addAll(name);
52
			}
53
		} else {
54
			fullname = splitTerms(s);
55

  
56
			int lastInitialPosition = fullname.size();
57
			boolean hasSurnameInUpperCase = false;
58

  
59
			for (int i = 0; i < fullname.size(); i++) {
60
				String term = fullname.get(i);
61
				if (term.length() == 1) {
62
					lastInitialPosition = i;
63
				} else if (term.equals(term.toUpperCase())) {
64
					hasSurnameInUpperCase = true;
65
				}
66
			}
67

  
68
			if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
69
				name = fullname.subList(0, lastInitialPosition + 1);
70
				surname = fullname.subList(lastInitialPosition + 1, fullname.size());
71
			} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
72
				for (String term : fullname) {
73
					if (term.length() > 1 && term.equals(term.toUpperCase())) {
74
						surname.add(term);
75
					} else {
76
						name.add(term);
77
					}
78
				}
79
			}
80
		}
81
	}
82

  
83
	private List<String> splitTerms(String s) {
84
		if (particles == null) {
85
			particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
86
		}
87

  
88
		List<String> list = Lists.newArrayList();
89
		for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
90
			if (!particles.contains(part.toLowerCase())) {
91
				list.add(part);
92
			}
93
		}
94
		return list;
95
	}
96

  
97
	public List<String> getName() {
98
		return name;
99
	}
100

  
101
	public List<String> getSurname() {
102
		return surname;
103
	}
104

  
105
	public List<String> getFullname() {
106
		return fullname;
107
	}
108

  
109
	public String getOriginal() {
110
		return original;
111
	}
112

  
113
	public String hash() {
114
		return Hashing.murmur3_128().hashString(getNormalisedFullname()).toString();
115
	}
116

  
117
	public String getNormalisedFirstName() {
118
		return Joiner.on(" ").join(getCapitalFirstnames());
119
	}
120

  
121
	public String getNormalisedSurname() {
122
		return Joiner.on(" ").join(getCapitalSurname());
123
	}
124

  
125
	public String getNormalisedFullname() {
126
		return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
127
	}
128

  
129
	public List<String> getCapitalFirstnames() {
130
		return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), new Capitalise()));
131
	}
132

  
133
	public List<String> getCapitalSurname() {
134
		return Lists.newArrayList(Iterables.transform(surname, new Capitalise()));
135
	}
136

  
137
	public List<String> getNameWithAbbreviations() {
138
		return Lists.newArrayList(Iterables.transform(name, new DotAbbreviations()));
139
	}
140

  
141
	public boolean isAccurate() {
142
		return (name != null && surname != null && !name.isEmpty() && !surname.isEmpty());
143
	}
144
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/MapDocument.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.Map;
4
import java.util.Set;
5

  
6
import com.google.common.collect.Iterables;
7
import com.google.common.collect.Lists;
8
import com.google.common.collect.Maps;
9

  
10
/**
11
 * The Class MapDocument.
12
 */
13
public class MapDocument implements Document {
14

  
15
	/** The identifier. */
16
	private String identifier;
17

  
18
	/** The field map. */
19
	private Map<String, FieldListImpl> fieldMap;
20

  
21
	/**
22
	 * Instantiates a new map document.
23
	 */
24
	public MapDocument() {
25
		identifier = null;
26
		fieldMap = Maps.newHashMap();
27
	}
28

  
29
	/**
30
	 * Instantiates a new map document.
31
	 *
32
	 * @param identifier
33
	 *            the identifier
34
	 * @param fieldMap
35
	 *            the field map
36
	 */
37
	public MapDocument(final String identifier, final Map<String, FieldListImpl> fieldMap) {
38
		this.setIdentifier(identifier);
39
		this.fieldMap = fieldMap;
40
	}
41

  
42
	/**
43
	 * Instantiates a new map document.
44
	 *
45
	 * @param identifier
46
	 *            the identifier
47
	 * @param data
48
	 *            the data
49
	 */
50
	public MapDocument(final String identifier, final byte[] data) {
51
		MapDocument doc = MapDocumentSerializer.decode(data);
52

  
53
		this.fieldMap = doc.fieldMap;
54
		this.identifier = doc.identifier;
55
	}
56

  
57
	/*
58
	 * (non-Javadoc)
59
	 *
60
	 * @see eu.dnetlib.pace.model.document.Document#fields()
61
	 */
62
	@Override
63
	public Iterable<Field> fields() {
64
		return Lists.newArrayList(Iterables.concat(fieldMap.values()));
65
	}
66

  
67
	/*
68
	 * (non-Javadoc)
69
	 *
70
	 * @see eu.dnetlib.pace.model.document.Document#values(java.lang.String)
71
	 */
72
	@Override
73
	public FieldList values(final String name) {
74
		return fieldMap.get(name);
75
	}
76

  
77
	/*
78
	 * (non-Javadoc)
79
	 *
80
	 * @see eu.dnetlib.pace.model.document.Document#fieldNames()
81
	 */
82
	@Override
83
	public Set<String> fieldNames() {
84
		return fieldMap.keySet();
85
	}
86

  
87
	/*
88
	 * (non-Javadoc)
89
	 *
90
	 * @see java.lang.Object#toString()
91
	 */
92
	@Override
93
	public String toString() {
94
		return MapDocumentSerializer.toString(this);
95
		// return String.format("Document(%s)", fieldMap.toString());
96
	}
97

  
98
	/**
99
	 * To byte array.
100
	 *
101
	 * @return the byte[]
102
	 */
103
	public byte[] toByteArray() {
104
		return MapDocumentSerializer.toByteArray(this);
105
	}
106

  
107
	/*
108
	 * (non-Javadoc)
109
	 *
110
	 * @see eu.dnetlib.pace.model.document.Document#getIdentifier()
111
	 */
112
	@Override
113
	public String getIdentifier() {
114
		return identifier;
115
	}
116

  
117
	/**
118
	 * Sets the identifier.
119
	 *
120
	 * @param identifier
121
	 *            the new identifier
122
	 */
123
	public void setIdentifier(final String identifier) {
124
		this.identifier = identifier;
125
	}
126

  
127
	/**
128
	 * Gets the field map.
129
	 *
130
	 * @return the field map
131
	 */
132
	public Map<String, FieldListImpl> getFieldMap() {
133
		return fieldMap;
134
	}
135

  
136
	/**
137
	 * Sets the field map.
138
	 *
139
	 * @param fieldMap
140
	 *            the field map
141
	 */
142
	public void setFieldMap(final Map<String, FieldListImpl> fieldMap) {
143
		this.fieldMap = fieldMap;
144
	}
145

  
146
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/config/Config.java
1
package eu.dnetlib.pace.config;
2

  
3
import java.util.List;
4
import java.util.Map;
5

  
6
import eu.dnetlib.pace.condition.ConditionAlgo;
7
import eu.dnetlib.pace.model.ClusteringDef;
8
import eu.dnetlib.pace.model.FieldDef;
9

  
10
/**
11
 * Interface for PACE configuration bean.
12
 *
13
 * @author claudio
14
 */
15
public interface Config {
16

  
17
	/**
18
	 * Field configuration definitions.
19
	 *
20
	 * @return the list of definitions
21
	 */
22
	public List<FieldDef> model();
23

  
24
	/**
25
	 * Strict Pre-Condition definitions.
26
	 *
27
	 * @return the list of conditions
28
	 */
29
	public List<ConditionAlgo> strictConditions();
30

  
31
	/**
32
	 * Pre-Condition definitions.
33
	 *
34
	 * @return the list of conditions
35
	 */
36
	public List<ConditionAlgo> conditions();
37

  
38
	/**
39
	 * Clusterings.
40
	 *
41
	 * @return the list
42
	 */
43
	public List<ClusteringDef> clusterings();
44

  
45
	/**
46
	 * Blacklists.
47
	 *
48
	 * @return the map
49
	 */
50
	public Map<String, List<String>> blacklists();
51

  
52
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/FieldList.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.List;
4

  
5
/**
6
 * The Interface FieldList.
7
 */
8
public interface FieldList extends List<Field>, Field {
9

  
10
	/**
11
	 * String list.
12
	 * 
13
	 * @return the list
14
	 */
15
	public List<String> stringList();
16

  
17
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/config/Algo.java
1
package eu.dnetlib.pace.config;
2

  
3
/**
4
 * Enumerates the distance Algos.
5
 */
6
public enum Algo {
7

  
8
	/** The Jaro winkler. */
9
	JaroWinkler,
10
	/** The Jaro winkler title. */
11
	JaroWinklerTitle,
12
	/** The Levenstein. */
13
	Levenstein,
14
	/** The Level2 jaro winkler. */
15
	Level2JaroWinkler,
16
	/** The Level2 levenstein. */
17
	Level2Levenstein,
18
	/** The Sub string levenstein. */
19
	SubStringLevenstein,
20
	/** The Year levenstein. */
21
	YearLevenstein,
22
	/** The Sorted jaro winkler. */
23
	SortedJaroWinkler,
24
	/** The Sorted level2 jaro winkler. */
25
	SortedLevel2JaroWinkler,
26
	/** The Null. */
27
	Null
28
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/DocumentBuilder.java
1
package eu.dnetlib.pace.model;
2

  
3
import java.util.Map;
4

  
5
/**
6
 * The Class DocumentBuilder.
7
 */
8
public class DocumentBuilder {
9

  
10
	/**
11
	 * New instance.
12
	 *
13
	 * @param identifier
14
	 *            the identifier
15
	 * @param fieldMap
16
	 *            the field map
17
	 * @return the map document
18
	 */
19
	public static MapDocument newInstance(final String identifier, final Map<String, FieldListImpl> fieldMap) {
20
		return new MapDocument(identifier, fieldMap);
21
	}
22

  
23
	/**
24
	 * New instance.
25
	 *
26
	 * @param identifier
27
	 *            the identifier
28
	 * @param fieldMap
29
	 *            the field map
30
	 * @return the map document
31
	 */
32
	public static MapDocument newInstance(final String identifier, final byte[] fieldMap) {
33
		return new MapDocument(identifier, fieldMap);
34
	}
35

  
36
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java
1
package eu.dnetlib.pace.distance;
2

  
3
import java.util.List;
4

  
5
import eu.dnetlib.pace.condition.ConditionAlgo;
6
import eu.dnetlib.pace.model.Document;
7
import eu.dnetlib.pace.model.Field;
8
import eu.dnetlib.pace.model.FieldDef;
9

  
10
/**
11
 * The distance between two documents is given by the weighted mean of the field distances
12
 */
13
public class DistanceScorer {
14

  
15
	private List<FieldDef> fields;
16

  
17
	private List<ConditionAlgo> strictConditions;
18

  
19
	private List<ConditionAlgo> conditions;
20

  
21
	public DistanceScorer(final List<FieldDef> fields, final List<ConditionAlgo> strictConditions, final List<ConditionAlgo> conditions) {
22
		this.fields = fields;
23
		this.conditions = conditions;
24
		this.strictConditions = strictConditions;
25
	}
26

  
27
	public double distance(final Document a, final Document b) {
28

  
29
		double w = sumWeights(fields);
30
		double sum = 0.0;
31
		final int cond = verify(a, b, strictConditions, true);
32

  
33
		if (cond > 0) return 1.0;
34
		if (cond < 0) return 0.0;
35

  
36
		if (verify(a, b, conditions, true) >= 0) {
37
			for (final FieldDef fd : fields) {
38
				final double d = fieldDistance(a, b, fd);
39

  
40
				if (d > 0) {
41
					sum += d;
42
				} else {
43
					w -= fd.getWeight();
44
				}
45
			}
46
			return w == 0 ? 0 : sum / w;
47
		}
48
		return 0.0;
49
	}
50

  
51
	private int verify(final Document a, final Document b, final List<ConditionAlgo> conditions, final boolean strict) {
52
		int cond = 0;
53

  
54
		for (final ConditionAlgo cd : conditions) {
55
			final int verify = cd.verify(a, b);
56
			if (strict && (verify < 0)) return -1;
57
			cond += verify;
58
		}
59
		return cond;
60
	}
61

  
62
	private double fieldDistance(final Document a, final Document b, final FieldDef fd) {
63
		final double w = fd.getWeight();
64
		if ((w == 0)) return 0.0; // optimization for 0 weight
65
		else {
66
			final Field va = getValue(a, fd);
67
			final Field vb = getValue(b, fd);
68

  
69
			if (va.isEmpty() || vb.isEmpty()) {
70
				if (fd.isIgnoreMissing()) return -1;
71
				else return w;
72
			} else {
73

  
74
				if (va.getType().equals(vb.getType())) {
75
					final double d = fd.getDistanceAlgo().distance(va, vb);
76
					return w * d;
77
				}
78
				throw new IllegalArgumentException("Types are differents type");
79
			}
80
		}
81
	}
82

  
83
	private Field getValue(final Document d, final FieldDef fd) {
84
		return d.values(fd.getName());
85
	}
86

  
87
	private double sumWeights(final List<FieldDef> fields) {
88
		double sum = 0.0;
89
		for (final FieldDef fd : fields) {
90
			sum += fd.getWeight();
91
		}
92
		return sum;
93
	}
94

  
95
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/config/WfConfig.java
1
package eu.dnetlib.pace.config;
2

  
3
import java.util.HashSet;
4
import java.util.List;
5
import java.util.Set;
6

  
7
import com.google.common.collect.Lists;
8
import com.google.common.collect.Sets;
9
import com.google.gson.GsonBuilder;
10

  
11
public class WfConfig {
12

  
13
	/**
14
	 * Entity type.
15
	 */
16
	private String entityType = "";
17

  
18
	/**
19
	 * Field name used to sort the values in the reducer phase.
20
	 */
21
	private String orderField = "";
22

  
23
	/**
24
	 * Column Families involved in the relations redirection.
25
	 */
26
	private List<String> rootBuilder = Lists.newArrayList();
27

  
28
	/**
29
	 * Set of datasource namespace prefixes that won't be deduplicated.
30
	 */
31
	private Set<String> skipList = Sets.newHashSet();
32

  
33
	/**
34
	 * Subprefix used to build the root id, allows multiple dedup runs.
35
	 */
36
	private String dedupRun = "";
37

  
38
	/**
39
	 * Similarity threshold.
40
	 */
41
	private double threshold = 0;
42

  
43
	/** The queue max size. */
44
	private int queueMaxSize = 2000;
45

  
46
	/** The group max size. */
47
	private int groupMaxSize;
48

  
49
	/** The sliding window size. */
50
	private int slidingWindowSize;
51

  
52
	/** The configuration id. */
53
	private String configurationId;
54

  
55
	/** The include children. */
56
	private boolean includeChildren;
57

  
58
	/** Default maximum number of allowed children. */
59
	private final static int MAX_CHILDREN = 50;
60

  
61
	/** Maximum number of allowed children. */
62
	private int maxChildren = MAX_CHILDREN;
63

  
64
	public WfConfig() {}
65

  
66
	/**
67
	 * Instantiates a new dedup config.
68
	 *
69
	 * @param entityType
70
	 *            the entity type
71
	 * @param orderField
72
	 *            the order field
73
	 * @param rootBuilder
74
	 *            the root builder families
75
	 * @param dedupRun
76
	 *            the dedup run
77
	 * @param configurationId
78
	 *            the configuration identifier
79
	 * @param threshold
80
	 *            the threshold
81
	 * @param skipList
82
	 *            the skip list
83
	 * @param queueMaxSize
84
	 *            the queue max size
85
	 * @param groupMaxSize
86
	 *            the group max size
87
	 * @param slidingWindowSize
88
	 *            the sliding window size
89
	 * @param includeChildren
90
	 *            allows the children to be included in the representative records or not.
91
	 */
92
	public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
93
			final double threshold,
94
			final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren) {
95
		super();
96
		this.entityType = entityType;
97
		this.orderField = orderField;
98
		this.rootBuilder = rootBuilder;
99
		this.dedupRun = cleanupStringNumber(dedupRun);
100
		this.threshold = threshold;
101
		this.skipList = skipList;
102
		this.queueMaxSize = queueMaxSize;
103
		this.groupMaxSize = groupMaxSize;
104
		this.slidingWindowSize = slidingWindowSize;
105
		this.includeChildren = includeChildren;
106
	}
107

  
108
	/**
109
	 * Cleanup string number.
110
	 *
111
	 * @param s
112
	 *            the s
113
	 * @return the string
114
	 */
115
	private String cleanupStringNumber(final String s) {
116
		return s.contains("'") ? s.replaceAll("'", "") : s;
117
	}
118

  
119
	public String getEntityType() {
120
		return entityType;
121
	}
122

  
123
	public void setEntityType(final String entityType) {
124
		this.entityType = entityType;
125
	}
126

  
127
	public String getOrderField() {
128
		return orderField;
129
	}
130

  
131
	public void setOrderField(final String orderField) {
132
		this.orderField = orderField;
133
	}
134

  
135
	public List<String> getRootBuilder() {
136
		return rootBuilder;
137
	}
138

  
139
	public void setRootBuilder(final List<String> rootBuilder) {
140
		this.rootBuilder = rootBuilder;
141
	}
142

  
143
	public Set<String> getSkipList() {
144
		return skipList != null ? skipList : new HashSet<String>();
145
	}
146

  
147
	public void setSkipList(final Set<String> skipList) {
148
		this.skipList = skipList;
149
	}
150

  
151
	public String getDedupRun() {
152
		return dedupRun;
153
	}
154

  
155
	public void setDedupRun(final String dedupRun) {
156
		this.dedupRun = dedupRun;
157
	}
158

  
159
	public double getThreshold() {
160
		return threshold;
161
	}
162

  
163
	public void setThreshold(final double threshold) {
164
		this.threshold = threshold;
165
	}
166

  
167
	public int getQueueMaxSize() {
168
		return queueMaxSize;
169
	}
170

  
171
	public void setQueueMaxSize(final int queueMaxSize) {
172
		this.queueMaxSize = queueMaxSize;
173
	}
174

  
175
	public int getGroupMaxSize() {
176
		return groupMaxSize;
177
	}
178

  
179
	public void setGroupMaxSize(final int groupMaxSize) {
180
		this.groupMaxSize = groupMaxSize;
181
	}
182

  
183
	public int getSlidingWindowSize() {
184
		return slidingWindowSize;
185
	}
186

  
187
	public void setSlidingWindowSize(final int slidingWindowSize) {
188
		this.slidingWindowSize = slidingWindowSize;
189
	}
190

  
191
	public String getConfigurationId() {
192
		return configurationId;
193
	}
194

  
195
	public void setConfigurationId(final String configurationId) {
196
		this.configurationId = configurationId;
197
	}
198

  
199
	public boolean isIncludeChildren() {
200
		return includeChildren;
201
	}
202

  
203
	public void setIncludeChildren(final boolean includeChildren) {
204
		this.includeChildren = includeChildren;
205
	}
206

  
207
	public int getMaxChildren() {
208
		return maxChildren;
209
	}
210

  
211
	public void setMaxChildren(final int maxChildren) {
212
		this.maxChildren = maxChildren;
213
	}
214

  
215
	/*
216
	 * (non-Javadoc)
217
	 *
218
	 * @see java.lang.Object#toString()
219
	 */
220
	@Override
221
	public String toString() {
222
		return new GsonBuilder().setPrettyPrinting().create().toJson(this);
223
	}
224

  
225
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/distance/SortedJaroWinkler.java
1
package eu.dnetlib.pace.distance;
2

  
3
import com.wcohen.ss.AbstractStringDistance;
4

  
5
/**
6
 * The Class SortedJaroWinkler.
7
 */
8
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
9

  
10
	/**
11
	 * Instantiates a new sorted jaro winkler.
12
	 * 
13
	 * @param weight
14
	 *            the weight
15
	 */
16
	public SortedJaroWinkler(final double weight) {
17
		super(weight, new com.wcohen.ss.JaroWinkler());
18
	}
19

  
20
	/**
21
	 * Instantiates a new sorted jaro winkler.
22
	 * 
23
	 * @param weight
24
	 *            the weight
25
	 * @param ssalgo
26
	 *            the ssalgo
27
	 */
28
	protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
29
		super(weight, ssalgo);
30
	}
31

  
32
	/*
33
	 * (non-Javadoc)
34
	 * 
35
	 * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
36
	 */
37
	@Override
38
	public double getWeight() {
39
		return super.weight;
40
	}
41

  
42
	/*
43
	 * (non-Javadoc)
44
	 * 
45
	 * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
46
	 */
47
	@Override
48
	protected double normalize(final double d) {
49
		return d;
50
	}
51

  
52
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
1
package eu.dnetlib.pace.config;
2

  
3
import java.util.List;
4
import java.util.Map;
5

  
6
import org.apache.commons.collections.CollectionUtils;
7

  
8
import com.google.common.base.Predicate;
9
import com.google.common.collect.Iterables;
10
import com.google.common.collect.Lists;
11

  
12
import eu.dnetlib.pace.condition.ConditionAlgo;
13
import eu.dnetlib.pace.model.ClusteringDef;
14
import eu.dnetlib.pace.model.CondDef;
15
import eu.dnetlib.pace.model.FieldDef;
16

  
17
public class PaceConfig {
18

  
19
	private List<FieldDef> model;
20
	private List<CondDef> strictConditions;
21
	private List<CondDef> conditions;
22
	private List<ClusteringDef> clustering;
23
	private Map<String, List<String>> blacklists;
24

  
25
	public PaceConfig() {}
26

  
27
	public List<FieldDef> getModel() {
28
		return model;
29
	}
30

  
31
	public void setModel(final List<FieldDef> fields) {
32
		this.model = fields;
33
	}
34

  
35
	public List<CondDef> getStrictConditions() {
36
		return strictConditions;
37
	}
38

  
39
	public void setStrictConditions(final List<CondDef> strictConditions) {
40
		this.strictConditions = strictConditions;
41
	}
42

  
43
	public List<CondDef> getConditions() {
44
		return conditions;
45
	}
46

  
47
	public List<ConditionAlgo> getConditionAlgos() {
48
		return asConditionAlgos(getConditions());
49
	}
50

  
51
	public List<ConditionAlgo> getStrictConditionAlgos() {
52
		return asConditionAlgos(getStrictConditions());
53
	}
54

  
55
	public void setConditions(final List<CondDef> conditions) {
56
		this.conditions = conditions;
57
	}
58

  
59
	public List<ClusteringDef> getClustering() {
60
		return clustering;
61
	}
62

  
63
	public void setClustering(final List<ClusteringDef> clustering) {
64
		this.clustering = clustering;
65
	}
66

  
67
	public Map<String, List<String>> getBlacklists() {
68
		return blacklists;
69
	}
70

  
71
	public void setBlacklists(final Map<String, List<String>> blacklists) {
72
		this.blacklists = blacklists;
73
	}
74

  
75
	// helper
76

  
77
	private List<ConditionAlgo> asConditionAlgos(final List<CondDef> defs) {
78
		final List<ConditionAlgo> algos = Lists.newArrayList();
79
		if (CollectionUtils.isEmpty(defs)) return algos;
80
		for (final CondDef cd : defs) {
81
			final List<FieldDef> fields = Lists.newArrayList(Iterables.filter(getModel(), new Predicate<FieldDef>() {
82

  
83
				@Override
84
				public boolean apply(final FieldDef fd) {
85

  
86
					return cd.getFields().contains(fd.getName());
87
				}
88
			}));
89
			algos.add(cd.getConditionAlgo(fields));
90
		}
91
		return algos;
92
	}
93

  
94
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/distance/LevensteinDate.java
1
package eu.dnetlib.pace.distance;
2

  
3

  
4
public class LevensteinDate extends Levenstein {
5

  
6

  
7
	public LevensteinDate(double w) {
8
		super(w);
9
	}
10

  
11
	
12
	@Override
13
	public double distance(String a, String b) {
14

  
15
		return 1.0;
16
	}
17
	
18

  
19
	
20
	@Override
21
	public double getWeight() {
22
		return super.weight;
23
	}
24

  
25
}
0 26

  
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/distance/SubStringLevenstein.java
1
package eu.dnetlib.pace.distance;
2

  
3
import org.apache.commons.lang.StringUtils;
4

  
5
import com.wcohen.ss.AbstractStringDistance;
6

  
7
import eu.dnetlib.pace.config.Type;
8
import eu.dnetlib.pace.model.Field;
9

  
10
/**
11
 * The Class SubStringLevenstein.
12
 */
13
public class SubStringLevenstein extends SecondStringDistanceAlgo {
14

  
15
	/** The limit. */
16
	protected int limit;
17

  
18
	/**
19
	 * Instantiates a new sub string levenstein.
20
	 * 
21
	 * @param w
22
	 *            the w
23
	 */
24
	public SubStringLevenstein(final double w) {
25
		super(w, new com.wcohen.ss.Levenstein());
26
	}
27

  
28
	/**
29
	 * Instantiates a new sub string levenstein.
30
	 * 
31
	 * @param w
32
	 *            the w
33
	 * @param limit
34
	 *            the limit
35
	 */
36
	public SubStringLevenstein(final double w, final int limit) {
37
		super(w, new com.wcohen.ss.Levenstein());
38
		this.limit = limit;
39
	}
40

  
41
	/**
42
	 * Instantiates a new sub string levenstein.
43
	 * 
44
	 * @param w
45
	 *            the w
46
	 * @param limit
47
	 *            the limit
48
	 * @param ssalgo
49
	 *            the ssalgo
50
	 */
51
	protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
52
		super(w, ssalgo);
53
		this.limit = limit;
54
	}
55

  
56
	/*
57
	 * (non-Javadoc)
58
	 * 
59
	 * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
60
	 */
61
	@Override
62
	public double distance(final Field a, final Field b) {
63
		if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
64
			return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
65

  
66
		throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
67
	}
68

  
69
	/*
70
	 * (non-Javadoc)
71
	 * 
72
	 * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
73
	 */
74
	@Override
75
	public double getWeight() {
76
		return super.weight;
77
	}
78

  
79
	/*
80
	 * (non-Javadoc)
81
	 * 
82
	 * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
83
	 */
84
	@Override
85
	protected double normalize(final double d) {
86
		return 1 / Math.pow(Math.abs(d) + 1, 0.1);
87
	}
88

  
89
}
0 90

  
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/pom.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet-hadoop-parent</artifactId>
6
		<version>1.0.0</version>
7
		<relativePath />
8
	</parent>
9
	<modelVersion>4.0.0</modelVersion>
10
	<groupId>eu.dnetlib</groupId>
11
	<artifactId>dnet-pace-core</artifactId>
12
	<packaging>jar</packaging>
13
	<version>2.0.0</version>
14
	<scm>
15
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-pace-core/tags/dnet-pace-core-2.0.0</developerConnection>
16
	</scm>
17
	<dependencies>
18
		<dependency>
19
			<groupId>edu.cmu</groupId>
20
			<artifactId>secondstring</artifactId>
21
			<version>1.0.0</version>
22
		</dependency>
23
		<dependency>
24
			<groupId>com.google.guava</groupId>
25
			<artifactId>guava</artifactId>
26
			<version>${google.guava.version}</version>
27
		</dependency>
28
		<dependency>
29
			<groupId>com.google.code.gson</groupId>
30
			<artifactId>gson</artifactId>
31
			<version>${google.gson.version}</version>
32
		</dependency>
33
		<dependency>
34
			<groupId>commons-lang</groupId>
35
			<artifactId>commons-lang</artifactId>
36
			<version>${commons.lang.version}</version>
37
		</dependency>
38
		<dependency>
39
			<groupId>commons-io</groupId>
40
			<artifactId>commons-io</artifactId>
41
			<version>${commons.io.version}</version>
42
		</dependency>
43
		<dependency>
44
			<groupId>commons-collections</groupId>
45
			<artifactId>commons-collections</artifactId>
46
			<version>${commons.collections.version}</version>
47
		</dependency>
48
		<dependency>
49
			<groupId>org.antlr</groupId>
50
			<artifactId>stringtemplate</artifactId>
51
			<version>3.2</version>
52
		</dependency>	
53
		<dependency>
54
			<groupId>junit</groupId>
55
			<artifactId>junit</artifactId>
56
			<version>${junit.version}</version>
57
			<scope>test</scope>
58
		</dependency>
59
	</dependencies>
60
</project>
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java
1
package eu.dnetlib.pace.distance;
2

  
3
import eu.dnetlib.pace.model.Document;
4

  
5
public class PaceDocumentDistance extends AbstractDistance<Document> {
6

  
7
	@Override
8
	protected Document toDocument(Document a) {
9
		return a;
10
	}
11

  
12
}
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
1
package eu.dnetlib.pace.config;
2

  
3
import static org.junit.Assert.assertNotNull;
4

  
5
import java.io.IOException;
6

  
7
import org.junit.Test;
8

  
9
import eu.dnetlib.pace.AbstractPaceTest;
10

  
11
public class ConfigTest extends AbstractPaceTest {
12

  
13
	@Test
14
	public void test() throws IOException {
15
		final DedupConfig cfg = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
16

  
17
		assertNotNull(cfg);
18

  
19
		System.out.println(cfg);
20
	}
21

  
22
}
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff