Revision 36615
Added by Claudio Atzori over 9 years ago
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
import com.google.gson.Gson; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.clustering.Acronyms; |
|
9 |
import eu.dnetlib.pace.clustering.Clustering; |
|
10 |
import eu.dnetlib.pace.clustering.ClusteringFunction; |
|
11 |
import eu.dnetlib.pace.clustering.NgramPairs; |
|
12 |
import eu.dnetlib.pace.clustering.Ngrams; |
|
13 |
import eu.dnetlib.pace.clustering.RandomClusteringFunction; |
|
14 |
import eu.dnetlib.pace.clustering.SpaceTrimmingFieldValue; |
|
15 |
import eu.dnetlib.pace.clustering.SuffixPrefix; |
|
16 |
|
|
17 |
public class ClusteringDef { |
|
18 |
|
|
19 |
private Clustering name; |
|
20 |
|
|
21 |
private List<String> fields; |
|
22 |
|
|
23 |
private Map<String, Integer> params; |
|
24 |
|
|
25 |
public ClusteringDef() {} |
|
26 |
|
|
27 |
public Clustering getName() { |
|
28 |
return name; |
|
29 |
} |
|
30 |
|
|
31 |
public void setName(final Clustering name) { |
|
32 |
this.name = name; |
|
33 |
} |
|
34 |
|
|
35 |
public ClusteringFunction getClusteringFunction() { |
|
36 |
switch (getName()) { |
|
37 |
case acronyms: |
|
38 |
return new Acronyms(getParams()); |
|
39 |
case ngrams: |
|
40 |
return new Ngrams(getParams()); |
|
41 |
case ngrampairs: |
|
42 |
return new NgramPairs(getParams()); |
|
43 |
case suffixprefix: |
|
44 |
return new SuffixPrefix(getParams()); |
|
45 |
case spacetrimmingfieldvalue: |
|
46 |
return new SpaceTrimmingFieldValue(getParams()); |
|
47 |
default: |
|
48 |
return new RandomClusteringFunction(getParams()); |
|
49 |
} |
|
50 |
} |
|
51 |
|
|
52 |
public List<String> getFields() { |
|
53 |
return fields; |
|
54 |
} |
|
55 |
|
|
56 |
public void setFields(final List<String> fields) { |
|
57 |
this.fields = fields; |
|
58 |
} |
|
59 |
|
|
60 |
public Map<String, Integer> getParams() { |
|
61 |
return params; |
|
62 |
} |
|
63 |
|
|
64 |
public void setParams(final Map<String, Integer> params) { |
|
65 |
this.params = params; |
|
66 |
} |
|
67 |
|
|
68 |
@Override |
|
69 |
public String toString() { |
|
70 |
return new Gson().toJson(this); |
|
71 |
} |
|
72 |
|
|
73 |
} |
|
0 | 74 |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
import java.util.List; |
|
5 |
|
|
6 |
import org.apache.commons.collections.iterators.SingletonIterator; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.config.Type; |
|
9 |
|
|
10 |
/** |
|
11 |
* The Class FieldValueImpl. |
|
12 |
*/ |
|
13 |
public class FieldValueImpl extends AbstractField implements FieldValue { |
|
14 |
|
|
15 |
/** The value. */ |
|
16 |
private Object value = null; |
|
17 |
|
|
18 |
/** |
|
19 |
* Instantiates a new field value impl. |
|
20 |
*/ |
|
21 |
public FieldValueImpl() {} |
|
22 |
|
|
23 |
/** |
|
24 |
* Instantiates a new field value impl. |
|
25 |
* |
|
26 |
* @param type |
|
27 |
* the type |
|
28 |
* @param name |
|
29 |
* the name |
|
30 |
* @param value |
|
31 |
* the value |
|
32 |
*/ |
|
33 |
public FieldValueImpl(final Type type, final String name, final Object value) { |
|
34 |
super(type, name); |
|
35 |
this.value = value; |
|
36 |
} |
|
37 |
|
|
38 |
/* |
|
39 |
* (non-Javadoc) |
|
40 |
* |
|
41 |
* @see eu.dnetlib.pace.model.Field#isEmpty() |
|
42 |
*/ |
|
43 |
@Override |
|
44 |
public boolean isEmpty() { |
|
45 |
if (value == null) return false; |
|
46 |
|
|
47 |
switch (type) { |
|
48 |
case String: |
|
49 |
return value.toString().isEmpty(); |
|
50 |
case List: |
|
51 |
List<?> list = (List<?>) value; |
|
52 |
return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty(); |
|
53 |
default: |
|
54 |
return true; |
|
55 |
} |
|
56 |
} |
|
57 |
|
|
58 |
/* |
|
59 |
* (non-Javadoc) |
|
60 |
* |
|
61 |
* @see eu.dnetlib.pace.model.FieldValue#getValue() |
|
62 |
*/ |
|
63 |
@Override |
|
64 |
public Object getValue() { |
|
65 |
return value; |
|
66 |
} |
|
67 |
|
|
68 |
/* |
|
69 |
* (non-Javadoc) |
|
70 |
* |
|
71 |
* @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object) |
|
72 |
*/ |
|
73 |
@Override |
|
74 |
public void setValue(final Object value) { |
|
75 |
this.value = value; |
|
76 |
} |
|
77 |
|
|
78 |
/* |
|
79 |
* (non-Javadoc) |
|
80 |
* |
|
81 |
* @see eu.dnetlib.pace.model.Field#stringValue() |
|
82 |
*/ |
|
83 |
@Override |
|
84 |
// @SuppressWarnings("unchecked") |
|
85 |
public String stringValue() { |
|
86 |
return String.valueOf(getValue()); |
|
87 |
// switch (getType()) { |
|
88 |
// |
|
89 |
// case Int: |
|
90 |
// return String.valueOf(getValue()); |
|
91 |
// case List: |
|
92 |
// return Joiner.on(" ").join((List<String>) getValue()); |
|
93 |
// case String: |
|
94 |
// return (String) getValue(); |
|
95 |
// default: |
|
96 |
// throw new IllegalArgumentException("Unknown type: " + getType().toString()); |
|
97 |
// } |
|
98 |
} |
|
99 |
|
|
100 |
/* |
|
101 |
* (non-Javadoc) |
|
102 |
* |
|
103 |
* @see java.lang.Iterable#iterator() |
|
104 |
*/ |
|
105 |
@Override |
|
106 |
@SuppressWarnings("unchecked") |
|
107 |
public Iterator<Field> iterator() { |
|
108 |
return new SingletonIterator(this); |
|
109 |
} |
|
110 |
|
|
111 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/FieldValue.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
/** |
|
4 |
* The Interface FieldValue. |
|
5 |
*/ |
|
6 |
public interface FieldValue extends Field { |
|
7 |
|
|
8 |
/** |
|
9 |
* Gets the value. |
|
10 |
* |
|
11 |
* @return the value |
|
12 |
*/ |
|
13 |
public Object getValue(); |
|
14 |
|
|
15 |
/** |
|
16 |
* Sets the value. |
|
17 |
* |
|
18 |
* @param value |
|
19 |
* the new value |
|
20 |
*/ |
|
21 |
public void setValue(final Object value); |
|
22 |
|
|
23 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.lang.reflect.Type; |
|
4 |
|
|
5 |
import com.google.gson.Gson; |
|
6 |
import com.google.gson.GsonBuilder; |
|
7 |
import com.google.gson.InstanceCreator; |
|
8 |
import com.google.gson.JsonDeserializationContext; |
|
9 |
import com.google.gson.JsonDeserializer; |
|
10 |
import com.google.gson.JsonElement; |
|
11 |
import com.google.gson.JsonParseException; |
|
12 |
|
|
13 |
/** |
|
14 |
* The Class MapDocumentSerializer. |
|
15 |
*/ |
|
16 |
public class MapDocumentSerializer implements InstanceCreator<MapDocument> { |
|
17 |
|
|
18 |
@Override |
|
19 |
public MapDocument createInstance(final Type type) { |
|
20 |
return new MapDocument(); |
|
21 |
} |
|
22 |
|
|
23 |
/** |
|
24 |
* Decode. |
|
25 |
* |
|
26 |
* @param bytes |
|
27 |
* the bytes |
|
28 |
* @return the map document |
|
29 |
*/ |
|
30 |
public static MapDocument decode(final byte[] bytes) { |
|
31 |
|
|
32 |
GsonBuilder gson = new GsonBuilder(); |
|
33 |
|
|
34 |
gson.registerTypeAdapter(Field.class, new JsonDeserializer<Field>() { |
|
35 |
|
|
36 |
@Override |
|
37 |
public Field deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException { |
|
38 |
FieldListImpl fl = new FieldListImpl(); |
|
39 |
if (json.isJsonObject()) { |
|
40 |
String name = json.getAsJsonObject().get("name").getAsString(); |
|
41 |
String type = json.getAsJsonObject().get("type").getAsString(); |
|
42 |
String value = json.getAsJsonObject().get("value").getAsString(); |
|
43 |
fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value)); |
|
44 |
} |
|
45 |
return fl; |
|
46 |
} |
|
47 |
}); |
|
48 |
|
|
49 |
return gson.create().fromJson(new String(bytes), MapDocument.class); |
|
50 |
} |
|
51 |
|
|
52 |
/** |
|
53 |
* To string. |
|
54 |
* |
|
55 |
* @param doc |
|
56 |
* the doc |
|
57 |
* @return the string |
|
58 |
*/ |
|
59 |
public static String toString(final MapDocument doc) { |
|
60 |
return new Gson().toJson(doc); |
|
61 |
} |
|
62 |
|
|
63 |
/** |
|
64 |
* To byte array. |
|
65 |
* |
|
66 |
* @param doc |
|
67 |
* the doc |
|
68 |
* @return the byte[] |
|
69 |
*/ |
|
70 |
public static byte[] toByteArray(final MapDocument doc) { |
|
71 |
return toString(doc).getBytes(); |
|
72 |
} |
|
73 |
|
|
74 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/Person.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.text.Normalizer; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Set; |
|
6 |
|
|
7 |
import com.google.common.base.Joiner; |
|
8 |
import com.google.common.base.Splitter; |
|
9 |
import com.google.common.collect.Iterables; |
|
10 |
import com.google.common.collect.Lists; |
|
11 |
import com.google.common.hash.Hashing; |
|
12 |
|
|
13 |
import eu.dnetlib.pace.clustering.NGramUtils; |
|
14 |
import eu.dnetlib.pace.util.Capitalise; |
|
15 |
import eu.dnetlib.pace.util.DotAbbreviations; |
|
16 |
|
|
17 |
public class Person { |
|
18 |
private List<String> name = Lists.newArrayList(); |
|
19 |
private List<String> surname = Lists.newArrayList(); |
|
20 |
private List<String> fullname = Lists.newArrayList(); |
|
21 |
private final String original; |
|
22 |
|
|
23 |
private static Set<String> particles = null; |
|
24 |
|
|
25 |
public Person(String s, boolean aggressive) { |
|
26 |
original = s; |
|
27 |
s = Normalizer.normalize(s, Normalizer.Form.NFD); |
|
28 |
s = s.replaceAll("\\(.+\\)", ""); |
|
29 |
s = s.replaceAll("\\[.+\\]", ""); |
|
30 |
s = s.replaceAll("\\{.+\\}", ""); |
|
31 |
s = s.replaceAll("\\s+-\\s+", "-"); |
|
32 |
s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); |
|
33 |
s = s.replaceAll("\\d", " "); |
|
34 |
s = s.replaceAll("\\n", " "); |
|
35 |
s = s.replaceAll("\\.", " "); |
|
36 |
s = s.replaceAll("\\s+", " "); |
|
37 |
|
|
38 |
if (aggressive) { |
|
39 |
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); |
|
40 |
//s = s.replaceAll("[\\W&&[^,-]]", ""); |
|
41 |
} |
|
42 |
|
|
43 |
if (s.contains(",")) { |
|
44 |
String[] arr = s.split(","); |
|
45 |
if (arr.length == 1) { |
|
46 |
fullname = splitTerms(arr[0]); |
|
47 |
} else if (arr.length > 1) { |
|
48 |
surname = splitTerms(arr[0]); |
|
49 |
name = splitTerms(arr[1]); |
|
50 |
fullname.addAll(surname); |
|
51 |
fullname.addAll(name); |
|
52 |
} |
|
53 |
} else { |
|
54 |
fullname = splitTerms(s); |
|
55 |
|
|
56 |
int lastInitialPosition = fullname.size(); |
|
57 |
boolean hasSurnameInUpperCase = false; |
|
58 |
|
|
59 |
for (int i = 0; i < fullname.size(); i++) { |
|
60 |
String term = fullname.get(i); |
|
61 |
if (term.length() == 1) { |
|
62 |
lastInitialPosition = i; |
|
63 |
} else if (term.equals(term.toUpperCase())) { |
|
64 |
hasSurnameInUpperCase = true; |
|
65 |
} |
|
66 |
} |
|
67 |
|
|
68 |
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini |
|
69 |
name = fullname.subList(0, lastInitialPosition + 1); |
|
70 |
surname = fullname.subList(lastInitialPosition + 1, fullname.size()); |
|
71 |
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI |
|
72 |
for (String term : fullname) { |
|
73 |
if (term.length() > 1 && term.equals(term.toUpperCase())) { |
|
74 |
surname.add(term); |
|
75 |
} else { |
|
76 |
name.add(term); |
|
77 |
} |
|
78 |
} |
|
79 |
} |
|
80 |
} |
|
81 |
} |
|
82 |
|
|
83 |
private List<String> splitTerms(String s) { |
|
84 |
if (particles == null) { |
|
85 |
particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt"); |
|
86 |
} |
|
87 |
|
|
88 |
List<String> list = Lists.newArrayList(); |
|
89 |
for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) { |
|
90 |
if (!particles.contains(part.toLowerCase())) { |
|
91 |
list.add(part); |
|
92 |
} |
|
93 |
} |
|
94 |
return list; |
|
95 |
} |
|
96 |
|
|
97 |
public List<String> getName() { |
|
98 |
return name; |
|
99 |
} |
|
100 |
|
|
101 |
public List<String> getSurname() { |
|
102 |
return surname; |
|
103 |
} |
|
104 |
|
|
105 |
public List<String> getFullname() { |
|
106 |
return fullname; |
|
107 |
} |
|
108 |
|
|
109 |
public String getOriginal() { |
|
110 |
return original; |
|
111 |
} |
|
112 |
|
|
113 |
public String hash() { |
|
114 |
return Hashing.murmur3_128().hashString(getNormalisedFullname()).toString(); |
|
115 |
} |
|
116 |
|
|
117 |
public String getNormalisedFirstName() { |
|
118 |
return Joiner.on(" ").join(getCapitalFirstnames()); |
|
119 |
} |
|
120 |
|
|
121 |
public String getNormalisedSurname() { |
|
122 |
return Joiner.on(" ").join(getCapitalSurname()); |
|
123 |
} |
|
124 |
|
|
125 |
public String getNormalisedFullname() { |
|
126 |
return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname); |
|
127 |
} |
|
128 |
|
|
129 |
public List<String> getCapitalFirstnames() { |
|
130 |
return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), new Capitalise())); |
|
131 |
} |
|
132 |
|
|
133 |
public List<String> getCapitalSurname() { |
|
134 |
return Lists.newArrayList(Iterables.transform(surname, new Capitalise())); |
|
135 |
} |
|
136 |
|
|
137 |
public List<String> getNameWithAbbreviations() { |
|
138 |
return Lists.newArrayList(Iterables.transform(name, new DotAbbreviations())); |
|
139 |
} |
|
140 |
|
|
141 |
public boolean isAccurate() { |
|
142 |
return (name != null && surname != null && !name.isEmpty() && !surname.isEmpty()); |
|
143 |
} |
|
144 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/MapDocument.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.Map; |
|
4 |
import java.util.Set; |
|
5 |
|
|
6 |
import com.google.common.collect.Iterables; |
|
7 |
import com.google.common.collect.Lists; |
|
8 |
import com.google.common.collect.Maps; |
|
9 |
|
|
10 |
/** |
|
11 |
* The Class MapDocument. |
|
12 |
*/ |
|
13 |
public class MapDocument implements Document { |
|
14 |
|
|
15 |
/** The identifier. */ |
|
16 |
private String identifier; |
|
17 |
|
|
18 |
/** The field map. */ |
|
19 |
private Map<String, FieldListImpl> fieldMap; |
|
20 |
|
|
21 |
/** |
|
22 |
* Instantiates a new map document. |
|
23 |
*/ |
|
24 |
public MapDocument() { |
|
25 |
identifier = null; |
|
26 |
fieldMap = Maps.newHashMap(); |
|
27 |
} |
|
28 |
|
|
29 |
/** |
|
30 |
* Instantiates a new map document. |
|
31 |
* |
|
32 |
* @param identifier |
|
33 |
* the identifier |
|
34 |
* @param fieldMap |
|
35 |
* the field map |
|
36 |
*/ |
|
37 |
public MapDocument(final String identifier, final Map<String, FieldListImpl> fieldMap) { |
|
38 |
this.setIdentifier(identifier); |
|
39 |
this.fieldMap = fieldMap; |
|
40 |
} |
|
41 |
|
|
42 |
/** |
|
43 |
* Instantiates a new map document. |
|
44 |
* |
|
45 |
* @param identifier |
|
46 |
* the identifier |
|
47 |
* @param data |
|
48 |
* the data |
|
49 |
*/ |
|
50 |
public MapDocument(final String identifier, final byte[] data) { |
|
51 |
MapDocument doc = MapDocumentSerializer.decode(data); |
|
52 |
|
|
53 |
this.fieldMap = doc.fieldMap; |
|
54 |
this.identifier = doc.identifier; |
|
55 |
} |
|
56 |
|
|
57 |
/* |
|
58 |
* (non-Javadoc) |
|
59 |
* |
|
60 |
* @see eu.dnetlib.pace.model.document.Document#fields() |
|
61 |
*/ |
|
62 |
@Override |
|
63 |
public Iterable<Field> fields() { |
|
64 |
return Lists.newArrayList(Iterables.concat(fieldMap.values())); |
|
65 |
} |
|
66 |
|
|
67 |
/* |
|
68 |
* (non-Javadoc) |
|
69 |
* |
|
70 |
* @see eu.dnetlib.pace.model.document.Document#values(java.lang.String) |
|
71 |
*/ |
|
72 |
@Override |
|
73 |
public FieldList values(final String name) { |
|
74 |
return fieldMap.get(name); |
|
75 |
} |
|
76 |
|
|
77 |
/* |
|
78 |
* (non-Javadoc) |
|
79 |
* |
|
80 |
* @see eu.dnetlib.pace.model.document.Document#fieldNames() |
|
81 |
*/ |
|
82 |
@Override |
|
83 |
public Set<String> fieldNames() { |
|
84 |
return fieldMap.keySet(); |
|
85 |
} |
|
86 |
|
|
87 |
/* |
|
88 |
* (non-Javadoc) |
|
89 |
* |
|
90 |
* @see java.lang.Object#toString() |
|
91 |
*/ |
|
92 |
@Override |
|
93 |
public String toString() { |
|
94 |
return MapDocumentSerializer.toString(this); |
|
95 |
// return String.format("Document(%s)", fieldMap.toString()); |
|
96 |
} |
|
97 |
|
|
98 |
/** |
|
99 |
* To byte array. |
|
100 |
* |
|
101 |
* @return the byte[] |
|
102 |
*/ |
|
103 |
public byte[] toByteArray() { |
|
104 |
return MapDocumentSerializer.toByteArray(this); |
|
105 |
} |
|
106 |
|
|
107 |
/* |
|
108 |
* (non-Javadoc) |
|
109 |
* |
|
110 |
* @see eu.dnetlib.pace.model.document.Document#getIdentifier() |
|
111 |
*/ |
|
112 |
@Override |
|
113 |
public String getIdentifier() { |
|
114 |
return identifier; |
|
115 |
} |
|
116 |
|
|
117 |
/** |
|
118 |
* Sets the identifier. |
|
119 |
* |
|
120 |
* @param identifier |
|
121 |
* the new identifier |
|
122 |
*/ |
|
123 |
public void setIdentifier(final String identifier) { |
|
124 |
this.identifier = identifier; |
|
125 |
} |
|
126 |
|
|
127 |
/** |
|
128 |
* Gets the field map. |
|
129 |
* |
|
130 |
* @return the field map |
|
131 |
*/ |
|
132 |
public Map<String, FieldListImpl> getFieldMap() { |
|
133 |
return fieldMap; |
|
134 |
} |
|
135 |
|
|
136 |
/** |
|
137 |
* Sets the field map. |
|
138 |
* |
|
139 |
* @param fieldMap |
|
140 |
* the field map |
|
141 |
*/ |
|
142 |
public void setFieldMap(final Map<String, FieldListImpl> fieldMap) { |
|
143 |
this.fieldMap = fieldMap; |
|
144 |
} |
|
145 |
|
|
146 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/config/Config.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.config; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
import eu.dnetlib.pace.condition.ConditionAlgo; |
|
7 |
import eu.dnetlib.pace.model.ClusteringDef; |
|
8 |
import eu.dnetlib.pace.model.FieldDef; |
|
9 |
|
|
10 |
/** |
|
11 |
* Interface for PACE configuration bean. |
|
12 |
* |
|
13 |
* @author claudio |
|
14 |
*/ |
|
15 |
public interface Config { |
|
16 |
|
|
17 |
/** |
|
18 |
* Field configuration definitions. |
|
19 |
* |
|
20 |
* @return the list of definitions |
|
21 |
*/ |
|
22 |
public List<FieldDef> model(); |
|
23 |
|
|
24 |
/** |
|
25 |
* Strict Pre-Condition definitions. |
|
26 |
* |
|
27 |
* @return the list of conditions |
|
28 |
*/ |
|
29 |
public List<ConditionAlgo> strictConditions(); |
|
30 |
|
|
31 |
/** |
|
32 |
* Pre-Condition definitions. |
|
33 |
* |
|
34 |
* @return the list of conditions |
|
35 |
*/ |
|
36 |
public List<ConditionAlgo> conditions(); |
|
37 |
|
|
38 |
/** |
|
39 |
* Clusterings. |
|
40 |
* |
|
41 |
* @return the list |
|
42 |
*/ |
|
43 |
public List<ClusteringDef> clusterings(); |
|
44 |
|
|
45 |
/** |
|
46 |
* Blacklists. |
|
47 |
* |
|
48 |
* @return the map |
|
49 |
*/ |
|
50 |
public Map<String, List<String>> blacklists(); |
|
51 |
|
|
52 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/FieldList.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
|
|
5 |
/** |
|
6 |
* The Interface FieldList. |
|
7 |
*/ |
|
8 |
public interface FieldList extends List<Field>, Field { |
|
9 |
|
|
10 |
/** |
|
11 |
* String list. |
|
12 |
* |
|
13 |
* @return the list |
|
14 |
*/ |
|
15 |
public List<String> stringList(); |
|
16 |
|
|
17 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/config/Algo.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.config; |
|
2 |
|
|
3 |
/** |
|
4 |
* Enumerates the distance Algos. |
|
5 |
*/ |
|
6 |
public enum Algo { |
|
7 |
|
|
8 |
/** The Jaro winkler. */ |
|
9 |
JaroWinkler, |
|
10 |
/** The Jaro winkler title. */ |
|
11 |
JaroWinklerTitle, |
|
12 |
/** The Levenstein. */ |
|
13 |
Levenstein, |
|
14 |
/** The Level2 jaro winkler. */ |
|
15 |
Level2JaroWinkler, |
|
16 |
/** The Level2 levenstein. */ |
|
17 |
Level2Levenstein, |
|
18 |
/** The Sub string levenstein. */ |
|
19 |
SubStringLevenstein, |
|
20 |
/** The Year levenstein. */ |
|
21 |
YearLevenstein, |
|
22 |
/** The Sorted jaro winkler. */ |
|
23 |
SortedJaroWinkler, |
|
24 |
/** The Sorted level2 jaro winkler. */ |
|
25 |
SortedLevel2JaroWinkler, |
|
26 |
/** The Null. */ |
|
27 |
Null |
|
28 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/model/DocumentBuilder.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.Map; |
|
4 |
|
|
5 |
/** |
|
6 |
* The Class DocumentBuilder. |
|
7 |
*/ |
|
8 |
public class DocumentBuilder { |
|
9 |
|
|
10 |
/** |
|
11 |
* New instance. |
|
12 |
* |
|
13 |
* @param identifier |
|
14 |
* the identifier |
|
15 |
* @param fieldMap |
|
16 |
* the field map |
|
17 |
* @return the map document |
|
18 |
*/ |
|
19 |
public static MapDocument newInstance(final String identifier, final Map<String, FieldListImpl> fieldMap) { |
|
20 |
return new MapDocument(identifier, fieldMap); |
|
21 |
} |
|
22 |
|
|
23 |
/** |
|
24 |
* New instance. |
|
25 |
* |
|
26 |
* @param identifier |
|
27 |
* the identifier |
|
28 |
* @param fieldMap |
|
29 |
* the field map |
|
30 |
* @return the map document |
|
31 |
*/ |
|
32 |
public static MapDocument newInstance(final String identifier, final byte[] fieldMap) { |
|
33 |
return new MapDocument(identifier, fieldMap); |
|
34 |
} |
|
35 |
|
|
36 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
|
|
5 |
import eu.dnetlib.pace.condition.ConditionAlgo; |
|
6 |
import eu.dnetlib.pace.model.Document; |
|
7 |
import eu.dnetlib.pace.model.Field; |
|
8 |
import eu.dnetlib.pace.model.FieldDef; |
|
9 |
|
|
10 |
/** |
|
11 |
* The distance between two documents is given by the weighted mean of the field distances |
|
12 |
*/ |
|
13 |
public class DistanceScorer { |
|
14 |
|
|
15 |
private List<FieldDef> fields; |
|
16 |
|
|
17 |
private List<ConditionAlgo> strictConditions; |
|
18 |
|
|
19 |
private List<ConditionAlgo> conditions; |
|
20 |
|
|
21 |
public DistanceScorer(final List<FieldDef> fields, final List<ConditionAlgo> strictConditions, final List<ConditionAlgo> conditions) { |
|
22 |
this.fields = fields; |
|
23 |
this.conditions = conditions; |
|
24 |
this.strictConditions = strictConditions; |
|
25 |
} |
|
26 |
|
|
27 |
public double distance(final Document a, final Document b) { |
|
28 |
|
|
29 |
double w = sumWeights(fields); |
|
30 |
double sum = 0.0; |
|
31 |
final int cond = verify(a, b, strictConditions, true); |
|
32 |
|
|
33 |
if (cond > 0) return 1.0; |
|
34 |
if (cond < 0) return 0.0; |
|
35 |
|
|
36 |
if (verify(a, b, conditions, true) >= 0) { |
|
37 |
for (final FieldDef fd : fields) { |
|
38 |
final double d = fieldDistance(a, b, fd); |
|
39 |
|
|
40 |
if (d > 0) { |
|
41 |
sum += d; |
|
42 |
} else { |
|
43 |
w -= fd.getWeight(); |
|
44 |
} |
|
45 |
} |
|
46 |
return w == 0 ? 0 : sum / w; |
|
47 |
} |
|
48 |
return 0.0; |
|
49 |
} |
|
50 |
|
|
51 |
private int verify(final Document a, final Document b, final List<ConditionAlgo> conditions, final boolean strict) { |
|
52 |
int cond = 0; |
|
53 |
|
|
54 |
for (final ConditionAlgo cd : conditions) { |
|
55 |
final int verify = cd.verify(a, b); |
|
56 |
if (strict && (verify < 0)) return -1; |
|
57 |
cond += verify; |
|
58 |
} |
|
59 |
return cond; |
|
60 |
} |
|
61 |
|
|
62 |
private double fieldDistance(final Document a, final Document b, final FieldDef fd) { |
|
63 |
final double w = fd.getWeight(); |
|
64 |
if ((w == 0)) return 0.0; // optimization for 0 weight |
|
65 |
else { |
|
66 |
final Field va = getValue(a, fd); |
|
67 |
final Field vb = getValue(b, fd); |
|
68 |
|
|
69 |
if (va.isEmpty() || vb.isEmpty()) { |
|
70 |
if (fd.isIgnoreMissing()) return -1; |
|
71 |
else return w; |
|
72 |
} else { |
|
73 |
|
|
74 |
if (va.getType().equals(vb.getType())) { |
|
75 |
final double d = fd.getDistanceAlgo().distance(va, vb); |
|
76 |
return w * d; |
|
77 |
} |
|
78 |
throw new IllegalArgumentException("Types are differents type"); |
|
79 |
} |
|
80 |
} |
|
81 |
} |
|
82 |
|
|
83 |
private Field getValue(final Document d, final FieldDef fd) { |
|
84 |
return d.values(fd.getName()); |
|
85 |
} |
|
86 |
|
|
87 |
private double sumWeights(final List<FieldDef> fields) { |
|
88 |
double sum = 0.0; |
|
89 |
for (final FieldDef fd : fields) { |
|
90 |
sum += fd.getWeight(); |
|
91 |
} |
|
92 |
return sum; |
|
93 |
} |
|
94 |
|
|
95 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/config/WfConfig.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.config; |
|
2 |
|
|
3 |
import java.util.HashSet; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Set; |
|
6 |
|
|
7 |
import com.google.common.collect.Lists; |
|
8 |
import com.google.common.collect.Sets; |
|
9 |
import com.google.gson.GsonBuilder; |
|
10 |
|
|
11 |
public class WfConfig { |
|
12 |
|
|
13 |
/** |
|
14 |
* Entity type. |
|
15 |
*/ |
|
16 |
private String entityType = ""; |
|
17 |
|
|
18 |
/** |
|
19 |
* Field name used to sort the values in the reducer phase. |
|
20 |
*/ |
|
21 |
private String orderField = ""; |
|
22 |
|
|
23 |
/** |
|
24 |
* Column Families involved in the relations redirection. |
|
25 |
*/ |
|
26 |
private List<String> rootBuilder = Lists.newArrayList(); |
|
27 |
|
|
28 |
/** |
|
29 |
* Set of datasource namespace prefixes that won't be deduplicated. |
|
30 |
*/ |
|
31 |
private Set<String> skipList = Sets.newHashSet(); |
|
32 |
|
|
33 |
/** |
|
34 |
* Subprefix used to build the root id, allows multiple dedup runs. |
|
35 |
*/ |
|
36 |
private String dedupRun = ""; |
|
37 |
|
|
38 |
/** |
|
39 |
* Similarity threshold. |
|
40 |
*/ |
|
41 |
private double threshold = 0; |
|
42 |
|
|
43 |
/** The queue max size. */ |
|
44 |
private int queueMaxSize = 2000; |
|
45 |
|
|
46 |
/** The group max size. */ |
|
47 |
private int groupMaxSize; |
|
48 |
|
|
49 |
/** The sliding window size. */ |
|
50 |
private int slidingWindowSize; |
|
51 |
|
|
52 |
/** The configuration id. */ |
|
53 |
private String configurationId; |
|
54 |
|
|
55 |
/** The include children. */ |
|
56 |
private boolean includeChildren; |
|
57 |
|
|
58 |
/** Default maximum number of allowed children. */ |
|
59 |
private final static int MAX_CHILDREN = 50; |
|
60 |
|
|
61 |
/** Maximum number of allowed children. */ |
|
62 |
private int maxChildren = MAX_CHILDREN; |
|
63 |
|
|
64 |
public WfConfig() {} |
|
65 |
|
|
66 |
/** |
|
67 |
* Instantiates a new dedup config. |
|
68 |
* |
|
69 |
* @param entityType |
|
70 |
* the entity type |
|
71 |
* @param orderField |
|
72 |
* the order field |
|
73 |
* @param rootBuilder |
|
74 |
* the root builder families |
|
75 |
* @param dedupRun |
|
76 |
* the dedup run |
|
77 |
* @param configurationId |
|
78 |
* the configuration identifier |
|
79 |
* @param threshold |
|
80 |
* the threshold |
|
81 |
* @param skipList |
|
82 |
* the skip list |
|
83 |
* @param queueMaxSize |
|
84 |
* the queue max size |
|
85 |
* @param groupMaxSize |
|
86 |
* the group max size |
|
87 |
* @param slidingWindowSize |
|
88 |
* the sliding window size |
|
89 |
* @param includeChildren |
|
90 |
* allows the children to be included in the representative records or not. |
|
91 |
*/ |
|
92 |
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun, |
|
93 |
final double threshold, |
|
94 |
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren) { |
|
95 |
super(); |
|
96 |
this.entityType = entityType; |
|
97 |
this.orderField = orderField; |
|
98 |
this.rootBuilder = rootBuilder; |
|
99 |
this.dedupRun = cleanupStringNumber(dedupRun); |
|
100 |
this.threshold = threshold; |
|
101 |
this.skipList = skipList; |
|
102 |
this.queueMaxSize = queueMaxSize; |
|
103 |
this.groupMaxSize = groupMaxSize; |
|
104 |
this.slidingWindowSize = slidingWindowSize; |
|
105 |
this.includeChildren = includeChildren; |
|
106 |
} |
|
107 |
|
|
108 |
/** |
|
109 |
* Cleanup string number. |
|
110 |
* |
|
111 |
* @param s |
|
112 |
* the s |
|
113 |
* @return the string |
|
114 |
*/ |
|
115 |
private String cleanupStringNumber(final String s) { |
|
116 |
return s.contains("'") ? s.replaceAll("'", "") : s; |
|
117 |
} |
|
118 |
|
|
119 |
public String getEntityType() { |
|
120 |
return entityType; |
|
121 |
} |
|
122 |
|
|
123 |
public void setEntityType(final String entityType) { |
|
124 |
this.entityType = entityType; |
|
125 |
} |
|
126 |
|
|
127 |
public String getOrderField() { |
|
128 |
return orderField; |
|
129 |
} |
|
130 |
|
|
131 |
public void setOrderField(final String orderField) { |
|
132 |
this.orderField = orderField; |
|
133 |
} |
|
134 |
|
|
135 |
public List<String> getRootBuilder() { |
|
136 |
return rootBuilder; |
|
137 |
} |
|
138 |
|
|
139 |
public void setRootBuilder(final List<String> rootBuilder) { |
|
140 |
this.rootBuilder = rootBuilder; |
|
141 |
} |
|
142 |
|
|
143 |
public Set<String> getSkipList() { |
|
144 |
return skipList != null ? skipList : new HashSet<String>(); |
|
145 |
} |
|
146 |
|
|
147 |
public void setSkipList(final Set<String> skipList) { |
|
148 |
this.skipList = skipList; |
|
149 |
} |
|
150 |
|
|
151 |
public String getDedupRun() { |
|
152 |
return dedupRun; |
|
153 |
} |
|
154 |
|
|
155 |
public void setDedupRun(final String dedupRun) { |
|
156 |
this.dedupRun = dedupRun; |
|
157 |
} |
|
158 |
|
|
159 |
public double getThreshold() { |
|
160 |
return threshold; |
|
161 |
} |
|
162 |
|
|
163 |
public void setThreshold(final double threshold) { |
|
164 |
this.threshold = threshold; |
|
165 |
} |
|
166 |
|
|
167 |
public int getQueueMaxSize() { |
|
168 |
return queueMaxSize; |
|
169 |
} |
|
170 |
|
|
171 |
public void setQueueMaxSize(final int queueMaxSize) { |
|
172 |
this.queueMaxSize = queueMaxSize; |
|
173 |
} |
|
174 |
|
|
175 |
public int getGroupMaxSize() { |
|
176 |
return groupMaxSize; |
|
177 |
} |
|
178 |
|
|
179 |
public void setGroupMaxSize(final int groupMaxSize) { |
|
180 |
this.groupMaxSize = groupMaxSize; |
|
181 |
} |
|
182 |
|
|
183 |
public int getSlidingWindowSize() { |
|
184 |
return slidingWindowSize; |
|
185 |
} |
|
186 |
|
|
187 |
public void setSlidingWindowSize(final int slidingWindowSize) { |
|
188 |
this.slidingWindowSize = slidingWindowSize; |
|
189 |
} |
|
190 |
|
|
191 |
public String getConfigurationId() { |
|
192 |
return configurationId; |
|
193 |
} |
|
194 |
|
|
195 |
public void setConfigurationId(final String configurationId) { |
|
196 |
this.configurationId = configurationId; |
|
197 |
} |
|
198 |
|
|
199 |
public boolean isIncludeChildren() { |
|
200 |
return includeChildren; |
|
201 |
} |
|
202 |
|
|
203 |
public void setIncludeChildren(final boolean includeChildren) { |
|
204 |
this.includeChildren = includeChildren; |
|
205 |
} |
|
206 |
|
|
207 |
public int getMaxChildren() { |
|
208 |
return maxChildren; |
|
209 |
} |
|
210 |
|
|
211 |
public void setMaxChildren(final int maxChildren) { |
|
212 |
this.maxChildren = maxChildren; |
|
213 |
} |
|
214 |
|
|
215 |
/* |
|
216 |
* (non-Javadoc) |
|
217 |
* |
|
218 |
* @see java.lang.Object#toString() |
|
219 |
*/ |
|
220 |
@Override |
|
221 |
public String toString() { |
|
222 |
return new GsonBuilder().setPrettyPrinting().create().toJson(this); |
|
223 |
} |
|
224 |
|
|
225 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/distance/SortedJaroWinkler.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import com.wcohen.ss.AbstractStringDistance; |
|
4 |
|
|
5 |
/** |
|
6 |
* The Class SortedJaroWinkler. |
|
7 |
*/ |
|
8 |
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo { |
|
9 |
|
|
10 |
/** |
|
11 |
* Instantiates a new sorted jaro winkler. |
|
12 |
* |
|
13 |
* @param weight |
|
14 |
* the weight |
|
15 |
*/ |
|
16 |
public SortedJaroWinkler(final double weight) { |
|
17 |
super(weight, new com.wcohen.ss.JaroWinkler()); |
|
18 |
} |
|
19 |
|
|
20 |
/** |
|
21 |
* Instantiates a new sorted jaro winkler. |
|
22 |
* |
|
23 |
* @param weight |
|
24 |
* the weight |
|
25 |
* @param ssalgo |
|
26 |
* the ssalgo |
|
27 |
*/ |
|
28 |
protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) { |
|
29 |
super(weight, ssalgo); |
|
30 |
} |
|
31 |
|
|
32 |
/* |
|
33 |
* (non-Javadoc) |
|
34 |
* |
|
35 |
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() |
|
36 |
*/ |
|
37 |
@Override |
|
38 |
public double getWeight() { |
|
39 |
return super.weight; |
|
40 |
} |
|
41 |
|
|
42 |
/* |
|
43 |
* (non-Javadoc) |
|
44 |
* |
|
45 |
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) |
|
46 |
*/ |
|
47 |
@Override |
|
48 |
protected double normalize(final double d) { |
|
49 |
return d; |
|
50 |
} |
|
51 |
|
|
52 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/config/PaceConfig.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.config; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
import org.apache.commons.collections.CollectionUtils; |
|
7 |
|
|
8 |
import com.google.common.base.Predicate; |
|
9 |
import com.google.common.collect.Iterables; |
|
10 |
import com.google.common.collect.Lists; |
|
11 |
|
|
12 |
import eu.dnetlib.pace.condition.ConditionAlgo; |
|
13 |
import eu.dnetlib.pace.model.ClusteringDef; |
|
14 |
import eu.dnetlib.pace.model.CondDef; |
|
15 |
import eu.dnetlib.pace.model.FieldDef; |
|
16 |
|
|
17 |
public class PaceConfig { |
|
18 |
|
|
19 |
private List<FieldDef> model; |
|
20 |
private List<CondDef> strictConditions; |
|
21 |
private List<CondDef> conditions; |
|
22 |
private List<ClusteringDef> clustering; |
|
23 |
private Map<String, List<String>> blacklists; |
|
24 |
|
|
25 |
public PaceConfig() {} |
|
26 |
|
|
27 |
public List<FieldDef> getModel() { |
|
28 |
return model; |
|
29 |
} |
|
30 |
|
|
31 |
public void setModel(final List<FieldDef> fields) { |
|
32 |
this.model = fields; |
|
33 |
} |
|
34 |
|
|
35 |
public List<CondDef> getStrictConditions() { |
|
36 |
return strictConditions; |
|
37 |
} |
|
38 |
|
|
39 |
public void setStrictConditions(final List<CondDef> strictConditions) { |
|
40 |
this.strictConditions = strictConditions; |
|
41 |
} |
|
42 |
|
|
43 |
public List<CondDef> getConditions() { |
|
44 |
return conditions; |
|
45 |
} |
|
46 |
|
|
47 |
public List<ConditionAlgo> getConditionAlgos() { |
|
48 |
return asConditionAlgos(getConditions()); |
|
49 |
} |
|
50 |
|
|
51 |
public List<ConditionAlgo> getStrictConditionAlgos() { |
|
52 |
return asConditionAlgos(getStrictConditions()); |
|
53 |
} |
|
54 |
|
|
55 |
public void setConditions(final List<CondDef> conditions) { |
|
56 |
this.conditions = conditions; |
|
57 |
} |
|
58 |
|
|
59 |
public List<ClusteringDef> getClustering() { |
|
60 |
return clustering; |
|
61 |
} |
|
62 |
|
|
63 |
public void setClustering(final List<ClusteringDef> clustering) { |
|
64 |
this.clustering = clustering; |
|
65 |
} |
|
66 |
|
|
67 |
public Map<String, List<String>> getBlacklists() { |
|
68 |
return blacklists; |
|
69 |
} |
|
70 |
|
|
71 |
public void setBlacklists(final Map<String, List<String>> blacklists) { |
|
72 |
this.blacklists = blacklists; |
|
73 |
} |
|
74 |
|
|
75 |
// helper |
|
76 |
|
|
77 |
private List<ConditionAlgo> asConditionAlgos(final List<CondDef> defs) { |
|
78 |
final List<ConditionAlgo> algos = Lists.newArrayList(); |
|
79 |
if (CollectionUtils.isEmpty(defs)) return algos; |
|
80 |
for (final CondDef cd : defs) { |
|
81 |
final List<FieldDef> fields = Lists.newArrayList(Iterables.filter(getModel(), new Predicate<FieldDef>() { |
|
82 |
|
|
83 |
@Override |
|
84 |
public boolean apply(final FieldDef fd) { |
|
85 |
|
|
86 |
return cd.getFields().contains(fd.getName()); |
|
87 |
} |
|
88 |
})); |
|
89 |
algos.add(cd.getConditionAlgo(fields)); |
|
90 |
} |
|
91 |
return algos; |
|
92 |
} |
|
93 |
|
|
94 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/distance/LevensteinDate.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
|
|
4 |
public class LevensteinDate extends Levenstein { |
|
5 |
|
|
6 |
|
|
7 |
public LevensteinDate(double w) { |
|
8 |
super(w); |
|
9 |
} |
|
10 |
|
|
11 |
|
|
12 |
@Override |
|
13 |
public double distance(String a, String b) { |
|
14 |
|
|
15 |
return 1.0; |
|
16 |
} |
|
17 |
|
|
18 |
|
|
19 |
|
|
20 |
@Override |
|
21 |
public double getWeight() { |
|
22 |
return super.weight; |
|
23 |
} |
|
24 |
|
|
25 |
} |
|
0 | 26 |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/distance/SubStringLevenstein.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import org.apache.commons.lang.StringUtils; |
|
4 |
|
|
5 |
import com.wcohen.ss.AbstractStringDistance; |
|
6 |
|
|
7 |
import eu.dnetlib.pace.config.Type; |
|
8 |
import eu.dnetlib.pace.model.Field; |
|
9 |
|
|
10 |
/** |
|
11 |
* The Class SubStringLevenstein. |
|
12 |
*/ |
|
13 |
public class SubStringLevenstein extends SecondStringDistanceAlgo { |
|
14 |
|
|
15 |
/** The limit. */ |
|
16 |
protected int limit; |
|
17 |
|
|
18 |
/** |
|
19 |
* Instantiates a new sub string levenstein. |
|
20 |
* |
|
21 |
* @param w |
|
22 |
* the w |
|
23 |
*/ |
|
24 |
public SubStringLevenstein(final double w) { |
|
25 |
super(w, new com.wcohen.ss.Levenstein()); |
|
26 |
} |
|
27 |
|
|
28 |
/** |
|
29 |
* Instantiates a new sub string levenstein. |
|
30 |
* |
|
31 |
* @param w |
|
32 |
* the w |
|
33 |
* @param limit |
|
34 |
* the limit |
|
35 |
*/ |
|
36 |
public SubStringLevenstein(final double w, final int limit) { |
|
37 |
super(w, new com.wcohen.ss.Levenstein()); |
|
38 |
this.limit = limit; |
|
39 |
} |
|
40 |
|
|
41 |
/** |
|
42 |
* Instantiates a new sub string levenstein. |
|
43 |
* |
|
44 |
* @param w |
|
45 |
* the w |
|
46 |
* @param limit |
|
47 |
* the limit |
|
48 |
* @param ssalgo |
|
49 |
* the ssalgo |
|
50 |
*/ |
|
51 |
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) { |
|
52 |
super(w, ssalgo); |
|
53 |
this.limit = limit; |
|
54 |
} |
|
55 |
|
|
56 |
/* |
|
57 |
* (non-Javadoc) |
|
58 |
* |
|
59 |
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) |
|
60 |
*/ |
|
61 |
@Override |
|
62 |
public double distance(final Field a, final Field b) { |
|
63 |
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) |
|
64 |
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit)); |
|
65 |
|
|
66 |
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); |
|
67 |
} |
|
68 |
|
|
69 |
/* |
|
70 |
* (non-Javadoc) |
|
71 |
* |
|
72 |
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() |
|
73 |
*/ |
|
74 |
@Override |
|
75 |
public double getWeight() { |
|
76 |
return super.weight; |
|
77 |
} |
|
78 |
|
|
79 |
/* |
|
80 |
* (non-Javadoc) |
|
81 |
* |
|
82 |
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) |
|
83 |
*/ |
|
84 |
@Override |
|
85 |
protected double normalize(final double d) { |
|
86 |
return 1 / Math.pow(Math.abs(d) + 1, 0.1); |
|
87 |
} |
|
88 |
|
|
89 |
} |
|
0 | 90 |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/pom.xml | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet-hadoop-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
<relativePath /> |
|
8 |
</parent> |
|
9 |
<modelVersion>4.0.0</modelVersion> |
|
10 |
<groupId>eu.dnetlib</groupId> |
|
11 |
<artifactId>dnet-pace-core</artifactId> |
|
12 |
<packaging>jar</packaging> |
|
13 |
<version>2.0.0</version> |
|
14 |
<scm> |
|
15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-pace-core/tags/dnet-pace-core-2.0.0</developerConnection> |
|
16 |
</scm> |
|
17 |
<dependencies> |
|
18 |
<dependency> |
|
19 |
<groupId>edu.cmu</groupId> |
|
20 |
<artifactId>secondstring</artifactId> |
|
21 |
<version>1.0.0</version> |
|
22 |
</dependency> |
|
23 |
<dependency> |
|
24 |
<groupId>com.google.guava</groupId> |
|
25 |
<artifactId>guava</artifactId> |
|
26 |
<version>${google.guava.version}</version> |
|
27 |
</dependency> |
|
28 |
<dependency> |
|
29 |
<groupId>com.google.code.gson</groupId> |
|
30 |
<artifactId>gson</artifactId> |
|
31 |
<version>${google.gson.version}</version> |
|
32 |
</dependency> |
|
33 |
<dependency> |
|
34 |
<groupId>commons-lang</groupId> |
|
35 |
<artifactId>commons-lang</artifactId> |
|
36 |
<version>${commons.lang.version}</version> |
|
37 |
</dependency> |
|
38 |
<dependency> |
|
39 |
<groupId>commons-io</groupId> |
|
40 |
<artifactId>commons-io</artifactId> |
|
41 |
<version>${commons.io.version}</version> |
|
42 |
</dependency> |
|
43 |
<dependency> |
|
44 |
<groupId>commons-collections</groupId> |
|
45 |
<artifactId>commons-collections</artifactId> |
|
46 |
<version>${commons.collections.version}</version> |
|
47 |
</dependency> |
|
48 |
<dependency> |
|
49 |
<groupId>org.antlr</groupId> |
|
50 |
<artifactId>stringtemplate</artifactId> |
|
51 |
<version>3.2</version> |
|
52 |
</dependency> |
|
53 |
<dependency> |
|
54 |
<groupId>junit</groupId> |
|
55 |
<artifactId>junit</artifactId> |
|
56 |
<version>${junit.version}</version> |
|
57 |
<scope>test</scope> |
|
58 |
</dependency> |
|
59 |
</dependencies> |
|
60 |
</project> |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import eu.dnetlib.pace.model.Document; |
|
4 |
|
|
5 |
public class PaceDocumentDistance extends AbstractDistance<Document> { |
|
6 |
|
|
7 |
@Override |
|
8 |
protected Document toDocument(Document a) { |
|
9 |
return a; |
|
10 |
} |
|
11 |
|
|
12 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.0.0/src/test/java/eu/dnetlib/pace/config/ConfigTest.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.config; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertNotNull; |
|
4 |
|
|
5 |
import java.io.IOException; |
|
6 |
|
|
7 |
import org.junit.Test; |
|
8 |
|
|
9 |
import eu.dnetlib.pace.AbstractPaceTest; |
|
10 |
|
|
11 |
public class ConfigTest extends AbstractPaceTest { |
|
12 |
|
|
13 |
@Test |
|
14 |
public void test() throws IOException { |
|
15 |
final DedupConfig cfg = DedupConfig.load(readFromClasspath("result.pace.conf.json")); |
|
16 |
|
|
17 |
assertNotNull(cfg); |
|
18 |
|
|
19 |
System.out.println(cfg); |
|
20 |
} |
|
21 |
|
|
22 |
} |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-pace-core-2.0.0