Revision 37509
Added by Claudio Atzori over 9 years ago
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/LevensteinDate.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
|
|
4 |
public class LevensteinDate extends Levenstein { |
|
5 |
|
|
6 |
|
|
7 |
public LevensteinDate(double w) { |
|
8 |
super(w); |
|
9 |
} |
|
10 |
|
|
11 |
|
|
12 |
@Override |
|
13 |
public double distance(String a, String b) { |
|
14 |
|
|
15 |
return 1.0; |
|
16 |
} |
|
17 |
|
|
18 |
|
|
19 |
|
|
20 |
@Override |
|
21 |
public double getWeight() { |
|
22 |
return super.weight; |
|
23 |
} |
|
24 |
|
|
25 |
} |
|
0 | 26 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/JaroWinklerTitle.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import com.wcohen.ss.AbstractStringDistance; |
|
4 |
|
|
5 |
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) |
|
6 |
public class JaroWinklerTitle extends SecondStringDistanceAlgo { |
|
7 |
|
|
8 |
public JaroWinklerTitle(double weight) { |
|
9 |
super(weight, new com.wcohen.ss.JaroWinkler()); |
|
10 |
} |
|
11 |
|
|
12 |
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) { |
|
13 |
super(weight, ssalgo); |
|
14 |
} |
|
15 |
|
|
16 |
@Override |
|
17 |
public double distance(String a, String b) { |
|
18 |
String ca = cleanup(a); |
|
19 |
String cb = cleanup(b); |
|
20 |
|
|
21 |
boolean check = checkNumbers(ca, cb); |
|
22 |
return check ? 0.5 : normalize(ssalgo.score(ca, cb)); |
|
23 |
} |
|
24 |
|
|
25 |
@Override |
|
26 |
public double getWeight() { |
|
27 |
return super.weight; |
|
28 |
} |
|
29 |
|
|
30 |
@Override |
|
31 |
protected double normalize(double d) { |
|
32 |
return d; |
|
33 |
} |
|
34 |
|
|
35 |
} |
|
0 | 36 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/Level2JaroWinklerTitle.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import com.wcohen.ss.AbstractStringDistance; |
|
4 |
|
|
5 |
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo { |
|
6 |
|
|
7 |
public Level2JaroWinklerTitle(final double w) { |
|
8 |
super(w, new com.wcohen.ss.Level2JaroWinkler()); |
|
9 |
} |
|
10 |
|
|
11 |
protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) { |
|
12 |
super(w, ssalgo); |
|
13 |
} |
|
14 |
|
|
15 |
@Override |
|
16 |
public double distance(final String a, final String b) { |
|
17 |
final String ca = cleanup(a); |
|
18 |
final String cb = cleanup(b); |
|
19 |
|
|
20 |
final boolean check = checkNumbers(ca, cb); |
|
21 |
|
|
22 |
if (check) return 0.5; |
|
23 |
|
|
24 |
final String cca = finalCleanup(ca); |
|
25 |
final String ccb = finalCleanup(cb); |
|
26 |
|
|
27 |
return ssalgo.score(cca, ccb); |
|
28 |
} |
|
29 |
|
|
30 |
@Override |
|
31 |
public double getWeight() { |
|
32 |
return super.weight; |
|
33 |
} |
|
34 |
|
|
35 |
@Override |
|
36 |
protected double normalize(final double d) { |
|
37 |
return d; |
|
38 |
} |
|
39 |
|
|
40 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import eu.dnetlib.pace.model.Field; |
|
4 |
|
|
5 |
/** |
|
6 |
* Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two |
|
7 |
* objects. |
|
8 |
*/ |
|
9 |
public interface DistanceAlgo { |
|
10 |
|
|
11 |
public abstract double distance(Field a, Field b); |
|
12 |
|
|
13 |
public double getWeight(); |
|
14 |
|
|
15 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
|
|
5 |
import eu.dnetlib.pace.condition.ConditionAlgo; |
|
6 |
import eu.dnetlib.pace.model.Document; |
|
7 |
import eu.dnetlib.pace.model.Field; |
|
8 |
import eu.dnetlib.pace.model.FieldDef; |
|
9 |
|
|
10 |
/** |
|
11 |
* The distance between two documents is given by the weighted mean of the field distances |
|
12 |
*/ |
|
13 |
public class DistanceScorer { |
|
14 |
|
|
15 |
private List<FieldDef> fields; |
|
16 |
|
|
17 |
private List<ConditionAlgo> strictConditions; |
|
18 |
|
|
19 |
private List<ConditionAlgo> conditions; |
|
20 |
|
|
21 |
public DistanceScorer(final List<FieldDef> fields, final List<ConditionAlgo> strictConditions, final List<ConditionAlgo> conditions) { |
|
22 |
this.fields = fields; |
|
23 |
this.conditions = conditions; |
|
24 |
this.strictConditions = strictConditions; |
|
25 |
} |
|
26 |
|
|
27 |
public double distance(final Document a, final Document b) { |
|
28 |
|
|
29 |
double w = sumWeights(fields); |
|
30 |
double sum = 0.0; |
|
31 |
final int cond = verify(a, b, strictConditions, true); |
|
32 |
|
|
33 |
if (cond > 0) return 1.0; |
|
34 |
if (cond < 0) return 0.0; |
|
35 |
|
|
36 |
if (verify(a, b, conditions, true) >= 0) { |
|
37 |
for (final FieldDef fd : fields) { |
|
38 |
final double d = fieldDistance(a, b, fd); |
|
39 |
|
|
40 |
if (d > 0) { |
|
41 |
sum += d; |
|
42 |
} else { |
|
43 |
w -= fd.getWeight(); |
|
44 |
} |
|
45 |
} |
|
46 |
return w == 0 ? 0 : sum / w; |
|
47 |
} |
|
48 |
return 0.0; |
|
49 |
} |
|
50 |
|
|
51 |
private int verify(final Document a, final Document b, final List<ConditionAlgo> conditions, final boolean strict) { |
|
52 |
int cond = 0; |
|
53 |
|
|
54 |
for (final ConditionAlgo cd : conditions) { |
|
55 |
final int verify = cd.verify(a, b); |
|
56 |
if (strict && (verify < 0)) return -1; |
|
57 |
cond += verify; |
|
58 |
} |
|
59 |
return cond; |
|
60 |
} |
|
61 |
|
|
62 |
private double fieldDistance(final Document a, final Document b, final FieldDef fd) { |
|
63 |
final double w = fd.getWeight(); |
|
64 |
if ((w == 0)) return 0.0; // optimization for 0 weight |
|
65 |
else { |
|
66 |
final Field va = getValue(a, fd); |
|
67 |
final Field vb = getValue(b, fd); |
|
68 |
|
|
69 |
if (va.isEmpty() || vb.isEmpty()) { |
|
70 |
if (fd.isIgnoreMissing()) return -1; |
|
71 |
else return w; |
|
72 |
} else { |
|
73 |
|
|
74 |
if (va.getType().equals(vb.getType())) { |
|
75 |
final double d = fd.getDistanceAlgo().distance(va, vb); |
|
76 |
return w * d; |
|
77 |
} |
|
78 |
throw new IllegalArgumentException("Types are differents type"); |
|
79 |
} |
|
80 |
} |
|
81 |
} |
|
82 |
|
|
83 |
private Field getValue(final Document d, final FieldDef fd) { |
|
84 |
return d.values(fd.getName()); |
|
85 |
} |
|
86 |
|
|
87 |
private double sumWeights(final List<FieldDef> fields) { |
|
88 |
double sum = 0.0; |
|
89 |
for (final FieldDef fd : fields) { |
|
90 |
sum += fd.getWeight(); |
|
91 |
} |
|
92 |
return sum; |
|
93 |
} |
|
94 |
|
|
95 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/Distance.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import eu.dnetlib.pace.config.Config; |
|
4 |
|
|
5 |
public interface Distance<A> { |
|
6 |
|
|
7 |
public double between(A a, A b, Config config); |
|
8 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/SortedSecondStringDistanceAlgo.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import java.util.Collections; |
|
4 |
import java.util.List; |
|
5 |
|
|
6 |
import com.google.common.collect.Lists; |
|
7 |
import com.wcohen.ss.AbstractStringDistance; |
|
8 |
|
|
9 |
import eu.dnetlib.pace.model.Field; |
|
10 |
import eu.dnetlib.pace.model.FieldList; |
|
11 |
|
|
12 |
/** |
|
13 |
* For the rest of the fields delegate the distance measure to the second string library. |
|
14 |
*/ |
|
15 |
public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo { |
|
16 |
|
|
17 |
/** |
|
18 |
* Instantiates a new sorted second string distance algo. |
|
19 |
* |
|
20 |
* @param weight |
|
21 |
* the weight |
|
22 |
* @param ssalgo |
|
23 |
* the ssalgo |
|
24 |
*/ |
|
25 |
protected SortedSecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) { |
|
26 |
super(weight, ssalgo); |
|
27 |
} |
|
28 |
|
|
29 |
/* |
|
30 |
* (non-Javadoc) |
|
31 |
* |
|
32 |
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field) |
|
33 |
*/ |
|
34 |
@Override |
|
35 |
protected List<String> toList(final Field list) { |
|
36 |
FieldList fl = (FieldList) list; |
|
37 |
List<String> values = Lists.newArrayList(fl.stringList()); |
|
38 |
Collections.sort(values); |
|
39 |
return values; |
|
40 |
} |
|
41 |
|
|
42 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import com.google.common.collect.Lists; |
|
8 |
|
|
9 |
import eu.dnetlib.pace.model.Person; |
|
10 |
|
|
11 |
public class PersonHash extends AbstractClusteringFunction { |
|
12 |
|
|
13 |
private boolean DEFAULT_AGGRESSIVE = false; |
|
14 |
|
|
15 |
public PersonHash(final Map<String, Integer> params) { |
|
16 |
super(params); |
|
17 |
} |
|
18 |
|
|
19 |
@Override |
|
20 |
protected Collection<String> doApply(final String s) { |
|
21 |
final List<String> res = Lists.newArrayList(); |
|
22 |
|
|
23 |
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); |
|
24 |
|
|
25 |
res.add(new Person(s, aggressive).hash()); |
|
26 |
|
|
27 |
return res; |
|
28 |
} |
|
29 |
|
|
30 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/SubStringLevenstein.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import org.apache.commons.lang.StringUtils; |
|
4 |
|
|
5 |
import com.wcohen.ss.AbstractStringDistance; |
|
6 |
|
|
7 |
import eu.dnetlib.pace.config.Type; |
|
8 |
import eu.dnetlib.pace.model.Field; |
|
9 |
|
|
10 |
/** |
|
11 |
* The Class SubStringLevenstein. |
|
12 |
*/ |
|
13 |
public class SubStringLevenstein extends SecondStringDistanceAlgo { |
|
14 |
|
|
15 |
/** The limit. */ |
|
16 |
protected int limit; |
|
17 |
|
|
18 |
/** |
|
19 |
* Instantiates a new sub string levenstein. |
|
20 |
* |
|
21 |
* @param w |
|
22 |
* the w |
|
23 |
*/ |
|
24 |
public SubStringLevenstein(final double w) { |
|
25 |
super(w, new com.wcohen.ss.Levenstein()); |
|
26 |
} |
|
27 |
|
|
28 |
/** |
|
29 |
* Instantiates a new sub string levenstein. |
|
30 |
* |
|
31 |
* @param w |
|
32 |
* the w |
|
33 |
* @param limit |
|
34 |
* the limit |
|
35 |
*/ |
|
36 |
public SubStringLevenstein(final double w, final int limit) { |
|
37 |
super(w, new com.wcohen.ss.Levenstein()); |
|
38 |
this.limit = limit; |
|
39 |
} |
|
40 |
|
|
41 |
/** |
|
42 |
* Instantiates a new sub string levenstein. |
|
43 |
* |
|
44 |
* @param w |
|
45 |
* the w |
|
46 |
* @param limit |
|
47 |
* the limit |
|
48 |
* @param ssalgo |
|
49 |
* the ssalgo |
|
50 |
*/ |
|
51 |
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) { |
|
52 |
super(w, ssalgo); |
|
53 |
this.limit = limit; |
|
54 |
} |
|
55 |
|
|
56 |
/* |
|
57 |
* (non-Javadoc) |
|
58 |
* |
|
59 |
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) |
|
60 |
*/ |
|
61 |
@Override |
|
62 |
public double distance(final Field a, final Field b) { |
|
63 |
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) |
|
64 |
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit)); |
|
65 |
|
|
66 |
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); |
|
67 |
} |
|
68 |
|
|
69 |
/* |
|
70 |
* (non-Javadoc) |
|
71 |
* |
|
72 |
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() |
|
73 |
*/ |
|
74 |
@Override |
|
75 |
public double getWeight() { |
|
76 |
return super.weight; |
|
77 |
} |
|
78 |
|
|
79 |
/* |
|
80 |
* (non-Javadoc) |
|
81 |
* |
|
82 |
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) |
|
83 |
*/ |
|
84 |
@Override |
|
85 |
protected double normalize(final double d) { |
|
86 |
return 1 / Math.pow(Math.abs(d) + 1, 0.1); |
|
87 |
} |
|
88 |
|
|
89 |
} |
|
0 | 90 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/SortedLevel2JaroWinkler.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import com.wcohen.ss.AbstractStringDistance; |
|
4 |
|
|
5 |
/** |
|
6 |
* The Class SortedJaroWinkler. |
|
7 |
*/ |
|
8 |
public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo { |
|
9 |
|
|
10 |
/** |
|
11 |
* Instantiates a new sorted jaro winkler. |
|
12 |
* |
|
13 |
* @param weight |
|
14 |
* the weight |
|
15 |
*/ |
|
16 |
public SortedLevel2JaroWinkler(final double weight) { |
|
17 |
super(weight, new com.wcohen.ss.Level2JaroWinkler()); |
|
18 |
} |
|
19 |
|
|
20 |
/** |
|
21 |
* Instantiates a new sorted jaro winkler. |
|
22 |
* |
|
23 |
* @param weight |
|
24 |
* the weight |
|
25 |
* @param ssalgo |
|
26 |
* the ssalgo |
|
27 |
*/ |
|
28 |
protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) { |
|
29 |
super(weight, ssalgo); |
|
30 |
} |
|
31 |
|
|
32 |
/* |
|
33 |
* (non-Javadoc) |
|
34 |
* |
|
35 |
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() |
|
36 |
*/ |
|
37 |
@Override |
|
38 |
public double getWeight() { |
|
39 |
return super.weight; |
|
40 |
} |
|
41 |
|
|
42 |
/* |
|
43 |
* (non-Javadoc) |
|
44 |
* |
|
45 |
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) |
|
46 |
*/ |
|
47 |
@Override |
|
48 |
protected double normalize(final double d) { |
|
49 |
return d; |
|
50 |
} |
|
51 |
|
|
52 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
import com.google.common.base.Predicate; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.model.Field; |
|
9 |
|
|
10 |
public class FieldFilter implements Predicate<Field> { |
|
11 |
|
|
12 |
private Map<String, List<String>> blacklists; |
|
13 |
|
|
14 |
private String filedName; |
|
15 |
|
|
16 |
public FieldFilter(final String fieldName, final Map<String, List<String>> blacklists) { |
|
17 |
this.filedName = fieldName; |
|
18 |
this.blacklists = blacklists; |
|
19 |
} |
|
20 |
|
|
21 |
@Override |
|
22 |
public boolean apply(final Field f) { |
|
23 |
return !regexMatches(filedName, f.stringValue(), blacklists); |
|
24 |
} |
|
25 |
|
|
26 |
/** |
|
27 |
* Tries to match the fields in the regex blacklist. |
|
28 |
* |
|
29 |
* @param fieldName |
|
30 |
* @param value |
|
31 |
* @return true if the field matches, false otherwise |
|
32 |
*/ |
|
33 |
protected boolean regexMatches(final String fieldName, final String value, final Map<String, List<String>> blacklists) { |
|
34 |
if (blacklists.containsKey(fieldName)) { |
|
35 |
final Iterable<String> regexes = blacklists.get(fieldName); |
|
36 |
for (final String regex : regexes) { |
|
37 |
if (value.matches(regex)) return true; |
|
38 |
} |
|
39 |
} |
|
40 |
return false; |
|
41 |
} |
|
42 |
} |
|
0 | 43 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/YearLevenstein.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
|
|
4 |
public class YearLevenstein extends SubStringLevenstein { |
|
5 |
|
|
6 |
public YearLevenstein(double w) { |
|
7 |
super(w); |
|
8 |
} |
|
9 |
|
|
10 |
public YearLevenstein(double w, int limit) { |
|
11 |
super(w, limit); |
|
12 |
} |
|
13 |
|
|
14 |
@Override |
|
15 |
public double distance(String a, String b) { |
|
16 |
boolean check = checkLength(a) && checkLength(b); |
|
17 |
if (check) { |
|
18 |
if (a.equals(b)) { |
|
19 |
return 1.0; |
|
20 |
} else { |
|
21 |
return 0.5; |
|
22 |
} |
|
23 |
} else { |
|
24 |
return 1.0; |
|
25 |
} |
|
26 |
} |
|
27 |
|
|
28 |
protected boolean checkLength(String s) { |
|
29 |
return getNumbers(s).length() == limit; |
|
30 |
} |
|
31 |
|
|
32 |
@Override |
|
33 |
public double getWeight() { |
|
34 |
return super.weight; |
|
35 |
} |
|
36 |
|
|
37 |
} |
|
0 | 38 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/NullDistanceAlgo.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import eu.dnetlib.pace.model.Field; |
|
4 |
|
|
5 |
/** |
|
6 |
* Not all fields of a document need to partecipate in the distance measure. We model those fields as having a |
|
7 |
* NullDistanceAlgo. |
|
8 |
*/ |
|
9 |
public class NullDistanceAlgo implements DistanceAlgo { |
|
10 |
|
|
11 |
@Override |
|
12 |
public double distance(Field a, Field b) { |
|
13 |
return 0.0; |
|
14 |
} |
|
15 |
|
|
16 |
@Override |
|
17 |
public double getWeight() { |
|
18 |
return 0.0; |
|
19 |
} |
|
20 |
|
|
21 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/distance/Level2Levenstein.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.distance; |
|
2 |
|
|
3 |
import com.wcohen.ss.AbstractStringDistance; |
|
4 |
|
|
5 |
public class Level2Levenstein extends SecondStringDistanceAlgo { |
|
6 |
|
|
7 |
public Level2Levenstein(double w) { |
|
8 |
super(w, new com.wcohen.ss.Level2Levenstein()); |
|
9 |
} |
|
10 |
|
|
11 |
protected Level2Levenstein(double w, AbstractStringDistance ssalgo) { |
|
12 |
super(w, ssalgo); |
|
13 |
} |
|
14 |
|
|
15 |
@Override |
|
16 |
public double getWeight() { |
|
17 |
return super.weight; |
|
18 |
} |
|
19 |
|
|
20 |
@Override |
|
21 |
protected double normalize(double d) { |
|
22 |
return 1 / Math.pow(Math.abs(d) + 1, 0.1); |
|
23 |
} |
|
24 |
|
|
25 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/pom.xml | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet-hadoop-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
<relativePath /> |
|
8 |
</parent> |
|
9 |
<modelVersion>4.0.0</modelVersion> |
|
10 |
<groupId>eu.dnetlib</groupId> |
|
11 |
<artifactId>dnet-pace-core</artifactId> |
|
12 |
<packaging>jar</packaging> |
|
13 |
<version>2.1.2</version> |
|
14 |
<scm> |
|
15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-pace-core/tags/dnet-pace-core-2.1.2</developerConnection> |
|
16 |
</scm> |
|
17 |
<dependencies> |
|
18 |
<dependency> |
|
19 |
<groupId>edu.cmu</groupId> |
|
20 |
<artifactId>secondstring</artifactId> |
|
21 |
<version>1.0.0</version> |
|
22 |
</dependency> |
|
23 |
<dependency> |
|
24 |
<groupId>com.google.guava</groupId> |
|
25 |
<artifactId>guava</artifactId> |
|
26 |
<version>${google.guava.version}</version> |
|
27 |
</dependency> |
|
28 |
<dependency> |
|
29 |
<groupId>com.google.code.gson</groupId> |
|
30 |
<artifactId>gson</artifactId> |
|
31 |
<version>${google.gson.version}</version> |
|
32 |
</dependency> |
|
33 |
<dependency> |
|
34 |
<groupId>commons-lang</groupId> |
|
35 |
<artifactId>commons-lang</artifactId> |
|
36 |
<version>${commons.lang.version}</version> |
|
37 |
</dependency> |
|
38 |
<dependency> |
|
39 |
<groupId>commons-io</groupId> |
|
40 |
<artifactId>commons-io</artifactId> |
|
41 |
<version>${commons.io.version}</version> |
|
42 |
</dependency> |
|
43 |
<dependency> |
|
44 |
<groupId>commons-collections</groupId> |
|
45 |
<artifactId>commons-collections</artifactId> |
|
46 |
<version>${commons.collections.version}</version> |
|
47 |
</dependency> |
|
48 |
<dependency> |
|
49 |
<groupId>org.antlr</groupId> |
|
50 |
<artifactId>stringtemplate</artifactId> |
|
51 |
<version>3.2</version> |
|
52 |
</dependency> |
|
53 |
<dependency> |
|
54 |
<groupId>junit</groupId> |
|
55 |
<artifactId>junit</artifactId> |
|
56 |
<version>${junit.version}</version> |
|
57 |
<scope>test</scope> |
|
58 |
</dependency> |
|
59 |
</dependencies> |
|
60 |
</project> |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/Clustering.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
public enum Clustering { |
|
4 |
acronyms, ngrams, ngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash |
|
5 |
} |
|
0 | 6 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
import java.util.Map.Entry; |
|
7 |
import java.util.Set; |
|
8 |
|
|
9 |
import com.google.common.collect.Iterables; |
|
10 |
import com.google.common.collect.Lists; |
|
11 |
import com.google.common.collect.Maps; |
|
12 |
|
|
13 |
import eu.dnetlib.pace.config.Config; |
|
14 |
import eu.dnetlib.pace.model.Document; |
|
15 |
import eu.dnetlib.pace.model.FieldListImpl; |
|
16 |
import eu.dnetlib.pace.model.MapDocument; |
|
17 |
|
|
18 |
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner { |
|
19 |
|
|
20 |
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf, final Map<String, List<String>> blacklists) { |
|
21 |
|
|
22 |
final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, blacklists); |
|
23 |
return combine(filtered, conf); |
|
24 |
} |
|
25 |
|
|
26 |
private MapDocument filter(final MapDocument a, final Map<String, List<String>> blacklists) { |
|
27 |
final Map<String, FieldListImpl> filtered = Maps.newHashMap(a.getFieldMap()); |
|
28 |
if (blacklists != null) { |
|
29 |
for (final Entry<String, FieldListImpl> e : filtered.entrySet()) { |
|
30 |
|
|
31 |
final FieldListImpl fl = new FieldListImpl(); |
|
32 |
fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists)))); |
|
33 |
filtered.put(e.getKey(), fl); |
|
34 |
} |
|
35 |
} |
|
36 |
return new MapDocument(a.getIdentifier(), filtered); |
|
37 |
} |
|
38 |
|
|
39 |
/** |
|
40 |
* Tries to match the fields in the regex blacklist. |
|
41 |
* |
|
42 |
* @param fieldName |
|
43 |
* @param value |
|
44 |
* @return true if the field matches, false otherwise |
|
45 |
*/ |
|
46 |
protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) { |
|
47 |
if (blacklists.containsKey(fieldName)) { |
|
48 |
for (final String regex : blacklists.get(fieldName)) { |
|
49 |
if (value.matches(regex)) return true; |
|
50 |
} |
|
51 |
} |
|
52 |
return false; |
|
53 |
} |
|
54 |
} |
|
0 | 55 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/model/FieldDef.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
|
|
5 |
import com.google.common.base.Splitter; |
|
6 |
import com.google.common.collect.Lists; |
|
7 |
import com.google.gson.Gson; |
|
8 |
|
|
9 |
import eu.dnetlib.pace.config.Algo; |
|
10 |
import eu.dnetlib.pace.config.Type; |
|
11 |
import eu.dnetlib.pace.distance.AlwaysMatch; |
|
12 |
import eu.dnetlib.pace.distance.DistanceAlgo; |
|
13 |
import eu.dnetlib.pace.distance.ExactMatch; |
|
14 |
import eu.dnetlib.pace.distance.JaroWinkler; |
|
15 |
import eu.dnetlib.pace.distance.JaroWinklerTitle; |
|
16 |
import eu.dnetlib.pace.distance.Level2JaroWinkler; |
|
17 |
import eu.dnetlib.pace.distance.Level2JaroWinklerTitle; |
|
18 |
import eu.dnetlib.pace.distance.Level2Levenstein; |
|
19 |
import eu.dnetlib.pace.distance.Levenstein; |
|
20 |
import eu.dnetlib.pace.distance.LevensteinTitle; |
|
21 |
import eu.dnetlib.pace.distance.NullDistanceAlgo; |
|
22 |
import eu.dnetlib.pace.distance.SortedJaroWinkler; |
|
23 |
import eu.dnetlib.pace.distance.SortedLevel2JaroWinkler; |
|
24 |
import eu.dnetlib.pace.distance.SubStringLevenstein; |
|
25 |
import eu.dnetlib.pace.distance.YearLevenstein; |
|
26 |
|
|
27 |
/** |
|
28 |
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm. |
|
29 |
*/ |
|
30 |
public class FieldDef { |
|
31 |
|
|
32 |
public final static String PATH_SEPARATOR = "/"; |
|
33 |
|
|
34 |
private Algo algo; |
|
35 |
|
|
36 |
private String name; |
|
37 |
|
|
38 |
private String path; |
|
39 |
|
|
40 |
private boolean ignoreMissing; |
|
41 |
|
|
42 |
private Type type; |
|
43 |
|
|
44 |
private boolean overrideMatch; |
|
45 |
|
|
46 |
private double weight; |
|
47 |
|
|
48 |
private int limit = -1; |
|
49 |
|
|
50 |
public FieldDef() {} |
|
51 |
|
|
52 |
// def apply(s: String): Field[A] |
|
53 |
public Field apply(final Type type, final String s) { |
|
54 |
switch (type) { |
|
55 |
case Int: |
|
56 |
return new FieldValueImpl(type, name, Integer.parseInt(s)); |
|
57 |
case String: |
|
58 |
return new FieldValueImpl(type, name, s); |
|
59 |
case List: |
|
60 |
return new FieldListImpl(name); |
|
61 |
default: |
|
62 |
throw new IllegalArgumentException("Casting not implemented for type " + type); |
|
63 |
} |
|
64 |
} |
|
65 |
|
|
66 |
public String getName() { |
|
67 |
return name; |
|
68 |
} |
|
69 |
|
|
70 |
public String getPath() { |
|
71 |
return path; |
|
72 |
} |
|
73 |
|
|
74 |
public List<String> getPathList() { |
|
75 |
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath())); |
|
76 |
} |
|
77 |
|
|
78 |
public DistanceAlgo getDistanceAlgo() { |
|
79 |
switch (getAlgo()) { |
|
80 |
case JaroWinkler: |
|
81 |
return new JaroWinkler(getWeight()); |
|
82 |
case JaroWinklerTitle: |
|
83 |
return new JaroWinklerTitle(getWeight()); |
|
84 |
case Level2JaroWinkler: |
|
85 |
return new Level2JaroWinkler(getWeight()); |
|
86 |
case Level2JaroWinklerTitle: |
|
87 |
return new Level2JaroWinklerTitle(getWeight()); |
|
88 |
case Level2Levenstein: |
|
89 |
return new Level2Levenstein(getWeight()); |
|
90 |
case Levenstein: |
|
91 |
return new Levenstein(getWeight()); |
|
92 |
case LevensteinTitle: |
|
93 |
return new LevensteinTitle(getWeight()); |
|
94 |
case SubStringLevenstein: |
|
95 |
return new SubStringLevenstein(getWeight(), getLimit()); |
|
96 |
case YearLevenstein: |
|
97 |
return new YearLevenstein(getWeight(), getLimit()); |
|
98 |
case SortedJaroWinkler: |
|
99 |
return new SortedJaroWinkler(getWeight()); |
|
100 |
case SortedLevel2JaroWinkler: |
|
101 |
return new SortedLevel2JaroWinkler(getWeight()); |
|
102 |
case ExactMatch: |
|
103 |
return new ExactMatch(getWeight()); |
|
104 |
case AlwaysMatch: |
|
105 |
return new AlwaysMatch(getWeight()); |
|
106 |
case Null: |
|
107 |
return new NullDistanceAlgo(); |
|
108 |
default: |
|
109 |
return new NullDistanceAlgo(); |
|
110 |
} |
|
111 |
} |
|
112 |
|
|
113 |
public boolean isIgnoreMissing() { |
|
114 |
return ignoreMissing; |
|
115 |
} |
|
116 |
|
|
117 |
public Type getType() { |
|
118 |
return type; |
|
119 |
} |
|
120 |
|
|
121 |
public void setType(final Type type) { |
|
122 |
this.type = type; |
|
123 |
} |
|
124 |
|
|
125 |
public boolean isOverrideMatch() { |
|
126 |
return overrideMatch; |
|
127 |
} |
|
128 |
|
|
129 |
public void setOverrideMatch(final boolean overrideMatch) { |
|
130 |
this.overrideMatch = overrideMatch; |
|
131 |
} |
|
132 |
|
|
133 |
@Override |
|
134 |
public String toString() { |
|
135 |
return new Gson().toJson(this); |
|
136 |
} |
|
137 |
|
|
138 |
public double getWeight() { |
|
139 |
return weight; |
|
140 |
} |
|
141 |
|
|
142 |
public void setWeight(final double weight) { |
|
143 |
this.weight = weight; |
|
144 |
} |
|
145 |
|
|
146 |
public Algo getAlgo() { |
|
147 |
return algo; |
|
148 |
} |
|
149 |
|
|
150 |
public void setAlgo(final Algo algo) { |
|
151 |
this.algo = algo; |
|
152 |
} |
|
153 |
|
|
154 |
public int getLimit() { |
|
155 |
return limit; |
|
156 |
} |
|
157 |
|
|
158 |
public void setLimit(final int limit) { |
|
159 |
this.limit = limit; |
|
160 |
} |
|
161 |
|
|
162 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.List; |
|
5 |
import java.util.Map; |
|
6 |
|
|
7 |
import org.apache.commons.lang.RandomStringUtils; |
|
8 |
import org.apache.commons.lang.StringUtils; |
|
9 |
|
|
10 |
import com.google.common.collect.Lists; |
|
11 |
|
|
12 |
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { |
|
13 |
|
|
14 |
public SpaceTrimmingFieldValue(final Map<String, Integer> params) { |
|
15 |
super(params); |
|
16 |
} |
|
17 |
|
|
18 |
@Override |
|
19 |
protected Collection<String> doApply(final String s) { |
|
20 |
final List<String> res = Lists.newArrayList(); |
|
21 |
|
|
22 |
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); |
|
23 |
|
|
24 |
return res; |
|
25 |
} |
|
26 |
|
|
27 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
import com.google.gson.Gson; |
|
7 |
|
|
8 |
import eu.dnetlib.pace.clustering.Acronyms; |
|
9 |
import eu.dnetlib.pace.clustering.Clustering; |
|
10 |
import eu.dnetlib.pace.clustering.ClusteringFunction; |
|
11 |
import eu.dnetlib.pace.clustering.ImmutableFieldValue; |
|
12 |
import eu.dnetlib.pace.clustering.NgramPairs; |
|
13 |
import eu.dnetlib.pace.clustering.Ngrams; |
|
14 |
import eu.dnetlib.pace.clustering.PersonHash; |
|
15 |
import eu.dnetlib.pace.clustering.RandomClusteringFunction; |
|
16 |
import eu.dnetlib.pace.clustering.SpaceTrimmingFieldValue; |
|
17 |
import eu.dnetlib.pace.clustering.SuffixPrefix; |
|
18 |
|
|
19 |
public class ClusteringDef { |
|
20 |
|
|
21 |
private Clustering name; |
|
22 |
|
|
23 |
private List<String> fields; |
|
24 |
|
|
25 |
private Map<String, Integer> params; |
|
26 |
|
|
27 |
public ClusteringDef() {} |
|
28 |
|
|
29 |
public Clustering getName() { |
|
30 |
return name; |
|
31 |
} |
|
32 |
|
|
33 |
public void setName(final Clustering name) { |
|
34 |
this.name = name; |
|
35 |
} |
|
36 |
|
|
37 |
public ClusteringFunction getClusteringFunction() { |
|
38 |
switch (getName()) { |
|
39 |
case acronyms: |
|
40 |
return new Acronyms(getParams()); |
|
41 |
case ngrams: |
|
42 |
return new Ngrams(getParams()); |
|
43 |
case ngrampairs: |
|
44 |
return new NgramPairs(getParams()); |
|
45 |
case suffixprefix: |
|
46 |
return new SuffixPrefix(getParams()); |
|
47 |
case spacetrimmingfieldvalue: |
|
48 |
return new SpaceTrimmingFieldValue(getParams()); |
|
49 |
case immutablefieldvalue: |
|
50 |
return new ImmutableFieldValue(getParams()); |
|
51 |
case personhash: |
|
52 |
return new PersonHash(getParams()); |
|
53 |
default: |
|
54 |
return new RandomClusteringFunction(getParams()); |
|
55 |
} |
|
56 |
} |
|
57 |
|
|
58 |
public List<String> getFields() { |
|
59 |
return fields; |
|
60 |
} |
|
61 |
|
|
62 |
public void setFields(final List<String> fields) { |
|
63 |
this.fields = fields; |
|
64 |
} |
|
65 |
|
|
66 |
public Map<String, Integer> getParams() { |
|
67 |
return params; |
|
68 |
} |
|
69 |
|
|
70 |
public void setParams(final Map<String, Integer> params) { |
|
71 |
this.params = params; |
|
72 |
} |
|
73 |
|
|
74 |
@Override |
|
75 |
public String toString() { |
|
76 |
return new Gson().toJson(this); |
|
77 |
} |
|
78 |
|
|
79 |
} |
|
0 | 80 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.LinkedHashSet; |
|
5 |
import java.util.Map; |
|
6 |
import java.util.StringTokenizer; |
|
7 |
|
|
8 |
public class Ngrams extends AbstractClusteringFunction { |
|
9 |
|
|
10 |
public Ngrams(Map<String, Integer> params) { |
|
11 |
super(params); |
|
12 |
} |
|
13 |
|
|
14 |
@Override |
|
15 |
protected Collection<String> doApply(String s) { |
|
16 |
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); |
|
17 |
} |
|
18 |
|
|
19 |
protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) { |
|
20 |
|
|
21 |
final Collection<String> ngrams = new LinkedHashSet<String>(); |
|
22 |
final StringTokenizer st = new StringTokenizer(s); |
|
23 |
|
|
24 |
while (st.hasMoreTokens()) { |
|
25 |
final String token = st.nextToken(); |
|
26 |
if (!token.isEmpty()) { |
|
27 |
|
|
28 |
for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) { |
|
29 |
String ngram = (token + " ").substring(i, ngramLen + i).trim(); |
|
30 |
if (ngrams.size() >= max) { |
|
31 |
return ngrams; |
|
32 |
} |
|
33 |
if (ngram.length() >= minNgramLen) { |
|
34 |
ngrams.add(ngram); |
|
35 |
} |
|
36 |
} |
|
37 |
} |
|
38 |
} |
|
39 |
//System.out.println(ngrams + " n: " + ngrams.size()); |
|
40 |
return ngrams; |
|
41 |
} |
|
42 |
|
|
43 |
} |
|
0 | 44 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Map; |
|
5 |
import java.util.Set; |
|
6 |
import java.util.StringTokenizer; |
|
7 |
|
|
8 |
import com.google.common.collect.Sets; |
|
9 |
|
|
10 |
public class Acronyms extends AbstractClusteringFunction { |
|
11 |
|
|
12 |
public Acronyms(Map<String, Integer> params) { |
|
13 |
super(params); |
|
14 |
} |
|
15 |
|
|
16 |
@Override |
|
17 |
protected Collection<String> doApply(String s) { |
|
18 |
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); |
|
19 |
} |
|
20 |
|
|
21 |
private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) { |
|
22 |
|
|
23 |
final Set<String> acronyms = Sets.newLinkedHashSet(); |
|
24 |
|
|
25 |
for (int i = 0; i < maxAcronyms; i++) { |
|
26 |
|
|
27 |
final StringTokenizer st = new StringTokenizer(s); |
|
28 |
final StringBuilder sb = new StringBuilder(); |
|
29 |
|
|
30 |
while (st.hasMoreTokens()) { |
|
31 |
final String token = st.nextToken(); |
|
32 |
if (sb.length() > maxLen) { |
|
33 |
break; |
|
34 |
} |
|
35 |
if (token.length() > 1 && i < token.length()) { |
|
36 |
sb.append(token.charAt(i)); |
|
37 |
} |
|
38 |
} |
|
39 |
String acronym = sb.toString(); |
|
40 |
if (acronym.length() > minLen) { |
|
41 |
acronyms.add(acronym); |
|
42 |
} |
|
43 |
} |
|
44 |
return acronyms; |
|
45 |
} |
|
46 |
|
|
47 |
} |
|
0 | 48 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/model/CondDef.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
|
|
5 |
import com.google.gson.Gson; |
|
6 |
|
|
7 |
import eu.dnetlib.pace.condition.AlwaysTrueCondition; |
|
8 |
import eu.dnetlib.pace.condition.ConditionAlgo; |
|
9 |
import eu.dnetlib.pace.condition.DoiExactMatch; |
|
10 |
import eu.dnetlib.pace.condition.ExactMatch; |
|
11 |
import eu.dnetlib.pace.condition.SizeMatch; |
|
12 |
import eu.dnetlib.pace.condition.TitleVersionMatch; |
|
13 |
import eu.dnetlib.pace.condition.YearMatch; |
|
14 |
import eu.dnetlib.pace.config.Cond; |
|
15 |
|
|
16 |
public class CondDef { |
|
17 |
|
|
18 |
private Cond name; |
|
19 |
|
|
20 |
private List<String> fields; |
|
21 |
|
|
22 |
public CondDef() {} |
|
23 |
|
|
24 |
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) { |
|
25 |
switch (getName()) { |
|
26 |
case yearMatch: |
|
27 |
return new YearMatch(fields); |
|
28 |
case titleVersionMatch: |
|
29 |
return new TitleVersionMatch(fields); |
|
30 |
case sizeMatch: |
|
31 |
return new SizeMatch(fields); |
|
32 |
case exactMatch: |
|
33 |
return new ExactMatch(fields); |
|
34 |
case doiExactMatch: |
|
35 |
return new DoiExactMatch(fields); |
|
36 |
default: |
|
37 |
return new AlwaysTrueCondition(fields); |
|
38 |
} |
|
39 |
} |
|
40 |
|
|
41 |
public Cond getName() { |
|
42 |
return name; |
|
43 |
} |
|
44 |
|
|
45 |
public void setName(final Cond name) { |
|
46 |
this.name = name; |
|
47 |
} |
|
48 |
|
|
49 |
public List<String> getFields() { |
|
50 |
return fields; |
|
51 |
} |
|
52 |
|
|
53 |
public void setFields(final List<String> fields) { |
|
54 |
this.fields = fields; |
|
55 |
} |
|
56 |
|
|
57 |
@Override |
|
58 |
public String toString() { |
|
59 |
return new Gson().toJson(this); |
|
60 |
} |
|
61 |
|
|
62 |
} |
|
0 | 63 |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.clustering; |
|
2 |
|
|
3 |
import java.util.Set; |
|
4 |
|
|
5 |
import org.apache.commons.lang.StringUtils; |
|
6 |
|
|
7 |
import eu.dnetlib.pace.common.AbstractPaceFunctions; |
|
8 |
|
|
9 |
public class NGramUtils extends AbstractPaceFunctions { |
|
10 |
|
|
11 |
private static final int SIZE = 100; |
|
12 |
|
|
13 |
private static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); |
|
14 |
|
|
15 |
public static String cleanupForOrdering(String s) { |
|
16 |
NGramUtils utils = new NGramUtils(); |
|
17 |
return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", ""); |
|
18 |
} |
|
19 |
|
|
20 |
} |
modules/dnet-pace-core/tags/dnet-pace-core-2.1.2/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java | ||
---|---|---|
1 |
package eu.dnetlib.pace.model; |
|
2 |
|
|
3 |
import java.util.Collection; |
|
4 |
import java.util.Iterator; |
|
5 |
import java.util.List; |
|
6 |
import java.util.ListIterator; |
|
7 |
|
|
8 |
import com.google.common.base.Function; |
|
9 |
import com.google.common.base.Joiner; |
|
10 |
import com.google.common.collect.Iterables; |
|
11 |
import com.google.common.collect.Lists; |
|
12 |
|
|
13 |
import eu.dnetlib.pace.config.Type; |
|
14 |
|
|
15 |
/** |
|
16 |
* The Class FieldListImpl. |
|
17 |
*/ |
|
18 |
public class FieldListImpl extends AbstractField implements FieldList { |
|
19 |
|
|
20 |
/** The fields. */ |
|
21 |
private List<Field> fields; |
|
22 |
|
|
23 |
/** |
|
24 |
* Instantiates a new field list impl. |
|
25 |
*/ |
|
26 |
public FieldListImpl() { |
|
27 |
fields = Lists.newArrayList(); |
|
28 |
} |
|
29 |
|
|
30 |
/** |
|
31 |
* Instantiates a new field list impl. |
|
32 |
* |
|
33 |
* @param name |
|
34 |
* the name |
|
35 |
*/ |
|
36 |
public FieldListImpl(final String name) { |
|
37 |
super(Type.List, name); |
|
38 |
fields = Lists.newArrayList(); |
|
39 |
} |
|
40 |
|
|
41 |
/* |
|
42 |
* (non-Javadoc) |
|
43 |
* |
|
44 |
* @see java.util.List#add(java.lang.Object) |
|
45 |
*/ |
|
46 |
@Override |
|
47 |
public boolean add(final Field f) { |
|
48 |
return fields.add(f); |
|
49 |
} |
|
50 |
|
|
51 |
/* |
|
52 |
* (non-Javadoc) |
|
53 |
* |
|
54 |
* @see java.util.List#add(int, java.lang.Object) |
|
55 |
*/ |
|
56 |
@Override |
|
57 |
public void add(final int i, final Field f) { |
|
58 |
fields.add(i, f); |
|
59 |
} |
|
60 |
|
|
61 |
/* |
|
62 |
* (non-Javadoc) |
|
63 |
* |
|
64 |
* @see java.util.List#addAll(java.util.Collection) |
|
65 |
*/ |
|
66 |
@Override |
|
67 |
public boolean addAll(final Collection<? extends Field> f) { |
|
68 |
return fields.addAll(f); |
|
69 |
} |
|
70 |
|
|
71 |
/* |
|
72 |
* (non-Javadoc) |
|
73 |
* |
|
74 |
* @see java.util.List#addAll(int, java.util.Collection) |
|
75 |
*/ |
|
76 |
@Override |
|
77 |
public boolean addAll(final int i, final Collection<? extends Field> f) { |
|
78 |
return fields.addAll(i, f); |
|
79 |
} |
|
80 |
|
|
81 |
/* |
|
82 |
* (non-Javadoc) |
|
83 |
* |
|
84 |
* @see java.util.List#clear() |
|
85 |
*/ |
|
86 |
@Override |
|
87 |
public void clear() { |
|
88 |
fields.clear(); |
|
89 |
} |
|
90 |
|
|
91 |
/* |
|
92 |
* (non-Javadoc) |
|
93 |
* |
|
94 |
* @see java.util.List#contains(java.lang.Object) |
|
95 |
*/ |
|
96 |
@Override |
|
97 |
public boolean contains(final Object o) { |
|
98 |
return fields.contains(o); |
|
99 |
} |
|
100 |
|
|
101 |
/* |
|
102 |
* (non-Javadoc) |
|
103 |
* |
|
104 |
* @see java.util.List#containsAll(java.util.Collection) |
|
105 |
*/ |
|
106 |
@Override |
|
107 |
public boolean containsAll(final Collection<?> f) { |
|
108 |
return fields.containsAll(f); |
|
109 |
} |
|
110 |
|
|
111 |
/* |
|
112 |
* (non-Javadoc) |
|
113 |
* |
|
114 |
* @see java.util.List#get(int) |
|
115 |
*/ |
|
116 |
@Override |
|
117 |
public Field get(final int i) { |
|
118 |
return fields.get(i); |
|
119 |
} |
|
120 |
|
|
121 |
/* |
|
122 |
* (non-Javadoc) |
|
123 |
* |
|
124 |
* @see java.util.List#indexOf(java.lang.Object) |
|
125 |
*/ |
|
126 |
@Override |
|
127 |
public int indexOf(final Object o) { |
|
128 |
return fields.indexOf(o); |
|
129 |
} |
|
130 |
|
|
131 |
/* |
|
132 |
* (non-Javadoc) |
|
133 |
* |
|
134 |
* @see eu.dnetlib.pace.model.Field#isEmpty() |
|
135 |
*/ |
|
136 |
@Override |
|
137 |
public boolean isEmpty() { |
|
138 |
return fields.isEmpty(); |
|
139 |
} |
|
140 |
|
|
141 |
/* |
|
142 |
* (non-Javadoc) |
|
143 |
* |
|
144 |
* @see java.lang.Iterable#iterator() |
|
145 |
*/ |
|
146 |
@Override |
|
147 |
public Iterator<Field> iterator() { |
|
148 |
return fields.iterator(); |
|
149 |
} |
|
150 |
|
|
151 |
/* |
|
152 |
* (non-Javadoc) |
|
153 |
* |
|
154 |
* @see java.util.List#lastIndexOf(java.lang.Object) |
|
155 |
*/ |
|
156 |
@Override |
|
157 |
public int lastIndexOf(final Object o) { |
|
158 |
return fields.lastIndexOf(o); |
|
159 |
} |
|
160 |
|
|
161 |
/* |
|
162 |
* (non-Javadoc) |
|
163 |
* |
|
164 |
* @see java.util.List#listIterator() |
|
165 |
*/ |
|
166 |
@Override |
|
167 |
public ListIterator<Field> listIterator() { |
|
168 |
return fields.listIterator(); |
|
169 |
} |
|
170 |
|
|
171 |
/* |
|
172 |
* (non-Javadoc) |
|
173 |
* |
|
174 |
* @see java.util.List#listIterator(int) |
|
175 |
*/ |
|
176 |
@Override |
|
177 |
public ListIterator<Field> listIterator(final int i) { |
|
178 |
return fields.listIterator(i); |
|
179 |
} |
|
180 |
|
|
181 |
/* |
|
182 |
* (non-Javadoc) |
|
183 |
* |
|
184 |
* @see java.util.List#remove(java.lang.Object) |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-pace-core-2.1.2