1
|
package eu.dnetlib.pace.distance;
|
2
|
|
3
|
import java.util.List;
|
4
|
|
5
|
import eu.dnetlib.pace.model.CondDef;
|
6
|
import eu.dnetlib.pace.model.Document;
|
7
|
import eu.dnetlib.pace.model.Field;
|
8
|
import eu.dnetlib.pace.model.FieldDef;
|
9
|
|
10
|
/**
|
11
|
* The distance between two documents is given by the weighted mean of the field distances
|
12
|
*/
|
13
|
public class DistanceScorer {
|
14
|
|
15
|
private List<FieldDef> fields;
|
16
|
|
17
|
private List<CondDef> strictConditions;
|
18
|
|
19
|
private List<CondDef> conditions;
|
20
|
|
21
|
public DistanceScorer(final List<FieldDef> fields, final List<CondDef> strictConditions, final List<CondDef> conditions) {
|
22
|
this.fields = fields;
|
23
|
this.conditions = conditions;
|
24
|
this.strictConditions = strictConditions;
|
25
|
}
|
26
|
|
27
|
public double distance(final Document a, final Document b) {
|
28
|
|
29
|
double w = sumWeights(fields);
|
30
|
double sum = 0.0;
|
31
|
int cond = verify(a, b, strictConditions, true);
|
32
|
|
33
|
if (cond > 0) return 1.0;
|
34
|
if (cond < 0) return 0.0;
|
35
|
|
36
|
if (verify(a, b, conditions, true) >= 0) {
|
37
|
for (FieldDef fd : fields) {
|
38
|
double d = fieldDistance(a, b, fd);
|
39
|
|
40
|
if (d > 0) {
|
41
|
sum += d;
|
42
|
} else {
|
43
|
w -= fd.getAlgo().getWeight();
|
44
|
}
|
45
|
}
|
46
|
return w == 0 ? 0 : sum / w;
|
47
|
}
|
48
|
return 0.0;
|
49
|
}
|
50
|
|
51
|
private int verify(final Document a, final Document b, final List<CondDef> conditions, final boolean strict) {
|
52
|
int cond = 0;
|
53
|
|
54
|
for (CondDef cd : conditions) {
|
55
|
int verify = cd.getConditionAlgo().verify(a, b);
|
56
|
if (strict && (verify < 0)) return -1;
|
57
|
cond += verify;
|
58
|
}
|
59
|
return cond;
|
60
|
}
|
61
|
|
62
|
private double fieldDistance(final Document a, final Document b, final FieldDef fd) {
|
63
|
final double w = fd.getAlgo().getWeight();
|
64
|
if ((w == 0)) return 0.0; // optimization for 0 weight
|
65
|
else {
|
66
|
Field va = getValue(a, fd);
|
67
|
Field vb = getValue(b, fd);
|
68
|
|
69
|
if (va.isEmpty() || vb.isEmpty()) {
|
70
|
if (fd.isIgnoreMissing()) return -1;
|
71
|
else return w;
|
72
|
} else {
|
73
|
|
74
|
if (va.getType().equals(vb.getType())) {
|
75
|
double d = fd.getAlgo().distance(va, vb);
|
76
|
return w * d;
|
77
|
}
|
78
|
throw new IllegalArgumentException("Types are differents type");
|
79
|
}
|
80
|
}
|
81
|
}
|
82
|
|
83
|
private Field getValue(final Document d, final FieldDef fd) {
|
84
|
return d.values(fd.getName());
|
85
|
}
|
86
|
|
87
|
private double sumWeights(final List<FieldDef> fields) {
|
88
|
double sum = 0.0;
|
89
|
for (FieldDef fd : fields) {
|
90
|
sum += fd.getAlgo().getWeight();
|
91
|
}
|
92
|
return sum;
|
93
|
}
|
94
|
|
95
|
}
|