1
|
package eu.dnetlib.pace.distance;
|
2
|
|
3
|
import java.util.Collection;
|
4
|
import java.util.List;
|
5
|
|
6
|
import eu.dnetlib.pace.condition.ConditionAlgo;
|
7
|
import eu.dnetlib.pace.config.Config;
|
8
|
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
9
|
import eu.dnetlib.pace.distance.eval.DistanceEval;
|
10
|
import eu.dnetlib.pace.distance.eval.DistanceEvalMap;
|
11
|
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
12
|
import eu.dnetlib.pace.model.Document;
|
13
|
import eu.dnetlib.pace.model.Field;
|
14
|
import eu.dnetlib.pace.model.FieldDef;
|
15
|
|
16
|
/**
|
17
|
* The distance between two documents is given by the weighted mean of the field distances
|
18
|
*/
|
19
|
public class DistanceScorer {
|
20
|
|
21
|
private Config config;
|
22
|
|
23
|
public DistanceScorer(final Config config) {
|
24
|
this.config = config;
|
25
|
}
|
26
|
|
27
|
public ScoreResult distance(final Document a, final Document b) {
|
28
|
final ScoreResult sr = new ScoreResult();
|
29
|
|
30
|
sr.setStrictConditions(verify(a, b, config.strictConditions()));
|
31
|
sr.setConditions(verify(a, b, config.conditions()));
|
32
|
|
33
|
final DistanceEvalMap dMap = new DistanceEvalMap(sumWeights(config.model()));
|
34
|
|
35
|
for (final FieldDef fd : config.model()) {
|
36
|
|
37
|
dMap.updateDistance(fieldDistance(a, b, fd));
|
38
|
}
|
39
|
sr.setDistances(dMap);
|
40
|
return sr;
|
41
|
}
|
42
|
|
43
|
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
|
44
|
final ConditionEvalMap res = new ConditionEvalMap();
|
45
|
|
46
|
for (final ConditionAlgo cd : conditions) {
|
47
|
final ConditionEvalMap map = cd.verify(a, b);
|
48
|
res.mergeFrom(map);
|
49
|
|
50
|
// commented out shortcuts
|
51
|
/*
|
52
|
if (map.anyNegative()) {
|
53
|
return res;
|
54
|
}
|
55
|
*/
|
56
|
|
57
|
//if (strict && (res < 0)) return -1;
|
58
|
//cond += verify;
|
59
|
}
|
60
|
return res;
|
61
|
}
|
62
|
|
63
|
private DistanceEval fieldDistance(final Document a, final Document b, final FieldDef fd) {
|
64
|
|
65
|
final double w = fd.getWeight();
|
66
|
final Field va = getValue(a, fd);
|
67
|
final Field vb = getValue(b, fd);
|
68
|
|
69
|
final DistanceEval de = new DistanceEval(fd, va, vb);
|
70
|
if ((w == 0)) return de; // optimization for 0 weight
|
71
|
else {
|
72
|
if (va.isEmpty() || vb.isEmpty()) {
|
73
|
if (fd.isIgnoreMissing()) {
|
74
|
de.setDistance(-1);
|
75
|
} else {
|
76
|
de.setDistance(w);
|
77
|
}
|
78
|
} else {
|
79
|
if (va.getType().equals(vb.getType())) {
|
80
|
de.setDistance(w * fd.getDistanceAlgo().distance(va, vb));
|
81
|
} else {
|
82
|
throw new IllegalArgumentException(String.format("Types are differents type: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
|
83
|
}
|
84
|
}
|
85
|
return de;
|
86
|
}
|
87
|
}
|
88
|
|
89
|
private Field getValue(final Document d, final FieldDef fd) {
|
90
|
return d.values(fd.getName());
|
91
|
}
|
92
|
|
93
|
private double sumWeights(final Collection<FieldDef> fields) {
|
94
|
double sum = 0.0;
|
95
|
for (final FieldDef fd : fields) {
|
96
|
sum += fd.getWeight();
|
97
|
}
|
98
|
return sum;
|
99
|
}
|
100
|
|
101
|
}
|