1
|
package eu.dnetlib.data.mapreduce.hbase.broker;
|
2
|
|
3
|
import static eu.dnetlib.data.mapreduce.hbase.broker.mapping.EventFactory.asEvent;
|
4
|
|
5
|
import java.util.LinkedList;
|
6
|
import java.util.List;
|
7
|
import java.util.Queue;
|
8
|
import java.util.function.Predicate;
|
9
|
import java.util.stream.Collectors;
|
10
|
|
11
|
import org.apache.commons.lang3.StringUtils;
|
12
|
|
13
|
import com.google.common.collect.Lists;
|
14
|
|
15
|
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
16
|
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.HighlightFactory;
|
17
|
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.OpenAireEventPayloadFactory;
|
18
|
import eu.dnetlib.data.mapreduce.hbase.broker.model.EventWrapper;
|
19
|
import eu.dnetlib.data.proto.FieldTypeProtos;
|
20
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
21
|
import eu.dnetlib.miscutils.collections.Pair;
|
22
|
import eu.dnetlib.pace.distance.algo.JaroWinkler;
|
23
|
|
24
|
public class OrcidEventFactory {
|
25
|
|
26
|
private static final long MAX_AUTHORS = 50;
|
27
|
private static final Float t = 0.9f;
|
28
|
public static final String ORCID_TYPE_MARKER = "ORCID";
|
29
|
|
30
|
public static List<EventWrapper> process(final Oaf current, final Oaf other, final float trust) {
|
31
|
return new OrcidEventFactory().processOrcid(current, other, trust);
|
32
|
}
|
33
|
|
34
|
public List<EventWrapper> processOrcid(final Oaf current, final Oaf other, final float trust) {
|
35
|
|
36
|
final List<EventWrapper> events = Lists.newArrayList();
|
37
|
|
38
|
final Queue<FieldTypeProtos.Author> currAuthors = getAuthors(current, noneIsORCID());
|
39
|
final Queue<FieldTypeProtos.Author> otherAuthors = getAuthors(other, anyIsORCID());
|
40
|
|
41
|
while (!currAuthors.isEmpty()) {
|
42
|
final FieldTypeProtos.Author currentAuthor = currAuthors.remove();
|
43
|
|
44
|
Pair<FieldTypeProtos.Author, Float> bestMatch = null;
|
45
|
for (final FieldTypeProtos.Author otherAuthor : otherAuthors) {
|
46
|
|
47
|
final Pair<FieldTypeProtos.Author, Float> pair = new Pair<>(otherAuthor, distance(currentAuthor, otherAuthor));
|
48
|
if (bestMatch == null || pair.getValue() > bestMatch.getValue()) {
|
49
|
bestMatch = pair;
|
50
|
}
|
51
|
}
|
52
|
|
53
|
if (bestMatch != null && bestMatch.getValue() >= t) {
|
54
|
final float authorTrust = trust * bestMatch.getValue(); // adjust it?
|
55
|
events.add(doProcessOrcid(current, other, new Pair<>(currentAuthor, bestMatch.getKey()), Topic.ENRICH_MISSING_AUTHOR_ORCID, authorTrust));
|
56
|
}
|
57
|
}
|
58
|
|
59
|
return events;
|
60
|
}
|
61
|
|
62
|
private LinkedList<FieldTypeProtos.Author> getAuthors(final Oaf oaf, final Predicate<FieldTypeProtos.Author> p) {
|
63
|
return authors(oaf).stream()
|
64
|
.filter(p)
|
65
|
.limit(MAX_AUTHORS)
|
66
|
.collect(Collectors.toCollection(LinkedList::new));
|
67
|
}
|
68
|
|
69
|
private Predicate<FieldTypeProtos.Author> anyIsORCID() {
|
70
|
return author -> author.getPidList().stream().anyMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
|
71
|
}
|
72
|
|
73
|
private Predicate<FieldTypeProtos.Author> noneIsORCID() {
|
74
|
return author -> author.getPidList().stream().noneMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
|
75
|
}
|
76
|
|
77
|
private EventWrapper doProcessOrcid(final Oaf current,
|
78
|
final Oaf other,
|
79
|
final Pair<FieldTypeProtos.Author, FieldTypeProtos.Author> pair,
|
80
|
final Topic topic,
|
81
|
final float trust) {
|
82
|
final Oaf.Builder prototype = Oaf.newBuilder(current);
|
83
|
|
84
|
for (final FieldTypeProtos.Author.Builder a : prototype.getEntityBuilder().getResultBuilder().getMetadataBuilder().getAuthorBuilderList()) {
|
85
|
if (a.getFullname().equals(pair.getKey().getFullname())) {
|
86
|
a.addAllPid(
|
87
|
pair.getValue().getPidList().stream()
|
88
|
.filter(p -> ORCID_TYPE_MARKER.equals(p.getKey()))
|
89
|
.collect(Collectors.toList()));
|
90
|
}
|
91
|
}
|
92
|
|
93
|
final Oaf oaf = prototype.build();
|
94
|
|
95
|
final OpenAireEventPayload payload =
|
96
|
HighlightFactory.highlightEnrichOrcidAuthor(OpenAireEventPayloadFactory.fromOAF(oaf.getEntity(), other.getEntity(), trust), pair.getValue());
|
97
|
|
98
|
return EventWrapper.newInstance(
|
99
|
asEvent(oaf.getEntity(), topic, payload, other.getEntity(), trust),
|
100
|
payload.getHighlight().getCreators().stream().filter(s -> StringUtils.contains(s, ORCID_TYPE_MARKER)).collect(Collectors.joining(", ")),
|
101
|
topic.getValue());
|
102
|
}
|
103
|
|
104
|
private List<FieldTypeProtos.Author> authors(final Oaf oaf) {
|
105
|
final List<FieldTypeProtos.Author> authors = oaf.getEntity().getResult().getMetadata().getAuthorList();
|
106
|
if (authors == null) { return Lists.newLinkedList(); }
|
107
|
return authors;
|
108
|
}
|
109
|
|
110
|
private Float distance(final FieldTypeProtos.Author a, final FieldTypeProtos.Author b) {
|
111
|
final JaroWinkler jaroWinkler = new JaroWinkler(1.0);
|
112
|
if (a.hasSurname() && b.hasSurname()) {
|
113
|
return (float) jaroWinkler.distance(getCanonicalName(a), getCanonicalName(b), null);
|
114
|
|
115
|
} else {
|
116
|
return (float) jaroWinkler.distance(a.getFullname(), b.getFullname(), null);
|
117
|
}
|
118
|
}
|
119
|
|
120
|
// returns the 1st letter of the author name + the author surname, all in lowercase
|
121
|
// e.g. "pmanghi"
|
122
|
private String getCanonicalName(final FieldTypeProtos.Author a) {
|
123
|
return (StringUtils.substring(a.getName(), 0, 1) + a.getSurname()).toLowerCase();
|
124
|
}
|
125
|
|
126
|
}
|