1
|
package eu.dnetlib.data.mapreduce.hbase.broker;
|
2
|
|
3
|
import com.google.common.collect.Lists;
|
4
|
import com.google.common.collect.Maps;
|
5
|
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
6
|
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.HighlightFactory;
|
7
|
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.OpenAireEventPayloadFactory;
|
8
|
import eu.dnetlib.data.mapreduce.hbase.broker.model.EventWrapper;
|
9
|
|
10
|
import eu.dnetlib.data.proto.FieldTypeProtos;
|
11
|
import eu.dnetlib.data.proto.OafProtos;
|
12
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
13
|
import eu.dnetlib.miscutils.collections.Pair;
|
14
|
import eu.dnetlib.pace.distance.algo.JaroWinkler;
|
15
|
import org.apache.commons.lang3.StringUtils;
|
16
|
import org.dom4j.DocumentException;
|
17
|
|
18
|
import java.io.IOException;
|
19
|
import java.util.*;
|
20
|
import java.util.function.Predicate;
|
21
|
import java.util.stream.Collectors;
|
22
|
|
23
|
import static eu.dnetlib.data.mapreduce.hbase.broker.mapping.EventFactory.asEvent;
|
24
|
|
25
|
public class OrcidEventFactory {
|
26
|
|
27
|
private static final long MAX_AUTHORS = 50;
|
28
|
private static final Float t = 0.9f;
|
29
|
public static final String ORCID_TYPE_MARKER = "ORCID";
|
30
|
|
31
|
public static List<EventWrapper> process(final Oaf current, final Oaf other, final float trust) {
|
32
|
return new OrcidEventFactory().processOrcid(current, other, trust);
|
33
|
}
|
34
|
|
35
|
public List<EventWrapper> processOrcid(final Oaf current, final Oaf other, final float trust) {
|
36
|
|
37
|
final List<EventWrapper> events = Lists.newArrayList();
|
38
|
|
39
|
final Queue<FieldTypeProtos.Author> currAuthors = getAuthors(current, noneIsORCID());
|
40
|
final Queue<FieldTypeProtos.Author> otherAuthors = getAuthors(other, anyIsORCID());
|
41
|
|
42
|
while (!currAuthors.isEmpty()) {
|
43
|
final FieldTypeProtos.Author currentAuthor = currAuthors.remove();
|
44
|
|
45
|
Pair<FieldTypeProtos.Author, Float> bestMatch = null;
|
46
|
for(FieldTypeProtos.Author otherAuthor : otherAuthors) {
|
47
|
|
48
|
final Pair<FieldTypeProtos.Author, Float> pair = new Pair<>(otherAuthor, distance(currentAuthor, otherAuthor));
|
49
|
if (bestMatch == null || pair.getValue() > bestMatch.getValue()) {
|
50
|
bestMatch = pair;
|
51
|
}
|
52
|
}
|
53
|
|
54
|
if (bestMatch != null && bestMatch.getValue() >= t) {
|
55
|
float authorTrust = trust * bestMatch.getValue(); // adjust it?
|
56
|
events.add(doProcessOrcid(current, other, new Pair<>(currentAuthor, bestMatch.getKey()), Topic.ENRICH_MISSING_AUTHOR_ORCID, authorTrust));
|
57
|
}
|
58
|
}
|
59
|
|
60
|
return events;
|
61
|
}
|
62
|
|
63
|
private LinkedList<FieldTypeProtos.Author> getAuthors(Oaf oaf, Predicate<FieldTypeProtos.Author> p) {
|
64
|
return authors(oaf).stream()
|
65
|
.filter(p)
|
66
|
.limit(MAX_AUTHORS)
|
67
|
.collect(Collectors.toCollection(LinkedList::new));
|
68
|
}
|
69
|
|
70
|
private Predicate<FieldTypeProtos.Author> anyIsORCID() {
|
71
|
return author -> author.getPidList().stream().anyMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
|
72
|
}
|
73
|
|
74
|
private Predicate<FieldTypeProtos.Author> noneIsORCID() {
|
75
|
return author -> author.getPidList().stream().noneMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
|
76
|
}
|
77
|
|
78
|
private EventWrapper doProcessOrcid(final Oaf current, final Oaf other, final Pair<FieldTypeProtos.Author, FieldTypeProtos.Author> pair, final Topic topic, final float trust) {
|
79
|
final Oaf.Builder prototype = Oaf.newBuilder(current);
|
80
|
|
81
|
for(FieldTypeProtos.Author.Builder a : prototype.getEntityBuilder().getResultBuilder().getMetadataBuilder().getAuthorBuilderList()) {
|
82
|
if (a.getFullname().equals(pair.getKey().getFullname())) {
|
83
|
a.addAllPid(
|
84
|
pair.getValue().getPidList().stream()
|
85
|
.filter(p -> ORCID_TYPE_MARKER.equals(p.getKey()))
|
86
|
.collect(Collectors.toList()));
|
87
|
}
|
88
|
}
|
89
|
|
90
|
final Oaf oaf = prototype.build();
|
91
|
|
92
|
final OpenAireEventPayload payload =
|
93
|
HighlightFactory.highlightEnrichOrcidAuthor(OpenAireEventPayloadFactory.fromOAF(oaf.getEntity(), other.getEntity(), trust), pair.getValue());
|
94
|
|
95
|
return EventWrapper.newInstance(
|
96
|
asEvent(oaf.getEntity(), topic, payload, other.getEntity(), trust),
|
97
|
payload.getHighlight().getCreators().stream().filter(s -> StringUtils.contains(s, ORCID_TYPE_MARKER)).collect(Collectors.joining(", ")),
|
98
|
topic.getValue());
|
99
|
}
|
100
|
|
101
|
private List<FieldTypeProtos.Author> authors(final Oaf oaf) {
|
102
|
List<FieldTypeProtos.Author> authors = oaf.getEntity().getResult().getMetadata().getAuthorList();
|
103
|
if (authors == null) {
|
104
|
return Lists.newLinkedList();
|
105
|
}
|
106
|
return authors;
|
107
|
}
|
108
|
|
109
|
private Float distance(final FieldTypeProtos.Author a, final FieldTypeProtos.Author b) {
|
110
|
final JaroWinkler jaroWinkler = new JaroWinkler(1.0);
|
111
|
if (a.hasSurname() && b.hasSurname()) {
|
112
|
return (float) jaroWinkler.distance(getCanonicalName(a), getCanonicalName(b));
|
113
|
|
114
|
} else {
|
115
|
return (float) jaroWinkler.distance(a.getFullname(), b.getFullname());
|
116
|
}
|
117
|
}
|
118
|
|
119
|
// returns the 1st letter of the author name + the author surname, all in lowercase
|
120
|
// e.g. "pmanghi"
|
121
|
private String getCanonicalName(FieldTypeProtos.Author a) {
|
122
|
return (StringUtils.substring(a.getName(), 0, 1) + a.getSurname()).toLowerCase();
|
123
|
}
|
124
|
|
125
|
|
126
|
}
|