1 |
54765
|
claudio.at
|
package eu.dnetlib.data.mapreduce.hbase.broker;
|
2 |
|
|
|
3 |
57450
|
michele.ar
|
import static eu.dnetlib.data.mapreduce.hbase.broker.mapping.EventFactory.asEvent;
|
4 |
|
|
|
5 |
|
|
import java.util.LinkedList;
|
6 |
|
|
import java.util.List;
|
7 |
|
|
import java.util.Queue;
|
8 |
|
|
import java.util.function.Predicate;
|
9 |
|
|
import java.util.stream.Collectors;
|
10 |
|
|
|
11 |
|
|
import org.apache.commons.lang3.StringUtils;
|
12 |
|
|
|
13 |
54765
|
claudio.at
|
import com.google.common.collect.Lists;
|
14 |
57450
|
michele.ar
|
|
15 |
54765
|
claudio.at
|
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
16 |
|
|
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.HighlightFactory;
|
17 |
|
|
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.OpenAireEventPayloadFactory;
|
18 |
|
|
import eu.dnetlib.data.mapreduce.hbase.broker.model.EventWrapper;
|
19 |
|
|
import eu.dnetlib.data.proto.FieldTypeProtos;
|
20 |
|
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
21 |
|
|
import eu.dnetlib.miscutils.collections.Pair;
|
22 |
|
|
import eu.dnetlib.pace.distance.algo.JaroWinkler;
|
23 |
|
|
|
24 |
|
|
public class OrcidEventFactory {
|
25 |
|
|
|
26 |
57450
|
michele.ar
|
private static final long MAX_AUTHORS = 50;
|
27 |
|
|
private static final Float t = 0.9f;
|
28 |
|
|
public static final String ORCID_TYPE_MARKER = "ORCID";
|
29 |
54765
|
claudio.at
|
|
30 |
57450
|
michele.ar
|
public static List<EventWrapper> process(final Oaf current, final Oaf other, final float trust) {
|
31 |
|
|
return new OrcidEventFactory().processOrcid(current, other, trust);
|
32 |
|
|
}
|
33 |
54765
|
claudio.at
|
|
34 |
57450
|
michele.ar
|
public List<EventWrapper> processOrcid(final Oaf current, final Oaf other, final float trust) {
|
35 |
54765
|
claudio.at
|
|
36 |
57450
|
michele.ar
|
final List<EventWrapper> events = Lists.newArrayList();
|
37 |
54765
|
claudio.at
|
|
38 |
57450
|
michele.ar
|
final Queue<FieldTypeProtos.Author> currAuthors = getAuthors(current, noneIsORCID());
|
39 |
|
|
final Queue<FieldTypeProtos.Author> otherAuthors = getAuthors(other, anyIsORCID());
|
40 |
54765
|
claudio.at
|
|
41 |
57450
|
michele.ar
|
while (!currAuthors.isEmpty()) {
|
42 |
|
|
final FieldTypeProtos.Author currentAuthor = currAuthors.remove();
|
43 |
54765
|
claudio.at
|
|
44 |
57450
|
michele.ar
|
Pair<FieldTypeProtos.Author, Float> bestMatch = null;
|
45 |
|
|
for (final FieldTypeProtos.Author otherAuthor : otherAuthors) {
|
46 |
54765
|
claudio.at
|
|
47 |
57450
|
michele.ar
|
final Pair<FieldTypeProtos.Author, Float> pair = new Pair<>(otherAuthor, distance(currentAuthor, otherAuthor));
|
48 |
|
|
if (bestMatch == null || pair.getValue() > bestMatch.getValue()) {
|
49 |
|
|
bestMatch = pair;
|
50 |
|
|
}
|
51 |
|
|
}
|
52 |
54765
|
claudio.at
|
|
53 |
57450
|
michele.ar
|
if (bestMatch != null && bestMatch.getValue() >= t) {
|
54 |
|
|
final float authorTrust = trust * bestMatch.getValue(); // adjust it?
|
55 |
|
|
events.add(doProcessOrcid(current, other, new Pair<>(currentAuthor, bestMatch.getKey()), Topic.ENRICH_MISSING_AUTHOR_ORCID, authorTrust));
|
56 |
|
|
}
|
57 |
|
|
}
|
58 |
54765
|
claudio.at
|
|
59 |
57450
|
michele.ar
|
return events;
|
60 |
|
|
}
|
61 |
54765
|
claudio.at
|
|
62 |
57450
|
michele.ar
|
private LinkedList<FieldTypeProtos.Author> getAuthors(final Oaf oaf, final Predicate<FieldTypeProtos.Author> p) {
|
63 |
|
|
return authors(oaf).stream()
|
64 |
|
|
.filter(p)
|
65 |
|
|
.limit(MAX_AUTHORS)
|
66 |
|
|
.collect(Collectors.toCollection(LinkedList::new));
|
67 |
|
|
}
|
68 |
54794
|
claudio.at
|
|
69 |
57450
|
michele.ar
|
private Predicate<FieldTypeProtos.Author> anyIsORCID() {
|
70 |
|
|
return author -> author.getPidList().stream().anyMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
|
71 |
|
|
}
|
72 |
54794
|
claudio.at
|
|
73 |
57450
|
michele.ar
|
private Predicate<FieldTypeProtos.Author> noneIsORCID() {
|
74 |
|
|
return author -> author.getPidList().stream().noneMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
|
75 |
|
|
}
|
76 |
54794
|
claudio.at
|
|
77 |
57450
|
michele.ar
|
private EventWrapper doProcessOrcid(final Oaf current,
|
78 |
|
|
final Oaf other,
|
79 |
|
|
final Pair<FieldTypeProtos.Author, FieldTypeProtos.Author> pair,
|
80 |
|
|
final Topic topic,
|
81 |
|
|
final float trust) {
|
82 |
|
|
final Oaf.Builder prototype = Oaf.newBuilder(current);
|
83 |
54765
|
claudio.at
|
|
84 |
57450
|
michele.ar
|
for (final FieldTypeProtos.Author.Builder a : prototype.getEntityBuilder().getResultBuilder().getMetadataBuilder().getAuthorBuilderList()) {
|
85 |
|
|
if (a.getFullname().equals(pair.getKey().getFullname())) {
|
86 |
|
|
a.addAllPid(
|
87 |
|
|
pair.getValue().getPidList().stream()
|
88 |
|
|
.filter(p -> ORCID_TYPE_MARKER.equals(p.getKey()))
|
89 |
|
|
.collect(Collectors.toList()));
|
90 |
|
|
}
|
91 |
|
|
}
|
92 |
54765
|
claudio.at
|
|
93 |
57450
|
michele.ar
|
final Oaf oaf = prototype.build();
|
94 |
54765
|
claudio.at
|
|
95 |
57450
|
michele.ar
|
final OpenAireEventPayload payload =
|
96 |
|
|
HighlightFactory.highlightEnrichOrcidAuthor(OpenAireEventPayloadFactory.fromOAF(oaf.getEntity(), other.getEntity(), trust), pair.getValue());
|
97 |
54765
|
claudio.at
|
|
98 |
57450
|
michele.ar
|
return EventWrapper.newInstance(
|
99 |
|
|
asEvent(oaf.getEntity(), topic, payload, other.getEntity(), trust),
|
100 |
|
|
payload.getHighlight().getCreators().stream().filter(s -> StringUtils.contains(s, ORCID_TYPE_MARKER)).collect(Collectors.joining(", ")),
|
101 |
|
|
topic.getValue());
|
102 |
|
|
}
|
103 |
54765
|
claudio.at
|
|
104 |
57450
|
michele.ar
|
private List<FieldTypeProtos.Author> authors(final Oaf oaf) {
|
105 |
|
|
final List<FieldTypeProtos.Author> authors = oaf.getEntity().getResult().getMetadata().getAuthorList();
|
106 |
|
|
if (authors == null) { return Lists.newLinkedList(); }
|
107 |
|
|
return authors;
|
108 |
|
|
}
|
109 |
54765
|
claudio.at
|
|
110 |
57450
|
michele.ar
|
private Float distance(final FieldTypeProtos.Author a, final FieldTypeProtos.Author b) {
|
111 |
|
|
final JaroWinkler jaroWinkler = new JaroWinkler(1.0);
|
112 |
|
|
if (a.hasSurname() && b.hasSurname()) {
|
113 |
57632
|
claudio.at
|
return (float) jaroWinkler.distance(getCanonicalName(a), getCanonicalName(b));
|
114 |
54765
|
claudio.at
|
|
115 |
57450
|
michele.ar
|
} else {
|
116 |
57632
|
claudio.at
|
return (float) jaroWinkler.distance(a.getFullname(), b.getFullname());
|
117 |
57450
|
michele.ar
|
}
|
118 |
|
|
}
|
119 |
54765
|
claudio.at
|
|
120 |
57450
|
michele.ar
|
// returns the 1st letter of the author name + the author surname, all in lowercase
|
121 |
|
|
// e.g. "pmanghi"
|
122 |
|
|
private String getCanonicalName(final FieldTypeProtos.Author a) {
|
123 |
|
|
return (StringUtils.substring(a.getName(), 0, 1) + a.getSurname()).toLowerCase();
|
124 |
|
|
}
|
125 |
54765
|
claudio.at
|
|
126 |
|
|
}
|