Revision 57450
Added by Michele Artini over 4 years ago
OrcidEventFactory.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.broker; |
2 | 2 |
|
3 |
import static eu.dnetlib.data.mapreduce.hbase.broker.mapping.EventFactory.asEvent; |
|
4 |
|
|
5 |
import java.util.LinkedList; |
|
6 |
import java.util.List; |
|
7 |
import java.util.Queue; |
|
8 |
import java.util.function.Predicate; |
|
9 |
import java.util.stream.Collectors; |
|
10 |
|
|
11 |
import org.apache.commons.lang3.StringUtils; |
|
12 |
|
|
3 | 13 |
import com.google.common.collect.Lists; |
4 |
import com.google.common.collect.Maps; |
|
14 |
|
|
5 | 15 |
import eu.dnetlib.broker.objects.OpenAireEventPayload; |
6 | 16 |
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.HighlightFactory; |
7 | 17 |
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.OpenAireEventPayloadFactory; |
8 | 18 |
import eu.dnetlib.data.mapreduce.hbase.broker.model.EventWrapper; |
9 |
|
|
10 | 19 |
import eu.dnetlib.data.proto.FieldTypeProtos; |
11 |
import eu.dnetlib.data.proto.OafProtos; |
|
12 | 20 |
import eu.dnetlib.data.proto.OafProtos.Oaf; |
13 | 21 |
import eu.dnetlib.miscutils.collections.Pair; |
14 | 22 |
import eu.dnetlib.pace.distance.algo.JaroWinkler; |
15 |
import org.apache.commons.lang3.StringUtils; |
|
16 |
import org.dom4j.DocumentException; |
|
17 | 23 |
|
18 |
import java.io.IOException; |
|
19 |
import java.util.*; |
|
20 |
import java.util.function.Predicate; |
|
21 |
import java.util.stream.Collectors; |
|
22 |
|
|
23 |
import static eu.dnetlib.data.mapreduce.hbase.broker.mapping.EventFactory.asEvent; |
|
24 |
|
|
25 | 24 |
public class OrcidEventFactory { |
26 | 25 |
|
27 |
private static final long MAX_AUTHORS = 50;
|
|
28 |
private static final Float t = 0.9f;
|
|
29 |
public static final String ORCID_TYPE_MARKER = "ORCID";
|
|
26 |
private static final long MAX_AUTHORS = 50;
|
|
27 |
private static final Float t = 0.9f;
|
|
28 |
public static final String ORCID_TYPE_MARKER = "ORCID";
|
|
30 | 29 |
|
31 |
public static List<EventWrapper> process(final Oaf current, final Oaf other, final float trust) {
|
|
32 |
return new OrcidEventFactory().processOrcid(current, other, trust);
|
|
33 |
}
|
|
30 |
public static List<EventWrapper> process(final Oaf current, final Oaf other, final float trust) {
|
|
31 |
return new OrcidEventFactory().processOrcid(current, other, trust);
|
|
32 |
}
|
|
34 | 33 |
|
35 |
public List<EventWrapper> processOrcid(final Oaf current, final Oaf other, final float trust) {
|
|
34 |
public List<EventWrapper> processOrcid(final Oaf current, final Oaf other, final float trust) {
|
|
36 | 35 |
|
37 |
final List<EventWrapper> events = Lists.newArrayList();
|
|
36 |
final List<EventWrapper> events = Lists.newArrayList();
|
|
38 | 37 |
|
39 |
final Queue<FieldTypeProtos.Author> currAuthors = getAuthors(current, noneIsORCID());
|
|
40 |
final Queue<FieldTypeProtos.Author> otherAuthors = getAuthors(other, anyIsORCID());
|
|
38 |
final Queue<FieldTypeProtos.Author> currAuthors = getAuthors(current, noneIsORCID());
|
|
39 |
final Queue<FieldTypeProtos.Author> otherAuthors = getAuthors(other, anyIsORCID());
|
|
41 | 40 |
|
42 |
while (!currAuthors.isEmpty()) {
|
|
43 |
final FieldTypeProtos.Author currentAuthor = currAuthors.remove();
|
|
41 |
while (!currAuthors.isEmpty()) {
|
|
42 |
final FieldTypeProtos.Author currentAuthor = currAuthors.remove();
|
|
44 | 43 |
|
45 |
Pair<FieldTypeProtos.Author, Float> bestMatch = null;
|
|
46 |
for(FieldTypeProtos.Author otherAuthor : otherAuthors) {
|
|
44 |
Pair<FieldTypeProtos.Author, Float> bestMatch = null;
|
|
45 |
for (final FieldTypeProtos.Author otherAuthor : otherAuthors) {
|
|
47 | 46 |
|
48 |
final Pair<FieldTypeProtos.Author, Float> pair = new Pair<>(otherAuthor, distance(currentAuthor, otherAuthor));
|
|
49 |
if (bestMatch == null || pair.getValue() > bestMatch.getValue()) {
|
|
50 |
bestMatch = pair;
|
|
51 |
}
|
|
52 |
}
|
|
47 |
final Pair<FieldTypeProtos.Author, Float> pair = new Pair<>(otherAuthor, distance(currentAuthor, otherAuthor));
|
|
48 |
if (bestMatch == null || pair.getValue() > bestMatch.getValue()) {
|
|
49 |
bestMatch = pair;
|
|
50 |
}
|
|
51 |
}
|
|
53 | 52 |
|
54 |
if (bestMatch != null && bestMatch.getValue() >= t) {
|
|
55 |
float authorTrust = trust * bestMatch.getValue(); // adjust it?
|
|
56 |
events.add(doProcessOrcid(current, other, new Pair<>(currentAuthor, bestMatch.getKey()), Topic.ENRICH_MISSING_AUTHOR_ORCID, authorTrust));
|
|
57 |
}
|
|
58 |
}
|
|
53 |
if (bestMatch != null && bestMatch.getValue() >= t) {
|
|
54 |
final float authorTrust = trust * bestMatch.getValue(); // adjust it?
|
|
55 |
events.add(doProcessOrcid(current, other, new Pair<>(currentAuthor, bestMatch.getKey()), Topic.ENRICH_MISSING_AUTHOR_ORCID, authorTrust));
|
|
56 |
}
|
|
57 |
}
|
|
59 | 58 |
|
60 |
return events;
|
|
61 |
}
|
|
59 |
return events;
|
|
60 |
}
|
|
62 | 61 |
|
63 |
private LinkedList<FieldTypeProtos.Author> getAuthors(Oaf oaf, Predicate<FieldTypeProtos.Author> p) {
|
|
64 |
return authors(oaf).stream()
|
|
65 |
.filter(p)
|
|
66 |
.limit(MAX_AUTHORS)
|
|
67 |
.collect(Collectors.toCollection(LinkedList::new));
|
|
68 |
}
|
|
62 |
private LinkedList<FieldTypeProtos.Author> getAuthors(final Oaf oaf, final Predicate<FieldTypeProtos.Author> p) {
|
|
63 |
return authors(oaf).stream()
|
|
64 |
.filter(p)
|
|
65 |
.limit(MAX_AUTHORS)
|
|
66 |
.collect(Collectors.toCollection(LinkedList::new));
|
|
67 |
}
|
|
69 | 68 |
|
70 |
private Predicate<FieldTypeProtos.Author> anyIsORCID() {
|
|
71 |
return author -> author.getPidList().stream().anyMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
|
|
72 |
}
|
|
69 |
private Predicate<FieldTypeProtos.Author> anyIsORCID() {
|
|
70 |
return author -> author.getPidList().stream().anyMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
|
|
71 |
}
|
|
73 | 72 |
|
74 |
private Predicate<FieldTypeProtos.Author> noneIsORCID() {
|
|
75 |
return author -> author.getPidList().stream().noneMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
|
|
76 |
}
|
|
73 |
private Predicate<FieldTypeProtos.Author> noneIsORCID() {
|
|
74 |
return author -> author.getPidList().stream().noneMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
|
|
75 |
}
|
|
77 | 76 |
|
78 |
private EventWrapper doProcessOrcid(final Oaf current, final Oaf other, final Pair<FieldTypeProtos.Author, FieldTypeProtos.Author> pair, final Topic topic, final float trust) { |
|
79 |
final Oaf.Builder prototype = Oaf.newBuilder(current); |
|
77 |
private EventWrapper doProcessOrcid(final Oaf current, |
|
78 |
final Oaf other, |
|
79 |
final Pair<FieldTypeProtos.Author, FieldTypeProtos.Author> pair, |
|
80 |
final Topic topic, |
|
81 |
final float trust) { |
|
82 |
final Oaf.Builder prototype = Oaf.newBuilder(current); |
|
80 | 83 |
|
81 |
for(FieldTypeProtos.Author.Builder a : prototype.getEntityBuilder().getResultBuilder().getMetadataBuilder().getAuthorBuilderList()) {
|
|
82 |
if (a.getFullname().equals(pair.getKey().getFullname())) {
|
|
83 |
a.addAllPid(
|
|
84 |
pair.getValue().getPidList().stream()
|
|
85 |
.filter(p -> ORCID_TYPE_MARKER.equals(p.getKey()))
|
|
86 |
.collect(Collectors.toList()));
|
|
87 |
}
|
|
88 |
}
|
|
84 |
for (final FieldTypeProtos.Author.Builder a : prototype.getEntityBuilder().getResultBuilder().getMetadataBuilder().getAuthorBuilderList()) {
|
|
85 |
if (a.getFullname().equals(pair.getKey().getFullname())) {
|
|
86 |
a.addAllPid(
|
|
87 |
pair.getValue().getPidList().stream()
|
|
88 |
.filter(p -> ORCID_TYPE_MARKER.equals(p.getKey()))
|
|
89 |
.collect(Collectors.toList()));
|
|
90 |
}
|
|
91 |
}
|
|
89 | 92 |
|
90 |
final Oaf oaf = prototype.build();
|
|
93 |
final Oaf oaf = prototype.build();
|
|
91 | 94 |
|
92 |
final OpenAireEventPayload payload =
|
|
93 |
HighlightFactory.highlightEnrichOrcidAuthor(OpenAireEventPayloadFactory.fromOAF(oaf.getEntity(), other.getEntity(), trust), pair.getValue());
|
|
95 |
final OpenAireEventPayload payload =
|
|
96 |
HighlightFactory.highlightEnrichOrcidAuthor(OpenAireEventPayloadFactory.fromOAF(oaf.getEntity(), other.getEntity(), trust), pair.getValue());
|
|
94 | 97 |
|
95 |
return EventWrapper.newInstance(
|
|
96 |
asEvent(oaf.getEntity(), topic, payload, other.getEntity(), trust),
|
|
97 |
payload.getHighlight().getCreators().stream().filter(s -> StringUtils.contains(s, ORCID_TYPE_MARKER)).collect(Collectors.joining(", ")),
|
|
98 |
topic.getValue());
|
|
99 |
}
|
|
98 |
return EventWrapper.newInstance(
|
|
99 |
asEvent(oaf.getEntity(), topic, payload, other.getEntity(), trust),
|
|
100 |
payload.getHighlight().getCreators().stream().filter(s -> StringUtils.contains(s, ORCID_TYPE_MARKER)).collect(Collectors.joining(", ")),
|
|
101 |
topic.getValue());
|
|
102 |
}
|
|
100 | 103 |
|
101 |
private List<FieldTypeProtos.Author> authors(final Oaf oaf) { |
|
102 |
List<FieldTypeProtos.Author> authors = oaf.getEntity().getResult().getMetadata().getAuthorList(); |
|
103 |
if (authors == null) { |
|
104 |
return Lists.newLinkedList(); |
|
105 |
} |
|
106 |
return authors; |
|
107 |
} |
|
104 |
private List<FieldTypeProtos.Author> authors(final Oaf oaf) { |
|
105 |
final List<FieldTypeProtos.Author> authors = oaf.getEntity().getResult().getMetadata().getAuthorList(); |
|
106 |
if (authors == null) { return Lists.newLinkedList(); } |
|
107 |
return authors; |
|
108 |
} |
|
108 | 109 |
|
109 |
private Float distance(final FieldTypeProtos.Author a, final FieldTypeProtos.Author b) {
|
|
110 |
final JaroWinkler jaroWinkler = new JaroWinkler(1.0);
|
|
111 |
if (a.hasSurname() && b.hasSurname()) {
|
|
112 |
return (float) jaroWinkler.distance(getCanonicalName(a), getCanonicalName(b));
|
|
110 |
private Float distance(final FieldTypeProtos.Author a, final FieldTypeProtos.Author b) {
|
|
111 |
final JaroWinkler jaroWinkler = new JaroWinkler(1.0);
|
|
112 |
if (a.hasSurname() && b.hasSurname()) {
|
|
113 |
return (float) jaroWinkler.distance(getCanonicalName(a), getCanonicalName(b), null);
|
|
113 | 114 |
|
114 |
} else {
|
|
115 |
return (float) jaroWinkler.distance(a.getFullname(), b.getFullname());
|
|
116 |
}
|
|
117 |
}
|
|
115 |
} else {
|
|
116 |
return (float) jaroWinkler.distance(a.getFullname(), b.getFullname(), null);
|
|
117 |
}
|
|
118 |
}
|
|
118 | 119 |
|
119 |
// returns the 1st letter of the author name + the author surname, all in lowercase
|
|
120 |
// e.g. "pmanghi"
|
|
121 |
private String getCanonicalName(FieldTypeProtos.Author a) {
|
|
122 |
return (StringUtils.substring(a.getName(), 0, 1) + a.getSurname()).toLowerCase();
|
|
123 |
}
|
|
120 |
// returns the 1st letter of the author name + the author surname, all in lowercase
|
|
121 |
// e.g. "pmanghi"
|
|
122 |
private String getCanonicalName(final FieldTypeProtos.Author a) {
|
|
123 |
return (StringUtils.substring(a.getName(), 0, 1) + a.getSurname()).toLowerCase();
|
|
124 |
}
|
|
124 | 125 |
|
125 |
|
|
126 | 126 |
} |
Also available in: Unified diff
use of dnet-pace-core 3.0.15