Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.broker;
2

    
3
import com.google.common.collect.Lists;
4
import com.google.common.collect.Maps;
5
import eu.dnetlib.broker.objects.OpenAireEventPayload;
6
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.HighlightFactory;
7
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.OpenAireEventPayloadFactory;
8
import eu.dnetlib.data.mapreduce.hbase.broker.model.EventWrapper;
9

    
10
import eu.dnetlib.data.proto.FieldTypeProtos;
11
import eu.dnetlib.data.proto.OafProtos;
12
import eu.dnetlib.data.proto.OafProtos.Oaf;
13
import eu.dnetlib.miscutils.collections.Pair;
14
import eu.dnetlib.pace.distance.algo.JaroWinkler;
15
import org.apache.commons.lang3.StringUtils;
16
import org.dom4j.DocumentException;
17

    
18
import java.io.IOException;
19
import java.util.*;
20
import java.util.function.Predicate;
21
import java.util.stream.Collectors;
22

    
23
import static eu.dnetlib.data.mapreduce.hbase.broker.mapping.EventFactory.asEvent;
24

    
25
public class OrcidEventFactory {
26

    
27
    private static final long MAX_AUTHORS = 50;
28
    private static final Float t = 0.9f;
29
    public static final String ORCID_TYPE_MARKER = "ORCID";
30

    
31
    public static List<EventWrapper> process(final Oaf current, final Oaf other, final float trust) {
32
        return new OrcidEventFactory().processOrcid(current, other, trust);
33
    }
34

    
35
    public List<EventWrapper> processOrcid(final Oaf current, final Oaf other, final float trust) {
36

    
37
        final List<EventWrapper> events = Lists.newArrayList();
38

    
39
        final Queue<FieldTypeProtos.Author> currAuthors = getAuthors(current, noneIsORCID());
40
        final Queue<FieldTypeProtos.Author> otherAuthors = getAuthors(other, anyIsORCID());
41

    
42
        while (!currAuthors.isEmpty()) {
43
            final FieldTypeProtos.Author currentAuthor = currAuthors.remove();
44

    
45
            Pair<FieldTypeProtos.Author, Float> bestMatch = null;
46
            for(FieldTypeProtos.Author otherAuthor : otherAuthors) {
47

    
48
                final Pair<FieldTypeProtos.Author, Float> pair = new Pair<>(otherAuthor, distance(currentAuthor, otherAuthor));
49
                if (bestMatch == null || pair.getValue() > bestMatch.getValue()) {
50
                    bestMatch = pair;
51
                }
52
            }
53

    
54
            if (bestMatch != null && bestMatch.getValue() >= t) {
55
                float authorTrust = trust * bestMatch.getValue(); // adjust it?
56
                events.add(doProcessOrcid(current, other, new Pair<>(currentAuthor, bestMatch.getKey()), Topic.ENRICH_MISSING_AUTHOR_ORCID, authorTrust));
57
            }
58
        }
59

    
60
        return events;
61
    }
62

    
63
    private LinkedList<FieldTypeProtos.Author> getAuthors(Oaf oaf, Predicate<FieldTypeProtos.Author> p) {
64
        return authors(oaf).stream()
65
                .filter(p)
66
                .limit(MAX_AUTHORS)
67
                .collect(Collectors.toCollection(LinkedList::new));
68
    }
69

    
70
    private Predicate<FieldTypeProtos.Author> anyIsORCID() {
71
        return author -> author.getPidList().stream().anyMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
72
    }
73

    
74
    private Predicate<FieldTypeProtos.Author> noneIsORCID() {
75
        return author -> author.getPidList().stream().noneMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
76
    }
77

    
78
    private EventWrapper doProcessOrcid(final Oaf current, final Oaf other, final Pair<FieldTypeProtos.Author, FieldTypeProtos.Author> pair, final Topic topic, final float trust) {
79
        final Oaf.Builder prototype = Oaf.newBuilder(current);
80

    
81
        for(FieldTypeProtos.Author.Builder a : prototype.getEntityBuilder().getResultBuilder().getMetadataBuilder().getAuthorBuilderList()) {
82
            if (a.getFullname().equals(pair.getKey().getFullname())) {
83
                a.addAllPid(
84
                        pair.getValue().getPidList().stream()
85
                        .filter(p -> ORCID_TYPE_MARKER.equals(p.getKey()))
86
                        .collect(Collectors.toList()));
87
            }
88
        }
89

    
90
        final Oaf oaf = prototype.build();
91

    
92
        final OpenAireEventPayload payload =
93
                HighlightFactory.highlightEnrichOrcidAuthor(OpenAireEventPayloadFactory.fromOAF(oaf.getEntity(), other.getEntity(), trust), pair.getValue());
94

    
95
        return EventWrapper.newInstance(
96
                asEvent(oaf.getEntity(), topic, payload, other.getEntity(), trust),
97
                payload.getHighlight().getCreators().stream().filter(s -> StringUtils.contains(s, ORCID_TYPE_MARKER)).collect(Collectors.joining(", ")),
98
                topic.getValue());
99
    }
100

    
101
    private List<FieldTypeProtos.Author> authors(final Oaf oaf) {
102
        List<FieldTypeProtos.Author> authors = oaf.getEntity().getResult().getMetadata().getAuthorList();
103
        if (authors == null) {
104
            return Lists.newLinkedList();
105
        }
106
        return authors;
107
    }
108

    
109
    private Float distance(final FieldTypeProtos.Author a, final FieldTypeProtos.Author b) {
110
        final JaroWinkler jaroWinkler = new JaroWinkler(1.0);
111
        if (a.hasSurname() && b.hasSurname()) {
112
            return (float) jaroWinkler.distance(getCanonicalName(a), getCanonicalName(b));
113

    
114
        } else {
115
            return (float) jaroWinkler.distance(a.getFullname(), b.getFullname());
116
        }
117
    }
118

    
119
    // returns the 1st letter of the author name + the author surname, all in lowercase
120
    // e.g. "pmanghi"
121
    private String getCanonicalName(FieldTypeProtos.Author a) {
122
        return (StringUtils.substring(a.getName(), 0, 1) + a.getSurname()).toLowerCase();
123
    }
124

    
125

    
126
}
(3-3/10)