Project

General

Profile

« Previous | Next » 

Revision 57450

use of dnet-pace-core 3.0.15

View differences:

OrcidEventFactory.java
1 1
package eu.dnetlib.data.mapreduce.hbase.broker;
2 2

  
3
import static eu.dnetlib.data.mapreduce.hbase.broker.mapping.EventFactory.asEvent;
4

  
5
import java.util.LinkedList;
6
import java.util.List;
7
import java.util.Queue;
8
import java.util.function.Predicate;
9
import java.util.stream.Collectors;
10

  
11
import org.apache.commons.lang3.StringUtils;
12

  
3 13
import com.google.common.collect.Lists;
4
import com.google.common.collect.Maps;
14

  
5 15
import eu.dnetlib.broker.objects.OpenAireEventPayload;
6 16
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.HighlightFactory;
7 17
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.OpenAireEventPayloadFactory;
8 18
import eu.dnetlib.data.mapreduce.hbase.broker.model.EventWrapper;
9

  
10 19
import eu.dnetlib.data.proto.FieldTypeProtos;
11
import eu.dnetlib.data.proto.OafProtos;
12 20
import eu.dnetlib.data.proto.OafProtos.Oaf;
13 21
import eu.dnetlib.miscutils.collections.Pair;
14 22
import eu.dnetlib.pace.distance.algo.JaroWinkler;
15
import org.apache.commons.lang3.StringUtils;
16
import org.dom4j.DocumentException;
17 23

  
18
import java.io.IOException;
19
import java.util.*;
20
import java.util.function.Predicate;
21
import java.util.stream.Collectors;
22

  
23
import static eu.dnetlib.data.mapreduce.hbase.broker.mapping.EventFactory.asEvent;
24

  
25 24
public class OrcidEventFactory {
26 25

  
27
    private static final long MAX_AUTHORS = 50;
28
    private static final Float t = 0.9f;
29
    public static final String ORCID_TYPE_MARKER = "ORCID";
26
	private static final long MAX_AUTHORS = 50;
27
	private static final Float t = 0.9f;
28
	public static final String ORCID_TYPE_MARKER = "ORCID";
30 29

  
31
    public static List<EventWrapper> process(final Oaf current, final Oaf other, final float trust) {
32
        return new OrcidEventFactory().processOrcid(current, other, trust);
33
    }
30
	public static List<EventWrapper> process(final Oaf current, final Oaf other, final float trust) {
31
		return new OrcidEventFactory().processOrcid(current, other, trust);
32
	}
34 33

  
35
    public List<EventWrapper> processOrcid(final Oaf current, final Oaf other, final float trust) {
34
	public List<EventWrapper> processOrcid(final Oaf current, final Oaf other, final float trust) {
36 35

  
37
        final List<EventWrapper> events = Lists.newArrayList();
36
		final List<EventWrapper> events = Lists.newArrayList();
38 37

  
39
        final Queue<FieldTypeProtos.Author> currAuthors = getAuthors(current, noneIsORCID());
40
        final Queue<FieldTypeProtos.Author> otherAuthors = getAuthors(other, anyIsORCID());
38
		final Queue<FieldTypeProtos.Author> currAuthors = getAuthors(current, noneIsORCID());
39
		final Queue<FieldTypeProtos.Author> otherAuthors = getAuthors(other, anyIsORCID());
41 40

  
42
        while (!currAuthors.isEmpty()) {
43
            final FieldTypeProtos.Author currentAuthor = currAuthors.remove();
41
		while (!currAuthors.isEmpty()) {
42
			final FieldTypeProtos.Author currentAuthor = currAuthors.remove();
44 43

  
45
            Pair<FieldTypeProtos.Author, Float> bestMatch = null;
46
            for(FieldTypeProtos.Author otherAuthor : otherAuthors) {
44
			Pair<FieldTypeProtos.Author, Float> bestMatch = null;
45
			for (final FieldTypeProtos.Author otherAuthor : otherAuthors) {
47 46

  
48
                final Pair<FieldTypeProtos.Author, Float> pair = new Pair<>(otherAuthor, distance(currentAuthor, otherAuthor));
49
                if (bestMatch == null || pair.getValue() > bestMatch.getValue()) {
50
                    bestMatch = pair;
51
                }
52
            }
47
				final Pair<FieldTypeProtos.Author, Float> pair = new Pair<>(otherAuthor, distance(currentAuthor, otherAuthor));
48
				if (bestMatch == null || pair.getValue() > bestMatch.getValue()) {
49
					bestMatch = pair;
50
				}
51
			}
53 52

  
54
            if (bestMatch != null && bestMatch.getValue() >= t) {
55
                float authorTrust = trust * bestMatch.getValue(); // adjust it?
56
                events.add(doProcessOrcid(current, other, new Pair<>(currentAuthor, bestMatch.getKey()), Topic.ENRICH_MISSING_AUTHOR_ORCID, authorTrust));
57
            }
58
        }
53
			if (bestMatch != null && bestMatch.getValue() >= t) {
54
				final float authorTrust = trust * bestMatch.getValue(); // adjust it?
55
				events.add(doProcessOrcid(current, other, new Pair<>(currentAuthor, bestMatch.getKey()), Topic.ENRICH_MISSING_AUTHOR_ORCID, authorTrust));
56
			}
57
		}
59 58

  
60
        return events;
61
    }
59
		return events;
60
	}
62 61

  
63
    private LinkedList<FieldTypeProtos.Author> getAuthors(Oaf oaf, Predicate<FieldTypeProtos.Author> p) {
64
        return authors(oaf).stream()
65
                .filter(p)
66
                .limit(MAX_AUTHORS)
67
                .collect(Collectors.toCollection(LinkedList::new));
68
    }
62
	private LinkedList<FieldTypeProtos.Author> getAuthors(final Oaf oaf, final Predicate<FieldTypeProtos.Author> p) {
63
		return authors(oaf).stream()
64
				.filter(p)
65
				.limit(MAX_AUTHORS)
66
				.collect(Collectors.toCollection(LinkedList::new));
67
	}
69 68

  
70
    private Predicate<FieldTypeProtos.Author> anyIsORCID() {
71
        return author -> author.getPidList().stream().anyMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
72
    }
69
	private Predicate<FieldTypeProtos.Author> anyIsORCID() {
70
		return author -> author.getPidList().stream().anyMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
71
	}
73 72

  
74
    private Predicate<FieldTypeProtos.Author> noneIsORCID() {
75
        return author -> author.getPidList().stream().noneMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
76
    }
73
	private Predicate<FieldTypeProtos.Author> noneIsORCID() {
74
		return author -> author.getPidList().stream().noneMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
75
	}
77 76

  
78
    private EventWrapper doProcessOrcid(final Oaf current, final Oaf other, final Pair<FieldTypeProtos.Author, FieldTypeProtos.Author> pair, final Topic topic, final float trust) {
79
        final Oaf.Builder prototype = Oaf.newBuilder(current);
77
	private EventWrapper doProcessOrcid(final Oaf current,
78
			final Oaf other,
79
			final Pair<FieldTypeProtos.Author, FieldTypeProtos.Author> pair,
80
			final Topic topic,
81
			final float trust) {
82
		final Oaf.Builder prototype = Oaf.newBuilder(current);
80 83

  
81
        for(FieldTypeProtos.Author.Builder a : prototype.getEntityBuilder().getResultBuilder().getMetadataBuilder().getAuthorBuilderList()) {
82
            if (a.getFullname().equals(pair.getKey().getFullname())) {
83
                a.addAllPid(
84
                        pair.getValue().getPidList().stream()
85
                        .filter(p -> ORCID_TYPE_MARKER.equals(p.getKey()))
86
                        .collect(Collectors.toList()));
87
            }
88
        }
84
		for (final FieldTypeProtos.Author.Builder a : prototype.getEntityBuilder().getResultBuilder().getMetadataBuilder().getAuthorBuilderList()) {
85
			if (a.getFullname().equals(pair.getKey().getFullname())) {
86
				a.addAllPid(
87
						pair.getValue().getPidList().stream()
88
								.filter(p -> ORCID_TYPE_MARKER.equals(p.getKey()))
89
								.collect(Collectors.toList()));
90
			}
91
		}
89 92

  
90
        final Oaf oaf = prototype.build();
93
		final Oaf oaf = prototype.build();
91 94

  
92
        final OpenAireEventPayload payload =
93
                HighlightFactory.highlightEnrichOrcidAuthor(OpenAireEventPayloadFactory.fromOAF(oaf.getEntity(), other.getEntity(), trust), pair.getValue());
95
		final OpenAireEventPayload payload =
96
				HighlightFactory.highlightEnrichOrcidAuthor(OpenAireEventPayloadFactory.fromOAF(oaf.getEntity(), other.getEntity(), trust), pair.getValue());
94 97

  
95
        return EventWrapper.newInstance(
96
                asEvent(oaf.getEntity(), topic, payload, other.getEntity(), trust),
97
                payload.getHighlight().getCreators().stream().filter(s -> StringUtils.contains(s, ORCID_TYPE_MARKER)).collect(Collectors.joining(", ")),
98
                topic.getValue());
99
    }
98
		return EventWrapper.newInstance(
99
				asEvent(oaf.getEntity(), topic, payload, other.getEntity(), trust),
100
				payload.getHighlight().getCreators().stream().filter(s -> StringUtils.contains(s, ORCID_TYPE_MARKER)).collect(Collectors.joining(", ")),
101
				topic.getValue());
102
	}
100 103

  
101
    private List<FieldTypeProtos.Author> authors(final Oaf oaf) {
102
        List<FieldTypeProtos.Author> authors = oaf.getEntity().getResult().getMetadata().getAuthorList();
103
        if (authors == null) {
104
            return Lists.newLinkedList();
105
        }
106
        return authors;
107
    }
104
	private List<FieldTypeProtos.Author> authors(final Oaf oaf) {
105
		final List<FieldTypeProtos.Author> authors = oaf.getEntity().getResult().getMetadata().getAuthorList();
106
		if (authors == null) { return Lists.newLinkedList(); }
107
		return authors;
108
	}
108 109

  
109
    private Float distance(final FieldTypeProtos.Author a, final FieldTypeProtos.Author b) {
110
        final JaroWinkler jaroWinkler = new JaroWinkler(1.0);
111
        if (a.hasSurname() && b.hasSurname()) {
112
            return (float) jaroWinkler.distance(getCanonicalName(a), getCanonicalName(b));
110
	private Float distance(final FieldTypeProtos.Author a, final FieldTypeProtos.Author b) {
111
		final JaroWinkler jaroWinkler = new JaroWinkler(1.0);
112
		if (a.hasSurname() && b.hasSurname()) {
113
			return (float) jaroWinkler.distance(getCanonicalName(a), getCanonicalName(b), null);
113 114

  
114
        } else {
115
            return (float) jaroWinkler.distance(a.getFullname(), b.getFullname());
116
        }
117
    }
115
		} else {
116
			return (float) jaroWinkler.distance(a.getFullname(), b.getFullname(), null);
117
		}
118
	}
118 119

  
119
    // returns the 1st letter of the author name + the author surname, all in lowercase
120
    // e.g. "pmanghi"
121
    private String getCanonicalName(FieldTypeProtos.Author a) {
122
        return (StringUtils.substring(a.getName(), 0, 1) + a.getSurname()).toLowerCase();
123
    }
120
	// returns the 1st letter of the author name + the author surname, all in lowercase
121
	// e.g. "pmanghi"
122
	private String getCanonicalName(final FieldTypeProtos.Author a) {
123
		return (StringUtils.substring(a.getName(), 0, 1) + a.getSurname()).toLowerCase();
124
	}
124 125

  
125

  
126 126
}

Also available in: Unified diff