Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.broker;
2

    
3
import static eu.dnetlib.data.mapreduce.hbase.broker.mapping.EventFactory.asEvent;
4

    
5
import java.util.LinkedList;
6
import java.util.List;
7
import java.util.Queue;
8
import java.util.function.Predicate;
9
import java.util.stream.Collectors;
10

    
11
import org.apache.commons.lang3.StringUtils;
12

    
13
import com.google.common.collect.Lists;
14

    
15
import eu.dnetlib.broker.objects.OpenAireEventPayload;
16
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.HighlightFactory;
17
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.OpenAireEventPayloadFactory;
18
import eu.dnetlib.data.mapreduce.hbase.broker.model.EventWrapper;
19
import eu.dnetlib.data.proto.FieldTypeProtos;
20
import eu.dnetlib.data.proto.OafProtos.Oaf;
21
import eu.dnetlib.miscutils.collections.Pair;
22
import eu.dnetlib.pace.distance.algo.JaroWinkler;
23

    
24
public class OrcidEventFactory {
25

    
26
	private static final long MAX_AUTHORS = 50;
27
	private static final Float t = 0.9f;
28
	public static final String ORCID_TYPE_MARKER = "ORCID";
29

    
30
	public static List<EventWrapper> process(final Oaf current, final Oaf other, final float trust) {
31
		return new OrcidEventFactory().processOrcid(current, other, trust);
32
	}
33

    
34
	public List<EventWrapper> processOrcid(final Oaf current, final Oaf other, final float trust) {
35

    
36
		final List<EventWrapper> events = Lists.newArrayList();
37

    
38
		final Queue<FieldTypeProtos.Author> currAuthors = getAuthors(current, noneIsORCID());
39
		final Queue<FieldTypeProtos.Author> otherAuthors = getAuthors(other, anyIsORCID());
40

    
41
		while (!currAuthors.isEmpty()) {
42
			final FieldTypeProtos.Author currentAuthor = currAuthors.remove();
43

    
44
			Pair<FieldTypeProtos.Author, Float> bestMatch = null;
45
			for (final FieldTypeProtos.Author otherAuthor : otherAuthors) {
46

    
47
				final Pair<FieldTypeProtos.Author, Float> pair = new Pair<>(otherAuthor, distance(currentAuthor, otherAuthor));
48
				if (bestMatch == null || pair.getValue() > bestMatch.getValue()) {
49
					bestMatch = pair;
50
				}
51
			}
52

    
53
			if (bestMatch != null && bestMatch.getValue() >= t) {
54
				final float authorTrust = trust * bestMatch.getValue(); // adjust it?
55
				events.add(doProcessOrcid(current, other, new Pair<>(currentAuthor, bestMatch.getKey()), Topic.ENRICH_MISSING_AUTHOR_ORCID, authorTrust));
56
			}
57
		}
58

    
59
		return events;
60
	}
61

    
62
	private LinkedList<FieldTypeProtos.Author> getAuthors(final Oaf oaf, final Predicate<FieldTypeProtos.Author> p) {
63
		return authors(oaf).stream()
64
				.filter(p)
65
				.limit(MAX_AUTHORS)
66
				.collect(Collectors.toCollection(LinkedList::new));
67
	}
68

    
69
	private Predicate<FieldTypeProtos.Author> anyIsORCID() {
70
		return author -> author.getPidList().stream().anyMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
71
	}
72

    
73
	private Predicate<FieldTypeProtos.Author> noneIsORCID() {
74
		return author -> author.getPidList().stream().noneMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
75
	}
76

    
77
	private EventWrapper doProcessOrcid(final Oaf current,
78
			final Oaf other,
79
			final Pair<FieldTypeProtos.Author, FieldTypeProtos.Author> pair,
80
			final Topic topic,
81
			final float trust) {
82
		final Oaf.Builder prototype = Oaf.newBuilder(current);
83

    
84
		for (final FieldTypeProtos.Author.Builder a : prototype.getEntityBuilder().getResultBuilder().getMetadataBuilder().getAuthorBuilderList()) {
85
			if (a.getFullname().equals(pair.getKey().getFullname())) {
86
				a.addAllPid(
87
						pair.getValue().getPidList().stream()
88
								.filter(p -> ORCID_TYPE_MARKER.equals(p.getKey()))
89
								.collect(Collectors.toList()));
90
			}
91
		}
92

    
93
		final Oaf oaf = prototype.build();
94

    
95
		final OpenAireEventPayload payload =
96
				HighlightFactory.highlightEnrichOrcidAuthor(OpenAireEventPayloadFactory.fromOAF(oaf.getEntity(), other.getEntity(), trust), pair.getValue());
97

    
98
		return EventWrapper.newInstance(
99
				asEvent(oaf.getEntity(), topic, payload, other.getEntity(), trust),
100
				payload.getHighlight().getCreators().stream().filter(s -> StringUtils.contains(s, ORCID_TYPE_MARKER)).collect(Collectors.joining(", ")),
101
				topic.getValue());
102
	}
103

    
104
	private List<FieldTypeProtos.Author> authors(final Oaf oaf) {
105
		final List<FieldTypeProtos.Author> authors = oaf.getEntity().getResult().getMetadata().getAuthorList();
106
		if (authors == null) { return Lists.newLinkedList(); }
107
		return authors;
108
	}
109

    
110
	private Float distance(final FieldTypeProtos.Author a, final FieldTypeProtos.Author b) {
111
		final JaroWinkler jaroWinkler = new JaroWinkler(1.0);
112
		if (a.hasSurname() && b.hasSurname()) {
113
			return (float) jaroWinkler.distance(getCanonicalName(a), getCanonicalName(b));
114

    
115
		} else {
116
			return (float) jaroWinkler.distance(a.getFullname(), b.getFullname());
117
		}
118
	}
119

    
120
	// returns the 1st letter of the author name + the author surname, all in lowercase
121
	// e.g. "pmanghi"
122
	private String getCanonicalName(final FieldTypeProtos.Author a) {
123
		return (StringUtils.substring(a.getName(), 0, 1) + a.getSurname()).toLowerCase();
124
	}
125

    
126
}
(3-3/10)