Project

General

Profile

1 54765 claudio.at
package eu.dnetlib.data.mapreduce.hbase.broker;
2
3 57450 michele.ar
import static eu.dnetlib.data.mapreduce.hbase.broker.mapping.EventFactory.asEvent;
4
5
import java.util.LinkedList;
6
import java.util.List;
7
import java.util.Queue;
8
import java.util.function.Predicate;
9
import java.util.stream.Collectors;
10
11
import org.apache.commons.lang3.StringUtils;
12
13 54765 claudio.at
import com.google.common.collect.Lists;
14 57450 michele.ar
15 54765 claudio.at
import eu.dnetlib.broker.objects.OpenAireEventPayload;
16
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.HighlightFactory;
17
import eu.dnetlib.data.mapreduce.hbase.broker.mapping.OpenAireEventPayloadFactory;
18
import eu.dnetlib.data.mapreduce.hbase.broker.model.EventWrapper;
19
import eu.dnetlib.data.proto.FieldTypeProtos;
20
import eu.dnetlib.data.proto.OafProtos.Oaf;
21
import eu.dnetlib.miscutils.collections.Pair;
22
import eu.dnetlib.pace.distance.algo.JaroWinkler;
23
24
public class OrcidEventFactory {
25
26 57450 michele.ar
	private static final long MAX_AUTHORS = 50;
27
	private static final Float t = 0.9f;
28
	public static final String ORCID_TYPE_MARKER = "ORCID";
29 54765 claudio.at
30 57450 michele.ar
	public static List<EventWrapper> process(final Oaf current, final Oaf other, final float trust) {
31
		return new OrcidEventFactory().processOrcid(current, other, trust);
32
	}
33 54765 claudio.at
34 57450 michele.ar
	public List<EventWrapper> processOrcid(final Oaf current, final Oaf other, final float trust) {
35 54765 claudio.at
36 57450 michele.ar
		final List<EventWrapper> events = Lists.newArrayList();
37 54765 claudio.at
38 57450 michele.ar
		final Queue<FieldTypeProtos.Author> currAuthors = getAuthors(current, noneIsORCID());
39
		final Queue<FieldTypeProtos.Author> otherAuthors = getAuthors(other, anyIsORCID());
40 54765 claudio.at
41 57450 michele.ar
		while (!currAuthors.isEmpty()) {
42
			final FieldTypeProtos.Author currentAuthor = currAuthors.remove();
43 54765 claudio.at
44 57450 michele.ar
			Pair<FieldTypeProtos.Author, Float> bestMatch = null;
45
			for (final FieldTypeProtos.Author otherAuthor : otherAuthors) {
46 54765 claudio.at
47 57450 michele.ar
				final Pair<FieldTypeProtos.Author, Float> pair = new Pair<>(otherAuthor, distance(currentAuthor, otherAuthor));
48
				if (bestMatch == null || pair.getValue() > bestMatch.getValue()) {
49
					bestMatch = pair;
50
				}
51
			}
52 54765 claudio.at
53 57450 michele.ar
			if (bestMatch != null && bestMatch.getValue() >= t) {
54
				final float authorTrust = trust * bestMatch.getValue(); // adjust it?
55
				events.add(doProcessOrcid(current, other, new Pair<>(currentAuthor, bestMatch.getKey()), Topic.ENRICH_MISSING_AUTHOR_ORCID, authorTrust));
56
			}
57
		}
58 54765 claudio.at
59 57450 michele.ar
		return events;
60
	}
61 54765 claudio.at
62 57450 michele.ar
	private LinkedList<FieldTypeProtos.Author> getAuthors(final Oaf oaf, final Predicate<FieldTypeProtos.Author> p) {
63
		return authors(oaf).stream()
64
				.filter(p)
65
				.limit(MAX_AUTHORS)
66
				.collect(Collectors.toCollection(LinkedList::new));
67
	}
68 54794 claudio.at
69 57450 michele.ar
	private Predicate<FieldTypeProtos.Author> anyIsORCID() {
70
		return author -> author.getPidList().stream().anyMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
71
	}
72 54794 claudio.at
73 57450 michele.ar
	private Predicate<FieldTypeProtos.Author> noneIsORCID() {
74
		return author -> author.getPidList().stream().noneMatch(pid -> ORCID_TYPE_MARKER.equals(pid.getKey()));
75
	}
76 54794 claudio.at
77 57450 michele.ar
	private EventWrapper doProcessOrcid(final Oaf current,
78
			final Oaf other,
79
			final Pair<FieldTypeProtos.Author, FieldTypeProtos.Author> pair,
80
			final Topic topic,
81
			final float trust) {
82
		final Oaf.Builder prototype = Oaf.newBuilder(current);
83 54765 claudio.at
84 57450 michele.ar
		for (final FieldTypeProtos.Author.Builder a : prototype.getEntityBuilder().getResultBuilder().getMetadataBuilder().getAuthorBuilderList()) {
85
			if (a.getFullname().equals(pair.getKey().getFullname())) {
86
				a.addAllPid(
87
						pair.getValue().getPidList().stream()
88
								.filter(p -> ORCID_TYPE_MARKER.equals(p.getKey()))
89
								.collect(Collectors.toList()));
90
			}
91
		}
92 54765 claudio.at
93 57450 michele.ar
		final Oaf oaf = prototype.build();
94 54765 claudio.at
95 57450 michele.ar
		final OpenAireEventPayload payload =
96
				HighlightFactory.highlightEnrichOrcidAuthor(OpenAireEventPayloadFactory.fromOAF(oaf.getEntity(), other.getEntity(), trust), pair.getValue());
97 54765 claudio.at
98 57450 michele.ar
		return EventWrapper.newInstance(
99
				asEvent(oaf.getEntity(), topic, payload, other.getEntity(), trust),
100
				payload.getHighlight().getCreators().stream().filter(s -> StringUtils.contains(s, ORCID_TYPE_MARKER)).collect(Collectors.joining(", ")),
101
				topic.getValue());
102
	}
103 54765 claudio.at
104 57450 michele.ar
	private List<FieldTypeProtos.Author> authors(final Oaf oaf) {
105
		final List<FieldTypeProtos.Author> authors = oaf.getEntity().getResult().getMetadata().getAuthorList();
106
		if (authors == null) { return Lists.newLinkedList(); }
107
		return authors;
108
	}
109 54765 claudio.at
110 57450 michele.ar
	private Float distance(final FieldTypeProtos.Author a, final FieldTypeProtos.Author b) {
111
		final JaroWinkler jaroWinkler = new JaroWinkler(1.0);
112
		if (a.hasSurname() && b.hasSurname()) {
113 57632 claudio.at
			return (float) jaroWinkler.distance(getCanonicalName(a), getCanonicalName(b));
114 54765 claudio.at
115 57450 michele.ar
		} else {
116 57632 claudio.at
			return (float) jaroWinkler.distance(a.getFullname(), b.getFullname());
117 57450 michele.ar
		}
118
	}
119 54765 claudio.at
120 57450 michele.ar
	// returns the 1st letter of the author name + the author surname, all in lowercase
121
	// e.g. "pmanghi"
122
	private String getCanonicalName(final FieldTypeProtos.Author a) {
123
		return (StringUtils.substring(a.getName(), 0, 1) + a.getSurname()).toLowerCase();
124
	}
125 54765 claudio.at
126
}