1
|
package eu.dnetlib.data.mapreduce.util;
|
2
|
|
3
|
import java.util.List;
|
4
|
import java.util.Map;
|
5
|
import java.util.Set;
|
6
|
|
7
|
import com.google.common.base.Predicate;
|
8
|
import com.google.common.collect.Iterables;
|
9
|
import com.google.common.collect.Lists;
|
10
|
import com.google.common.collect.Maps;
|
11
|
import com.google.common.collect.Sets;
|
12
|
import com.google.protobuf.Descriptors.FieldDescriptor;
|
13
|
import com.google.protobuf.Message.Builder;
|
14
|
|
15
|
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
|
16
|
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
|
17
|
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
18
|
import eu.dnetlib.data.proto.KindProtos.Kind;
|
19
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
20
|
import eu.dnetlib.data.proto.OafProtos.OafEntity;
|
21
|
import eu.dnetlib.data.proto.PersonProtos.Person;
|
22
|
import eu.dnetlib.data.proto.ResultProtos.Result;
|
23
|
import eu.dnetlib.data.proto.SpecialTrustProtos.SpecialTrust;
|
24
|
|
25
|
public class OafEntityMerger {
|
26
|
|
27
|
private final Predicate<StringField> skipEmptyStringField = new Predicate<StringField>() {
|
28
|
|
29
|
@Override
|
30
|
public boolean apply(StringField s) {
|
31
|
return s != null && s.getValue() != null && !s.getValue().isEmpty();
|
32
|
}
|
33
|
};
|
34
|
|
35
|
private final Predicate<String> skipEmptyString = new Predicate<String>() {
|
36
|
|
37
|
@Override
|
38
|
public boolean apply(String s) {
|
39
|
return s != null && !s.isEmpty();
|
40
|
}
|
41
|
};
|
42
|
|
43
|
public static Oaf.Builder merge(String id, Iterable<Oaf> entities) {
|
44
|
return new OafEntityMerger().mergeEntities(id, entities);
|
45
|
}
|
46
|
|
47
|
public static Oaf.Builder merge(Oaf.Builder builder) {
|
48
|
return new OafEntityMerger().doMergeEntities(builder);
|
49
|
}
|
50
|
|
51
|
public Oaf.Builder mergeEntities(String id, Iterable<Oaf> entities) {
|
52
|
|
53
|
Oaf.Builder builder = Oaf.newBuilder();
|
54
|
String trust = "0.0";
|
55
|
for (Oaf oaf : TrustOrdering.sort(entities)) {
|
56
|
// doublecheck we're dealing only with main entities
|
57
|
if (!oaf.getKind().equals(Kind.entity)) { throw new IllegalArgumentException("expected OafEntity!"); }
|
58
|
|
59
|
String currentTrust = oaf.getDataInfo().getTrust();
|
60
|
if (!currentTrust.equals(SpecialTrust.NEUTRAL.toString())) {
|
61
|
trust = currentTrust;
|
62
|
}
|
63
|
builder.mergeFrom(oaf);
|
64
|
}
|
65
|
|
66
|
builder = doMergeEntities(builder);
|
67
|
builder.getEntityBuilder().setId(id);
|
68
|
builder.getDataInfoBuilder().setInferred(true).setDeletedbyinference(false).setTrust(trust);
|
69
|
|
70
|
return builder;
|
71
|
}
|
72
|
|
73
|
public Oaf.Builder doMergeEntities(Oaf.Builder builder) {
|
74
|
|
75
|
switch (builder.getEntity().getType()) {
|
76
|
case datasource:
|
77
|
break;
|
78
|
case organization:
|
79
|
break;
|
80
|
case person:
|
81
|
Person.Metadata.Builder person = builder.getEntityBuilder().getPersonBuilder().getMetadataBuilder();
|
82
|
for (String field : Lists.newArrayList("secondnames")) {
|
83
|
setSingleString(person, field);
|
84
|
}
|
85
|
break;
|
86
|
case project:
|
87
|
break;
|
88
|
case result:
|
89
|
Result.Metadata.Builder result = builder.getEntityBuilder().getResultBuilder().getMetadataBuilder();
|
90
|
setTitle(result);
|
91
|
|
92
|
// for (String field : Lists.newArrayList("subject", "relevantdate")) {
|
93
|
for (String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.SUBJECT_FIELD_NUMBER,
|
94
|
Result.Metadata.RELEVANTDATE_FIELD_NUMBER)) {
|
95
|
setStructuredProperty(result, field);
|
96
|
}
|
97
|
for (String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.DESCRIPTION_FIELD_NUMBER)) {
|
98
|
setLongestStringField(result, field);
|
99
|
}
|
100
|
for (String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.SOURCE_FIELD_NUMBER)) {
|
101
|
setUniqueStringField(result, field);
|
102
|
}
|
103
|
for (String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.COLLECTEDFROM_FIELD_NUMBER)) {
|
104
|
setKeyValues(builder.getEntityBuilder(), field);
|
105
|
}
|
106
|
for (String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.PID_FIELD_NUMBER)) {
|
107
|
setStructuredProperty(builder.getEntityBuilder(), field);
|
108
|
}
|
109
|
for (String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.ORIGINALID_FIELD_NUMBER)) {
|
110
|
setUniqueString(builder.getEntityBuilder(), field);
|
111
|
}
|
112
|
break;
|
113
|
default:
|
114
|
break;
|
115
|
}
|
116
|
return builder;
|
117
|
}
|
118
|
|
119
|
/**
|
120
|
* Helper method, avoid duplicated StructuredProperties in the given builder for the given fieldName
|
121
|
*
|
122
|
* @param builder
|
123
|
* @param fieldName
|
124
|
*/
|
125
|
@SuppressWarnings("unchecked")
|
126
|
private void setStructuredProperty(Builder builder, String fieldName) {
|
127
|
final Map<String, StructuredProperty> map = Maps.newHashMap();
|
128
|
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
|
129
|
final List<StructuredProperty> sps = (List<StructuredProperty>) builder.getField(fd);
|
130
|
|
131
|
if (sps != null && !sps.isEmpty()) {
|
132
|
for (StructuredProperty sp : sps) {
|
133
|
map.put(sp.getValue(), sp);
|
134
|
}
|
135
|
|
136
|
if (!map.isEmpty()) {
|
137
|
builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
|
138
|
}
|
139
|
}
|
140
|
}
|
141
|
|
142
|
/**
|
143
|
* Helper method, avoid duplicated KeyValues in the given builder for the given fieldName
|
144
|
*
|
145
|
* @param builder
|
146
|
* @param fieldName
|
147
|
*/
|
148
|
@SuppressWarnings("unchecked")
|
149
|
private void setKeyValues(Builder builder, String fieldName) {
|
150
|
final Map<String, KeyValue> map = Maps.newHashMap();
|
151
|
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
|
152
|
final List<KeyValue> kvs = (List<KeyValue>) builder.getField(fd);
|
153
|
|
154
|
if (kvs != null && !kvs.isEmpty()) {
|
155
|
for (KeyValue sp : kvs) {
|
156
|
map.put(sp.getKey(), sp);
|
157
|
}
|
158
|
|
159
|
if (!map.isEmpty()) {
|
160
|
builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
|
161
|
}
|
162
|
}
|
163
|
}
|
164
|
|
165
|
@SuppressWarnings("unchecked")
|
166
|
private void setSingleString(Builder builder, String fieldName) {
|
167
|
|
168
|
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
|
169
|
final List<StringField> field = (List<StringField>) builder.getField(fd);
|
170
|
if (field != null && !field.isEmpty()) {
|
171
|
final StringField s = (StringField) Iterables.getLast(Iterables.filter(field, skipEmptyStringField), "");
|
172
|
|
173
|
if (s != null && s.getValue() != null && !s.getValue().isEmpty()) {
|
174
|
builder.clearField(fd).setField(fd, Lists.newArrayList(s));
|
175
|
}
|
176
|
}
|
177
|
}
|
178
|
|
179
|
@SuppressWarnings("unchecked")
|
180
|
private void setLongestStringField(Builder builder, String fieldName) {
|
181
|
|
182
|
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
|
183
|
final List<StringField> field = (List<StringField>) builder.getField(fd);
|
184
|
|
185
|
if (field != null && !field.isEmpty()) {
|
186
|
StringField.Builder max = StringField.newBuilder().setValue("");
|
187
|
int maxLength = 0;
|
188
|
for (StringField sf : field) {
|
189
|
if (sf.getValue().length() > maxLength) {
|
190
|
maxLength = sf.getValue().length();
|
191
|
max.clear();
|
192
|
max.mergeFrom(sf);
|
193
|
}
|
194
|
}
|
195
|
|
196
|
builder.clearField(fd).setField(fd, Lists.newArrayList(max.build()));
|
197
|
}
|
198
|
}
|
199
|
|
200
|
@SuppressWarnings("unchecked")
|
201
|
private void setUniqueStringField(Builder builder, String fieldName) {
|
202
|
|
203
|
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
|
204
|
final List<StringField> field = (List<StringField>) builder.getField(fd);
|
205
|
final Map<String, StringField> map = Maps.newHashMap();
|
206
|
if (field != null && !field.isEmpty()) {
|
207
|
for (StringField s : Iterables.filter(field, skipEmptyStringField)) {
|
208
|
map.put(s.getValue(), s);
|
209
|
}
|
210
|
|
211
|
builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
|
212
|
}
|
213
|
}
|
214
|
|
215
|
@SuppressWarnings("unchecked")
|
216
|
private void setUniqueString(Builder builder, String fieldName) {
|
217
|
|
218
|
final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
|
219
|
final List<String> field = (List<String>) builder.getField(fd);
|
220
|
final Set<String> set = Sets.newHashSet();
|
221
|
if (field != null && !field.isEmpty()) {
|
222
|
for (String s : Iterables.filter(field, skipEmptyString)) {
|
223
|
set.add(s);
|
224
|
}
|
225
|
|
226
|
builder.clearField(fd).setField(fd, Lists.newArrayList(set));
|
227
|
}
|
228
|
}
|
229
|
|
230
|
private void setTitle(Result.Metadata.Builder metadata) {
|
231
|
Iterable<StructuredProperty> filtered = Iterables.filter(metadata.getTitleList(), OafUtils.mainTitleFilter());
|
232
|
|
233
|
if (!Iterables.isEmpty(filtered)) {
|
234
|
metadata.clearTitle().addTitle(Iterables.getLast(filtered));
|
235
|
}
|
236
|
}
|
237
|
|
238
|
}
|