Project

General

Profile

1
package eu.dnetlib.data.transform;
2

    
3
import java.util.Collection;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.Set;
7
import java.util.stream.Collectors;
8
import java.util.stream.Stream;
9

    
10
import com.google.common.base.Functions;
11
import com.google.common.base.Predicate;
12
import com.google.common.collect.Iterables;
13
import com.google.common.collect.Lists;
14
import com.google.common.collect.Maps;
15
import com.google.common.collect.Sets;
16
import com.google.protobuf.Descriptors.FieldDescriptor;
17
import com.google.protobuf.Message.Builder;
18
import eu.dnetlib.data.proto.FieldTypeProtos;
19
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
20
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
21
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
22
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
23
import eu.dnetlib.data.proto.KindProtos.Kind;
24
import eu.dnetlib.data.proto.OafProtos.Oaf;
25
import eu.dnetlib.data.proto.OafProtos.OafEntity;
26
import eu.dnetlib.data.proto.ResultProtos;
27
import eu.dnetlib.data.proto.ResultProtos.Result;
28
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
29
import eu.dnetlib.data.proto.SpecialTrustProtos.SpecialTrust;
30
import eu.dnetlib.data.proto.TypeProtos.Type;
31
import eu.dnetlib.pace.config.DedupConfig;
32
import org.apache.commons.lang.StringUtils;
33

    
34
public class OafEntityMerger {
35

    
36
	private static final String DEDUP_CLASSID = "sysimport:dedup";
37

    
38
	private static final String DNET_PROVENANCE_SCHEME = "dnet:provenanceActions";
39

    
40
	private final Predicate<StringField> skipEmptyStringField = s -> (s != null) && (s.getValue() != null) && !s.getValue().isEmpty();
41

    
42
	private final Predicate<String> skipEmptyString = s -> StringUtils.isNotBlank(s);
43

    
44
	public static Oaf.Builder merge(final String id, final Iterable<Oaf> entities) {
45
		return merge(null, id, entities);
46
	}
47

    
48
	public static Oaf.Builder merge(final DedupConfig dedupConf, final String id, final Iterable<Oaf> entities) {
49
		return new OafEntityMerger().mergeEntities(dedupConf, id, entities);
50
	}
51

    
52
	public static Oaf.Builder merge(final Oaf.Builder builder) {
53
		return new OafEntityMerger().doMergeEntities(builder);
54
	}
55

    
56
	public Oaf.Builder mergeEntities(final DedupConfig dedupConf, final String id, final Iterable<Oaf> entities) {
57

    
58
		Oaf.Builder builder = Oaf.newBuilder();
59
		String trust = "0.0";
60

    
61
		for (final Oaf oaf : TrustOrdering.sort(entities)) {
62
			// doublecheck we're dealing only with main entities
63
			if (!oaf.getKind().equals(Kind.entity)) throw new IllegalArgumentException("expected OafEntity!");
64

    
65
			final String currentTrust = oaf.getDataInfo().getTrust();
66
			if (!currentTrust.equals(SpecialTrust.NEUTRAL.toString())) {
67
				trust = currentTrust;
68
			}
69

    
70
			builder.mergeFrom(oaf);
71

    
72
			if (oaf.getEntity().getType().equals(Type.result)) {
73
				builder.getEntityBuilder().getResultBuilder().getMetadataBuilder().clearAuthor();
74
				builder.getEntityBuilder().getResultBuilder().getMetadataBuilder().addAllAuthor(oaf.getEntity().getResult().getMetadata().getAuthorList());
75
			}
76
		}
77

    
78
		builder = doMergeEntities(builder);
79
		builder.getEntityBuilder().setId(id);
80
		builder.getDataInfoBuilder()
81
				.setInvisible(false)
82
				.setInferred(true)
83
				.setDeletedbyinference(false)
84
				.setTrust(trust)
85
				.setInferenceprovenance(dedupConf != null ? dedupConf.getWf().getConfigurationId() : "")
86
				.setProvenanceaction(getProvenanceAction());
87

    
88
		if ((dedupConf != null) && dedupConf.getWf().isIncludeChildren()) {
89
			for (final Oaf oaf : Iterables.limit(entities, dedupConf.getWf().getMaxChildren())) {
90
				builder.getEntityBuilder().addChildren(oaf.getEntity());
91
			}
92
		}
93

    
94
		return builder;
95
	}
96

    
97
	private Qualifier.Builder getProvenanceAction() {
98
		return Qualifier.newBuilder().setClassid(DEDUP_CLASSID).setClassname(DEDUP_CLASSID).setSchemeid(DNET_PROVENANCE_SCHEME)
99
				.setSchemename(DNET_PROVENANCE_SCHEME);
100
	}
101

    
102
	public Oaf.Builder doMergeEntities(final Oaf.Builder builder) {
103

    
104
		for (final String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.COLLECTEDFROM_FIELD_NUMBER)) {
105
			setKeyValues(builder.getEntityBuilder(), field);
106
		}
107
		for (final String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.PID_FIELD_NUMBER)) {
108
			setStructuredProperty(builder.getEntityBuilder(), field);
109
		}
110
		for (final String field : OafUtils.getFieldNames(OafEntity.getDescriptor(), OafEntity.ORIGINALID_FIELD_NUMBER)) {
111
			setUniqueString(builder.getEntityBuilder(), field);
112
		}
113

    
114
		switch (builder.getEntity().getType()) {
115
		case datasource:
116
			break;
117
		case organization:
118
			break;
119
		case project:
120
			break;
121
		case result:
122
			final Result.Metadata.Builder result = builder.getEntityBuilder().getResultBuilder().getMetadataBuilder();
123
			setTitle(result);
124
			mergeInstances(builder.getEntityBuilder().getResultBuilder());
125

    
126
			// for (String field : Lists.newArrayList("subject", "relevantdate")) {
127
			for (final String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.SUBJECT_FIELD_NUMBER,
128
					Result.Metadata.RELEVANTDATE_FIELD_NUMBER)) {
129
				setStructuredProperty(result, field);
130
			}
131
			for (final String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.DESCRIPTION_FIELD_NUMBER)) {
132
				setLongestStringField(result, field);
133
			}
134
			for (final String field : OafUtils.getFieldNames(Result.Metadata.getDescriptor(), Result.Metadata.SOURCE_FIELD_NUMBER)) {
135
				setUniqueStringField(result, field);
136
			}
137

    
138
			mergeContexts(result);
139
			mergeCountries(result);
140

    
141
			break;
142
		default:
143
			break;
144
		}
145
		return builder;
146
	}
147

    
148
	private void mergeCountries(Result.Metadata.Builder result) {
149
		final Map<String, Qualifier.Builder> cMap = Maps.newHashMap();
150
		for(Qualifier country : result.getCountryList()) {
151
			if (!cMap.containsKey(country.getClassid())) {
152
				cMap.put(country.getClassid(), Qualifier.newBuilder(country));
153
			}
154
		}
155
		if (!cMap.isEmpty()) {
156
			result.clearCountry();
157
			for(Qualifier.Builder country : cMap.values()) {
158
				result.addCountry(country.build());
159
			}
160
		}
161
	}
162

    
163
	private void mergeContexts(Result.Metadata.Builder result) {
164
		final Map<String, Result.Context.Builder> cMap = Maps.newHashMap();
165
		for(Result.Context c : result.getContextList()) {
166
			if (!cMap.containsKey(c.getId())) {
167
				//TODO merge DataInfo
168
				cMap.put(c.getId(), Result.Context.newBuilder(c));
169
			}
170
		}
171
		if (!cMap.isEmpty()) {
172
			result.clearContext();
173
			for (Result.Context.Builder b : cMap.values()) {
174
				result.addContext(b.build());
175
			}
176
		}
177
	}
178

    
179
	private void mergeInstances(final Result.Builder builder) {
180
		final Map<String, Instance.Builder> map = Maps.newHashMap();
181
		for(Instance i : builder.getInstanceList()) {
182

    
183
			final String key = i.getHostedby().getKey() + i.getAccessright().getClassid() + i.getInstancetype().getClassid();
184
			if (!map.containsKey(key)) {
185
				map.put(key, Instance.newBuilder().mergeFrom(i));
186
				continue;
187
			}
188

    
189
			map.get(key).addAllUrl(i.getUrlList());
190
		}
191

    
192
		for(Instance.Builder i : map.values()) {
193
			final Set<String> urls = Sets.newHashSet();
194
			urls.addAll(i.getUrlList());
195
			i.clearUrl().addAllUrl(urls);
196
		}
197
		builder.clearInstance();
198
		builder.addAllInstance(Iterables.transform(map.values(), b -> b.build()));
199
	}
200

    
201
	/**
202
	 * Helper method, avoid duplicated StructuredProperties in the given builder for the given fieldName
203
	 *
204
	 * @param builder
205
	 * @param fieldName
206
	 */
207
	@SuppressWarnings("unchecked")
208
	private void setStructuredProperty(final Builder builder, final String fieldName) {
209
		final Map<String, StructuredProperty> map = Maps.newHashMap();
210
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
211
		final List<StructuredProperty> sps = (List<StructuredProperty>) builder.getField(fd);
212

    
213
		if ((sps != null) && !sps.isEmpty()) {
214
			for (final StructuredProperty sp : sps) {
215
				if (StringUtils.isNotBlank(sp.getValue())) {
216
					map.put(sp.getValue().toLowerCase(), sp);
217
				}
218
			}
219

    
220
			if (!map.isEmpty()) {
221
				builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
222
			}
223
		}
224
	}
225

    
226
	/**
227
	 * Helper method, avoid duplicated KeyValues in the given builder for the given fieldName
228
	 *
229
	 * @param builder
230
	 * @param fieldName
231
	 */
232
	@SuppressWarnings("unchecked")
233
	private void setKeyValues(final Builder builder, final String fieldName) {
234
		final Map<String, KeyValue> map = Maps.newHashMap();
235
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
236
		final List<KeyValue> kvs = (List<KeyValue>) builder.getField(fd);
237

    
238
		if ((kvs != null) && !kvs.isEmpty()) {
239
			for (final KeyValue sp : kvs) {
240
				map.put(sp.getKey(), sp);
241
			}
242

    
243
			if (!map.isEmpty()) {
244
				builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
245
			}
246
		}
247
	}
248

    
249
	@SuppressWarnings("unchecked")
250
	private void setSingleString(final Builder builder, final String fieldName) {
251

    
252
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
253
		final List<StringField> field = (List<StringField>) builder.getField(fd);
254
		if ((field != null) && !field.isEmpty()) {
255
			final StringField s = (StringField) Iterables.getLast(Iterables.filter(field, skipEmptyStringField), "");
256

    
257
			if ((s != null) && (s.getValue() != null) && !s.getValue().isEmpty()) {
258
				builder.clearField(fd).setField(fd, Lists.newArrayList(s));
259
			}
260
		}
261
	}
262

    
263
	@SuppressWarnings("unchecked")
264
	private void setLongestStringField(final Builder builder, final String fieldName) {
265

    
266
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
267
		final List<StringField> field = (List<StringField>) builder.getField(fd);
268

    
269
		if ((field != null) && !field.isEmpty()) {
270
			final StringField.Builder max = StringField.newBuilder().setValue("");
271
			int maxLength = 0;
272
			for (final StringField sf : field) {
273
				if (sf.getValue().length() > maxLength) {
274
					maxLength = sf.getValue().length();
275
					max.clear();
276
					max.mergeFrom(sf);
277
				}
278
			}
279

    
280
			builder.clearField(fd).setField(fd, Lists.newArrayList(max.build()));
281
		}
282
	}
283

    
284
	@SuppressWarnings("unchecked")
285
	private void setUniqueStringField(final Builder builder, final String fieldName) {
286

    
287
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
288
		final List<StringField> field = (List<StringField>) builder.getField(fd);
289
		final Map<String, StringField> map = Maps.newHashMap();
290
		if ((field != null) && !field.isEmpty()) {
291
			for (final StringField s : Iterables.filter(field, skipEmptyStringField)) {
292
				map.put(s.getValue(), s);
293
			}
294

    
295
			builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
296
		}
297
	}
298

    
299
	@SuppressWarnings("unchecked")
300
	private void setUniqueString(final Builder builder, final String fieldName) {
301

    
302
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
303
		final List<String> field = (List<String>) builder.getField(fd);
304
		final Set<String> set = Sets.newHashSet();
305
		if ((field != null) && !field.isEmpty()) {
306
			for (final String s : Iterables.filter(field, skipEmptyString)) {
307
				set.add(s);
308
			}
309

    
310
			builder.clearField(fd).setField(fd, Lists.newArrayList(set));
311
		}
312
	}
313

    
314
	private void setTitle(final Result.Metadata.Builder metadata) {
315
		final Iterable<StructuredProperty> filtered = Iterables.filter(metadata.getTitleList(), OafUtils.mainTitleFilter());
316

    
317
		if (!Iterables.isEmpty(filtered)) {
318
			metadata.clearTitle().addTitle(Iterables.getLast(filtered));
319
		}
320
	}
321

    
322
}
(2-2/6)