Project

General

Profile

1
package eu.dnetlib.data.transform;
2

    
3
import java.util.List;
4
import java.util.Map;
5
import java.util.Set;
6

    
7
import com.google.common.base.Predicate;
8
import com.google.common.collect.Iterables;
9
import com.google.common.collect.Lists;
10
import com.google.common.collect.Maps;
11
import com.google.common.collect.Sets;
12
import com.google.protobuf.Descriptors.FieldDescriptor;
13
import com.google.protobuf.Message.Builder;
14
import eu.dnetlib.data.proto.DNGFProtos.DNGF;
15
import eu.dnetlib.data.proto.DNGFProtos.DNGFEntity;
16
import eu.dnetlib.data.proto.DatasetProtos.Dataset;
17
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
18
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
19
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
20
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
21
import eu.dnetlib.data.proto.KindProtos.Kind;
22
import eu.dnetlib.data.proto.PersonProtos.Person;
23
import eu.dnetlib.data.proto.PersonProtos.Person.CoAuthor;
24
import eu.dnetlib.data.proto.PersonProtos.Person.MergedPerson;
25
import eu.dnetlib.data.proto.PersonProtos.Person.Metadata;
26
import eu.dnetlib.data.proto.PublicationProtos.Publication;
27
import eu.dnetlib.data.proto.SpecialTrustProtos.SpecialTrust;
28
import eu.dnetlib.pace.config.DedupConfig;
29
import org.apache.commons.lang3.StringUtils;
30

    
31
public class DNGFEntityMerger {
32

    
33
	private static final String DEDUP_CLASSID = "sysimport:dedup";
34

    
35
	private static final String DNET_PROVENANCE_SCHEME = "dnet:provenanceActions";
36

    
37
	private final Predicate<StringField> skipEmptyStringField = s -> (s != null) && (s.getValue() != null) && !s.getValue().isEmpty();
38

    
39
	private final Predicate<String> skipEmptyString = s -> StringUtils.isNotBlank(s);
40

    
41
	public static DNGF.Builder merge(final String id, final Iterable<DNGF> entities) {
42
		return merge(null, id, entities);
43
	}
44

    
45
	public static DNGF.Builder merge(final DedupConfig dedupConf, final String id, final Iterable<DNGF> entities) {
46
		return new DNGFEntityMerger().mergeEntities(dedupConf, id, entities);
47
	}
48

    
49
	public static DNGF.Builder merge(final DNGF.Builder builder) {
50
		return new DNGFEntityMerger().doMergeEntities(builder);
51
	}
52

    
53
	public DNGF.Builder mergeEntities(final DedupConfig dedupConf, final String id, final Iterable<DNGF> entities) {
54

    
55
		DNGF.Builder builder = DNGF.newBuilder();
56
		String trust = "0.0";
57

    
58
		for (final DNGF dngf : TrustOrdering.sort(entities)) {
59
			// doublecheck we're dealing only with main entities
60
			if (!dngf.getKind().equals(Kind.entity)) throw new IllegalArgumentException("expected DNGFEntity!");
61

    
62
			final String currentTrust = dngf.getDataInfo().getTrust();
63
			if (!currentTrust.equals(SpecialTrust.NEUTRAL.toString())) {
64
				trust = currentTrust;
65
			}
66
			builder.mergeFrom(dngf);
67
		}
68

    
69
		builder = doMergeEntities(builder);
70
		builder.getEntityBuilder().setId(id);
71
		builder.getDataInfoBuilder()
72
				.setInferred(true)
73
				.setDeletedbyinference(false)
74
				.setTrust(trust)
75
				.setInferenceprovenance(dedupConf != null ? dedupConf.getWf().getConfigurationId() : "")
76
				.setProvenanceaction(getProvenanceAction());
77

    
78
		if ((dedupConf != null) && dedupConf.getWf().isIncludeChildren()) {
79
			for (final DNGF dngf : Iterables.limit(entities, dedupConf.getWf().getMaxChildren())) {
80
				builder.getEntityBuilder().addChildren(dngf.getEntity());
81
			}
82
		}
83

    
84
		return builder;
85
	}
86

    
87
	private Qualifier.Builder getProvenanceAction() {
88
		return Qualifier.newBuilder().setClassid(DEDUP_CLASSID).setClassname(DEDUP_CLASSID).setSchemeid(DNET_PROVENANCE_SCHEME)
89
				.setSchemename(DNET_PROVENANCE_SCHEME);
90
	}
91

    
92
	public DNGF.Builder doMergeEntities(final DNGF.Builder builder) {
93

    
94
		for (final String field : DNGFUtils.getFieldNames(DNGFEntity.getDescriptor(), DNGFEntity.COLLECTEDFROM_FIELD_NUMBER)) {
95
			setKeyValues(builder.getEntityBuilder(), field);
96
		}
97
		for (final String field : DNGFUtils.getFieldNames(DNGFEntity.getDescriptor(), DNGFEntity.PID_FIELD_NUMBER)) {
98
			setStructuredProperty(builder.getEntityBuilder(), field);
99
		}
100
		for (final String field : DNGFUtils.getFieldNames(DNGFEntity.getDescriptor(), DNGFEntity.ORIGINALID_FIELD_NUMBER)) {
101
			setUniqueString(builder.getEntityBuilder(), field);
102
		}
103

    
104
		switch (builder.getEntity().getType()) {
105
		case datasource:
106
			break;
107
		case organization:
108
			break;
109
		case person:
110
			final Person.Builder person = builder.getEntityBuilder().getPersonBuilder().setAnchor(true);
111

    
112
			for (final String field : DNGFUtils.getFieldNames(Person.Metadata.getDescriptor(), Metadata.SECONDNAMES_FIELD_NUMBER)) {
113
				setSingleString(person.getMetadataBuilder(), field);
114
			}
115

    
116
			final Map<String, MergedPerson> mergedMap = Maps.newHashMap();
117
			for(MergedPerson merged : person.getMergedpersonList()) {
118
				mergedMap.put(merged.getId(), merged);
119
			}
120
			person.clearMergedperson().addAllMergedperson(mergedMap.values());
121

    
122
			final Map<String, CoAuthor> coAuthorMap = Maps.newHashMap();
123
			for(CoAuthor coAuthor : person.getCoauthorList()) {
124
				coAuthorMap.put(coAuthor.getId(), coAuthor);
125
			}
126
			person.clearCoauthor().addAllCoauthor(coAuthorMap.values());
127

    
128
			break;
129
		case project:
130
			break;
131
		case publication:
132
			final Publication.Metadata.Builder pub = builder.getEntityBuilder().getPublicationBuilder().getMetadataBuilder();
133
			setTitle(pub);
134

    
135
			// for (String field : Lists.newArrayList("subject", "relevantdate")) {
136
			for (final String field : DNGFUtils.getFieldNames(Publication.Metadata.getDescriptor(), Publication.Metadata.SUBJECT_FIELD_NUMBER,
137
					Publication.Metadata.RELEVANTDATE_FIELD_NUMBER)) {
138
				setStructuredProperty(pub, field);
139
			}
140
			for (final String field : DNGFUtils.getFieldNames(Publication.Metadata.getDescriptor(), Publication.Metadata.DESCRIPTION_FIELD_NUMBER)) {
141
				setLongestStringField(pub, field);
142
			}
143
			for (final String field : DNGFUtils.getFieldNames(Publication.Metadata.getDescriptor(), Publication.Metadata.SOURCE_FIELD_NUMBER)) {
144
				setUniqueStringField(pub, field);
145
			}
146

    
147
			// remove the inner authors, rely on the children
148
			builder.getEntityBuilder().getPublicationBuilder().clearAuthor();
149
			break;
150
		case dataset:
151
			final Dataset.Metadata.Builder dataset = builder.getEntityBuilder().getDatasetBuilder().getMetadataBuilder();
152
			setTitle(dataset);
153

    
154
			// for (String field : Lists.newArrayList("subject", "relevantdate")) {
155
			for (final String field : DNGFUtils.getFieldNames(Publication.Metadata.getDescriptor(), Publication.Metadata.SUBJECT_FIELD_NUMBER,
156
					Publication.Metadata.RELEVANTDATE_FIELD_NUMBER)) {
157
				setStructuredProperty(dataset, field);
158
			}
159
			for (final String field : DNGFUtils.getFieldNames(Publication.Metadata.getDescriptor(), Publication.Metadata.DESCRIPTION_FIELD_NUMBER)) {
160
				setLongestStringField(dataset, field);
161
			}
162
			for (final String field : DNGFUtils.getFieldNames(Publication.Metadata.getDescriptor(), Publication.Metadata.SOURCE_FIELD_NUMBER)) {
163
				setUniqueStringField(dataset, field);
164
			}
165

    
166
			// remove the inner authors, rely on the children
167
			builder.getEntityBuilder().getDatasetBuilder().clearAuthor();
168
			break;
169
		default:
170
			break;
171
		}
172
		return builder;
173
	}
174

    
175
	/**
176
	 * Helper method, avoid duplicated StructuredProperties in the given builder for the given fieldName
177
	 *
178
	 * @param builder
179
	 * @param fieldName
180
	 */
181
	@SuppressWarnings("unchecked")
182
	private void setStructuredProperty(final Builder builder, final String fieldName) {
183
		final Map<String, StructuredProperty> map = Maps.newHashMap();
184
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
185
		final List<StructuredProperty> sps = (List<StructuredProperty>) builder.getField(fd);
186

    
187
		if ((sps != null) && !sps.isEmpty()) {
188
			for (final StructuredProperty sp : sps) {
189
				map.put(sp.getValue(), sp);
190
			}
191

    
192
			if (!map.isEmpty()) {
193
				builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
194
			}
195
		}
196
	}
197

    
198
	/**
199
	 * Helper method, avoid duplicated KeyValues in the given builder for the given fieldName
200
	 *
201
	 * @param builder
202
	 * @param fieldName
203
	 */
204
	@SuppressWarnings("unchecked")
205
	private void setKeyValues(final Builder builder, final String fieldName) {
206
		final Map<String, KeyValue> map = Maps.newHashMap();
207
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
208
		final List<KeyValue> kvs = (List<KeyValue>) builder.getField(fd);
209

    
210
		if ((kvs != null) && !kvs.isEmpty()) {
211
			for (final KeyValue sp : kvs) {
212
				map.put(sp.getKey(), sp);
213
			}
214

    
215
			if (!map.isEmpty()) {
216
				builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
217
			}
218
		}
219
	}
220

    
221
	@SuppressWarnings("unchecked")
222
	private void setSingleString(final Builder builder, final String fieldName) {
223

    
224
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
225
		final List<StringField> field = (List<StringField>) builder.getField(fd);
226
		if ((field != null) && !field.isEmpty()) {
227
			final StringField s = (StringField) Iterables.getLast(Iterables.filter(field, skipEmptyStringField), "");
228

    
229
			if ((s != null) && (s.getValue() != null) && !s.getValue().isEmpty()) {
230
				builder.clearField(fd).setField(fd, Lists.newArrayList(s));
231
			}
232
		}
233
	}
234

    
235
	@SuppressWarnings("unchecked")
236
	private void setLongestStringField(final Builder builder, final String fieldName) {
237

    
238
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
239
		final List<StringField> field = (List<StringField>) builder.getField(fd);
240

    
241
		if ((field != null) && !field.isEmpty()) {
242
			final StringField.Builder max = StringField.newBuilder().setValue("");
243
			int maxLength = 0;
244
			for (final StringField sf : field) {
245
				if (sf.getValue().length() > maxLength) {
246
					maxLength = sf.getValue().length();
247
					max.clear();
248
					max.mergeFrom(sf);
249
				}
250
			}
251

    
252
			builder.clearField(fd).setField(fd, Lists.newArrayList(max.build()));
253
		}
254
	}
255

    
256
	@SuppressWarnings("unchecked")
257
	private void setUniqueStringField(final Builder builder, final String fieldName) {
258

    
259
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
260
		final List<StringField> field = (List<StringField>) builder.getField(fd);
261
		final Map<String, StringField> map = Maps.newHashMap();
262
		if ((field != null) && !field.isEmpty()) {
263
			for (final StringField s : Iterables.filter(field, skipEmptyStringField)) {
264
				map.put(s.getValue(), s);
265
			}
266

    
267
			builder.clearField(fd).setField(fd, Lists.newArrayList(map.values()));
268
		}
269
	}
270

    
271
	@SuppressWarnings("unchecked")
272
	private void setUniqueString(final Builder builder, final String fieldName) {
273

    
274
		final FieldDescriptor fd = builder.getDescriptorForType().findFieldByName(fieldName);
275
		final List<String> field = (List<String>) builder.getField(fd);
276
		final Set<String> set = Sets.newHashSet();
277
		if ((field != null) && !field.isEmpty()) {
278
			for (final String s : Iterables.filter(field, skipEmptyString)) {
279
				set.add(s);
280
			}
281

    
282
			builder.clearField(fd).setField(fd, Lists.newArrayList(set));
283
		}
284
	}
285

    
286
	private void setTitle(final Publication.Metadata.Builder metadata) {
287
		final Iterable<StructuredProperty> filtered = Iterables.filter(metadata.getTitleList(), DNGFUtils.mainTitleFilter());
288

    
289
		if (!Iterables.isEmpty(filtered)) {
290
			metadata.clearTitle().addTitle(Iterables.getLast(filtered));
291
		}
292
	}
293

    
294
	private void setTitle(final Dataset.Metadata.Builder metadata) {
295
		final Iterable<StructuredProperty> filtered = Iterables.filter(metadata.getTitleList(), DNGFUtils.mainTitleFilter());
296

    
297
		if (!Iterables.isEmpty(filtered)) {
298
			metadata.clearTitle().addTitle(Iterables.getLast(filtered));
299
		}
300
	}
301

    
302
}
(2-2/5)