Project

General

Profile

1
package eu.dnetlib.data.mapreduce.util;
2

    
3
import java.nio.ByteBuffer;
4

    
5
import eu.dnetlib.data.mapreduce.Algorithms;
6
import eu.dnetlib.data.mapreduce.JobParams;
7
import eu.dnetlib.data.proto.DedupProtos.Dedup.RelName;
8
import eu.dnetlib.data.proto.KindProtos.Kind;
9
import eu.dnetlib.data.proto.OafProtos.Oaf;
10
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
11
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
12
import org.apache.hadoop.hbase.util.Bytes;
13

    
14
import eu.dnetlib.data.proto.DedupProtos.Dedup;
15
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
16
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
17
import eu.dnetlib.data.proto.OafProtos.OafRel;
18
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
19
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
20
import eu.dnetlib.data.proto.PersonPersonProtos.PersonPerson;
21
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
22
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
23
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
24
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
25
import eu.dnetlib.data.proto.TypeProtos.Type;
26
import eu.dnetlib.pace.config.DedupConfig;
27

    
28
public class DedupUtils {
29

    
30
	public static final String CF_SEPARATOR = "_";
31

    
32
	public static final String ROOT = "dedup_wf";
33

    
34
	public static final String BODY_S = "body";
35

    
36
	public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
37

    
38
	public static String dedupPrefix(final String dedupRun) {
39
		return "|" + ROOT + "_" + dedupRun + "::";
40
	}
41

    
42
	public static String newId(final String id, final String dedupRun) {
43
		if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
44

    
45
		return id.replaceFirst("\\|.*\\:\\:", dedupPrefix(dedupRun));
46
	}
47

    
48
	public static byte[] newIdBytes(final String s, final String dedupRun) {
49
		return newId(s, dedupRun).getBytes();
50
	}
51

    
52
	public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
53
		return newId(new String(b.array()), dedupRun).getBytes();
54
	}
55

    
56
	public static boolean isRoot(final String s) {
57
		return s.contains(ROOT);
58
	}
59

    
60
	public static boolean isRoot(final ImmutableBytesWritable s) {
61
		return isRoot(s.copyBytes());
62
	}
63

    
64
	public static boolean isRoot(final byte[] s) {
65
		return isRoot(new String(s));
66
	}
67

    
68
	public static String getDedupCF_merges(final Type type) {
69
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
70
	}
71

    
72
	public static String getDedupCF_merges(final String type) {
73
		return getDedupCF_merges(Type.valueOf(type));
74
	}
75

    
76
	public static byte[] getDedupCF_mergesBytes(final Type type) {
77
		return Bytes.toBytes(getDedupCF_merges(type));
78
	}
79

    
80
	public static byte[] getDedupCF_mergesBytes(final String type) {
81
		return getDedupCF_mergesBytes(Type.valueOf(type));
82
	}
83

    
84
	public static String getDedupCF_mergedIn(final Type type) {
85
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
86
	}
87

    
88
	public static String getDedupCF_mergedIn(final String type) {
89
		return getDedupCF_mergedIn(Type.valueOf(type));
90
	}
91

    
92
	public static byte[] getDedupCF_mergedInBytes(final Type type) {
93
		return Bytes.toBytes(getDedupCF_mergedIn(type));
94
	}
95

    
96
	public static byte[] getDedupCF_mergedInBytes(final String type) {
97
		return getDedupCF_mergedInBytes(Type.valueOf(type));
98
	}
99

    
100
	public static String getSimilarityCF(final Type type) {
101
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
102
	}
103

    
104
	public static String getSimilarityCF(final String type) {
105
		return getSimilarityCF(Type.valueOf(type));
106
	}
107

    
108
	public static byte[] getSimilarityCFBytes(final Type type) {
109
		return Bytes.toBytes(getSimilarityCF(type));
110
	}
111

    
112
	public static byte[] getSimilarityCFBytes(final String type) {
113
		return getSimilarityCFBytes(Type.valueOf(type));
114
	}
115

    
116
	public static String getRelTypeString(final Type type) {
117
		return getRelType(type).toString();
118
	}
119

    
120
	public static RelType getRelType(final Type type) {
121
		switch (type) {
122
		case organization:
123
			return RelType.organizationOrganization;
124
		case person:
125
			return RelType.personPerson;
126
		case result:
127
			return RelType.resultResult;
128
		default:
129
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
130
		}
131
	}
132

    
133
	public static ColumnFamily decodeCF(final byte[] b) {
134
		final String[] s = new String(b).split(CF_SEPARATOR);
135
		return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
136
	}
137

    
138
	private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
139
		return new ColumnFamily(relType, subRelType);
140
	}
141

    
142
	public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
143
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
144
		final Builder oafRel = getRelBuilder(from, to, relClass.name(), DedupUtils.getRelType(type));
145
		switch (type) {
146
		case organization:
147
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
148
					dedup(relClass.name(), "dnet:organization_organization_relations")));
149
			break;
150
		case person:
151
			oafRel.setPersonPerson(PersonPerson.newBuilder().setDedup(DedupUtils.dedup(relClass.name(), "dnet:person_person_relations")));
152
			break;
153
		case result:
154
			oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass.name(), "dnet:result_result_relations")));
155
			break;
156
		default:
157
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
158
		}
159
		return oafRel;
160
	}
161

    
162
	public static OafRel.Builder getDedupSimilarity(final DedupConfig dedupConf, final String from, final String to) {
163
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
164
		final String isSimilarTo = DedupSimilarity.RelName.isSimilarTo.name();
165
		final Builder oafRel = getRelBuilder(from, to, isSimilarTo, DedupUtils.getRelType(type));
166
		switch (type) {
167
		case organization:
168
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedupSimilarity(
169
					dedupSimilarity(isSimilarTo, "dnet:organization_organization_relations")));
170
			break;
171
		case person:
172
			oafRel.setPersonPerson(PersonPerson.newBuilder().setDedupSimilarity(dedupSimilarity(isSimilarTo, "dnet:person_person_relations")));
173
			break;
174
		case result:
175
			oafRel.setResultResult(ResultResult.newBuilder().setDedupSimilarity(dedupSimilarity(isSimilarTo, "dnet:result_result_relations")));
176
			break;
177
		default:
178
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
179
		}
180
		return oafRel;
181
	}
182

    
183
	private static Builder getRelBuilder(final String from, final String to, final String relClass, final RelType relType) {
184
		return OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass).setChild(false)
185
				.setSource(new String(from)).setTarget(new String(to));
186
	}
187

    
188
	private static Dedup.Builder dedup(final String relClass, final String scheme) {
189
		return Dedup.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
190
	}
191

    
192
	private static DedupSimilarity.Builder dedupSimilarity(final String relClass, final String scheme) {
193
		return DedupSimilarity.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
194
	}
195

    
196
	private static RelMetadata.Builder getRelMetadata(final String relClass, final String scheme) {
197
		return RelMetadata.newBuilder().setSemantics(
198
				Qualifier.newBuilder().setClassid(relClass).setClassname(relClass).setSchemeid(scheme).setSchemename(scheme));
199
	}
200

    
201
	public static Oaf.Builder buildRel(final DedupConfig dedupConf, final OafRel.Builder oafRel, final double trust) {
202
		final double sTrust = Algorithms.scale(trust, JobParams.MAX_DEDUP_TRUST);
203
		final Oaf.Builder oaf =
204
				Oaf.newBuilder()
205
						.setKind(Kind.relation)
206
						.setLastupdatetimestamp(System.currentTimeMillis())
207
						.setDataInfo(
208
								AbstractDNetXsltFunctions.getDataInfo(null, "", String.valueOf(sTrust), false, true).setInferenceprovenance(
209
										dedupConf.getWf().getConfigurationId())).setRel(oafRel);
210
		return oaf;
211
	}
212

    
213
	class ColumnFamily {
214

    
215
		private final RelType relType;
216
		private final SubRelType subRelType;
217

    
218
		public ColumnFamily(final RelType relType, final SubRelType subRelType) {
219
			this.relType = relType;
220
			this.subRelType = subRelType;
221
		}
222

    
223
		@Override
224
		public String toString() {
225
			return getRelType() + CF_SEPARATOR + getSubRelType();
226
		}
227

    
228
		public RelType getRelType() {
229
			return relType;
230
		}
231

    
232
		public SubRelType getSubRelType() {
233
			return subRelType;
234
		}
235

    
236
	}
237

    
238
}
(1-1/8)