Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.mapreduce.util;
2
3
import java.nio.ByteBuffer;
4
5 44467 claudio.at
import eu.dnetlib.data.mapreduce.Algorithms;
6
import eu.dnetlib.data.mapreduce.JobParams;
7 28058 claudio.at
import eu.dnetlib.data.proto.DedupProtos.Dedup;
8
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
9 28308 claudio.at
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
10 49029 claudio.at
import eu.dnetlib.data.proto.KindProtos.Kind;
11
import eu.dnetlib.data.proto.OafProtos.Oaf;
12 28308 claudio.at
import eu.dnetlib.data.proto.OafProtos.OafRel;
13
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
14
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
15
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
16 27941 claudio.at
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
17
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
18 28308 claudio.at
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
19 27941 claudio.at
import eu.dnetlib.data.proto.TypeProtos.Type;
20 49029 claudio.at
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
21 36670 claudio.at
import eu.dnetlib.pace.config.DedupConfig;
22 57767 michele.de
import org.apache.commons.lang3.StringUtils;
23 49029 claudio.at
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
24
import org.apache.hadoop.hbase.util.Bytes;
25 26600 sandro.lab
26 27941 claudio.at
public class DedupUtils {
27
28
	public static final String CF_SEPARATOR = "_";
29
30 26600 sandro.lab
	public static final String ROOT = "dedup_wf";
31
32 28308 claudio.at
	public static final String BODY_S = "body";
33
34
	public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
35
36
	public static String dedupPrefix(final String dedupRun) {
37 26600 sandro.lab
		return "|" + ROOT + "_" + dedupRun + "::";
38
	}
39 27941 claudio.at
40 28308 claudio.at
	public static String newId(final String id, final String dedupRun) {
41
		if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
42 27941 claudio.at
43 57767 michele.de
		return StringUtils.substringBefore(id,"|") + dedupPrefix(dedupRun) + "::" + AbstractDNetXsltFunctions.md5(id);
44 26600 sandro.lab
	}
45
46 28308 claudio.at
	public static byte[] newIdBytes(final String s, final String dedupRun) {
47 26600 sandro.lab
		return newId(s, dedupRun).getBytes();
48
	}
49
50 28308 claudio.at
	public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
51 26600 sandro.lab
		return newId(new String(b.array()), dedupRun).getBytes();
52
	}
53
54 28308 claudio.at
	public static boolean isRoot(final String s) {
55 26600 sandro.lab
		return s.contains(ROOT);
56
	}
57 27941 claudio.at
58 28308 claudio.at
	public static boolean isRoot(final ImmutableBytesWritable s) {
59 26600 sandro.lab
		return isRoot(s.copyBytes());
60
	}
61 27941 claudio.at
62 28308 claudio.at
	public static boolean isRoot(final byte[] s) {
63 26600 sandro.lab
		return isRoot(new String(s));
64
	}
65 27941 claudio.at
66 28308 claudio.at
	public static String getDedupCF_merges(final Type type) {
67 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
68 27941 claudio.at
	}
69
70 36164 claudio.at
	public static String getDedupCF_merges(final String type) {
71
		return getDedupCF_merges(Type.valueOf(type));
72
	}
73
74 28308 claudio.at
	public static byte[] getDedupCF_mergesBytes(final Type type) {
75 28058 claudio.at
		return Bytes.toBytes(getDedupCF_merges(type));
76 27941 claudio.at
	}
77
78 36164 claudio.at
	public static byte[] getDedupCF_mergesBytes(final String type) {
79
		return getDedupCF_mergesBytes(Type.valueOf(type));
80
	}
81
82 28308 claudio.at
	public static String getDedupCF_mergedIn(final Type type) {
83 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
84
	}
85
86 36164 claudio.at
	public static String getDedupCF_mergedIn(final String type) {
87
		return getDedupCF_mergedIn(Type.valueOf(type));
88
	}
89
90 28308 claudio.at
	public static byte[] getDedupCF_mergedInBytes(final Type type) {
91 28058 claudio.at
		return Bytes.toBytes(getDedupCF_mergedIn(type));
92
	}
93
94 36164 claudio.at
	public static byte[] getDedupCF_mergedInBytes(final String type) {
95
		return getDedupCF_mergedInBytes(Type.valueOf(type));
96
	}
97
98 28308 claudio.at
	public static String getSimilarityCF(final Type type) {
99 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
100 27941 claudio.at
	}
101
102 36164 claudio.at
	public static String getSimilarityCF(final String type) {
103
		return getSimilarityCF(Type.valueOf(type));
104
	}
105
106 28308 claudio.at
	public static byte[] getSimilarityCFBytes(final Type type) {
107 27941 claudio.at
		return Bytes.toBytes(getSimilarityCF(type));
108
	}
109
110 36164 claudio.at
	public static byte[] getSimilarityCFBytes(final String type) {
111
		return getSimilarityCFBytes(Type.valueOf(type));
112
	}
113
114 28308 claudio.at
	public static String getRelTypeString(final Type type) {
115 27941 claudio.at
		return getRelType(type).toString();
116
	}
117
118 28308 claudio.at
	public static RelType getRelType(final Type type) {
119 27941 claudio.at
		switch (type) {
120
		case organization:
121
			return RelType.organizationOrganization;
122
		case result:
123
			return RelType.resultResult;
124
		default:
125
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
126
		}
127
	}
128
129 28308 claudio.at
	public static ColumnFamily decodeCF(final byte[] b) {
130 36164 claudio.at
		final String[] s = new String(b).split(CF_SEPARATOR);
131 27941 claudio.at
		return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
132
	}
133
134 28308 claudio.at
	private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
135 27941 claudio.at
		return new ColumnFamily(relType, subRelType);
136
	}
137
138 28308 claudio.at
	public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
139 36670 claudio.at
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
140 44467 claudio.at
		final Builder oafRel = getRelBuilder(from, to, relClass.name(), DedupUtils.getRelType(type));
141 28411 claudio.at
		switch (type) {
142 28308 claudio.at
		case organization:
143
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
144 44467 claudio.at
					dedup(relClass.name(), "dnet:organization_organization_relations")));
145 28308 claudio.at
			break;
146
		case result:
147 44467 claudio.at
			oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass.name(), "dnet:result_result_relations")));
148 28308 claudio.at
			break;
149
		default:
150 36670 claudio.at
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
151 28308 claudio.at
		}
152
		return oafRel;
153
	}
154
155 44467 claudio.at
	public static OafRel.Builder getDedupSimilarity(final DedupConfig dedupConf, final String from, final String to) {
156
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
157
		final String isSimilarTo = DedupSimilarity.RelName.isSimilarTo.name();
158
		final Builder oafRel = getRelBuilder(from, to, isSimilarTo, DedupUtils.getRelType(type));
159
		switch (type) {
160
		case organization:
161
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedupSimilarity(
162
					dedupSimilarity(isSimilarTo, "dnet:organization_organization_relations")));
163
			break;
164
		case result:
165
			oafRel.setResultResult(ResultResult.newBuilder().setDedupSimilarity(dedupSimilarity(isSimilarTo, "dnet:result_result_relations")));
166
			break;
167
		default:
168
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
169
		}
170
		return oafRel;
171 28308 claudio.at
	}
172
173 44467 claudio.at
	private static Builder getRelBuilder(final String from, final String to, final String relClass, final RelType relType) {
174
		return OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass).setChild(false)
175
				.setSource(new String(from)).setTarget(new String(to));
176
	}
177
178
	private static Dedup.Builder dedup(final String relClass, final String scheme) {
179
		return Dedup.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
180
	}
181
182
	private static DedupSimilarity.Builder dedupSimilarity(final String relClass, final String scheme) {
183
		return DedupSimilarity.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
184
	}
185
186
	private static RelMetadata.Builder getRelMetadata(final String relClass, final String scheme) {
187
		return RelMetadata.newBuilder().setSemantics(
188
				Qualifier.newBuilder().setClassid(relClass).setClassname(relClass).setSchemeid(scheme).setSchemename(scheme));
189
	}
190
191
	public static Oaf.Builder buildRel(final DedupConfig dedupConf, final OafRel.Builder oafRel, final double trust) {
192
		final double sTrust = Algorithms.scale(trust, JobParams.MAX_DEDUP_TRUST);
193
		final Oaf.Builder oaf =
194
				Oaf.newBuilder()
195
						.setKind(Kind.relation)
196
						.setLastupdatetimestamp(System.currentTimeMillis())
197
						.setDataInfo(
198
								AbstractDNetXsltFunctions.getDataInfo(null, "", String.valueOf(sTrust), false, true).setInferenceprovenance(
199
										dedupConf.getWf().getConfigurationId())).setRel(oafRel);
200
		return oaf;
201
	}
202
203 27941 claudio.at
	class ColumnFamily {
204
205
		private final RelType relType;
206
		private final SubRelType subRelType;
207
208 28308 claudio.at
		public ColumnFamily(final RelType relType, final SubRelType subRelType) {
209 27941 claudio.at
			this.relType = relType;
210
			this.subRelType = subRelType;
211
		}
212
213
		@Override
214
		public String toString() {
215
			return getRelType() + CF_SEPARATOR + getSubRelType();
216
		}
217
218
		public RelType getRelType() {
219
			return relType;
220
		}
221
222
		public SubRelType getSubRelType() {
223
			return subRelType;
224
		}
225
226
	}
227
228 26600 sandro.lab
}