Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.mapreduce.util;
2
3
import java.nio.ByteBuffer;
4
5 44467 claudio.at
import eu.dnetlib.data.mapreduce.Algorithms;
6
import eu.dnetlib.data.mapreduce.JobParams;
7 28058 claudio.at
import eu.dnetlib.data.proto.DedupProtos.Dedup;
8
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
9 28308 claudio.at
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
10 49029 claudio.at
import eu.dnetlib.data.proto.KindProtos.Kind;
11
import eu.dnetlib.data.proto.OafProtos.Oaf;
12 28308 claudio.at
import eu.dnetlib.data.proto.OafProtos.OafRel;
13
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
14
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
15
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
16 27941 claudio.at
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
17
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
18 28308 claudio.at
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
19 27941 claudio.at
import eu.dnetlib.data.proto.TypeProtos.Type;
20 49029 claudio.at
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
21 36670 claudio.at
import eu.dnetlib.pace.config.DedupConfig;
22 49029 claudio.at
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
23
import org.apache.hadoop.hbase.util.Bytes;
24 26600 sandro.lab
25 27941 claudio.at
public class DedupUtils {
26
27
	public static final String CF_SEPARATOR = "_";
28
29 26600 sandro.lab
	public static final String ROOT = "dedup_wf";
30
31 28308 claudio.at
	public static final String BODY_S = "body";
32
33
	public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
34
35
	public static String dedupPrefix(final String dedupRun) {
36 26600 sandro.lab
		return "|" + ROOT + "_" + dedupRun + "::";
37
	}
38 27941 claudio.at
39 28308 claudio.at
	public static String newId(final String id, final String dedupRun) {
40
		if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
41 27941 claudio.at
42 26600 sandro.lab
		return id.replaceFirst("\\|.*\\:\\:", dedupPrefix(dedupRun));
43
	}
44
45 28308 claudio.at
	public static byte[] newIdBytes(final String s, final String dedupRun) {
46 26600 sandro.lab
		return newId(s, dedupRun).getBytes();
47
	}
48
49 28308 claudio.at
	public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
50 26600 sandro.lab
		return newId(new String(b.array()), dedupRun).getBytes();
51
	}
52
53 28308 claudio.at
	public static boolean isRoot(final String s) {
54 26600 sandro.lab
		return s.contains(ROOT);
55
	}
56 27941 claudio.at
57 28308 claudio.at
	public static boolean isRoot(final ImmutableBytesWritable s) {
58 26600 sandro.lab
		return isRoot(s.copyBytes());
59
	}
60 27941 claudio.at
61 28308 claudio.at
	public static boolean isRoot(final byte[] s) {
62 26600 sandro.lab
		return isRoot(new String(s));
63
	}
64 27941 claudio.at
65 28308 claudio.at
	public static String getDedupCF_merges(final Type type) {
66 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
67 27941 claudio.at
	}
68
69 36164 claudio.at
	public static String getDedupCF_merges(final String type) {
70
		return getDedupCF_merges(Type.valueOf(type));
71
	}
72
73 28308 claudio.at
	public static byte[] getDedupCF_mergesBytes(final Type type) {
74 28058 claudio.at
		return Bytes.toBytes(getDedupCF_merges(type));
75 27941 claudio.at
	}
76
77 36164 claudio.at
	public static byte[] getDedupCF_mergesBytes(final String type) {
78
		return getDedupCF_mergesBytes(Type.valueOf(type));
79
	}
80
81 28308 claudio.at
	public static String getDedupCF_mergedIn(final Type type) {
82 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
83
	}
84
85 36164 claudio.at
	public static String getDedupCF_mergedIn(final String type) {
86
		return getDedupCF_mergedIn(Type.valueOf(type));
87
	}
88
89 28308 claudio.at
	public static byte[] getDedupCF_mergedInBytes(final Type type) {
90 28058 claudio.at
		return Bytes.toBytes(getDedupCF_mergedIn(type));
91
	}
92
93 36164 claudio.at
	public static byte[] getDedupCF_mergedInBytes(final String type) {
94
		return getDedupCF_mergedInBytes(Type.valueOf(type));
95
	}
96
97 28308 claudio.at
	public static String getSimilarityCF(final Type type) {
98 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
99 27941 claudio.at
	}
100
101 36164 claudio.at
	public static String getSimilarityCF(final String type) {
102
		return getSimilarityCF(Type.valueOf(type));
103
	}
104
105 28308 claudio.at
	public static byte[] getSimilarityCFBytes(final Type type) {
106 27941 claudio.at
		return Bytes.toBytes(getSimilarityCF(type));
107
	}
108
109 36164 claudio.at
	public static byte[] getSimilarityCFBytes(final String type) {
110
		return getSimilarityCFBytes(Type.valueOf(type));
111
	}
112
113 28308 claudio.at
	public static String getRelTypeString(final Type type) {
114 27941 claudio.at
		return getRelType(type).toString();
115
	}
116
117 28308 claudio.at
	public static RelType getRelType(final Type type) {
118 27941 claudio.at
		switch (type) {
119
		case organization:
120
			return RelType.organizationOrganization;
121
		case result:
122
			return RelType.resultResult;
123
		default:
124
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
125
		}
126
	}
127
128 28308 claudio.at
	public static ColumnFamily decodeCF(final byte[] b) {
129 36164 claudio.at
		final String[] s = new String(b).split(CF_SEPARATOR);
130 27941 claudio.at
		return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
131
	}
132
133 28308 claudio.at
	private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
134 27941 claudio.at
		return new ColumnFamily(relType, subRelType);
135
	}
136
137 28308 claudio.at
	public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
138 36670 claudio.at
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
139 44467 claudio.at
		final Builder oafRel = getRelBuilder(from, to, relClass.name(), DedupUtils.getRelType(type));
140 28411 claudio.at
		switch (type) {
141 28308 claudio.at
		case organization:
142
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
143 44467 claudio.at
					dedup(relClass.name(), "dnet:organization_organization_relations")));
144 28308 claudio.at
			break;
145
		case result:
146 44467 claudio.at
			oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass.name(), "dnet:result_result_relations")));
147 28308 claudio.at
			break;
148
		default:
149 36670 claudio.at
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
150 28308 claudio.at
		}
151
		return oafRel;
152
	}
153
154 44467 claudio.at
	public static OafRel.Builder getDedupSimilarity(final DedupConfig dedupConf, final String from, final String to) {
155
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
156
		final String isSimilarTo = DedupSimilarity.RelName.isSimilarTo.name();
157
		final Builder oafRel = getRelBuilder(from, to, isSimilarTo, DedupUtils.getRelType(type));
158
		switch (type) {
159
		case organization:
160
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedupSimilarity(
161
					dedupSimilarity(isSimilarTo, "dnet:organization_organization_relations")));
162
			break;
163
		case result:
164
			oafRel.setResultResult(ResultResult.newBuilder().setDedupSimilarity(dedupSimilarity(isSimilarTo, "dnet:result_result_relations")));
165
			break;
166
		default:
167
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
168
		}
169
		return oafRel;
170 28308 claudio.at
	}
171
172 44467 claudio.at
	private static Builder getRelBuilder(final String from, final String to, final String relClass, final RelType relType) {
173
		return OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass).setChild(false)
174
				.setSource(new String(from)).setTarget(new String(to));
175
	}
176
177
	private static Dedup.Builder dedup(final String relClass, final String scheme) {
178
		return Dedup.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
179
	}
180
181
	private static DedupSimilarity.Builder dedupSimilarity(final String relClass, final String scheme) {
182
		return DedupSimilarity.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
183
	}
184
185
	private static RelMetadata.Builder getRelMetadata(final String relClass, final String scheme) {
186
		return RelMetadata.newBuilder().setSemantics(
187
				Qualifier.newBuilder().setClassid(relClass).setClassname(relClass).setSchemeid(scheme).setSchemename(scheme));
188
	}
189
190
	public static Oaf.Builder buildRel(final DedupConfig dedupConf, final OafRel.Builder oafRel, final double trust) {
191
		final double sTrust = Algorithms.scale(trust, JobParams.MAX_DEDUP_TRUST);
192
		final Oaf.Builder oaf =
193
				Oaf.newBuilder()
194
						.setKind(Kind.relation)
195
						.setLastupdatetimestamp(System.currentTimeMillis())
196
						.setDataInfo(
197
								AbstractDNetXsltFunctions.getDataInfo(null, "", String.valueOf(sTrust), false, true).setInferenceprovenance(
198
										dedupConf.getWf().getConfigurationId())).setRel(oafRel);
199
		return oaf;
200
	}
201
202 27941 claudio.at
	class ColumnFamily {
203
204
		private final RelType relType;
205
		private final SubRelType subRelType;
206
207 28308 claudio.at
		public ColumnFamily(final RelType relType, final SubRelType subRelType) {
208 27941 claudio.at
			this.relType = relType;
209
			this.subRelType = subRelType;
210
		}
211
212
		@Override
213
		public String toString() {
214
			return getRelType() + CF_SEPARATOR + getSubRelType();
215
		}
216
217
		public RelType getRelType() {
218
			return relType;
219
		}
220
221
		public SubRelType getSubRelType() {
222
			return subRelType;
223
		}
224
225
	}
226
227 26600 sandro.lab
}