1 |
26600
|
sandro.lab
|
package eu.dnetlib.data.mapreduce.util;
|
2 |
|
|
|
3 |
|
|
import java.nio.ByteBuffer;
|
4 |
|
|
|
5 |
44467
|
claudio.at
|
import eu.dnetlib.data.mapreduce.Algorithms;
|
6 |
|
|
import eu.dnetlib.data.mapreduce.JobParams;
|
7 |
28058
|
claudio.at
|
import eu.dnetlib.data.proto.DedupProtos.Dedup;
|
8 |
|
|
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
|
9 |
28308
|
claudio.at
|
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
10 |
49029
|
claudio.at
|
import eu.dnetlib.data.proto.KindProtos.Kind;
|
11 |
|
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
12 |
28308
|
claudio.at
|
import eu.dnetlib.data.proto.OafProtos.OafRel;
|
13 |
|
|
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
|
14 |
|
|
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
|
15 |
|
|
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
|
16 |
27941
|
claudio.at
|
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
|
17 |
|
|
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
|
18 |
28308
|
claudio.at
|
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
|
19 |
27941
|
claudio.at
|
import eu.dnetlib.data.proto.TypeProtos.Type;
|
20 |
49029
|
claudio.at
|
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
|
21 |
36670
|
claudio.at
|
import eu.dnetlib.pace.config.DedupConfig;
|
22 |
49029
|
claudio.at
|
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
|
23 |
|
|
import org.apache.hadoop.hbase.util.Bytes;
|
24 |
26600
|
sandro.lab
|
|
25 |
27941
|
claudio.at
|
public class DedupUtils {
|
26 |
|
|
|
27 |
|
|
public static final String CF_SEPARATOR = "_";
|
28 |
|
|
|
29 |
26600
|
sandro.lab
|
public static final String ROOT = "dedup_wf";
|
30 |
|
|
|
31 |
28308
|
claudio.at
|
public static final String BODY_S = "body";
|
32 |
|
|
|
33 |
|
|
public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
|
34 |
|
|
|
35 |
|
|
public static String dedupPrefix(final String dedupRun) {
|
36 |
26600
|
sandro.lab
|
return "|" + ROOT + "_" + dedupRun + "::";
|
37 |
|
|
}
|
38 |
27941
|
claudio.at
|
|
39 |
28308
|
claudio.at
|
public static String newId(final String id, final String dedupRun) {
|
40 |
|
|
if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
|
41 |
27941
|
claudio.at
|
|
42 |
26600
|
sandro.lab
|
return id.replaceFirst("\\|.*\\:\\:", dedupPrefix(dedupRun));
|
43 |
|
|
}
|
44 |
|
|
|
45 |
28308
|
claudio.at
|
public static byte[] newIdBytes(final String s, final String dedupRun) {
|
46 |
26600
|
sandro.lab
|
return newId(s, dedupRun).getBytes();
|
47 |
|
|
}
|
48 |
|
|
|
49 |
28308
|
claudio.at
|
public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
|
50 |
26600
|
sandro.lab
|
return newId(new String(b.array()), dedupRun).getBytes();
|
51 |
|
|
}
|
52 |
|
|
|
53 |
28308
|
claudio.at
|
public static boolean isRoot(final String s) {
|
54 |
26600
|
sandro.lab
|
return s.contains(ROOT);
|
55 |
|
|
}
|
56 |
27941
|
claudio.at
|
|
57 |
28308
|
claudio.at
|
public static boolean isRoot(final ImmutableBytesWritable s) {
|
58 |
26600
|
sandro.lab
|
return isRoot(s.copyBytes());
|
59 |
|
|
}
|
60 |
27941
|
claudio.at
|
|
61 |
28308
|
claudio.at
|
public static boolean isRoot(final byte[] s) {
|
62 |
26600
|
sandro.lab
|
return isRoot(new String(s));
|
63 |
|
|
}
|
64 |
27941
|
claudio.at
|
|
65 |
28308
|
claudio.at
|
public static String getDedupCF_merges(final Type type) {
|
66 |
28058
|
claudio.at
|
return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
|
67 |
27941
|
claudio.at
|
}
|
68 |
|
|
|
69 |
36164
|
claudio.at
|
public static String getDedupCF_merges(final String type) {
|
70 |
|
|
return getDedupCF_merges(Type.valueOf(type));
|
71 |
|
|
}
|
72 |
|
|
|
73 |
28308
|
claudio.at
|
public static byte[] getDedupCF_mergesBytes(final Type type) {
|
74 |
28058
|
claudio.at
|
return Bytes.toBytes(getDedupCF_merges(type));
|
75 |
27941
|
claudio.at
|
}
|
76 |
|
|
|
77 |
36164
|
claudio.at
|
public static byte[] getDedupCF_mergesBytes(final String type) {
|
78 |
|
|
return getDedupCF_mergesBytes(Type.valueOf(type));
|
79 |
|
|
}
|
80 |
|
|
|
81 |
28308
|
claudio.at
|
public static String getDedupCF_mergedIn(final Type type) {
|
82 |
28058
|
claudio.at
|
return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
|
83 |
|
|
}
|
84 |
|
|
|
85 |
36164
|
claudio.at
|
public static String getDedupCF_mergedIn(final String type) {
|
86 |
|
|
return getDedupCF_mergedIn(Type.valueOf(type));
|
87 |
|
|
}
|
88 |
|
|
|
89 |
28308
|
claudio.at
|
public static byte[] getDedupCF_mergedInBytes(final Type type) {
|
90 |
28058
|
claudio.at
|
return Bytes.toBytes(getDedupCF_mergedIn(type));
|
91 |
|
|
}
|
92 |
|
|
|
93 |
36164
|
claudio.at
|
public static byte[] getDedupCF_mergedInBytes(final String type) {
|
94 |
|
|
return getDedupCF_mergedInBytes(Type.valueOf(type));
|
95 |
|
|
}
|
96 |
|
|
|
97 |
28308
|
claudio.at
|
public static String getSimilarityCF(final Type type) {
|
98 |
28058
|
claudio.at
|
return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
|
99 |
27941
|
claudio.at
|
}
|
100 |
|
|
|
101 |
36164
|
claudio.at
|
public static String getSimilarityCF(final String type) {
|
102 |
|
|
return getSimilarityCF(Type.valueOf(type));
|
103 |
|
|
}
|
104 |
|
|
|
105 |
28308
|
claudio.at
|
public static byte[] getSimilarityCFBytes(final Type type) {
|
106 |
27941
|
claudio.at
|
return Bytes.toBytes(getSimilarityCF(type));
|
107 |
|
|
}
|
108 |
|
|
|
109 |
36164
|
claudio.at
|
public static byte[] getSimilarityCFBytes(final String type) {
|
110 |
|
|
return getSimilarityCFBytes(Type.valueOf(type));
|
111 |
|
|
}
|
112 |
|
|
|
113 |
28308
|
claudio.at
|
public static String getRelTypeString(final Type type) {
|
114 |
27941
|
claudio.at
|
return getRelType(type).toString();
|
115 |
|
|
}
|
116 |
|
|
|
117 |
28308
|
claudio.at
|
public static RelType getRelType(final Type type) {
|
118 |
27941
|
claudio.at
|
switch (type) {
|
119 |
|
|
case organization:
|
120 |
|
|
return RelType.organizationOrganization;
|
121 |
|
|
case result:
|
122 |
|
|
return RelType.resultResult;
|
123 |
|
|
default:
|
124 |
|
|
throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
|
125 |
|
|
}
|
126 |
|
|
}
|
127 |
|
|
|
128 |
28308
|
claudio.at
|
public static ColumnFamily decodeCF(final byte[] b) {
|
129 |
36164
|
claudio.at
|
final String[] s = new String(b).split(CF_SEPARATOR);
|
130 |
27941
|
claudio.at
|
return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
|
131 |
|
|
}
|
132 |
|
|
|
133 |
28308
|
claudio.at
|
private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
|
134 |
27941
|
claudio.at
|
return new ColumnFamily(relType, subRelType);
|
135 |
|
|
}
|
136 |
|
|
|
137 |
28308
|
claudio.at
|
public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
|
138 |
36670
|
claudio.at
|
final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
|
139 |
44467
|
claudio.at
|
final Builder oafRel = getRelBuilder(from, to, relClass.name(), DedupUtils.getRelType(type));
|
140 |
28411
|
claudio.at
|
switch (type) {
|
141 |
28308
|
claudio.at
|
case organization:
|
142 |
|
|
oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
|
143 |
44467
|
claudio.at
|
dedup(relClass.name(), "dnet:organization_organization_relations")));
|
144 |
28308
|
claudio.at
|
break;
|
145 |
|
|
case result:
|
146 |
44467
|
claudio.at
|
oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass.name(), "dnet:result_result_relations")));
|
147 |
28308
|
claudio.at
|
break;
|
148 |
|
|
default:
|
149 |
36670
|
claudio.at
|
throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
|
150 |
28308
|
claudio.at
|
}
|
151 |
|
|
return oafRel;
|
152 |
|
|
}
|
153 |
|
|
|
154 |
44467
|
claudio.at
|
public static OafRel.Builder getDedupSimilarity(final DedupConfig dedupConf, final String from, final String to) {
|
155 |
|
|
final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
|
156 |
|
|
final String isSimilarTo = DedupSimilarity.RelName.isSimilarTo.name();
|
157 |
|
|
final Builder oafRel = getRelBuilder(from, to, isSimilarTo, DedupUtils.getRelType(type));
|
158 |
|
|
switch (type) {
|
159 |
|
|
case organization:
|
160 |
|
|
oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedupSimilarity(
|
161 |
|
|
dedupSimilarity(isSimilarTo, "dnet:organization_organization_relations")));
|
162 |
|
|
break;
|
163 |
|
|
case result:
|
164 |
|
|
oafRel.setResultResult(ResultResult.newBuilder().setDedupSimilarity(dedupSimilarity(isSimilarTo, "dnet:result_result_relations")));
|
165 |
|
|
break;
|
166 |
|
|
default:
|
167 |
|
|
throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
|
168 |
|
|
}
|
169 |
|
|
return oafRel;
|
170 |
28308
|
claudio.at
|
}
|
171 |
|
|
|
172 |
44467
|
claudio.at
|
private static Builder getRelBuilder(final String from, final String to, final String relClass, final RelType relType) {
|
173 |
|
|
return OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass).setChild(false)
|
174 |
|
|
.setSource(new String(from)).setTarget(new String(to));
|
175 |
|
|
}
|
176 |
|
|
|
177 |
|
|
private static Dedup.Builder dedup(final String relClass, final String scheme) {
|
178 |
|
|
return Dedup.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
|
179 |
|
|
}
|
180 |
|
|
|
181 |
|
|
private static DedupSimilarity.Builder dedupSimilarity(final String relClass, final String scheme) {
|
182 |
|
|
return DedupSimilarity.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
|
183 |
|
|
}
|
184 |
|
|
|
185 |
|
|
private static RelMetadata.Builder getRelMetadata(final String relClass, final String scheme) {
|
186 |
|
|
return RelMetadata.newBuilder().setSemantics(
|
187 |
|
|
Qualifier.newBuilder().setClassid(relClass).setClassname(relClass).setSchemeid(scheme).setSchemename(scheme));
|
188 |
|
|
}
|
189 |
|
|
|
190 |
|
|
public static Oaf.Builder buildRel(final DedupConfig dedupConf, final OafRel.Builder oafRel, final double trust) {
|
191 |
|
|
final double sTrust = Algorithms.scale(trust, JobParams.MAX_DEDUP_TRUST);
|
192 |
|
|
final Oaf.Builder oaf =
|
193 |
|
|
Oaf.newBuilder()
|
194 |
|
|
.setKind(Kind.relation)
|
195 |
|
|
.setLastupdatetimestamp(System.currentTimeMillis())
|
196 |
|
|
.setDataInfo(
|
197 |
|
|
AbstractDNetXsltFunctions.getDataInfo(null, "", String.valueOf(sTrust), false, true).setInferenceprovenance(
|
198 |
|
|
dedupConf.getWf().getConfigurationId())).setRel(oafRel);
|
199 |
|
|
return oaf;
|
200 |
|
|
}
|
201 |
|
|
|
202 |
27941
|
claudio.at
|
class ColumnFamily {
|
203 |
|
|
|
204 |
|
|
private final RelType relType;
|
205 |
|
|
private final SubRelType subRelType;
|
206 |
|
|
|
207 |
28308
|
claudio.at
|
public ColumnFamily(final RelType relType, final SubRelType subRelType) {
|
208 |
27941
|
claudio.at
|
this.relType = relType;
|
209 |
|
|
this.subRelType = subRelType;
|
210 |
|
|
}
|
211 |
|
|
|
212 |
|
|
@Override
|
213 |
|
|
public String toString() {
|
214 |
|
|
return getRelType() + CF_SEPARATOR + getSubRelType();
|
215 |
|
|
}
|
216 |
|
|
|
217 |
|
|
public RelType getRelType() {
|
218 |
|
|
return relType;
|
219 |
|
|
}
|
220 |
|
|
|
221 |
|
|
public SubRelType getSubRelType() {
|
222 |
|
|
return subRelType;
|
223 |
|
|
}
|
224 |
|
|
|
225 |
|
|
}
|
226 |
|
|
|
227 |
26600
|
sandro.lab
|
}
|