1 |
26600
|
sandro.lab
|
package eu.dnetlib.data.mapreduce.util;
|
2 |
|
|
|
3 |
|
|
import java.nio.ByteBuffer;
|
4 |
|
|
|
5 |
|
|
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
|
6 |
27941
|
claudio.at
|
import org.apache.hadoop.hbase.util.Bytes;
|
7 |
26600
|
sandro.lab
|
|
8 |
28058
|
claudio.at
|
import eu.dnetlib.data.proto.DedupProtos.Dedup;
|
9 |
|
|
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
|
10 |
28308
|
claudio.at
|
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
11 |
|
|
import eu.dnetlib.data.proto.OafProtos.OafRel;
|
12 |
|
|
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
|
13 |
|
|
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
|
14 |
|
|
import eu.dnetlib.data.proto.PersonPersonProtos.PersonPerson;
|
15 |
|
|
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
|
16 |
27941
|
claudio.at
|
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
|
17 |
|
|
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
|
18 |
28308
|
claudio.at
|
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
|
19 |
27941
|
claudio.at
|
import eu.dnetlib.data.proto.TypeProtos.Type;
|
20 |
36670
|
claudio.at
|
import eu.dnetlib.pace.config.DedupConfig;
|
21 |
26600
|
sandro.lab
|
|
22 |
27941
|
claudio.at
|
public class DedupUtils {
|
23 |
|
|
|
24 |
|
|
public static final String CF_SEPARATOR = "_";
|
25 |
|
|
|
26 |
26600
|
sandro.lab
|
public static final String ROOT = "dedup_wf";
|
27 |
|
|
|
28 |
28308
|
claudio.at
|
public static final String BODY_S = "body";
|
29 |
|
|
|
30 |
|
|
public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
|
31 |
|
|
|
32 |
|
|
public static String dedupPrefix(final String dedupRun) {
|
33 |
26600
|
sandro.lab
|
return "|" + ROOT + "_" + dedupRun + "::";
|
34 |
|
|
}
|
35 |
27941
|
claudio.at
|
|
36 |
28308
|
claudio.at
|
public static String newId(final String id, final String dedupRun) {
|
37 |
|
|
if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
|
38 |
27941
|
claudio.at
|
|
39 |
26600
|
sandro.lab
|
return id.replaceFirst("\\|.*\\:\\:", dedupPrefix(dedupRun));
|
40 |
|
|
}
|
41 |
|
|
|
42 |
28308
|
claudio.at
|
public static byte[] newIdBytes(final String s, final String dedupRun) {
|
43 |
26600
|
sandro.lab
|
return newId(s, dedupRun).getBytes();
|
44 |
|
|
}
|
45 |
|
|
|
46 |
28308
|
claudio.at
|
public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
|
47 |
26600
|
sandro.lab
|
return newId(new String(b.array()), dedupRun).getBytes();
|
48 |
|
|
}
|
49 |
|
|
|
50 |
28308
|
claudio.at
|
public static boolean isRoot(final String s) {
|
51 |
26600
|
sandro.lab
|
return s.contains(ROOT);
|
52 |
|
|
}
|
53 |
27941
|
claudio.at
|
|
54 |
28308
|
claudio.at
|
public static boolean isRoot(final ImmutableBytesWritable s) {
|
55 |
26600
|
sandro.lab
|
return isRoot(s.copyBytes());
|
56 |
|
|
}
|
57 |
27941
|
claudio.at
|
|
58 |
28308
|
claudio.at
|
public static boolean isRoot(final byte[] s) {
|
59 |
26600
|
sandro.lab
|
return isRoot(new String(s));
|
60 |
|
|
}
|
61 |
27941
|
claudio.at
|
|
62 |
28308
|
claudio.at
|
public static String getDedupCF_merges(final Type type) {
|
63 |
28058
|
claudio.at
|
return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
|
64 |
27941
|
claudio.at
|
}
|
65 |
|
|
|
66 |
36164
|
claudio.at
|
public static String getDedupCF_merges(final String type) {
|
67 |
|
|
return getDedupCF_merges(Type.valueOf(type));
|
68 |
|
|
}
|
69 |
|
|
|
70 |
28308
|
claudio.at
|
public static byte[] getDedupCF_mergesBytes(final Type type) {
|
71 |
28058
|
claudio.at
|
return Bytes.toBytes(getDedupCF_merges(type));
|
72 |
27941
|
claudio.at
|
}
|
73 |
|
|
|
74 |
36164
|
claudio.at
|
public static byte[] getDedupCF_mergesBytes(final String type) {
|
75 |
|
|
return getDedupCF_mergesBytes(Type.valueOf(type));
|
76 |
|
|
}
|
77 |
|
|
|
78 |
28308
|
claudio.at
|
public static String getDedupCF_mergedIn(final Type type) {
|
79 |
28058
|
claudio.at
|
return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
|
80 |
|
|
}
|
81 |
|
|
|
82 |
36164
|
claudio.at
|
public static String getDedupCF_mergedIn(final String type) {
|
83 |
|
|
return getDedupCF_mergedIn(Type.valueOf(type));
|
84 |
|
|
}
|
85 |
|
|
|
86 |
28308
|
claudio.at
|
public static byte[] getDedupCF_mergedInBytes(final Type type) {
|
87 |
28058
|
claudio.at
|
return Bytes.toBytes(getDedupCF_mergedIn(type));
|
88 |
|
|
}
|
89 |
|
|
|
90 |
36164
|
claudio.at
|
public static byte[] getDedupCF_mergedInBytes(final String type) {
|
91 |
|
|
return getDedupCF_mergedInBytes(Type.valueOf(type));
|
92 |
|
|
}
|
93 |
|
|
|
94 |
28308
|
claudio.at
|
public static String getSimilarityCF(final Type type) {
|
95 |
28058
|
claudio.at
|
return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
|
96 |
27941
|
claudio.at
|
}
|
97 |
|
|
|
98 |
36164
|
claudio.at
|
public static String getSimilarityCF(final String type) {
|
99 |
|
|
return getSimilarityCF(Type.valueOf(type));
|
100 |
|
|
}
|
101 |
|
|
|
102 |
28308
|
claudio.at
|
public static byte[] getSimilarityCFBytes(final Type type) {
|
103 |
27941
|
claudio.at
|
return Bytes.toBytes(getSimilarityCF(type));
|
104 |
|
|
}
|
105 |
|
|
|
106 |
36164
|
claudio.at
|
public static byte[] getSimilarityCFBytes(final String type) {
|
107 |
|
|
return getSimilarityCFBytes(Type.valueOf(type));
|
108 |
|
|
}
|
109 |
|
|
|
110 |
28308
|
claudio.at
|
public static String getRelTypeString(final Type type) {
|
111 |
27941
|
claudio.at
|
return getRelType(type).toString();
|
112 |
|
|
}
|
113 |
|
|
|
114 |
28308
|
claudio.at
|
public static RelType getRelType(final Type type) {
|
115 |
27941
|
claudio.at
|
switch (type) {
|
116 |
|
|
case organization:
|
117 |
|
|
return RelType.organizationOrganization;
|
118 |
|
|
case person:
|
119 |
|
|
return RelType.personPerson;
|
120 |
|
|
case result:
|
121 |
|
|
return RelType.resultResult;
|
122 |
|
|
default:
|
123 |
|
|
throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
|
124 |
|
|
}
|
125 |
|
|
}
|
126 |
|
|
|
127 |
28308
|
claudio.at
|
public static ColumnFamily decodeCF(final byte[] b) {
|
128 |
36164
|
claudio.at
|
final String[] s = new String(b).split(CF_SEPARATOR);
|
129 |
27941
|
claudio.at
|
return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
|
130 |
|
|
}
|
131 |
|
|
|
132 |
28308
|
claudio.at
|
private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
|
133 |
27941
|
claudio.at
|
return new ColumnFamily(relType, subRelType);
|
134 |
|
|
}
|
135 |
|
|
|
136 |
28308
|
claudio.at
|
public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
|
137 |
36670
|
claudio.at
|
final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
|
138 |
36164
|
claudio.at
|
final RelType relType = DedupUtils.getRelType(type);
|
139 |
|
|
final Builder oafRel =
|
140 |
28308
|
claudio.at
|
OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass.toString()).setChild(false)
|
141 |
36670
|
claudio.at
|
.setSource(new String(from)).setTarget(new String(to));
|
142 |
28411
|
claudio.at
|
switch (type) {
|
143 |
28308
|
claudio.at
|
case organization:
|
144 |
|
|
oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
|
145 |
|
|
DedupUtils.dedup(relClass, "dnet:organization_organization_relations")));
|
146 |
|
|
break;
|
147 |
|
|
case person:
|
148 |
|
|
oafRel.setPersonPerson(PersonPerson.newBuilder().setDedup(DedupUtils.dedup(relClass, "dnet:person_person_relations")));
|
149 |
|
|
break;
|
150 |
|
|
case result:
|
151 |
|
|
oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass, "dnet:result_result_relations")));
|
152 |
|
|
break;
|
153 |
|
|
default:
|
154 |
36670
|
claudio.at
|
throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
|
155 |
28308
|
claudio.at
|
}
|
156 |
|
|
return oafRel;
|
157 |
|
|
}
|
158 |
|
|
|
159 |
|
|
private static Dedup.Builder dedup(final Dedup.RelName relClass, final String scheme) {
|
160 |
|
|
return Dedup.newBuilder().setRelMetadata(
|
161 |
|
|
RelMetadata.newBuilder().setSemantics(
|
162 |
|
|
Qualifier.newBuilder().setClassid(relClass.toString()).setClassname(relClass.toString()).setSchemeid(scheme).setSchemename(scheme)));
|
163 |
|
|
}
|
164 |
|
|
|
165 |
27941
|
claudio.at
|
class ColumnFamily {
|
166 |
|
|
|
167 |
|
|
private final RelType relType;
|
168 |
|
|
private final SubRelType subRelType;
|
169 |
|
|
|
170 |
28308
|
claudio.at
|
public ColumnFamily(final RelType relType, final SubRelType subRelType) {
|
171 |
27941
|
claudio.at
|
this.relType = relType;
|
172 |
|
|
this.subRelType = subRelType;
|
173 |
|
|
}
|
174 |
|
|
|
175 |
|
|
@Override
|
176 |
|
|
public String toString() {
|
177 |
|
|
return getRelType() + CF_SEPARATOR + getSubRelType();
|
178 |
|
|
}
|
179 |
|
|
|
180 |
|
|
public RelType getRelType() {
|
181 |
|
|
return relType;
|
182 |
|
|
}
|
183 |
|
|
|
184 |
|
|
public SubRelType getSubRelType() {
|
185 |
|
|
return subRelType;
|
186 |
|
|
}
|
187 |
|
|
|
188 |
|
|
}
|
189 |
|
|
|
190 |
26600
|
sandro.lab
|
}
|