Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.mapreduce.util;
2
3
import java.nio.ByteBuffer;
4
5
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
6 27941 claudio.at
import org.apache.hadoop.hbase.util.Bytes;
7 26600 sandro.lab
8 28058 claudio.at
import eu.dnetlib.data.proto.DedupProtos.Dedup;
9
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
10 28308 claudio.at
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
11
import eu.dnetlib.data.proto.OafProtos.OafRel;
12
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
13
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
14
import eu.dnetlib.data.proto.PersonPersonProtos.PersonPerson;
15
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
16 27941 claudio.at
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
17
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
18 28308 claudio.at
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
19 27941 claudio.at
import eu.dnetlib.data.proto.TypeProtos.Type;
20 36670 claudio.at
import eu.dnetlib.pace.config.DedupConfig;
21 26600 sandro.lab
22 27941 claudio.at
public class DedupUtils {
23
24
	public static final String CF_SEPARATOR = "_";
25
26 26600 sandro.lab
	public static final String ROOT = "dedup_wf";
27
28 28308 claudio.at
	public static final String BODY_S = "body";
29
30
	public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
31
32
	public static String dedupPrefix(final String dedupRun) {
33 26600 sandro.lab
		return "|" + ROOT + "_" + dedupRun + "::";
34
	}
35 27941 claudio.at
36 28308 claudio.at
	public static String newId(final String id, final String dedupRun) {
37
		if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
38 27941 claudio.at
39 26600 sandro.lab
		return id.replaceFirst("\\|.*\\:\\:", dedupPrefix(dedupRun));
40
	}
41
42 28308 claudio.at
	public static byte[] newIdBytes(final String s, final String dedupRun) {
43 26600 sandro.lab
		return newId(s, dedupRun).getBytes();
44
	}
45
46 28308 claudio.at
	public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
47 26600 sandro.lab
		return newId(new String(b.array()), dedupRun).getBytes();
48
	}
49
50 28308 claudio.at
	public static boolean isRoot(final String s) {
51 26600 sandro.lab
		return s.contains(ROOT);
52
	}
53 27941 claudio.at
54 28308 claudio.at
	public static boolean isRoot(final ImmutableBytesWritable s) {
55 26600 sandro.lab
		return isRoot(s.copyBytes());
56
	}
57 27941 claudio.at
58 28308 claudio.at
	public static boolean isRoot(final byte[] s) {
59 26600 sandro.lab
		return isRoot(new String(s));
60
	}
61 27941 claudio.at
62 28308 claudio.at
	public static String getDedupCF_merges(final Type type) {
63 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
64 27941 claudio.at
	}
65
66 36164 claudio.at
	public static String getDedupCF_merges(final String type) {
67
		return getDedupCF_merges(Type.valueOf(type));
68
	}
69
70 28308 claudio.at
	public static byte[] getDedupCF_mergesBytes(final Type type) {
71 28058 claudio.at
		return Bytes.toBytes(getDedupCF_merges(type));
72 27941 claudio.at
	}
73
74 36164 claudio.at
	public static byte[] getDedupCF_mergesBytes(final String type) {
75
		return getDedupCF_mergesBytes(Type.valueOf(type));
76
	}
77
78 28308 claudio.at
	public static String getDedupCF_mergedIn(final Type type) {
79 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
80
	}
81
82 36164 claudio.at
	public static String getDedupCF_mergedIn(final String type) {
83
		return getDedupCF_mergedIn(Type.valueOf(type));
84
	}
85
86 28308 claudio.at
	public static byte[] getDedupCF_mergedInBytes(final Type type) {
87 28058 claudio.at
		return Bytes.toBytes(getDedupCF_mergedIn(type));
88
	}
89
90 36164 claudio.at
	public static byte[] getDedupCF_mergedInBytes(final String type) {
91
		return getDedupCF_mergedInBytes(Type.valueOf(type));
92
	}
93
94 28308 claudio.at
	public static String getSimilarityCF(final Type type) {
95 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
96 27941 claudio.at
	}
97
98 36164 claudio.at
	public static String getSimilarityCF(final String type) {
99
		return getSimilarityCF(Type.valueOf(type));
100
	}
101
102 28308 claudio.at
	public static byte[] getSimilarityCFBytes(final Type type) {
103 27941 claudio.at
		return Bytes.toBytes(getSimilarityCF(type));
104
	}
105
106 36164 claudio.at
	public static byte[] getSimilarityCFBytes(final String type) {
107
		return getSimilarityCFBytes(Type.valueOf(type));
108
	}
109
110 28308 claudio.at
	public static String getRelTypeString(final Type type) {
111 27941 claudio.at
		return getRelType(type).toString();
112
	}
113
114 28308 claudio.at
	public static RelType getRelType(final Type type) {
115 27941 claudio.at
		switch (type) {
116
		case organization:
117
			return RelType.organizationOrganization;
118
		case person:
119
			return RelType.personPerson;
120
		case result:
121
			return RelType.resultResult;
122
		default:
123
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
124
		}
125
	}
126
127 28308 claudio.at
	public static ColumnFamily decodeCF(final byte[] b) {
128 36164 claudio.at
		final String[] s = new String(b).split(CF_SEPARATOR);
129 27941 claudio.at
		return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
130
	}
131
132 28308 claudio.at
	private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
133 27941 claudio.at
		return new ColumnFamily(relType, subRelType);
134
	}
135
136 28308 claudio.at
	public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
137 36670 claudio.at
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
138 36164 claudio.at
		final RelType relType = DedupUtils.getRelType(type);
139
		final Builder oafRel =
140 28308 claudio.at
				OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass.toString()).setChild(false)
141 36670 claudio.at
						.setSource(new String(from)).setTarget(new String(to));
142 28411 claudio.at
		switch (type) {
143 28308 claudio.at
		case organization:
144
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
145
					DedupUtils.dedup(relClass, "dnet:organization_organization_relations")));
146
			break;
147
		case person:
148
			oafRel.setPersonPerson(PersonPerson.newBuilder().setDedup(DedupUtils.dedup(relClass, "dnet:person_person_relations")));
149
			break;
150
		case result:
151
			oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass, "dnet:result_result_relations")));
152
			break;
153
		default:
154 36670 claudio.at
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
155 28308 claudio.at
		}
156
		return oafRel;
157
	}
158
159
	private static Dedup.Builder dedup(final Dedup.RelName relClass, final String scheme) {
160
		return Dedup.newBuilder().setRelMetadata(
161
				RelMetadata.newBuilder().setSemantics(
162
						Qualifier.newBuilder().setClassid(relClass.toString()).setClassname(relClass.toString()).setSchemeid(scheme).setSchemename(scheme)));
163
	}
164
165 27941 claudio.at
	class ColumnFamily {
166
167
		private final RelType relType;
168
		private final SubRelType subRelType;
169
170 28308 claudio.at
		public ColumnFamily(final RelType relType, final SubRelType subRelType) {
171 27941 claudio.at
			this.relType = relType;
172
			this.subRelType = subRelType;
173
		}
174
175
		@Override
176
		public String toString() {
177
			return getRelType() + CF_SEPARATOR + getSubRelType();
178
		}
179
180
		public RelType getRelType() {
181
			return relType;
182
		}
183
184
		public SubRelType getSubRelType() {
185
			return subRelType;
186
		}
187
188
	}
189
190 26600 sandro.lab
}