Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.mapreduce.util;
2
3
import java.nio.ByteBuffer;
4
5
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
6 27941 claudio.at
import org.apache.hadoop.hbase.util.Bytes;
7 26600 sandro.lab
8 28058 claudio.at
import eu.dnetlib.data.proto.DedupProtos.Dedup;
9
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
10 28308 claudio.at
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
11
import eu.dnetlib.data.proto.OafProtos.OafRel;
12
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
13
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
14
import eu.dnetlib.data.proto.PersonPersonProtos.PersonPerson;
15
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
16 27941 claudio.at
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
17
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
18 28308 claudio.at
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
19 27941 claudio.at
import eu.dnetlib.data.proto.TypeProtos.Type;
20 28308 claudio.at
import eu.dnetlib.pace.util.DedupConfig;
21 26600 sandro.lab
22 27941 claudio.at
public class DedupUtils {
23
24
	public static final String CF_SEPARATOR = "_";
25
26 26600 sandro.lab
	public static final String ROOT = "dedup_wf";
27
28 28308 claudio.at
	public static final String BODY_S = "body";
29
30
	public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
31
32
	public static String dedupPrefix(final String dedupRun) {
33 26600 sandro.lab
		return "|" + ROOT + "_" + dedupRun + "::";
34
	}
35 27941 claudio.at
36 28308 claudio.at
	public static String newId(final String id, final String dedupRun) {
37
		if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
38 27941 claudio.at
39 26600 sandro.lab
		return id.replaceFirst("\\|.*\\:\\:", dedupPrefix(dedupRun));
40
	}
41
42 28308 claudio.at
	public static byte[] newIdBytes(final String s, final String dedupRun) {
43 26600 sandro.lab
		return newId(s, dedupRun).getBytes();
44
	}
45
46 28308 claudio.at
	public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
47 26600 sandro.lab
		return newId(new String(b.array()), dedupRun).getBytes();
48
	}
49
50 28308 claudio.at
	public static boolean isRoot(final String s) {
51 26600 sandro.lab
		return s.contains(ROOT);
52
	}
53 27941 claudio.at
54 28308 claudio.at
	public static boolean isRoot(final ImmutableBytesWritable s) {
55 26600 sandro.lab
		return isRoot(s.copyBytes());
56
	}
57 27941 claudio.at
58 28308 claudio.at
	public static boolean isRoot(final byte[] s) {
59 26600 sandro.lab
		return isRoot(new String(s));
60
	}
61 27941 claudio.at
62 28308 claudio.at
	public static String getDedupCF_merges(final Type type) {
63 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
64 27941 claudio.at
	}
65
66 28308 claudio.at
	public static byte[] getDedupCF_mergesBytes(final Type type) {
67 28058 claudio.at
		return Bytes.toBytes(getDedupCF_merges(type));
68 27941 claudio.at
	}
69
70 28308 claudio.at
	public static String getDedupCF_mergedIn(final Type type) {
71 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
72
	}
73
74 28308 claudio.at
	public static byte[] getDedupCF_mergedInBytes(final Type type) {
75 28058 claudio.at
		return Bytes.toBytes(getDedupCF_mergedIn(type));
76
	}
77
78 28308 claudio.at
	public static String getSimilarityCF(final Type type) {
79 28058 claudio.at
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
80 27941 claudio.at
	}
81
82 28308 claudio.at
	public static byte[] getSimilarityCFBytes(final Type type) {
83 27941 claudio.at
		return Bytes.toBytes(getSimilarityCF(type));
84
	}
85
86 28308 claudio.at
	public static String getRelTypeString(final Type type) {
87 27941 claudio.at
		return getRelType(type).toString();
88
	}
89
90 28308 claudio.at
	public static RelType getRelType(final Type type) {
91 27941 claudio.at
		switch (type) {
92
		case organization:
93
			return RelType.organizationOrganization;
94
		case person:
95
			return RelType.personPerson;
96
		case result:
97
			return RelType.resultResult;
98
		default:
99
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
100
		}
101
	}
102
103 28308 claudio.at
	public static ColumnFamily decodeCF(final byte[] b) {
104 27941 claudio.at
		String[] s = new String(b).split(CF_SEPARATOR);
105
		return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
106
	}
107
108 28308 claudio.at
	private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
109 27941 claudio.at
		return new ColumnFamily(relType, subRelType);
110
	}
111
112 28308 claudio.at
	public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
113 28411 claudio.at
		Type type = Type.valueOf(dedupConf.getEntityType());
114
		RelType relType = DedupUtils.getRelType(type);
115 28308 claudio.at
		Builder oafRel =
116
				OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass.toString()).setChild(false)
117
						.setSource(new String(from)).setTarget(new String(to));
118 28411 claudio.at
		switch (type) {
119 28308 claudio.at
		case organization:
120
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
121
					DedupUtils.dedup(relClass, "dnet:organization_organization_relations")));
122
			break;
123
		case person:
124
			oafRel.setPersonPerson(PersonPerson.newBuilder().setDedup(DedupUtils.dedup(relClass, "dnet:person_person_relations")));
125
			break;
126
		case result:
127
			oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass, "dnet:result_result_relations")));
128
			break;
129
		default:
130
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getEntityType());
131
		}
132
		return oafRel;
133
	}
134
135
	private static Dedup.Builder dedup(final Dedup.RelName relClass, final String scheme) {
136
		return Dedup.newBuilder().setRelMetadata(
137
				RelMetadata.newBuilder().setSemantics(
138
						Qualifier.newBuilder().setClassid(relClass.toString()).setClassname(relClass.toString()).setSchemeid(scheme).setSchemename(scheme)));
139
	}
140
141 27941 claudio.at
	class ColumnFamily {
142
143
		private final RelType relType;
144
		private final SubRelType subRelType;
145
146 28308 claudio.at
		public ColumnFamily(final RelType relType, final SubRelType subRelType) {
147 27941 claudio.at
			this.relType = relType;
148
			this.subRelType = subRelType;
149
		}
150
151
		@Override
152
		public String toString() {
153
			return getRelType() + CF_SEPARATOR + getSubRelType();
154
		}
155
156
		public RelType getRelType() {
157
			return relType;
158
		}
159
160
		public SubRelType getSubRelType() {
161
			return subRelType;
162
		}
163
164
	}
165
166 26600 sandro.lab
}