Project

General

Profile

1
package eu.dnetlib.data.mapreduce.util;
2

    
3
import java.nio.ByteBuffer;
4

    
5
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
6
import org.apache.hadoop.hbase.util.Bytes;
7

    
8
import eu.dnetlib.data.proto.DedupProtos.Dedup;
9
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
10
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
11
import eu.dnetlib.data.proto.OafProtos.OafRel;
12
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
13
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
14
import eu.dnetlib.data.proto.PersonPersonProtos.PersonPerson;
15
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
16
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
17
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
18
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
19
import eu.dnetlib.data.proto.TypeProtos.Type;
20
import eu.dnetlib.pace.util.DedupConfig;
21

    
22
public class DedupUtils {
23

    
24
	public static final String CF_SEPARATOR = "_";
25

    
26
	public static final String ROOT = "dedup_wf";
27

    
28
	public static final String BODY_S = "body";
29

    
30
	public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
31

    
32
	public static String dedupPrefix(final String dedupRun) {
33
		return "|" + ROOT + "_" + dedupRun + "::";
34
	}
35

    
36
	public static String newId(final String id, final String dedupRun) {
37
		if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
38

    
39
		return id.replaceFirst("\\|.*\\:\\:", dedupPrefix(dedupRun));
40
	}
41

    
42
	public static byte[] newIdBytes(final String s, final String dedupRun) {
43
		return newId(s, dedupRun).getBytes();
44
	}
45

    
46
	public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
47
		return newId(new String(b.array()), dedupRun).getBytes();
48
	}
49

    
50
	public static boolean isRoot(final String s) {
51
		return s.contains(ROOT);
52
	}
53

    
54
	public static boolean isRoot(final ImmutableBytesWritable s) {
55
		return isRoot(s.copyBytes());
56
	}
57

    
58
	public static boolean isRoot(final byte[] s) {
59
		return isRoot(new String(s));
60
	}
61

    
62
	public static String getDedupCF_merges(final Type type) {
63
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
64
	}
65

    
66
	public static String getDedupCF_merges(final String type) {
67
		return getDedupCF_merges(Type.valueOf(type));
68
	}
69

    
70
	public static byte[] getDedupCF_mergesBytes(final Type type) {
71
		return Bytes.toBytes(getDedupCF_merges(type));
72
	}
73

    
74
	public static byte[] getDedupCF_mergesBytes(final String type) {
75
		return getDedupCF_mergesBytes(Type.valueOf(type));
76
	}
77

    
78
	public static String getDedupCF_mergedIn(final Type type) {
79
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
80
	}
81

    
82
	public static String getDedupCF_mergedIn(final String type) {
83
		return getDedupCF_mergedIn(Type.valueOf(type));
84
	}
85

    
86
	public static byte[] getDedupCF_mergedInBytes(final Type type) {
87
		return Bytes.toBytes(getDedupCF_mergedIn(type));
88
	}
89

    
90
	public static byte[] getDedupCF_mergedInBytes(final String type) {
91
		return getDedupCF_mergedInBytes(Type.valueOf(type));
92
	}
93

    
94
	public static String getSimilarityCF(final Type type) {
95
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
96
	}
97

    
98
	public static String getSimilarityCF(final String type) {
99
		return getSimilarityCF(Type.valueOf(type));
100
	}
101

    
102
	public static byte[] getSimilarityCFBytes(final Type type) {
103
		return Bytes.toBytes(getSimilarityCF(type));
104
	}
105

    
106
	public static byte[] getSimilarityCFBytes(final String type) {
107
		return getSimilarityCFBytes(Type.valueOf(type));
108
	}
109

    
110
	public static String getRelTypeString(final Type type) {
111
		return getRelType(type).toString();
112
	}
113

    
114
	public static RelType getRelType(final Type type) {
115
		switch (type) {
116
		case organization:
117
			return RelType.organizationOrganization;
118
		case person:
119
			return RelType.personPerson;
120
		case result:
121
			return RelType.resultResult;
122
		default:
123
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
124
		}
125
	}
126

    
127
	public static ColumnFamily decodeCF(final byte[] b) {
128
		final String[] s = new String(b).split(CF_SEPARATOR);
129
		return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
130
	}
131

    
132
	private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
133
		return new ColumnFamily(relType, subRelType);
134
	}
135

    
136
	public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
137
		final Type type = Type.valueOf(dedupConf.getEntityType());
138
		final RelType relType = DedupUtils.getRelType(type);
139
		final Builder oafRel =
140
				OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass.toString()).setChild(false)
141
						.setSource(new String(from)).setTarget(new String(to));
142
		switch (type) {
143
		case organization:
144
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
145
					DedupUtils.dedup(relClass, "dnet:organization_organization_relations")));
146
			break;
147
		case person:
148
			oafRel.setPersonPerson(PersonPerson.newBuilder().setDedup(DedupUtils.dedup(relClass, "dnet:person_person_relations")));
149
			break;
150
		case result:
151
			oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass, "dnet:result_result_relations")));
152
			break;
153
		default:
154
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getEntityType());
155
		}
156
		return oafRel;
157
	}
158

    
159
	private static Dedup.Builder dedup(final Dedup.RelName relClass, final String scheme) {
160
		return Dedup.newBuilder().setRelMetadata(
161
				RelMetadata.newBuilder().setSemantics(
162
						Qualifier.newBuilder().setClassid(relClass.toString()).setClassname(relClass.toString()).setSchemeid(scheme).setSchemename(scheme)));
163
	}
164

    
165
	class ColumnFamily {
166

    
167
		private final RelType relType;
168
		private final SubRelType subRelType;
169

    
170
		public ColumnFamily(final RelType relType, final SubRelType subRelType) {
171
			this.relType = relType;
172
			this.subRelType = subRelType;
173
		}
174

    
175
		@Override
176
		public String toString() {
177
			return getRelType() + CF_SEPARATOR + getSubRelType();
178
		}
179

    
180
		public RelType getRelType() {
181
			return relType;
182
		}
183

    
184
		public SubRelType getSubRelType() {
185
			return subRelType;
186
		}
187

    
188
	}
189

    
190
}
(1-1/10)