Project

General

Profile

1
package eu.dnetlib.data.mapreduce.util;
2

    
3
import java.nio.ByteBuffer;
4

    
5
import eu.dnetlib.data.mapreduce.Algorithms;
6
import eu.dnetlib.data.mapreduce.JobParams;
7
import eu.dnetlib.data.proto.DedupProtos.Dedup;
8
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
9
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
10
import eu.dnetlib.data.proto.KindProtos.Kind;
11
import eu.dnetlib.data.proto.OafProtos.Oaf;
12
import eu.dnetlib.data.proto.OafProtos.OafRel;
13
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
14
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
15
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
16
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
17
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
18
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
19
import eu.dnetlib.data.proto.TypeProtos.Type;
20
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
21
import eu.dnetlib.pace.config.DedupConfig;
22
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
23
import org.apache.hadoop.hbase.util.Bytes;
24

    
25
public class DedupUtils {
26

    
27
	public static final String CF_SEPARATOR = "_";
28

    
29
	public static final String ROOT = "dedup_wf";
30

    
31
	public static final String BODY_S = "body";
32

    
33
	public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
34

    
35
	public static String dedupPrefix(final String dedupRun) {
36
		return "|" + ROOT + "_" + dedupRun + "::";
37
	}
38

    
39
	public static String newId(final String id, final String dedupRun) {
40
		if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
41

    
42
		return id.replaceFirst("\\|.*\\:\\:", dedupPrefix(dedupRun));
43
	}
44

    
45
	public static byte[] newIdBytes(final String s, final String dedupRun) {
46
		return newId(s, dedupRun).getBytes();
47
	}
48

    
49
	public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
50
		return newId(new String(b.array()), dedupRun).getBytes();
51
	}
52

    
53
	public static boolean isRoot(final String s) {
54
		return s.contains(ROOT);
55
	}
56

    
57
	public static boolean isRoot(final ImmutableBytesWritable s) {
58
		return isRoot(s.copyBytes());
59
	}
60

    
61
	public static boolean isRoot(final byte[] s) {
62
		return isRoot(new String(s));
63
	}
64

    
65
	public static String getDedupCF_merges(final Type type) {
66
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
67
	}
68

    
69
	public static String getDedupCF_merges(final String type) {
70
		return getDedupCF_merges(Type.valueOf(type));
71
	}
72

    
73
	public static byte[] getDedupCF_mergesBytes(final Type type) {
74
		return Bytes.toBytes(getDedupCF_merges(type));
75
	}
76

    
77
	public static byte[] getDedupCF_mergesBytes(final String type) {
78
		return getDedupCF_mergesBytes(Type.valueOf(type));
79
	}
80

    
81
	public static String getDedupCF_mergedIn(final Type type) {
82
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
83
	}
84

    
85
	public static String getDedupCF_mergedIn(final String type) {
86
		return getDedupCF_mergedIn(Type.valueOf(type));
87
	}
88

    
89
	public static byte[] getDedupCF_mergedInBytes(final Type type) {
90
		return Bytes.toBytes(getDedupCF_mergedIn(type));
91
	}
92

    
93
	public static byte[] getDedupCF_mergedInBytes(final String type) {
94
		return getDedupCF_mergedInBytes(Type.valueOf(type));
95
	}
96

    
97
	public static String getSimilarityCF(final Type type) {
98
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
99
	}
100

    
101
	public static String getSimilarityCF(final String type) {
102
		return getSimilarityCF(Type.valueOf(type));
103
	}
104

    
105
	public static byte[] getSimilarityCFBytes(final Type type) {
106
		return Bytes.toBytes(getSimilarityCF(type));
107
	}
108

    
109
	public static byte[] getSimilarityCFBytes(final String type) {
110
		return getSimilarityCFBytes(Type.valueOf(type));
111
	}
112

    
113
	public static String getRelTypeString(final Type type) {
114
		return getRelType(type).toString();
115
	}
116

    
117
	public static RelType getRelType(final Type type) {
118
		switch (type) {
119
		case organization:
120
			return RelType.organizationOrganization;
121
		case result:
122
			return RelType.resultResult;
123
		default:
124
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
125
		}
126
	}
127

    
128
	public static ColumnFamily decodeCF(final byte[] b) {
129
		final String[] s = new String(b).split(CF_SEPARATOR);
130
		return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
131
	}
132

    
133
	private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
134
		return new ColumnFamily(relType, subRelType);
135
	}
136

    
137
	public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
138
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
139
		final Builder oafRel = getRelBuilder(from, to, relClass.name(), DedupUtils.getRelType(type));
140
		switch (type) {
141
		case organization:
142
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
143
					dedup(relClass.name(), "dnet:organization_organization_relations")));
144
			break;
145
		case result:
146
			oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass.name(), "dnet:result_result_relations")));
147
			break;
148
		default:
149
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
150
		}
151
		return oafRel;
152
	}
153

    
154
	public static OafRel.Builder getDedupSimilarity(final DedupConfig dedupConf, final String from, final String to) {
155
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
156
		final String isSimilarTo = DedupSimilarity.RelName.isSimilarTo.name();
157
		final Builder oafRel = getRelBuilder(from, to, isSimilarTo, DedupUtils.getRelType(type));
158
		switch (type) {
159
		case organization:
160
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedupSimilarity(
161
					dedupSimilarity(isSimilarTo, "dnet:organization_organization_relations")));
162
			break;
163
		case result:
164
			oafRel.setResultResult(ResultResult.newBuilder().setDedupSimilarity(dedupSimilarity(isSimilarTo, "dnet:result_result_relations")));
165
			break;
166
		default:
167
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
168
		}
169
		return oafRel;
170
	}
171

    
172
	private static Builder getRelBuilder(final String from, final String to, final String relClass, final RelType relType) {
173
		return OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass).setChild(false)
174
				.setSource(new String(from)).setTarget(new String(to));
175
	}
176

    
177
	private static Dedup.Builder dedup(final String relClass, final String scheme) {
178
		return Dedup.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
179
	}
180

    
181
	private static DedupSimilarity.Builder dedupSimilarity(final String relClass, final String scheme) {
182
		return DedupSimilarity.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
183
	}
184

    
185
	private static RelMetadata.Builder getRelMetadata(final String relClass, final String scheme) {
186
		return RelMetadata.newBuilder().setSemantics(
187
				Qualifier.newBuilder().setClassid(relClass).setClassname(relClass).setSchemeid(scheme).setSchemename(scheme));
188
	}
189

    
190
	public static Oaf.Builder buildRel(final DedupConfig dedupConf, final OafRel.Builder oafRel, final double trust) {
191
		final double sTrust = Algorithms.scale(trust, JobParams.MAX_DEDUP_TRUST);
192
		final Oaf.Builder oaf =
193
				Oaf.newBuilder()
194
						.setKind(Kind.relation)
195
						.setLastupdatetimestamp(System.currentTimeMillis())
196
						.setDataInfo(
197
								AbstractDNetXsltFunctions.getDataInfo(null, "", String.valueOf(sTrust), false, true).setInferenceprovenance(
198
										dedupConf.getWf().getConfigurationId())).setRel(oafRel);
199
		return oaf;
200
	}
201

    
202
	class ColumnFamily {
203

    
204
		private final RelType relType;
205
		private final SubRelType subRelType;
206

    
207
		public ColumnFamily(final RelType relType, final SubRelType subRelType) {
208
			this.relType = relType;
209
			this.subRelType = subRelType;
210
		}
211

    
212
		@Override
213
		public String toString() {
214
			return getRelType() + CF_SEPARATOR + getSubRelType();
215
		}
216

    
217
		public RelType getRelType() {
218
			return relType;
219
		}
220

    
221
		public SubRelType getSubRelType() {
222
			return subRelType;
223
		}
224

    
225
	}
226

    
227
}
(1-1/7)