Project

General

Profile

1
package eu.dnetlib.data.mapreduce.util;
2

    
3
import java.nio.ByteBuffer;
4

    
5
import eu.dnetlib.data.mapreduce.Algorithms;
6
import eu.dnetlib.data.mapreduce.JobParams;
7
import eu.dnetlib.data.proto.DedupProtos.Dedup;
8
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
9
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
10
import eu.dnetlib.data.proto.KindProtos.Kind;
11
import eu.dnetlib.data.proto.OafProtos.Oaf;
12
import eu.dnetlib.data.proto.OafProtos.OafRel;
13
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
14
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
15
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
16
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
17
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
18
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
19
import eu.dnetlib.data.proto.TypeProtos.Type;
20
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
21
import eu.dnetlib.pace.config.DedupConfig;
22
import org.apache.commons.lang3.StringUtils;
23
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
24
import org.apache.hadoop.hbase.util.Bytes;
25

    
26
public class DedupUtils {
27

    
28
	public static final String CF_SEPARATOR = "_";
29

    
30
	public static final String ROOT = "dedup_wf";
31

    
32
	public static final String BODY_S = "body";
33

    
34
	public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
35

    
36
	public static String dedupPrefix(final String dedupRun) {
37
		return "|" + ROOT + "_" + dedupRun + "::";
38
	}
39

    
40
	public static String newId(final String id, final String dedupRun) {
41
		if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
42

    
43
		return StringUtils.substringBefore(id,"|") + dedupPrefix(dedupRun) + "::" + AbstractDNetXsltFunctions.md5(id);
44
	}
45

    
46
	public static byte[] newIdBytes(final String s, final String dedupRun) {
47
		return newId(s, dedupRun).getBytes();
48
	}
49

    
50
	public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
51
		return newId(new String(b.array()), dedupRun).getBytes();
52
	}
53

    
54
	public static boolean isRoot(final String s) {
55
		return s.contains(ROOT);
56
	}
57

    
58
	public static boolean isRoot(final ImmutableBytesWritable s) {
59
		return isRoot(s.copyBytes());
60
	}
61

    
62
	public static boolean isRoot(final byte[] s) {
63
		return isRoot(new String(s));
64
	}
65

    
66
	public static String getDedupCF_merges(final Type type) {
67
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
68
	}
69

    
70
	public static String getDedupCF_merges(final String type) {
71
		return getDedupCF_merges(Type.valueOf(type));
72
	}
73

    
74
	public static byte[] getDedupCF_mergesBytes(final Type type) {
75
		return Bytes.toBytes(getDedupCF_merges(type));
76
	}
77

    
78
	public static byte[] getDedupCF_mergesBytes(final String type) {
79
		return getDedupCF_mergesBytes(Type.valueOf(type));
80
	}
81

    
82
	public static String getDedupCF_mergedIn(final Type type) {
83
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
84
	}
85

    
86
	public static String getDedupCF_mergedIn(final String type) {
87
		return getDedupCF_mergedIn(Type.valueOf(type));
88
	}
89

    
90
	public static byte[] getDedupCF_mergedInBytes(final Type type) {
91
		return Bytes.toBytes(getDedupCF_mergedIn(type));
92
	}
93

    
94
	public static byte[] getDedupCF_mergedInBytes(final String type) {
95
		return getDedupCF_mergedInBytes(Type.valueOf(type));
96
	}
97

    
98
	public static String getSimilarityCF(final Type type) {
99
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
100
	}
101

    
102
	public static String getSimilarityCF(final String type) {
103
		return getSimilarityCF(Type.valueOf(type));
104
	}
105

    
106
	public static byte[] getSimilarityCFBytes(final Type type) {
107
		return Bytes.toBytes(getSimilarityCF(type));
108
	}
109

    
110
	public static byte[] getSimilarityCFBytes(final String type) {
111
		return getSimilarityCFBytes(Type.valueOf(type));
112
	}
113

    
114
	public static String getRelTypeString(final Type type) {
115
		return getRelType(type).toString();
116
	}
117

    
118
	public static RelType getRelType(final Type type) {
119
		switch (type) {
120
		case organization:
121
			return RelType.organizationOrganization;
122
		case result:
123
			return RelType.resultResult;
124
		default:
125
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
126
		}
127
	}
128

    
129
	public static ColumnFamily decodeCF(final byte[] b) {
130
		final String[] s = new String(b).split(CF_SEPARATOR);
131
		return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
132
	}
133

    
134
	private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
135
		return new ColumnFamily(relType, subRelType);
136
	}
137

    
138
	public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
139
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
140
		final Builder oafRel = getRelBuilder(from, to, relClass.name(), DedupUtils.getRelType(type));
141
		switch (type) {
142
		case organization:
143
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
144
					dedup(relClass.name(), "dnet:organization_organization_relations")));
145
			break;
146
		case result:
147
			oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass.name(), "dnet:result_result_relations")));
148
			break;
149
		default:
150
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
151
		}
152
		return oafRel;
153
	}
154

    
155
	public static OafRel.Builder getDedupSimilarity(final DedupConfig dedupConf, final String from, final String to) {
156
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
157
		final String isSimilarTo = DedupSimilarity.RelName.isSimilarTo.name();
158
		final Builder oafRel = getRelBuilder(from, to, isSimilarTo, DedupUtils.getRelType(type));
159
		switch (type) {
160
		case organization:
161
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedupSimilarity(
162
					dedupSimilarity(isSimilarTo, "dnet:organization_organization_relations")));
163
			break;
164
		case result:
165
			oafRel.setResultResult(ResultResult.newBuilder().setDedupSimilarity(dedupSimilarity(isSimilarTo, "dnet:result_result_relations")));
166
			break;
167
		default:
168
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getWf().getEntityType());
169
		}
170
		return oafRel;
171
	}
172

    
173
	private static Builder getRelBuilder(final String from, final String to, final String relClass, final RelType relType) {
174
		return OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass).setChild(false)
175
				.setSource(new String(from)).setTarget(new String(to));
176
	}
177

    
178
	private static Dedup.Builder dedup(final String relClass, final String scheme) {
179
		return Dedup.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
180
	}
181

    
182
	private static DedupSimilarity.Builder dedupSimilarity(final String relClass, final String scheme) {
183
		return DedupSimilarity.newBuilder().setRelMetadata(getRelMetadata(relClass, scheme));
184
	}
185

    
186
	private static RelMetadata.Builder getRelMetadata(final String relClass, final String scheme) {
187
		return RelMetadata.newBuilder().setSemantics(
188
				Qualifier.newBuilder().setClassid(relClass).setClassname(relClass).setSchemeid(scheme).setSchemename(scheme));
189
	}
190

    
191
	public static Oaf.Builder buildRel(final DedupConfig dedupConf, final OafRel.Builder oafRel, final double trust) {
192
		final double sTrust = Algorithms.scale(trust, JobParams.MAX_DEDUP_TRUST);
193
		final Oaf.Builder oaf =
194
				Oaf.newBuilder()
195
						.setKind(Kind.relation)
196
						.setLastupdatetimestamp(System.currentTimeMillis())
197
						.setDataInfo(
198
								AbstractDNetXsltFunctions.getDataInfo(null, "", String.valueOf(sTrust), false, true).setInferenceprovenance(
199
										dedupConf.getWf().getConfigurationId())).setRel(oafRel);
200
		return oaf;
201
	}
202

    
203
	class ColumnFamily {
204

    
205
		private final RelType relType;
206
		private final SubRelType subRelType;
207

    
208
		public ColumnFamily(final RelType relType, final SubRelType subRelType) {
209
			this.relType = relType;
210
			this.subRelType = subRelType;
211
		}
212

    
213
		@Override
214
		public String toString() {
215
			return getRelType() + CF_SEPARATOR + getSubRelType();
216
		}
217

    
218
		public RelType getRelType() {
219
			return relType;
220
		}
221

    
222
		public SubRelType getSubRelType() {
223
			return subRelType;
224
		}
225

    
226
	}
227

    
228
}
(1-1/8)