Project

General

Profile

1
package eu.dnetlib.data.mapreduce.util;
2

    
3
import java.nio.ByteBuffer;
4

    
5
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
6
import org.apache.hadoop.hbase.util.Bytes;
7

    
8
import eu.dnetlib.data.proto.DedupProtos.Dedup;
9
import eu.dnetlib.data.proto.DedupSimilarityProtos.DedupSimilarity;
10
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
11
import eu.dnetlib.data.proto.OafProtos.OafRel;
12
import eu.dnetlib.data.proto.OafProtos.OafRel.Builder;
13
import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
14
import eu.dnetlib.data.proto.PersonPersonProtos.PersonPerson;
15
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
16
import eu.dnetlib.data.proto.RelTypeProtos.RelType;
17
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
18
import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
19
import eu.dnetlib.data.proto.TypeProtos.Type;
20
import eu.dnetlib.pace.util.DedupConfig;
21

    
22
public class DedupUtils {
23

    
24
	public static final String CF_SEPARATOR = "_";
25

    
26
	public static final String ROOT = "dedup_wf";
27

    
28
	public static final String BODY_S = "body";
29

    
30
	public static final byte[] BODY_B = Bytes.toBytes(BODY_S);
31

    
32
	public static String dedupPrefix(final String dedupRun) {
33
		return "|" + ROOT + "_" + dedupRun + "::";
34
	}
35

    
36
	public static String newId(final String id, final String dedupRun) {
37
		if ((dedupRun == null) || (dedupRun.length() != 3)) throw new IllegalArgumentException("wrong dedupRun param");
38

    
39
		return id.replaceFirst("\\|.*\\:\\:", dedupPrefix(dedupRun));
40
	}
41

    
42
	public static byte[] newIdBytes(final String s, final String dedupRun) {
43
		return newId(s, dedupRun).getBytes();
44
	}
45

    
46
	public static byte[] newIdBytes(final ByteBuffer b, final String dedupRun) {
47
		return newId(new String(b.array()), dedupRun).getBytes();
48
	}
49

    
50
	public static boolean isRoot(final String s) {
51
		return s.contains(ROOT);
52
	}
53

    
54
	public static boolean isRoot(final ImmutableBytesWritable s) {
55
		return isRoot(s.copyBytes());
56
	}
57

    
58
	public static boolean isRoot(final byte[] s) {
59
		return isRoot(new String(s));
60
	}
61

    
62
	public static String getDedupCF_merges(final Type type) {
63
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.merges;
64
	}
65

    
66
	public static byte[] getDedupCF_mergesBytes(final Type type) {
67
		return Bytes.toBytes(getDedupCF_merges(type));
68
	}
69

    
70
	public static String getDedupCF_mergedIn(final Type type) {
71
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedup + CF_SEPARATOR + Dedup.RelName.isMergedIn;
72
	}
73

    
74
	public static byte[] getDedupCF_mergedInBytes(final Type type) {
75
		return Bytes.toBytes(getDedupCF_mergedIn(type));
76
	}
77

    
78
	public static String getSimilarityCF(final Type type) {
79
		return getRelType(type) + CF_SEPARATOR + SubRelType.dedupSimilarity + CF_SEPARATOR + DedupSimilarity.RelName.isSimilarTo;
80
	}
81

    
82
	public static byte[] getSimilarityCFBytes(final Type type) {
83
		return Bytes.toBytes(getSimilarityCF(type));
84
	}
85

    
86
	public static String getRelTypeString(final Type type) {
87
		return getRelType(type).toString();
88
	}
89

    
90
	public static RelType getRelType(final Type type) {
91
		switch (type) {
92
		case organization:
93
			return RelType.organizationOrganization;
94
		case person:
95
			return RelType.personPerson;
96
		case result:
97
			return RelType.resultResult;
98
		default:
99
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + type);
100
		}
101
	}
102

    
103
	public static ColumnFamily decodeCF(final byte[] b) {
104
		String[] s = new String(b).split(CF_SEPARATOR);
105
		return new DedupUtils().getCF(RelType.valueOf(s[0]), SubRelType.valueOf(s[1]));
106
	}
107

    
108
	private ColumnFamily getCF(final RelType relType, final SubRelType subRelType) {
109
		return new ColumnFamily(relType, subRelType);
110
	}
111

    
112
	public static OafRel.Builder getDedup(final DedupConfig dedupConf, final String from, final String to, final Dedup.RelName relClass) {
113
		Type type = Type.valueOf(dedupConf.getEntityType());
114
		RelType relType = DedupUtils.getRelType(type);
115
		Builder oafRel =
116
				OafRel.newBuilder().setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass.toString()).setChild(false)
117
						.setSource(new String(from)).setTarget(new String(to));
118
		switch (type) {
119
		case organization:
120
			oafRel.setOrganizationOrganization(OrganizationOrganization.newBuilder().setDedup(
121
					DedupUtils.dedup(relClass, "dnet:organization_organization_relations")));
122
			break;
123
		case person:
124
			oafRel.setPersonPerson(PersonPerson.newBuilder().setDedup(DedupUtils.dedup(relClass, "dnet:person_person_relations")));
125
			break;
126
		case result:
127
			oafRel.setResultResult(ResultResult.newBuilder().setDedup(DedupUtils.dedup(relClass, "dnet:result_result_relations")));
128
			break;
129
		default:
130
			throw new IllegalArgumentException("Deduplication not supported for entity type: " + dedupConf.getEntityType());
131
		}
132
		return oafRel;
133
	}
134

    
135
	private static Dedup.Builder dedup(final Dedup.RelName relClass, final String scheme) {
136
		return Dedup.newBuilder().setRelMetadata(
137
				RelMetadata.newBuilder().setSemantics(
138
						Qualifier.newBuilder().setClassid(relClass.toString()).setClassname(relClass.toString()).setSchemeid(scheme).setSchemename(scheme)));
139
	}
140

    
141
	class ColumnFamily {
142

    
143
		private final RelType relType;
144
		private final SubRelType subRelType;
145

    
146
		public ColumnFamily(final RelType relType, final SubRelType subRelType) {
147
			this.relType = relType;
148
			this.subRelType = subRelType;
149
		}
150

    
151
		@Override
152
		public String toString() {
153
			return getRelType() + CF_SEPARATOR + getSubRelType();
154
		}
155

    
156
		public RelType getRelType() {
157
			return relType;
158
		}
159

    
160
		public SubRelType getSubRelType() {
161
			return subRelType;
162
		}
163

    
164
	}
165

    
166
}
(1-1/8)