Project

General

Profile

1
//package eu.dnetlib.data.mapreduce.hbase.dedup;
2
//
3
//import java.io.IOException;
4
//import java.nio.ByteBuffer;
5
//import java.util.Map;
6
//
7
//import org.apache.hadoop.hbase.client.Result;
8
//import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
9
//import org.apache.hadoop.hbase.mapreduce.TableMapper;
10
//import org.apache.hadoop.hbase.util.Bytes;
11
//
12
//import com.google.protobuf.InvalidProtocolBufferException;
13
//import com.googlecode.protobuf.format.JsonFormat;
14
//
15
//import eu.dnetlib.data.mapreduce.JobParams;
16
//import eu.dnetlib.data.mapreduce.util.DedupUtils;
17
//import eu.dnetlib.data.mapreduce.util.DNGFDecoder;
18
//import eu.dnetlib.data.mapreduce.util.DNGFRowKeyDecoder;
19
//import eu.dnetlib.data.proto.DNGFProtos.DNGF;
20
//import eu.dnetlib.data.proto.PersonProtos.Person;
21
//import eu.dnetlib.data.proto.TypeProtos.Type;
22
//import eu.dnetlib.pace.config.DedupConfig;
23
//import eu.dnetlib.pace.model.gt.GTAuthor;
24
//
25
//public class DedupFindRootsPersonMapper extends TableMapper<ImmutableBytesWritable, ImmutableBytesWritable> {
26
//
27
//	private DedupConfig dedupConf;
28
//
29
//	private ImmutableBytesWritable outKey;
30
//
31
//	private ImmutableBytesWritable outValue;
32
//
33
//	@Override
34
//	protected void setup(final Context context) throws IOException, InterruptedException {
35
//		dedupConf = DedupConfig.load(context.getConfiguration().get(JobParams.DEDUP_CONF));
36
//		System.out.println("dedup findRoots mapper\nwf conf: " + dedupConf.toString());
37
//
38
//		outKey = new ImmutableBytesWritable();
39
//		outValue = new ImmutableBytesWritable();
40
//	}
41
//
42
//	@Override
43
//	protected void map(final ImmutableBytesWritable rowkey, final Result value, final Context context) throws IOException, InterruptedException {
44
//		// System.out.println("Find root mapping: " + new String(rowkey.copyBytes()));
45
//
46
//		final DNGFRowKeyDecoder rkd = DNGFRowKeyDecoder.decode(rowkey.copyBytes());
47
//
48
//		if (!Type.person.equals(rkd.getType())) {
49
//			context.getCounter(rkd.getType().toString(), "skipped").increment(1);
50
//		}
51
//
52
//		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
53
//		final Map<byte[], byte[]> similarRels = value.getFamilyMap(DedupUtils.getSimilarityCFBytes(type));
54
//
55
//		if ((similarRels != null) && !similarRels.isEmpty()) {
56
//			final ByteBuffer min = findMin(rowkey.copyBytes(), similarRels.keySet());
57
//
58
//			final byte[] groupingKey = DedupUtils.newIdBytes(min, dedupConf.getWf().getDedupRun());
59
//
60
//			final GTAuthor gta = asGTA(rowkey, value.getValue(Bytes.toBytes(dedupConf.getWf().getEntityType()), DedupUtils.BODY_B));
61
//
62
//			emitBody(context, groupingKey, gta);
63
//		} else {
64
//			context.getCounter(dedupConf.getWf().getEntityType(), "row not in similarity mesh").increment(1);
65
//		}
66
//	}
67
//
68
//	private GTAuthor asGTA(final ImmutableBytesWritable rowkey, final byte[] input) {
69
//
70
//		final DNGFDecoder decoder = DNGFDecoder.decode(input);
71
//		final DNGF oaf = decoder.getDNGF();
72
//
73
//		final Person person = oaf.getEntity().getPerson();
74
//
75
//		final GTAuthor gta = GTAuthor.fromDNGFJson(new JsonFormat().printToString(person));
76
//		final String id = new String(rowkey.copyBytes());
77
//		gta.setId(id);
78
//		gta.getAuthor().setId(id);
79
//		return gta;
80
//	}
81
//
82
//	private ByteBuffer findMin(final byte[] key, final Iterable<byte[]> keys) {
83
//		ByteBuffer bb = ByteBuffer.wrap(key);
84
//		for (final byte[] q : keys) {
85
//			final ByteBuffer iq = ByteBuffer.wrap(q);
86
//			if (bb.compareTo(iq) > 0) {
87
//				bb = iq;
88
//			}
89
//		}
90
//		return bb;
91
//	}
92
//
93
//	private void emitBody(final Context context, final byte[] row, final GTAuthor gta) throws IOException, InterruptedException {
94
//
95
//		outKey.set(row);
96
//		outValue.set(toDNGFByteArray(gta));
97
//
98
//		context.write(outKey, outValue);
99
//		context.getCounter(dedupConf.getWf().getEntityType(), "in").increment(1);
100
//	}
101
//
102
//	public byte[] toDNGFByteArray(final GTAuthor gta) {
103
//		// final DNGF oaf = new GTAuthorMapper().map(gta);
104
//		// return oaf.toByteArray();
105
//		return Bytes.toBytes(gta.toString());
106
//	}
107
//
108
//}
(6-6/22)