Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dedup;
2

    
3
import java.io.IOException;
4
import java.util.List;
5

    
6
import eu.dnetlib.data.graph.utils.RelDescriptor;
7
import eu.dnetlib.data.mapreduce.JobParams;
8
import eu.dnetlib.data.proto.DNGFProtos.DNGF;
9
import eu.dnetlib.data.proto.KindProtos.Kind;
10
import eu.dnetlib.data.proto.TypeProtos.Type;
11
import eu.dnetlib.pace.config.DedupConfig;
12
import org.apache.commons.logging.Log;
13
import org.apache.commons.logging.LogFactory;
14
import org.apache.hadoop.hbase.client.Result;
15
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
16
import org.apache.hadoop.hbase.mapreduce.TableMapper;
17
import org.apache.hadoop.io.Text;
18

    
19
import static eu.dnetlib.data.mapreduce.util.UpdateMerger.mergeBodyUpdates;
20
import static eu.dnetlib.data.mapreduce.util.dao.HBaseTableDAO.*;
21

    
22
public class DedupBuildRootsMapper extends TableMapper<Text, ImmutableBytesWritable> {
23

    
24
	private static final Log log = LogFactory.getLog(DedupBuildRootsMapper.class);
25

    
26
	private DedupConfig dedupConf;
27

    
28
	private ImmutableBytesWritable ibw;
29

    
30
	@Override
31
	protected void setup(final Context context) {
32
		dedupConf = DedupConfig.load(context.getConfiguration().get(JobParams.DEDUP_CONF));
33
		System.out.println("dedup buildRoots mapper\nwf conf: " + dedupConf.toString());
34

    
35
		ibw = new ImmutableBytesWritable();
36
	}
37

    
38
	@Override
39
	protected void map(final ImmutableBytesWritable rowkey, final Result value, final Context context) throws IOException, InterruptedException {
40
		// System.out.println("Find root mapping: " + new String(rowkey.copyBytes()));
41

    
42
		final Type type = Type.valueOf(dedupConf.getWf().getEntityType());
43
		final List<String> mergedIn = getTargetIds(value, "isMergedIn");
44

    
45
		if ((mergedIn != null) && !mergedIn.isEmpty()) {
46

    
47
			if (mergedIn.size() > 1) {
48
				throw new RuntimeException("found more than one merged in relation for row key: " + new String(rowkey.copyBytes()));
49
			}
50
			final Text rootId = new Text(mergedIn.get(0));
51

    
52
			context.getCounter(dedupConf.getWf().getEntityType(), "merged").increment(1);
53

    
54
			final DNGF dngfMerged = mergeBodyUpdates(context, value.getFamilyMap(cfMetadataByte()), type);
55
			if (dngfMerged == null) {
56
				context.getCounter(dedupConf.getWf().getEntityType(), "missing body").increment(1);
57
				System.out.println("missing body in: " + new String(rowkey.copyBytes()));
58
				return;
59
			}
60
			emit(context, rootId, dngfMerged.toByteArray());
61

    
62
			rel(value, "isMergedIn", "merges", "isSimilarTo").values().forEach(dngf -> {
63
				if (!isRelMarkedDeleted(dngf)) {
64
					emit(context, rootId, dngf.toByteArray());
65
				} else {
66
                    //context.getCounter(RelDescriptor.asString(dngf.getRel().getRelType()), "rel marked deleted").increment(1);
67
                }
68
            });
69

    
70
		} else {
71
			//context.getCounter(dedupConf.getWf().getEntityType(), "not in duplicate group").increment(1);
72

    
73
			final List<String> mergesRels = getTargetIds(value, "merges");
74
			if (mergesRels != null && !mergesRels.isEmpty()) {
75
				final byte[] body = value.getValue(cfMetadataByte(), type.toString().getBytes());
76
				if (body != null) {
77
					context.getCounter(type.toString(), "root").increment(1);
78
					emit(context, new Text(rowkey.copyBytes()), body);
79
				}
80
			}
81
		}
82

    
83
	}
84

    
85
	private void emit(final Context context, final Text rootId, final byte[] value) {
86
		ibw.set(value);
87
		try {
88
			context.write(rootId, ibw);
89
		} catch (Exception e) {
90
			throw new RuntimeException(e);
91
		}
92
	}
93

    
94
	private boolean isRelMarkedDeleted(final DNGF dngf) {
95
		return dngf.getKind().equals(Kind.relation) && dngf.getDataInfo().getDeletedbyinference();
96
	}
97

    
98
}
(1-1/22)