Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dedup;
2

    
3
import java.io.IOException;
4
import java.util.Collection;
5
import java.util.List;
6
import java.util.Map;
7

    
8
import com.google.common.collect.Maps;
9
import eu.dnetlib.data.graph.model.DNGFDecoder;
10
import eu.dnetlib.data.mapreduce.JobParams;
11
import eu.dnetlib.data.mapreduce.util.dao.HBaseTableDAO;
12
import eu.dnetlib.data.proto.DNGFProtos.DNGFEntity;
13
import eu.dnetlib.data.proto.DliFieldTypeProtos;
14
import eu.dnetlib.data.proto.DliProtos;
15
import eu.dnetlib.data.proto.TypeProtos.Type;
16
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
17
import eu.dnetlib.pace.config.DedupConfig;
18
import eu.dnetlib.pace.model.MapDocument;
19
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
20
import org.apache.commons.logging.Log;
21
import org.apache.commons.logging.LogFactory;
22
import org.apache.hadoop.hbase.client.Result;
23
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
24
import org.apache.hadoop.hbase.mapreduce.TableMapper;
25
import org.apache.hadoop.hbase.util.Bytes;
26
import org.apache.hadoop.io.Text;
27

    
28
public class DedupMapper extends TableMapper<Text, ImmutableBytesWritable> {
29

    
30
	private static final Log log = LogFactory.getLog(DedupMapper.class);
31

    
32
	private DedupConfig dedupConf;
33

    
34
	private Map<String, List<String>> blackListMap = Maps.newHashMap();
35

    
36
	private Text outKey;
37

    
38
	private ImmutableBytesWritable ibw;
39

    
40
	@Override
41
	protected void setup(final Context context) throws IOException, InterruptedException {
42

    
43
		final String dedupConfJson = context.getConfiguration().get(JobParams.DEDUP_CONF);
44

    
45
		log.info("pace conf strings");
46
		log.info("pace conf: " + dedupConfJson);
47

    
48
		dedupConf = DedupConfig.load(dedupConfJson);
49

    
50
		blackListMap = dedupConf.getPace().getBlacklists();
51

    
52
		outKey = new Text();
53
		ibw = new ImmutableBytesWritable();
54

    
55
		log.info("pace conf");
56
		log.info("entity type: " + dedupConf.getWf().getEntityType());
57
		log.info("clustering: " + dedupConf.getPace().getClustering());
58
		log.info("conditions: " + dedupConf.getPace().getConditions());
59
		log.info("fields: " + dedupConf.getPace().getModel());
60
		log.info("blacklists: " + blackListMap);
61
		log.info("wf conf: " + dedupConf.toString());
62
	}
63

    
64
	@Override
65
	protected void map(final ImmutableBytesWritable keyIn, final Result result, final Context context) throws IOException, InterruptedException {
66
		// log.info("got key: " + new String(keyIn.copyBytes()));
67

    
68
		final byte[] body = result.getValue(HBaseTableDAO.cfMetadataByte(), dedupConf.getWf().getEntityType().getBytes());
69

    
70
		if (body != null) {
71

    
72
			final DNGFDecoder decoder = DNGFDecoder.decode(body, DliFieldTypeProtos.completionStatus, DliProtos.completionStatus, DliProtos.resolvedfrom, DliProtos.typedIdentifier);
73
			if (decoder.getDNGF().getDataInfo().getDeletedbyinference()) {
74
				context.getCounter(dedupConf.getWf().getEntityType(), "deleted by inference").increment(1);
75
				return;
76
			}
77

    
78
			final DNGFEntity entity = decoder.getEntity();
79

    
80
			context.getCounter(entity.getType().toString(), "decoded").increment(1);
81

    
82
			if (entity.getType().equals(Type.valueOf(dedupConf.getWf().getEntityType()))) {
83

    
84
				// GeneratedMessage metadata = DNGFEntityDecoder.decode(entity).getEntity();
85
				final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel());
86
				emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap));
87
			}
88
		} else {
89
			context.getCounter(dedupConf.getWf().getEntityType(), "missing body").increment(1);
90
		}
91
	}
92

    
93
	private void emitNGrams(final Context context, final MapDocument doc, final Collection<String> ngrams) throws IOException, InterruptedException {
94
		for (final String ngram : ngrams) {
95
			outKey.set(ngram);
96
			ibw.set(doc.toByteArray());
97
			context.write(outKey, ibw);
98
		}
99
	}
100

    
101
}
(10-10/22)