Revision 57655
Added by Michele De Bonis over 4 years ago
DedupReducer.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.dedup; |
2 | 2 |
|
3 |
import java.io.IOException; |
|
4 |
import java.util.*; |
|
5 |
|
|
6 | 3 |
import com.google.common.base.Function; |
7 | 4 |
import com.google.common.collect.Iterables; |
8 |
import com.google.common.collect.Lists; |
|
9 |
import com.google.protobuf.InvalidProtocolBufferException; |
|
10 | 5 |
import eu.dnetlib.data.mapreduce.JobParams; |
11 | 6 |
import eu.dnetlib.data.mapreduce.util.DedupUtils; |
12 |
import eu.dnetlib.data.mapreduce.util.StreamUtils; |
|
13 |
import eu.dnetlib.data.proto.OafProtos; |
|
14 |
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; |
|
15 |
import eu.dnetlib.data.proto.TypeProtos.Type; |
|
16 |
import eu.dnetlib.pace.clustering.NGramUtils; |
|
17 | 7 |
import eu.dnetlib.pace.config.DedupConfig; |
18 |
import eu.dnetlib.pace.config.WfConfig; |
|
19 |
import eu.dnetlib.pace.distance.PaceDocumentDistance; |
|
20 |
import eu.dnetlib.pace.distance.eval.ScoreResult; |
|
21 |
import eu.dnetlib.pace.model.*; |
|
8 |
import eu.dnetlib.pace.model.MapDocument; |
|
9 |
import eu.dnetlib.pace.model.MapDocumentSerializer; |
|
22 | 10 |
import eu.dnetlib.pace.util.BlockProcessor; |
23 | 11 |
import eu.dnetlib.pace.util.Reporter; |
24 |
import org.apache.commons.lang.StringUtils; |
|
25 | 12 |
import org.apache.commons.logging.Log; |
26 | 13 |
import org.apache.commons.logging.LogFactory; |
27 | 14 |
import org.apache.hadoop.hbase.client.Durability; |
... | ... | |
32 | 19 |
import org.apache.hadoop.io.Text; |
33 | 20 |
|
34 | 21 |
import javax.annotation.Nullable; |
22 |
import java.io.IOException; |
|
35 | 23 |
|
36 | 24 |
public class DedupReducer extends TableReducer<Text, ImmutableBytesWritable, ImmutableBytesWritable> { |
37 | 25 |
|
Also available in: Unified diff
modification to fit with the tree-dedup