Revision 47483
Added by Claudio Atzori almost 7 years ago
DedupMapper.java | ||
---|---|---|
79 | 79 |
|
80 | 80 |
if (entity.getType().equals(Type.valueOf(dedupConf.getWf().getEntityType()))) { |
81 | 81 |
|
82 |
// TODO: remove this hack - here because we don't want to dedup datasets
|
|
83 |
if (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("dataset")) return;
|
|
82 |
// TODO: remove this hack - here because we want to dedup only publications
|
|
83 |
if (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("publication")) {
|
|
84 | 84 |
|
85 |
// GeneratedMessage metadata = OafEntityDecoder.decode(entity).getEntity(); |
|
86 |
final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel()); |
|
87 |
emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap)); |
|
85 |
// GeneratedMessage metadata = OafEntityDecoder.decode(entity).getEntity(); |
|
86 |
final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel()); |
|
87 |
emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap)); |
|
88 |
} |
|
88 | 89 |
} |
89 | 90 |
} else { |
90 | 91 |
context.getCounter(dedupConf.getWf().getEntityType(), "missing body").increment(1); |
Also available in: Unified diff
instead of excluding datasets from the deduplication process, we include only publications