Revision 49517
Added by Claudio Atzori over 6 years ago
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/dedup/DedupMapper.java | ||
---|---|---|
79 | 79 |
|
80 | 80 |
if (entity.getType().equals(Type.valueOf(dedupConf.getWf().getEntityType()))) { |
81 | 81 |
|
82 |
// TODO: remove this hack - here because we want to dedup only publications |
|
83 |
if (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("publication")) { |
|
84 |
|
|
85 |
// GeneratedMessage metadata = OafEntityDecoder.decode(entity).getEntity(); |
|
82 |
// TODO: remove this hack - here because we want to dedup only publications and organizazions |
|
83 |
if (shouldDedup(entity)) { |
|
86 | 84 |
final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel()); |
87 | 85 |
emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap)); |
88 | 86 |
} |
... | ... | |
92 | 90 |
} |
93 | 91 |
} |
94 | 92 |
|
93 |
private boolean shouldDedup(final OafEntity entity) { |
|
94 |
return (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("publication")) | |
|
95 |
entity.getType().equals(Type.organization); |
|
96 |
} |
|
97 |
|
|
95 | 98 |
private void emitNGrams(final Context context, final MapDocument doc, final Collection<String> ngrams) throws IOException, InterruptedException { |
96 | 99 |
for (final String ngram : ngrams) { |
97 | 100 |
outKey.set(ngram); |
Also available in: Unified diff
exclude from the deduplication process results that aren't publications