Project

General

Profile

« Previous | Next » 

Revision 47483

instead of excluding datasets from the deduplication process, we include only publications

View differences:

DedupMapper.java
79 79

  
80 80
			if (entity.getType().equals(Type.valueOf(dedupConf.getWf().getEntityType()))) {
81 81

  
82
				// TODO: remove this hack - here because we don't want to dedup datasets
83
				if (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("dataset")) return;
82
				// TODO: remove this hack - here because we want to dedup only publications
83
				if (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("publication")) {
84 84

  
85
				// GeneratedMessage metadata = OafEntityDecoder.decode(entity).getEntity();
86
				final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel());
87
				emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap));
85
					// GeneratedMessage metadata = OafEntityDecoder.decode(entity).getEntity();
86
					final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel());
87
					emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap));
88
				}
88 89
			}
89 90
		} else {
90 91
			context.getCounter(dedupConf.getWf().getEntityType(), "missing body").increment(1);

Also available in: Unified diff