Project

General

Profile

« Previous | Next » 

Revision 49517

exclude from the deduplication process results that aren't publications

View differences:

DedupMapper.java
79 79

  
80 80
			if (entity.getType().equals(Type.valueOf(dedupConf.getWf().getEntityType()))) {
81 81

  
82
				// TODO: remove this hack - here because we want to dedup only publications
83
				if (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("publication")) {
84

  
85
					// GeneratedMessage metadata = OafEntityDecoder.decode(entity).getEntity();
82
				// TODO: remove this hack - here because we want to dedup only publications and organizazions
83
				if (shouldDedup(entity)) {
86 84
					final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel());
87 85
					emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap));
88 86
				}
......
92 90
		}
93 91
	}
94 92

  
93
	private boolean shouldDedup(final OafEntity entity) {
94
		return (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("publication")) |
95
				entity.getType().equals(Type.organization);
96
	}
97

  
95 98
	private void emitNGrams(final Context context, final MapDocument doc, final Collection<String> ngrams) throws IOException, InterruptedException {
96 99
		for (final String ngram : ngrams) {
97 100
			outKey.set(ngram);

Also available in: Unified diff