/modules/dnet-mapreduce-jobs/branches/beta/src/main/java/eu/dnetlib/data/mapreduce/hbase/dedup/DedupMapper.java - Diff - D-Net - D-Net project tracking tool

« Previous | Next »

Revision 52878

Added by Claudio Atzori over 5 years ago

introduced subType in pace wf configuration

     import eu.dnetlib.data.proto.TypeProtos.Type;
     import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
     import eu.dnetlib.pace.config.DedupConfig;
     import eu.dnetlib.pace.config.WfConfig;
     import eu.dnetlib.pace.model.Field;
     import eu.dnetlib.pace.model.MapDocument;
     import eu.dnetlib.pace.model.ProtoDocumentBuilder;
     import org.apache.commons.logging.Log;
-...
     import org.apache.hadoop.hbase.mapreduce.TableMapper;
     import org.apache.hadoop.hbase.util.Bytes;
     import org.apache.hadoop.io.Text;
     import org.apache.hadoop.mapred.JobTracker.IllegalStateException;
     public class DedupMapper extends TableMapper<Text, ImmutableBytesWritable> {
-...
     		outKey = new Text();
     		ibw = new ImmutableBytesWritable();
     		log.info("pace conf");
     		log.info("entity type: " + dedupConf.getWf().getEntityType());
     		log.info("clustering: " + dedupConf.getPace().getClustering());
     		log.info("conditions: " + dedupConf.getPace().getConditions());
     		log.info("fields: " + dedupConf.getPace().getModel());
     		log.info("blacklists: " + blackListMap);
     		//log.info("pace conf");
     		//log.info("entity type: " + dedupConf.getWf().getEntityType());
     		//log.info("clustering: " + dedupConf.getPace().getClustering());
     		//log.info("conditions: " + dedupConf.getPace().getConditions());
     		//log.info("fields: " + dedupConf.getPace().getModel());
     		//log.info("blacklists: " + blackListMap);
     		log.info("wf conf: " + dedupConf.toString());
+    	}
-...
     	protected void map(final ImmutableBytesWritable keyIn, final Result result, final Context context) throws IOException, InterruptedException {
     		// log.info("got key: " + new String(keyIn.copyBytes()));
     		final byte[] body = result.getValue(dedupConf.getWf().getEntityType().getBytes(), DedupUtils.BODY_B);
     		final WfConfig wf = dedupConf.getWf();
     		final byte[] body = result.getValue(wf.getEntityType().getBytes(), DedupUtils.BODY_B);
     		if (body != null) {
     			final OafDecoder decoder = OafDecoder.decode(body);
     			if (decoder.getOaf().getDataInfo().getDeletedbyinference()) {
     				context.getCounter(dedupConf.getWf().getEntityType(), "deleted by inference").increment(1);
     				context.getCounter(wf.getEntityType(), "deleted by inference").increment(1);
     				return;
+    			}
-...
     			context.getCounter(entity.getType().toString(), "decoded").increment(1);
     			if (entity.getType().equals(Type.valueOf(dedupConf.getWf().getEntityType()))) {
     			if (entity.getType().equals(Type.valueOf(wf.getEntityType()))) {
     				// TODO: remove this hack - here because we want to dedup only publications and organizazions
     				if (shouldDedup(entity)) {
     					final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel());
     				final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel());
     				context.getCounter(entity.getType().toString(), "converted as MapDocument").increment(1);
     				if (wf.hasSubType()) {
     					final Map<String, Field> fields = doc.getFieldMap();
     					if (!fields.containsKey(wf.getSubEntityType())) {
     						throw new IllegalStateException(String.format("model map does not contain field %s", wf.getSubEntityType()));
+    					}
     					final String subType = fields.get(wf.getSubEntityType()).stringValue();
     					if (wf.getSubEntityValue().equalsIgnoreCase(subType)) {
     						context.getCounter(subType, "converted as MapDocument").increment(1);
     						emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap));
     					} else {
     						context.getCounter(subType, "ignored").increment(1);
+    					}
     				} else {
     					emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap));
+    				}
+    			}
     		} else {
     			context.getCounter(dedupConf.getWf().getEntityType(), "missing body").increment(1);
     			context.getCounter(wf.getEntityType(), "missing body").increment(1);
+    		}
+    	}
     	private boolean shouldDedup(final OafEntity entity) {
     		return (entity.getType().equals(Type.result) && entity.getResult().getMetadata().getResulttype().getClassid().equals("publication")) |
     				entity.getType().equals(Type.organization);
+    	}
     	private void emitNGrams(final Context context, final MapDocument doc, final Collection<String> ngrams) throws IOException, InterruptedException {
     		for (final String ngram : ngrams) {
     			outKey.set(ngram);

Also available in: Unified diff

Project

General

Profile

D-Net

Revision 52878

Added by Claudio Atzori over 5 years ago