/modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/linkage/LinkCustomReducer.java - D-Net - D-Net project tracking tool

dnet40/modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/linkage/LinkCustomReducer.java @ 45878

       package eu.dnetlib.data.mapreduce.hbase.lodExport.linkage;
       import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.caching.RedisUtils;
       import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.compators.MyComparator;
       import org.apache.hadoop.io.Text;
       import org.apache.hadoop.mapreduce.Reducer;
       import org.apache.log4j.Logger;
       import java.io.IOException;
       import java.util.HashMap;
       import java.util.Map;
       public class LinkCustomReducer extends Reducer<Text, Text, Text, Text> {
           private Logger log = Logger.getLogger(LinkCustomReducer.class);
           private RedisUtils redisUtils;
           private static final String SEPERATOR = ",";
           private static double RECORD_SIMILARITY_THRESHOLD = 0.8;
           public static enum LINK_RECUDER_COUNTERS {
               TARGET_TRIPLES,
               SOURCE_TRIPLES,
               WRITTEN_OUT_ENTITIES,
               COMPARISONS,
               COMPARISONS_PER_BLOCK,
               BLOCKS
+          }
           @Override
           protected void setup(Context context) throws IOException, InterruptedException {
               try {
                   redisUtils = new RedisUtils(context);
               } catch (Exception e) {
                   log.error("Error connecting to Redis " + e.toString());
                   throw new RuntimeException(e);
+              }
+          }
           @Override
           protected void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException {
               //each item in the list is a block with the given key
               for (Text block : values) {
                   context.getCounter(LINK_RECUDER_COUNTERS.BLOCKS).increment(1);
                   try {
                       linkRecords(block.toString(), context);
                   } catch (Exception e) {
                       log.error("Error comparing records" + e.toString());
                       throw new RuntimeException(e);
+                  }
+              }
+          }
           private void linkRecords(String block, Context context) throws Exception {
               String[] split = block.split(SEPERATOR);
               Map<String, String> sourceRecords = new HashMap<>();
               Map<String, String> targetRecords = new HashMap<>();
               for (String recordId : split) {
                   if (recordId.contains("source_")) {
                       sourceRecords.put(recordId, "");
                       context.getCounter(LINK_RECUDER_COUNTERS.SOURCE_TRIPLES).increment(1);
                   } else {
                       targetRecords.put(recordId, redisUtils.getValue(recordId));
                       context.getCounter(LINK_RECUDER_COUNTERS.TARGET_TRIPLES).increment(1);
+                  }
+              }
               for (String sourceId : sourceRecords.keySet()) {
                   String sourceRecord = redisUtils.getValue(sourceId);
                   for (String targetId : targetRecords.keySet()) {
                       //store target records to have less I/O ops per record
                       //test also version where we get each record again in each loop
                       //
                       //TODO
                       //String targetRecord = redisUtils.getValue(targetId);}
                       String targetRecord = targetRecords.get(targetId);
                       double similarity = MyComparator.findMatchingPair(sourceRecord, targetRecord);
                       context.getCounter(LINK_RECUDER_COUNTERS.COMPARISONS).increment(1);
                       if (similarity >= RECORD_SIMILARITY_THRESHOLD) {
                           try {
                               context.write(new Text(sourceId), new Text(targetId + SEPERATOR + similarity));
                               context.getCounter(LINK_RECUDER_COUNTERS.WRITTEN_OUT_ENTITIES).increment(1);
                           } catch (Exception ex) {
                               log.error("Error while writing records to output : " + ex.toString());
                               throw new IOException("Error while writing records to output", ex);
+                          }
+                      }
+                  }
+              }
+          }
           @Override
           protected void cleanup(Context context) throws IOException
                   , InterruptedException {
               context.getCounter(LINK_RECUDER_COUNTERS.COMPARISONS_PER_BLOCK).setValue(context.getCounter(LINK_RECUDER_COUNTERS.COMPARISONS).getValue() /
                       context.getCounter(LINK_RECUDER_COUNTERS.BLOCKS).getValue());
+          }
+      }

(4-4/5)

Project

General

Profile

D-Net