/modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/preprocessing/SourceMapper.java - D-Net - D-Net project tracking tool

dnet40/modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/preprocessing/SourceMapper.java @ 50352

       package eu.dnetlib.data.mapreduce.hbase.lodExport.preprocessing;
       import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.LodConfiguration;
       import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.configuration.Properties;
       import org.apache.commons.lang.StringUtils;
       import org.apache.hadoop.io.LongWritable;
       import org.apache.hadoop.io.Text;
       import org.apache.hadoop.mapreduce.Mapper;
       import org.apache.log4j.Logger;
       import java.io.IOException;
       /**
        * Mapper Class that reads HBASE contents and prepares them for the StatsDB
        * export
        */
       /*
       -> Parse LOD dump files
               Process lod input files and divide by entity type (both source and target)
               Transform to id, array of [ properties] format
               Store to HDFS
               For -> Multiple outputs and inputs
               Multiple inputs: all source and target datasets and their corresponding mappings
               M/O: separate output files for each dataset: mark records so that they are written to the correct one
       */
       /*
       <http://lod.openaire.eu/data/result/doajarticles::89217af00809a91acc15a416e56b3782> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.eurocris.org/ontologies/cerif/1.3#ResultEntity> .
       <http://lod.openaire.eu/data/result/doajarticles::89217af00809a91acc15a416e56b3782> <http://www.eurocris.org/ontologies/cerif/1.3#name> "Une nouvelle anomalie a allure h r ditaire chez des agneaux  it khouzistans /it" .
       */
       public class SourceMapper extends Mapper<LongWritable, Text, Text, Text> {
           private static final String OA_PREFIX = "OA";
           private static final String TAB_CHARACTER = "\t";
           private static final String DOUBLE_TAB = "\t.\t";
           private static final String BACKSLASH = "\"";
           private static final String SLASH = "/";
           private static final String SPACE = "\\s";
           private Logger log = Logger.getLogger(SourceMapper.class);
           private LodConfiguration lodConfiguration;
           public static enum SOURCE_COUNTERS {
               SOURCE_ENTITIES, TOTAL_ENTITIES
+          }
           @Override
           protected void setup(Context context) throws IOException, InterruptedException {
               lodConfiguration = new LodConfiguration();
               lodConfiguration.load(context.getConfiguration().get(Properties.LOD_SOURCE_MAPPINGS));
+          }
           @Override
           protected void map(final LongWritable keyIn, final Text result, final Context context) throws IOException {
               try {
                   context.getCounter(SOURCE_COUNTERS.TOTAL_ENTITIES).increment(1);
                   //get ID - output source_recordID so we can group by id and get all props of a record
                   String[] inputRecord = parseInputRecord(result);
                   String[] fields = StringUtils.join(inputRecord, BACKSLASH).split(TAB_CHARACTER);
                   if (fields.length < 2) {
                       log.error("Not a valid record");
                       return;
+                  }
                   String subject = fields[0];
                   //here addd all  fields as array props and append "\t"
                   // between them so we can write directly to output
                   //DO NOT enter id- we'll get it from key output
                   //extract entity type from subject
                   String type = extractType(fields[0]);
                   if (lodConfiguration.entityExists(type)) {
                       String record = buildRecord(fields, subject);
                       // write out type,source_ID as key, and rest of props as value
                       writeKeyValuePair(context, type, subject, record);
+                  }
               } catch (Exception e) {
                   log.error("Error while preprocessing", e);
+              }
+          }
           private String extractType(String value) throws Exception {
               String[] tmp = value.split(SLASH);
               if (tmp.length < 5) {
                   log.error("Not a valid record");
                   throw new Exception("Error while reading record: missing subject");
+              }
               return tmp[4];
+          }
           private String buildRecord(String[] fields, String subject) {
               StringBuilder value = new StringBuilder();
               for (int i = 1; i < fields.length - 1; i += 2) {
                   String field = fields[i];
                   String fieldValue = fields[i + 1];
                   if (lodConfiguration.isValidField(field)) {
                       value.append(subject).append(TAB_CHARACTER).append(field).append(TAB_CHARACTER).append(fieldValue).append(DOUBLE_TAB);
+                  }
+              }
               return value.toString();
+          }
           private void writeKeyValuePair(Context context, String type, String subject, String value) throws IOException, InterruptedException {
               if (value.length() > 0) {
                   Text key = new Text(OA_PREFIX + "," + type + "," + subject);
                   context.write(key, new Text());
                   context.getCounter(SOURCE_COUNTERS.SOURCE_ENTITIES).increment(1);
+              }
+          }
           private String[] parseInputRecord(Text result) {
               String[] inputParts = result.toString().split(BACKSLASH);
               for (int i = 0; i < inputParts.length; i += 2) {
                   inputParts[i] = inputParts[i].replaceAll(SPACE, TAB_CHARACTER);
+              }
               return inputParts;
+          }
           private static String cleanInput(Text result) {
               String resulString = result.toString().replace("<", "").replace(">", "");
               int ind = resulString.lastIndexOf(".");
               if (ind >= 0) {
                   resulString = resulString.substring(0, ind);
+              }
               return resulString;
+          }
           @Override
           protected void cleanup(Context context) throws IOException, InterruptedException {
               super.cleanup(context);
+          }
+      }

(2-2/4)

Project

General

Profile

D-Net