/ - Diff - D-Net - D-Net project tracking tool

     package eu.dnetlib.data.mapreduce.hbase;
     import java.io.IOException;
     import java.util.Map.Entry;
     import java.util.Properties;
     import javax.annotation.Resource;
     import org.apache.commons.logging.Log;
     import org.apache.commons.logging.LogFactory;
     import org.apache.hadoop.conf.Configured;
     import org.apache.hadoop.fs.FileSystem;
     import org.apache.hadoop.fs.Path;
     import org.apache.hadoop.mapreduce.Job;
     import org.springframework.beans.factory.BeanNameAware;
     import eu.dnetlib.data.hadoop.config.ClusterName;
     import eu.dnetlib.data.hadoop.config.ConfigurationEnumerator;
     import eu.dnetlib.data.mapreduce.HadoopJob;
     public abstract class AbstractHBaseMapReduceJob extends Configured implements HadoopJob, BeanNameAware {
     	protected static final Log log = LogFactory.getLog(AbstractHBaseMapReduceJob.class);
     	protected String jobName;
     	@Resource
     	protected ConfigurationEnumerator configurationEnumerator;
     	protected abstract Job setJobDetails(Job job, Properties p) throws Exception;
     	@Override
     	public Job setJobDetails(ClusterName name, Properties p) {
     		try {
     			final Job job = createJobCommon(name, p);
     			return setJobDetails(job, p);
     		} catch (Exception e) {
     			throw new RuntimeException("unable to define Job: " + getClass().getSimpleName(), e);
+    		}
+    	}
     	protected Job createJobCommon(ClusterName name, Properties p) throws IOException {
     		Job job = new Job(configurationEnumerator.get(name), getClass().getSimpleName());
     		merge(job, p);
     		return job;
+    	}
     	protected void merge(final Job job, final Properties p) {
     		for (Entry<Object, Object> e : p.entrySet()) {
     			job.getConfiguration().set((String) e.getKey(), (String) e.getValue());
+    		}
+    	}
     	protected void deleteHdfsFile(Job job, Path path) {
     		try {
     			FileSystem hdfs = FileSystem.get(job.getConfiguration());
     			if (hdfs.exists(path)) {
     				hdfs.delete(path, true);
+    			}
     		} catch (IOException e) {
     			e.printStackTrace();
+    		}
+    	}
     	@Override
     	public void setBeanName(String name) {
     		this.jobName = name;
+    	}
     	@Override
     	public String getName() {
     		return jobName;
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.actions;
     import org.apache.hadoop.hbase.client.Put;
     import org.apache.hadoop.hbase.client.Scan;
     import org.apache.hadoop.hbase.filter.FilterList;
     import org.apache.hadoop.hbase.filter.PrefixFilter;
     import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
     import org.apache.hadoop.hbase.mapreduce.TableMapper;
     import org.apache.hadoop.hbase.util.Bytes;
     public class CommitActionsJob extends AbstractActionsJob {
     	@Override
     	protected Class<? extends TableMapper<ImmutableBytesWritable, ?>> getMapperClass() {
     		return CommitActionsMapper.class;
+    	}
     	@Override
     	protected Class<?> getMapOutputValueClass() {
     		return Put.class;
+    	}
     	@Override
     	protected void addSpecificFilters(FilterList filters) {
     		filters.addFilter(new PrefixFilter(Bytes.toBytes("aac|")));
+    	}
     	@Override
     	protected void addSpecificScanClauses(Scan scan) {
     		//scan.addFamily(Bytes.toBytes("target"));
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.actions;
     import java.io.IOException;
     import org.apache.hadoop.hbase.client.Delete;
     import org.apache.hadoop.hbase.client.Result;
     import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
     import org.apache.hadoop.hbase.mapreduce.TableMapper;
     import org.apache.hadoop.hbase.util.Bytes;
     public class DeleteActionsMapper extends TableMapper<ImmutableBytesWritable, Delete> {
     	private String set = null;
     	@Override
     	protected void setup(Context context) throws IOException, InterruptedException {
     		set = context.getConfiguration().get("set");
     		if (set != null) {
     			System.out.println("Deleting set: " + set);
     		} else {
     			System.out.println("Deleting ALL sets");
+    		}
+    	}
     	@Override
     	protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
     		if (set == null || value.getColumnLatest(Bytes.toBytes("set"), Bytes.toBytes(set)) != null) {
     			final byte[] rowKey = key.copyBytes();
     			System.out.println("Deleting action " + Bytes.toString(rowKey));
     			context.getCounter("Actions", "N. Deletes").increment(1);
     			context.write(key, new Delete(rowKey));
+    		}
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.actions;
     import org.apache.hadoop.hbase.client.Delete;
     import org.apache.hadoop.hbase.client.Scan;
     import org.apache.hadoop.hbase.filter.FilterList;
     import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
     import org.apache.hadoop.hbase.mapreduce.TableMapper;
     public class DeleteActionsJob extends AbstractActionsJob {
     	@Override
     	protected Class<? extends TableMapper<ImmutableBytesWritable, ?>> getMapperClass() {
     		return DeleteActionsMapper.class;
+    	}
     	@Override
     	protected Class<?> getMapOutputValueClass() {
     		return Delete.class;
+    	}
     	@Override
     	protected void addSpecificFilters(FilterList filters) {
     		// NOT necessary
+    	}
     	@Override
     	protected void addSpecificScanClauses(Scan scan) {
     		// NOT necessary
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.actions;
     import java.io.IOException;
     import org.apache.hadoop.hbase.client.Put;
     import org.apache.hadoop.hbase.client.Result;
     import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
     import org.apache.hadoop.hbase.mapreduce.TableMapper;
     import org.apache.hadoop.hbase.util.Bytes;
     public class CommitActionsMapper extends TableMapper<ImmutableBytesWritable, Put> {
     	private String set = null;
     	@Override
     	protected void setup(Context context) throws IOException, InterruptedException {
     		set = context.getConfiguration().get("set");
     		if (set != null) {
     			System.out.println("Committing set: " + set);
     		} else {
     			System.out.println("Committing ALL sets");
+    		}
+    	}
     	@Override
     	protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
     		if (set == null || value.getColumnLatest(Bytes.toBytes("set"), Bytes.toBytes(set)) != null) {
     			System.out.println("Processing action " + Bytes.toString(key.copyBytes()));
     			byte[] cf = Bytes.toBytes("target");
     			byte[] tkey = value.getValue(cf, Bytes.toBytes("rowKey"));
     			byte[] tcf = value.getValue(cf, Bytes.toBytes("columnFamily"));
     			byte[] tc = value.getValue(cf, Bytes.toBytes("column"));
     			byte[] tv = value.getValue(cf, Bytes.toBytes("content"));
     			Put put = new Put(tkey);
     			put.add(tcf, tc, tv);
     			context.getCounter("Actions", Bytes.toString(tcf)).increment(1);
     			context.write(new ImmutableBytesWritable(tkey), put);
+    		}
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.actions;
     import java.io.IOException;
     import java.util.Properties;
     import org.apache.hadoop.hbase.client.Scan;
     import org.apache.hadoop.hbase.filter.FilterList;
     import org.apache.hadoop.hbase.filter.FilterList.Operator;
     import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
     import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
     import org.apache.hadoop.hbase.mapreduce.TableMapper;
     import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
     import org.apache.hadoop.io.Text;
     import org.apache.hadoop.mapreduce.Job;
     import eu.dnetlib.data.mapreduce.JobParams;
     import eu.dnetlib.data.mapreduce.hbase.AbstractHBaseMapReduceJob;
     abstract public class AbstractActionsJob extends AbstractHBaseMapReduceJob {
     	@Override
     	protected Job setJobDetails(Job job, Properties p) throws Exception {
     		initMapper(job, getScan(p), p.getProperty(JobParams.HBASE_SOURCE_TABLE));
     		job.setOutputFormatClass(TableOutputFormat.class);
     		job.setMapOutputKeyClass(Text.class);
     		job.setMapOutputValueClass(getMapOutputValueClass());
     		job.setNumReduceTasks(0);
     		return job;
+    	}
     	private void initMapper(final Job job, final Scan scan, final String sourceTable) {
     		try {
     			TableMapReduceUtil.initTableMapperJob(sourceTable, scan, getMapperClass(), Text.class, ImmutableBytesWritable.class, job);
     		} catch (IOException e) {
     			throw new RuntimeException(e);
+    		}
+    	}
     	abstract protected Class<? extends TableMapper<ImmutableBytesWritable, ?>> getMapperClass();
     	abstract protected Class<?> getMapOutputValueClass();
     	abstract protected void addSpecificFilters(FilterList filters);
     	abstract protected void addSpecificScanClauses(Scan scan);
     	private Scan getScan(Properties p) {
     		Scan scan = new Scan();
     		scan.setCaching(500);
     		scan.setCacheBlocks(false);
     		FilterList filters = new FilterList(Operator.MUST_PASS_ALL);
     		//if (p.containsKey("set")) {
     		//filters.addFilter(new FamilyFilter(CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes("set"))));
     		//filters.addFilter(new QualifierFilter(CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(p.getProperty("set")))));
     		//scan.addFamily(Bytes.toBytes("set"));
     		//			byte[] qualifier = Bytes.toBytes(p.getProperty("set"));
     		//			SingleColumnValueFilter filter = new SingleColumnValueFilter(Bytes.toBytes("set"), qualifier, CompareOp.EQUAL, qualifier);
     		//			filter.setFilterIfMissing(true);
     		//			filters.addFilter(filter);
     		//scan.addColumn(Bytes.toBytes("set"), Bytes.toBytes(p.getProperty("set")));
     		//}
     		addSpecificFilters(filters);
     		addSpecificScanClauses(scan);
     		scan.setFilter(filters);
     		return scan;
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.index;
     import java.io.IOException;
     import java.util.Properties;
     import org.apache.hadoop.fs.Path;
     import org.apache.hadoop.io.Text;
     import org.apache.hadoop.mapreduce.Job;
     import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
     import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
     import eu.dnetlib.data.mapreduce.JobParams;
     import eu.dnetlib.data.mapreduce.hbase.AbstractHBaseMapReduceJob;
     public class DocumentDatabaseFeedJob extends AbstractHBaseMapReduceJob {
     	@Override
     	public Job setJobDetails(final Job job, final Properties p) {
     		job.setInputFormatClass(SequenceFileInputFormat.class);
     		try {
     			FileInputFormat.setInputPaths(job, p.getProperty(JobParams.MAPRED_INPUT_DIR));
     			FileOutputFormat.setOutputPath(job, new Path(p.getProperty(JobParams.DOCUMENT_DB_ROTTEN_FILE)));
     		} catch (IOException e) {
     			throw new RuntimeException(e);
+    		}
     		job.setMapperClass(DocumentDatabaseMapper.class);
     		job.setMapOutputKeyClass(Text.class);
     		job.setMapOutputValueClass(Text.class);
     		job.setNumReduceTasks(0);
     		job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
     		job.getConfiguration().setBoolean("mapreduce.map.speculative", false);
     		job.getConfiguration().setBoolean("mapred.compress.map.output", true);
     		return job;
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.dataexport;
     import java.util.Properties;
     import org.apache.hadoop.hbase.mapreduce.CopyTable;
     import org.apache.hadoop.mapreduce.Job;
     import com.google.common.collect.Iterables;
     import com.google.common.collect.Lists;
     import eu.dnetlib.data.mapreduce.hbase.AbstractHBaseMapReduceJob;
     /**
      * HBaseCopyTableJob is a simple wrapper over org.apache.hadoop.hbase.mapreduce.CopyTable that makes it invokable from a
      * D-Net workflow, passing the required parameters.
+     *
      * Copies the given table to the remote hbase instance.
+     *
      * @author claudio
+     *
      */
     public class HBaseCopyTableJob extends AbstractHBaseMapReduceJob {
     	@Override
     	protected Job setJobDetails(Job job, Properties p) throws Exception {
     		String remoteCluster = "--peer.adr=" + p.getProperty("peername");
     		String tablename = p.getProperty("tablename");
     		String[] args = Iterables.toArray(Lists.newArrayList(remoteCluster, tablename), String.class);
     		return CopyTable.createSubmittableJob(job.getConfiguration(), args);
+    	}
+    }

Project

General

Profile

D-Net

Revision 49831

Added by Claudio Atzori almost 7 years ago