params) {
+ return new XsltRowTransformerFactory().getTransformer(xslt, params);
+ }
+
+}
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java (revision 45289)
@@ -0,0 +1,96 @@
+package eu.dnetlib.data.hadoop.hdfs;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Writer.Option;
+import org.apache.hadoop.io.Writable;
+import org.springframework.beans.factory.annotation.Required;
+
+/**
+ * Factory for SequenceFile.Writer instances
+ *
+ * @author claudio
+ *
+ */
+public class SequenceFileWriterFactory {
+
+ private Class extends Writable> keyClass;
+
+ private Class extends Writable> valueClass;
+
+ private String compressionType;
+
+ private String blockSize = "64M";
+
+ private short replication = 3;
+
+ public SequenceFile.Writer getSequenceFileWriter(
+ final Class extends Writable> keyClass,
+ final Class extends Writable> valueClass,
+ final Configuration conf,
+ final Path sequenceFilePath) throws IOException {
+
+ Configuration newConf = new Configuration(conf);
+ newConf.set("dfs.blocksize", getBlockSize());
+
+ Option oFile = SequenceFile.Writer.file(sequenceFilePath);
+ Option oKey = SequenceFile.Writer.keyClass(keyClass);
+ Option oValue = SequenceFile.Writer.valueClass(valueClass);
+ Option oCmp = SequenceFile.Writer.compression(CompressionType.valueOf(getCompressionType()));
+ Option oRpl = SequenceFile.Writer.replication(getReplication());
+
+ return SequenceFile.createWriter(newConf, oFile, oKey, oValue, oCmp, oRpl);
+ }
+
+ public SequenceFile.Writer getSequenceFileWriter(final Configuration conf, final Path sequenceFilePath) throws IOException {
+ return getSequenceFileWriter(getKeyClass(), getValueClass(), conf, sequenceFilePath);
+ }
+
+ public Class extends Writable> getKeyClass() {
+ return keyClass;
+ }
+
+ @Required
+ public void setKeyClass(final Class extends Writable> keyClass) {
+ this.keyClass = keyClass;
+ }
+
+ public Class extends Writable> getValueClass() {
+ return valueClass;
+ }
+
+ @Required
+ public void setValueClass(final Class extends Writable> valueClass) {
+ this.valueClass = valueClass;
+ }
+
+ public String getCompressionType() {
+ return compressionType;
+ }
+
+ @Required
+ public void setCompressionType(final String compressionType) {
+ this.compressionType = compressionType;
+ }
+
+ public String getBlockSize() {
+ return blockSize;
+ }
+
+ public void setBlockSize(final String blockSize) {
+ this.blockSize = blockSize;
+ }
+
+ public short getReplication() {
+ return replication;
+ }
+
+ public void setReplication(final short replication) {
+ this.replication = replication;
+ }
+
+}
Index: modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/transform/ColumnType.java
===================================================================
--- modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/transform/ColumnType.java (nonexistent)
+++ modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/transform/ColumnType.java (revision 45289)
@@ -0,0 +1,6 @@
+package eu.dnetlib.data.transform;
+
+public enum ColumnType {
+ base64,
+ text,
+}
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/deploy.info
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/deploy.info (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/deploy.info (revision 45289)
@@ -0,0 +1 @@
+{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-hadoop-commons/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "dnet-hadoop-commons"}
\ No newline at end of file
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java (revision 45289)
@@ -0,0 +1,99 @@
+package eu.dnetlib.data.hadoop.hdfs;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.collect.AbstractIterator;
+import com.google.common.io.Closeables;
+
+import eu.dnetlib.miscutils.collections.Pair;
+
+/**
+ *
+ * {@link java.util.Iterator} over a {@link SequenceFile}'s keys and values, as a {@link Pair} containing key and value.
+ *
+ */
+public final class SequenceFileIterator extends AbstractIterator> implements Closeable {
+
+ private final SequenceFile.Reader reader;
+ private final Configuration conf;
+ private final Class keyClass;
+ private final Class valueClass;
+ private final boolean noValue;
+ private K key;
+ private V value;
+ private final boolean reuseKeyValueInstances;
+
+ /**
+ * @throws IOException
+ * if path can't be read, or its key or value class can't be instantiated
+ */
+ @SuppressWarnings("unchecked")
+ public SequenceFileIterator(Path path, boolean reuseKeyValueInstances, Configuration conf) throws IOException {
+ key = null;
+ value = null;
+ FileSystem fs = path.getFileSystem(conf);
+ path = path.makeQualified(fs);
+ reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path));
+ this.conf = conf;
+ keyClass = (Class) reader.getKeyClass();
+ valueClass = (Class) reader.getValueClass();
+ noValue = NullWritable.class.equals(valueClass);
+ this.reuseKeyValueInstances = reuseKeyValueInstances;
+ }
+
+ public Class getKeyClass() {
+ return keyClass;
+ }
+
+ public Class getValueClass() {
+ return valueClass;
+ }
+
+ @Override
+ public void close() {
+ key = null;
+ value = null;
+ try {
+ Closeables.close(reader, false);
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ endOfData();
+ }
+
+ @Override
+ protected Pair computeNext() {
+ if (!reuseKeyValueInstances || value == null) {
+ key = ReflectionUtils.newInstance(keyClass, conf);
+ if (!noValue) {
+ value = ReflectionUtils.newInstance(valueClass, conf);
+ }
+ }
+ try {
+ boolean available;
+ if (noValue) {
+ available = reader.next(key);
+ } else {
+ available = reader.next(key, value);
+ }
+ if (!available) {
+ close();
+ return null;
+ }
+ return new Pair(key, value);
+ } catch (IOException ioe) {
+ close();
+ throw new IllegalStateException(ioe);
+ }
+ }
+
+}
\ No newline at end of file
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/java/eu/dnetlib/data/transform/XsltRowTransformer.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/java/eu/dnetlib/data/transform/XsltRowTransformer.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/java/eu/dnetlib/data/transform/XsltRowTransformer.java (revision 45289)
@@ -0,0 +1,111 @@
+package eu.dnetlib.data.transform;
+
+import java.io.StringReader;
+import java.util.List;
+
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.Element;
+import org.dom4j.Node;
+import org.dom4j.io.DocumentResult;
+import org.dom4j.io.DocumentSource;
+import org.dom4j.io.SAXReader;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Lists;
+
+public class XsltRowTransformer implements Function> {
+
+ private static final Log log = LogFactory.getLog(XsltRowTransformer.class); // NOPMD by marko on 11/24/08 5:02 PM
+
+ private Transformer transformer;
+
+ private SAXReader reader = new SAXReader();
+
+ public XsltRowTransformer(final Transformer transformer) {
+ this.transformer = transformer;
+ }
+
+ private Document transform(final Document doc) {
+ if (transformer == null)
+ return doc;
+ final DocumentResult result = new DocumentResult();
+
+ try {
+ transformer.transform(new DocumentSource(doc), result);
+ return result.getDocument();
+ } catch (final TransformerException e) {
+ throw new RuntimeException("Unable to transform document:\n" + doc.asXML(), e);
+ }
+ }
+
+ private Document transform(final String xml) {
+ try {
+ return transform(reader.read(new StringReader(xml)));
+ } catch (final DocumentException e) {
+ log.error("Error parsing xml:\n" + xml, e);
+ throw new RuntimeException("Unable to parse document:\n" + xml, e);
+ }
+ }
+
+ private List transformAsListRow(final String xml) {
+ final Document doc = transform(xml);
+
+ final List rows = Lists.newArrayList();
+
+ if (doc == null)
+ return rows;
+
+ for (final Object or : doc.selectNodes("//ROW")) {
+ final Element row = (Element) or;
+
+ final String columnFamily = row.valueOf("@columnFamily");
+ final String key = row.valueOf("@key");
+
+ if ((key == null) || key.isEmpty())
+ throw new RuntimeException("Attribute 'key' is missing in XSLT");
+
+ if ((columnFamily == null) || columnFamily.isEmpty())
+ throw new RuntimeException("Attribute 'columnFamily' is missing in XSLT");
+
+ final List> cols = Lists.newArrayList();
+
+ for (final Object of : row.selectNodes("./QUALIFIER")) {
+ final Node node = (Node) of;
+
+ final String name = node.valueOf("@name");
+ final String type = node.valueOf("@type");
+
+ final byte[] value = decode(node.getText().trim(), type);
+
+ cols.add(new Column(name, value));
+ }
+ rows.add(new Row(columnFamily, key, cols));
+ }
+ return rows;
+ }
+
+ public String getTransformerClassName() {
+ return transformer != null ? transformer.getClass().getName() : "null";
+ }
+
+ private byte[] decode(final String value, final String type) {
+
+ if ("base64".equals(type))
+ return Base64.decodeBase64(value);
+
+ return value.getBytes();
+ }
+
+ @Override
+ public List apply(final String xml) {
+ return transformAsListRow(xml);
+ }
+
+}
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/src/main/java/eu/dnetlib/data/hadoop/config/ConfigurationEnumerator.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/src/main/java/eu/dnetlib/data/hadoop/config/ConfigurationEnumerator.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/src/main/java/eu/dnetlib/data/hadoop/config/ConfigurationEnumerator.java (revision 45289)
@@ -0,0 +1,66 @@
+package eu.dnetlib.data.hadoop.config;
+
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.springframework.beans.BeansException;
+import org.springframework.beans.factory.BeanFactory;
+import org.springframework.beans.factory.BeanFactoryAware;
+import org.springframework.beans.factory.ListableBeanFactory;
+import org.springframework.beans.factory.NoSuchBeanDefinitionException;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Maps;
+
+public class ConfigurationEnumerator implements BeanFactoryAware {
+
+ private static final Log log = LogFactory.getLog(ConfigurationEnumerator.class); // NOPMD by marko on 11/24/08 5:02 PM
+
+ /**
+ * bean factory.
+ */
+ private ListableBeanFactory beanFactory;
+
+ private final Function asConfiguration = new Function() {
+
+ @Override
+ public Configuration apply(ConfigurationFactory factory) {
+ return factory.getConfiguration();
+ }
+ };
+
+ /**
+ * Get all beans implementing the ConfiguredJob interface.
+ *
+ * @return
+ */
+ public Map getAll() {
+ return Maps.transformValues(beanFactory.getBeansOfType(ConfigurationFactory.class), asConfiguration);
+ }
+
+ @Override
+ public void setBeanFactory(final BeanFactory beanFactory) throws BeansException {
+ this.beanFactory = (ListableBeanFactory) beanFactory;
+ }
+
+ public ListableBeanFactory getBeanFactory() {
+ return beanFactory;
+ }
+
+ /**
+ * Get given Configuration or null.
+ *
+ * @param name
+ * @return
+ */
+ public Configuration get(final ClusterName name) {
+ try {
+ return (beanFactory.getBean(name.toString(), Configuration.class));
+ } catch (final NoSuchBeanDefinitionException e) {
+ log.error("undefined bean: " + name, e);
+ return null;
+ }
+ }
+}
Property changes on: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-2.0.0/src/main/java/eu/dnetlib/data/hadoop/config/ConfigurationEnumerator.java
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java (revision 45289)
@@ -0,0 +1,96 @@
+package eu.dnetlib.data.hadoop.hdfs;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Writer.Option;
+import org.apache.hadoop.io.Writable;
+import org.springframework.beans.factory.annotation.Required;
+
+/**
+ * Factory for SequenceFile.Writer instances
+ *
+ * @author claudio
+ *
+ */
+public class SequenceFileWriterFactory {
+
+ private Class extends Writable> keyClass;
+
+ private Class extends Writable> valueClass;
+
+ private String compressionType;
+
+ private String blockSize = "64M";
+
+ private short replication = 3;
+
+ public SequenceFile.Writer getSequenceFileWriter(
+ final Class extends Writable> keyClass,
+ final Class extends Writable> valueClass,
+ final Configuration conf,
+ final Path sequenceFilePath) throws IOException {
+
+ Configuration newConf = new Configuration(conf);
+ newConf.set("dfs.blocksize", getBlockSize());
+
+ Option oFile = SequenceFile.Writer.file(sequenceFilePath);
+ Option oKey = SequenceFile.Writer.keyClass(keyClass);
+ Option oValue = SequenceFile.Writer.valueClass(valueClass);
+ Option oCmp = SequenceFile.Writer.compression(CompressionType.valueOf(getCompressionType()));
+ Option oRpl = SequenceFile.Writer.replication(getReplication());
+
+ return SequenceFile.createWriter(newConf, oFile, oKey, oValue, oCmp, oRpl);
+ }
+
+ public SequenceFile.Writer getSequenceFileWriter(final Configuration conf, final Path sequenceFilePath) throws IOException {
+ return getSequenceFileWriter(getKeyClass(), getValueClass(), conf, sequenceFilePath);
+ }
+
+ public Class extends Writable> getKeyClass() {
+ return keyClass;
+ }
+
+ @Required
+ public void setKeyClass(final Class extends Writable> keyClass) {
+ this.keyClass = keyClass;
+ }
+
+ public Class extends Writable> getValueClass() {
+ return valueClass;
+ }
+
+ @Required
+ public void setValueClass(final Class extends Writable> valueClass) {
+ this.valueClass = valueClass;
+ }
+
+ public String getCompressionType() {
+ return compressionType;
+ }
+
+ @Required
+ public void setCompressionType(final String compressionType) {
+ this.compressionType = compressionType;
+ }
+
+ public String getBlockSize() {
+ return blockSize;
+ }
+
+ public void setBlockSize(final String blockSize) {
+ this.blockSize = blockSize;
+ }
+
+ public short getReplication() {
+ return replication;
+ }
+
+ public void setReplication(final short replication) {
+ this.replication = replication;
+ }
+
+}
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/java/eu/dnetlib/data/hadoop/config/ClusterName.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/java/eu/dnetlib/data/hadoop/config/ClusterName.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/java/eu/dnetlib/data/hadoop/config/ClusterName.java (revision 45289)
@@ -0,0 +1,34 @@
+package eu.dnetlib.data.hadoop.config;
+
+import java.util.Arrays;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+
+/**
+ * Enumeration of the managed clusters.
+ *
+ * @author claudio
+ *
+ */
+public enum ClusterName {
+ DM, // Data Management
+ IIS; // Information Inference Service(s)
+
+ public static List asStringList() {
+ return Lists.newArrayList(Iterables.transform(asList(), new Function() {
+
+ @Override
+ public String apply(final ClusterName clusterName) {
+ return clusterName.toString();
+ }
+ }));
+ }
+
+ public static List asList() {
+ return Arrays.asList(ClusterName.values());
+ }
+
+}
Property changes on: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/java/eu/dnetlib/data/hadoop/config/ClusterName.java
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java
===================================================================
--- modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java (nonexistent)
+++ modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java (revision 45289)
@@ -0,0 +1,96 @@
+package eu.dnetlib.data.hadoop.hdfs;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Writer.Option;
+import org.apache.hadoop.io.Writable;
+import org.springframework.beans.factory.annotation.Required;
+
+/**
+ * Factory for SequenceFile.Writer instances
+ *
+ * @author claudio
+ *
+ */
+public class SequenceFileWriterFactory {
+
+ private Class extends Writable> keyClass;
+
+ private Class extends Writable> valueClass;
+
+ private String compressionType;
+
+ private String blockSize = "64M";
+
+ private short replication = 3;
+
+ public SequenceFile.Writer getSequenceFileWriter(
+ final Class extends Writable> keyClass,
+ final Class extends Writable> valueClass,
+ final Configuration conf,
+ final Path sequenceFilePath) throws IOException {
+
+ Configuration newConf = new Configuration(conf);
+ newConf.set("dfs.blocksize", getBlockSize());
+
+ Option oFile = SequenceFile.Writer.file(sequenceFilePath);
+ Option oKey = SequenceFile.Writer.keyClass(keyClass);
+ Option oValue = SequenceFile.Writer.valueClass(valueClass);
+ Option oCmp = SequenceFile.Writer.compression(CompressionType.valueOf(getCompressionType()));
+ Option oRpl = SequenceFile.Writer.replication(getReplication());
+
+ return SequenceFile.createWriter(newConf, oFile, oKey, oValue, oCmp, oRpl);
+ }
+
+ public SequenceFile.Writer getSequenceFileWriter(final Configuration conf, final Path sequenceFilePath) throws IOException {
+ return getSequenceFileWriter(getKeyClass(), getValueClass(), conf, sequenceFilePath);
+ }
+
+ public Class extends Writable> getKeyClass() {
+ return keyClass;
+ }
+
+ @Required
+ public void setKeyClass(final Class extends Writable> keyClass) {
+ this.keyClass = keyClass;
+ }
+
+ public Class extends Writable> getValueClass() {
+ return valueClass;
+ }
+
+ @Required
+ public void setValueClass(final Class extends Writable> valueClass) {
+ this.valueClass = valueClass;
+ }
+
+ public String getCompressionType() {
+ return compressionType;
+ }
+
+ @Required
+ public void setCompressionType(final String compressionType) {
+ this.compressionType = compressionType;
+ }
+
+ public String getBlockSize() {
+ return blockSize;
+ }
+
+ public void setBlockSize(final String blockSize) {
+ this.blockSize = blockSize;
+ }
+
+ public short getReplication() {
+ return replication;
+ }
+
+ public void setReplication(final short replication) {
+ this.replication = replication;
+ }
+
+}
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/transform/Column.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/transform/Column.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/transform/Column.java (revision 45289)
@@ -0,0 +1,26 @@
+package eu.dnetlib.data.transform;
+
+public class Column {
+
+ private N name;
+
+ private V value;
+
+ public static Column newInstance(final N name, final V value) {
+ return new Column(name, value);
+ }
+
+ public Column(final N name, final V value) {
+ this.name = name;
+ this.value = value;
+ }
+
+ public N getName() {
+ return name;
+ }
+
+ public V getValue() {
+ return value;
+ }
+
+}
Index: modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileUtils.java
===================================================================
--- modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileUtils.java (nonexistent)
+++ modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileUtils.java (revision 45289)
@@ -0,0 +1,57 @@
+package eu.dnetlib.data.hadoop.hdfs;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.Text;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+
+import eu.dnetlib.miscutils.collections.Pair;
+
+public class SequenceFileUtils {
+
+ public static final int NO_LIMIT = -1;
+
+ public static Iterable> read(Path path, final Configuration conf) throws IOException {
+ return Iterables.concat(Iterables.transform(new SequenceFileUtils().doRead(path, conf), new Function>() {
+ @Override
+ public SequenceFileIterable apply(Path path) {
+ return new SequenceFileIterable(path, conf);
+ }
+ }));
+ }
+
+ public static Iterable> read(Path path, final Configuration conf, final int readMax) throws IOException {
+ if (readMax != NO_LIMIT)
+ return Iterables.limit(read(path, conf), readMax);
+ else
+ return read(path, conf);
+ }
+
+ private final PathFilter f = new PathFilter() {
+ @Override
+ public boolean accept(Path path) {
+ return path.getName().contains("part-r");
+ }
+ };
+
+ private Iterable doRead(Path path, final Configuration conf) throws IOException {
+ FileSystem fs = FileSystem.get(conf);
+ return Iterables.transform(Lists.newArrayList(fs.listStatus(path, f)), new Function() {
+ @Override
+ public Path apply(FileStatus fs) {
+ Path path = fs.getPath();
+ System.out.println("downloading xml files from path: " + path.toString());
+ return path;
+ }
+ });
+ }
+
+}
Index: modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterable.java
===================================================================
--- modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterable.java (nonexistent)
+++ modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterable.java (revision 45289)
@@ -0,0 +1,56 @@
+package eu.dnetlib.data.hadoop.hdfs;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+
+import eu.dnetlib.miscutils.collections.Pair;
+
+/**
+ *
+ * {@link Iterable} counterpart to {@link SequenceFileIterator}.
+ *
+ */
+public final class SequenceFileIterable implements Iterable> {
+
+ private final Path path;
+ private final boolean reuseKeyValueInstances;
+ private final Configuration conf;
+
+ /**
+ * Like {@link #SequenceFileIterable(Path, boolean, Configuration)} but key and value instances are not reused by
+ * default.
+ *
+ * @param path
+ * file to iterate over
+ */
+ public SequenceFileIterable(Path path, Configuration conf) {
+ this(path, false, conf);
+ }
+
+ /**
+ * @param path
+ * file to iterate over
+ * @param reuseKeyValueInstances
+ * if true, reuses instances of the key and value object instead of creating a new one for each read from
+ * the file
+ */
+ public SequenceFileIterable(Path path, boolean reuseKeyValueInstances, Configuration conf) {
+ this.path = path;
+ this.reuseKeyValueInstances = reuseKeyValueInstances;
+ this.conf = conf;
+ }
+
+ @Override
+ public Iterator> iterator() {
+ try {
+ return new SequenceFileIterator(path, reuseKeyValueInstances, conf);
+ } catch (IOException ioe) {
+ throw new IllegalStateException(path.toString(), ioe);
+ }
+ }
+
+}
Index: modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java
===================================================================
--- modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java (nonexistent)
+++ modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java (revision 45289)
@@ -0,0 +1,99 @@
+package eu.dnetlib.data.hadoop.hdfs;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.collect.AbstractIterator;
+import com.google.common.io.Closeables;
+
+import eu.dnetlib.miscutils.collections.Pair;
+
+/**
+ *
+ * {@link java.util.Iterator} over a {@link SequenceFile}'s keys and values, as a {@link Pair} containing key and value.
+ *
+ */
+public final class SequenceFileIterator extends AbstractIterator> implements Closeable {
+
+ private final SequenceFile.Reader reader;
+ private final Configuration conf;
+ private final Class keyClass;
+ private final Class valueClass;
+ private final boolean noValue;
+ private K key;
+ private V value;
+ private final boolean reuseKeyValueInstances;
+
+ /**
+ * @throws IOException
+ * if path can't be read, or its key or value class can't be instantiated
+ */
+ @SuppressWarnings("unchecked")
+ public SequenceFileIterator(Path path, boolean reuseKeyValueInstances, Configuration conf) throws IOException {
+ key = null;
+ value = null;
+ FileSystem fs = path.getFileSystem(conf);
+ path = path.makeQualified(fs);
+ reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path));
+ this.conf = conf;
+ keyClass = (Class) reader.getKeyClass();
+ valueClass = (Class) reader.getValueClass();
+ noValue = NullWritable.class.equals(valueClass);
+ this.reuseKeyValueInstances = reuseKeyValueInstances;
+ }
+
+ public Class getKeyClass() {
+ return keyClass;
+ }
+
+ public Class getValueClass() {
+ return valueClass;
+ }
+
+ @Override
+ public void close() {
+ key = null;
+ value = null;
+ try {
+ Closeables.close(reader, false);
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ endOfData();
+ }
+
+ @Override
+ protected Pair computeNext() {
+ if (!reuseKeyValueInstances || value == null) {
+ key = ReflectionUtils.newInstance(keyClass, conf);
+ if (!noValue) {
+ value = ReflectionUtils.newInstance(valueClass, conf);
+ }
+ }
+ try {
+ boolean available;
+ if (noValue) {
+ available = reader.next(key);
+ } else {
+ available = reader.next(key, value);
+ }
+ if (!available) {
+ close();
+ return null;
+ }
+ return new Pair(key, value);
+ } catch (IOException ioe) {
+ close();
+ throw new IllegalStateException(ioe);
+ }
+ }
+
+}
\ No newline at end of file
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/transform/Row.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/transform/Row.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/transform/Row.java (revision 45289)
@@ -0,0 +1,76 @@
+package eu.dnetlib.data.transform;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Maps;
+
+public class Row implements Iterable> {
+
+ private String columnFamily;
+
+ private String key;
+
+ private Map> columns;
+
+ public Row(final Row row) {
+ this.columnFamily = row.getColumnFamily();
+ this.key = row.getKey();
+ this.columns = row.columns;
+ }
+
+ public Row(final String columnFamily, final String key) {
+ this.columnFamily = columnFamily;
+ this.key = key;
+ this.columns = Maps.newHashMap();
+ }
+
+ public Row(final String columnFamily, final String key, final List> columns) {
+ this(columnFamily, key);
+ this.setColumns(columns);
+ }
+
+ public void addColumn(final Column column) {
+ getColumns().add(column);
+ }
+
+ public String getKey() {
+ return key;
+ }
+
+ public Collection> getColumns() {
+ return columns.values();
+ }
+
+ public Column getColumn(final String name) {
+ return columns.get(name);
+ }
+
+ public void setColumn(final String name, final Column newColumn) {
+ columns.put(name, newColumn);
+ }
+
+ public void setColumns(final List> columns) {
+ this.columns.clear();
+ for (final Column col : columns) {
+ this.columns.put(col.getName(), col);
+ }
+ }
+
+ public String getColumnFamily() {
+ return columnFamily;
+ }
+
+ @Override
+ public Iterator> iterator() {
+ return getColumns().iterator();
+ }
+
+ @Override
+ public String toString() {
+ return "\nRow [ KEY: " + key + ",\n\tCF: " + columnFamily + "\n\tCOLS: {" + columns + "}]";
+ }
+
+}
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/transform/ColumnType.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/transform/ColumnType.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/transform/ColumnType.java (revision 45289)
@@ -0,0 +1,6 @@
+package eu.dnetlib.data.transform;
+
+public enum ColumnType {
+ base64,
+ text,
+}
Index: modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/config/ClusterName.java
===================================================================
--- modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/config/ClusterName.java (nonexistent)
+++ modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/config/ClusterName.java (revision 45289)
@@ -0,0 +1,34 @@
+package eu.dnetlib.data.hadoop.config;
+
+import java.util.Arrays;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+
+/**
+ * Enumeration of the managed clusters.
+ *
+ * @author claudio
+ *
+ */
+public enum ClusterName {
+ DM, // Data Management
+ IIS; // Information Inference Service(s)
+
+ public static List asStringList() {
+ return Lists.newArrayList(Iterables.transform(asList(), new Function() {
+
+ @Override
+ public String apply(final ClusterName clusterName) {
+ return clusterName.toString();
+ }
+ }));
+ }
+
+ public static List asList() {
+ return Arrays.asList(ClusterName.values());
+ }
+
+}
Property changes on: modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/config/ClusterName.java
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.iis.icm.properties
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.iis.icm.properties (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.3/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.iis.icm.properties (revision 45289)
@@ -0,0 +1,44 @@
+dnet.clustername = IIS
+
+#CORE-SITE
+fs.defaultFS = hdfs://iis-cluster-nn
+
+hadoop.security.authentication = simple
+hadoop.security.auth_to_local = DEFAULT
+
+#HBASE-SITE
+hbase.rootdir = hdfs://iis-cluster-nn/hbase
+
+hbase.security.authentication = simple
+zookeeper.znode.rootserver = root-region-server
+hbase.zookeeper.quorum = master1.hadoop.iis.openaire.eu,master2.hadoop.iis.openaire.eu,master3.hadoop.iis.openaire.eu
+hbase.zookeeper.property.clientPort = 2181
+hbase.zookeeper.client.port = 2181
+zookeeper.znode.parent = /hbase
+
+#HDFS-SITE
+dfs.replication = 2
+dfs.nameservices = iis-cluster-nn
+dfs.ha.namenodes.iis-cluster-nn = nn1,nn2
+
+dfs.namenode.rpc-address.iis-cluster-nn.nn1=namenode1.hadoop.iis.openaire.eu:8020
+dfs.namenode.http-address.iis-cluster-nn.nn1=namenode1.hadoop.iis.openaire.eu:50070
+dfs.namenode.rpc-address.iis-cluster-nn.nn2=namenode2.hadoop.iis.openaire.eu:8020
+dfs.namenode.http-address.iis-cluster-nn.nn2=namenode2.hadoop.iis.openaire.eu:50070
+
+dfs.client.failover.proxy.provider.iis-cluster-nn=org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
+
+#MAPRED-SITE
+mapred.job.tracker = iis-cluster-jt
+mapred.jobtrackers.iis-cluster-jt = jt1,jt2
+
+mapred.jobtracker.rpc-address.iis-cluster-jt.jt1 = master1.hadoop.iis.openaire.eu:8021
+mapred.jobtracker.rpc-address.iis-cluster-jt.jt2 = master1.hadoop.iis.openaire.eu:8022
+
+mapred.client.failover.proxy.provider.iis-cluster-jt = org.apache.hadoop.mapred.ConfiguredFailoverProxyProvider
+
+mapred.mapper.new-api = true
+mapred.reducer.new-api = true
+
+#OOZIE SERVER
+oozie.service.loc = http://oozie.hadoop.iis.openaire.eu:11000/oozie
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java (revision 45289)
@@ -0,0 +1,99 @@
+package eu.dnetlib.data.hadoop.hdfs;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.collect.AbstractIterator;
+import com.google.common.io.Closeables;
+
+import eu.dnetlib.miscutils.collections.Pair;
+
+/**
+ *
+ * {@link java.util.Iterator} over a {@link SequenceFile}'s keys and values, as a {@link Pair} containing key and value.
+ *
+ */
+public final class SequenceFileIterator extends AbstractIterator> implements Closeable {
+
+ private final SequenceFile.Reader reader;
+ private final Configuration conf;
+ private final Class keyClass;
+ private final Class valueClass;
+ private final boolean noValue;
+ private K key;
+ private V value;
+ private final boolean reuseKeyValueInstances;
+
+ /**
+ * @throws IOException
+ * if path can't be read, or its key or value class can't be instantiated
+ */
+ @SuppressWarnings("unchecked")
+ public SequenceFileIterator(Path path, boolean reuseKeyValueInstances, Configuration conf) throws IOException {
+ key = null;
+ value = null;
+ FileSystem fs = path.getFileSystem(conf);
+ path = path.makeQualified(fs);
+ reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path));
+ this.conf = conf;
+ keyClass = (Class) reader.getKeyClass();
+ valueClass = (Class) reader.getValueClass();
+ noValue = NullWritable.class.equals(valueClass);
+ this.reuseKeyValueInstances = reuseKeyValueInstances;
+ }
+
+ public Class getKeyClass() {
+ return keyClass;
+ }
+
+ public Class getValueClass() {
+ return valueClass;
+ }
+
+ @Override
+ public void close() {
+ key = null;
+ value = null;
+ try {
+ Closeables.close(reader, false);
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ endOfData();
+ }
+
+ @Override
+ protected Pair computeNext() {
+ if (!reuseKeyValueInstances || value == null) {
+ key = ReflectionUtils.newInstance(keyClass, conf);
+ if (!noValue) {
+ value = ReflectionUtils.newInstance(valueClass, conf);
+ }
+ }
+ try {
+ boolean available;
+ if (noValue) {
+ available = reader.next(key);
+ } else {
+ available = reader.next(key, value);
+ }
+ if (!available) {
+ close();
+ return null;
+ }
+ return new Pair(key, value);
+ } catch (IOException ioe) {
+ close();
+ throw new IllegalStateException(ioe);
+ }
+ }
+
+}
\ No newline at end of file
Index: modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java
===================================================================
--- modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java (nonexistent)
+++ modules/dnet-hadoop-commons/tags/dnet-hadoop-commons-1.0.4/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java (revision 45289)
@@ -0,0 +1,96 @@
+package eu.dnetlib.data.hadoop.hdfs;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Writer.Option;
+import org.apache.hadoop.io.Writable;
+import org.springframework.beans.factory.annotation.Required;
+
+/**
+ * Factory for SequenceFile.Writer instances
+ *
+ * @author claudio
+ *
+ */
+public class SequenceFileWriterFactory {
+
+ private Class extends Writable> keyClass;
+
+ private Class extends Writable> valueClass;
+
+ private String compressionType;
+
+ private String blockSize = "64M";
+
+ private short replication = 3;
+
+ public SequenceFile.Writer getSequenceFileWriter(
+ final Class extends Writable> keyClass,
+ final Class extends Writable> valueClass,
+ final Configuration conf,
+ final Path sequenceFilePath) throws IOException {
+
+ Configuration newConf = new Configuration(conf);
+ newConf.set("dfs.blocksize", getBlockSize());
+
+ Option oFile = SequenceFile.Writer.file(sequenceFilePath);
+ Option oKey = SequenceFile.Writer.keyClass(keyClass);
+ Option oValue = SequenceFile.Writer.valueClass(valueClass);
+ Option oCmp = SequenceFile.Writer.compression(CompressionType.valueOf(getCompressionType()));
+ Option oRpl = SequenceFile.Writer.replication(getReplication());
+
+ return SequenceFile.createWriter(newConf, oFile, oKey, oValue, oCmp, oRpl);
+ }
+
+ public SequenceFile.Writer getSequenceFileWriter(final Configuration conf, final Path sequenceFilePath) throws IOException {
+ return getSequenceFileWriter(getKeyClass(), getValueClass(), conf, sequenceFilePath);
+ }
+
+ public Class extends Writable> getKeyClass() {
+ return keyClass;
+ }
+
+ @Required
+ public void setKeyClass(final Class extends Writable> keyClass) {
+ this.keyClass = keyClass;
+ }
+
+ public Class extends Writable> getValueClass() {
+ return valueClass;
+ }
+
+ @Required
+ public void setValueClass(final Class extends Writable> valueClass) {
+ this.valueClass = valueClass;
+ }
+
+ public String getCompressionType() {
+ return compressionType;
+ }
+
+ @Required
+ public void setCompressionType(final String compressionType) {
+ this.compressionType = compressionType;
+ }
+
+ public String getBlockSize() {
+ return blockSize;
+ }
+
+ public void setBlockSize(final String blockSize) {
+ this.blockSize = blockSize;
+ }
+
+ public short getReplication() {
+ return replication;
+ }
+
+ public void setReplication(final short replication) {
+ this.replication = replication;
+ }
+
+}
Index: modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/config/ConfigurationFactory.java
===================================================================
--- modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/config/ConfigurationFactory.java (nonexistent)
+++ modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/config/ConfigurationFactory.java (revision 45289)
@@ -0,0 +1,73 @@
+package eu.dnetlib.data.hadoop.config;
+
+import java.io.IOException;
+import java.util.Map.Entry;
+import java.util.Properties;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.springframework.beans.factory.FactoryBean;
+import org.springframework.beans.factory.annotation.Required;
+import org.springframework.core.io.Resource;
+
+/**
+ * Factory bean for hadoop cluster configuration object
+ *
+ * @author claudio
+ *
+ */
+public class ConfigurationFactory implements FactoryBean {
+
+ private static final Log log = LogFactory.getLog(ConfigurationFactory.class); // NOPMD by marko on 11/24/08 5:02 PM
+
+ private Resource defaults;
+
+ public Configuration getConfiguration() {
+ try {
+ return getObject();
+ } catch (Exception e) {
+ throw new IllegalStateException("Unable to load hadoop configuration", e);
+ }
+ }
+
+ @Override
+ public Configuration getObject() throws Exception {
+ return initConfiguration(defaultProperties());
+ }
+
+ @Override
+ public Class> getObjectType() {
+ return Configuration.class;
+ }
+
+ @Override
+ public boolean isSingleton() {
+ return true;
+ }
+
+ private Configuration initConfiguration(final Properties p) {
+ final Configuration conf = new Configuration(false);
+ for (Entry