Project

General

Profile

« Previous | Next » 

Revision 45289

codebase used to migrate to java8 the production system

View differences:

modules/dnet-hadoop-commons/trunk/deploy.info
1
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-hadoop-commons/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "dnet-hadoop-commons"}
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterable.java
1
package eu.dnetlib.data.hadoop.hdfs;
2

  
3
import java.io.IOException;
4
import java.util.Iterator;
5

  
6
import org.apache.hadoop.conf.Configuration;
7
import org.apache.hadoop.fs.Path;
8
import org.apache.hadoop.io.Writable;
9

  
10
import eu.dnetlib.miscutils.collections.Pair;
11

  
12
/**
13
 * <p>
14
 * {@link Iterable} counterpart to {@link SequenceFileIterator}.
15
 * </p>
16
 */
17
public final class SequenceFileIterable<K extends Writable, V extends Writable> implements Iterable<Pair<K, V>> {
18

  
19
	private final Path path;
20
	private final boolean reuseKeyValueInstances;
21
	private final Configuration conf;
22

  
23
	/**
24
	 * Like {@link #SequenceFileIterable(Path, boolean, Configuration)} but key and value instances are not reused by
25
	 * default.
26
	 * 
27
	 * @param path
28
	 *            file to iterate over
29
	 */
30
	public SequenceFileIterable(Path path, Configuration conf) {
31
		this(path, false, conf);
32
	}
33

  
34
	/**
35
	 * @param path
36
	 *            file to iterate over
37
	 * @param reuseKeyValueInstances
38
	 *            if true, reuses instances of the key and value object instead of creating a new one for each read from
39
	 *            the file
40
	 */
41
	public SequenceFileIterable(Path path, boolean reuseKeyValueInstances, Configuration conf) {
42
		this.path = path;
43
		this.reuseKeyValueInstances = reuseKeyValueInstances;
44
		this.conf = conf;
45
	}
46

  
47
	@Override
48
	public Iterator<Pair<K, V>> iterator() {
49
		try {
50
			return new SequenceFileIterator<K, V>(path, reuseKeyValueInstances, conf);
51
		} catch (IOException ioe) {
52
			throw new IllegalStateException(path.toString(), ioe);
53
		}
54
	}
55

  
56
}
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java
1
package eu.dnetlib.data.hadoop.hdfs;
2

  
3
import java.io.Closeable;
4
import java.io.IOException;
5

  
6
import org.apache.hadoop.conf.Configuration;
7
import org.apache.hadoop.fs.FileSystem;
8
import org.apache.hadoop.fs.Path;
9
import org.apache.hadoop.io.NullWritable;
10
import org.apache.hadoop.io.SequenceFile;
11
import org.apache.hadoop.io.Writable;
12
import org.apache.hadoop.util.ReflectionUtils;
13

  
14
import com.google.common.collect.AbstractIterator;
15
import com.google.common.io.Closeables;
16

  
17
import eu.dnetlib.miscutils.collections.Pair;
18

  
19
/**
20
 * <p>
21
 * {@link java.util.Iterator} over a {@link SequenceFile}'s keys and values, as a {@link Pair} containing key and value.
22
 * </p>
23
 */
24
public final class SequenceFileIterator<K extends Writable, V extends Writable> extends AbstractIterator<Pair<K, V>> implements Closeable {
25

  
26
	private final SequenceFile.Reader reader;
27
	private final Configuration conf;
28
	private final Class<K> keyClass;
29
	private final Class<V> valueClass;
30
	private final boolean noValue;
31
	private K key;
32
	private V value;
33
	private final boolean reuseKeyValueInstances;
34

  
35
	/**
36
	 * @throws IOException
37
	 *             if path can't be read, or its key or value class can't be instantiated
38
	 */
39
	@SuppressWarnings("unchecked")
40
	public SequenceFileIterator(Path path, boolean reuseKeyValueInstances, Configuration conf) throws IOException {
41
		key = null;
42
		value = null;
43
		FileSystem fs = path.getFileSystem(conf);
44
		path = path.makeQualified(fs);
45
		reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path));
46
		this.conf = conf;
47
		keyClass = (Class<K>) reader.getKeyClass();
48
		valueClass = (Class<V>) reader.getValueClass();
49
		noValue = NullWritable.class.equals(valueClass);
50
		this.reuseKeyValueInstances = reuseKeyValueInstances;
51
	}
52

  
53
	public Class<K> getKeyClass() {
54
		return keyClass;
55
	}
56

  
57
	public Class<V> getValueClass() {
58
		return valueClass;
59
	}
60

  
61
	@Override
62
	public void close() {
63
		key = null;
64
		value = null;
65
		try {
66
			Closeables.close(reader, false);
67
		} catch (IOException e) {
68
			throw new IllegalStateException(e);
69
		}
70
		endOfData();
71
	}
72

  
73
	@Override
74
	protected Pair<K, V> computeNext() {
75
		if (!reuseKeyValueInstances || value == null) {
76
			key = ReflectionUtils.newInstance(keyClass, conf);
77
			if (!noValue) {
78
				value = ReflectionUtils.newInstance(valueClass, conf);
79
			}
80
		}
81
		try {
82
			boolean available;
83
			if (noValue) {
84
				available = reader.next(key);
85
			} else {
86
				available = reader.next(key, value);
87
			}
88
			if (!available) {
89
				close();
90
				return null;
91
			}
92
			return new Pair<K, V>(key, value);
93
		} catch (IOException ioe) {
94
			close();
95
			throw new IllegalStateException(ioe);
96
		}
97
	}
98

  
99
}
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileUtils.java
1
package eu.dnetlib.data.hadoop.hdfs;
2

  
3
import java.io.IOException;
4

  
5
import org.apache.hadoop.conf.Configuration;
6
import org.apache.hadoop.fs.FileStatus;
7
import org.apache.hadoop.fs.FileSystem;
8
import org.apache.hadoop.fs.Path;
9
import org.apache.hadoop.fs.PathFilter;
10
import org.apache.hadoop.io.Text;
11

  
12
import com.google.common.base.Function;
13
import com.google.common.collect.Iterables;
14
import com.google.common.collect.Lists;
15

  
16
import eu.dnetlib.miscutils.collections.Pair;
17

  
18
public class SequenceFileUtils {
19

  
20
	public static final int NO_LIMIT = -1;
21

  
22
	public static Iterable<Pair<Text, Text>> read(Path path, final Configuration conf) throws IOException {
23
		return Iterables.concat(Iterables.transform(new SequenceFileUtils().doRead(path, conf), new Function<Path, SequenceFileIterable<Text, Text>>() {
24
			@Override
25
			public SequenceFileIterable<Text, Text> apply(Path path) {
26
				return new SequenceFileIterable<Text, Text>(path, conf);
27
			}
28
		}));
29
	}
30

  
31
	public static Iterable<Pair<Text, Text>> read(Path path, final Configuration conf, final int readMax) throws IOException {
32
		if (readMax != NO_LIMIT)
33
			return Iterables.limit(read(path, conf), readMax);
34
		else
35
			return read(path, conf);
36
	}
37

  
38
	private final PathFilter f = new PathFilter() {
39
		@Override
40
		public boolean accept(Path path) {
41
			return path.getName().contains("part-r");
42
		}
43
	};
44

  
45
	private Iterable<Path> doRead(Path path, final Configuration conf) throws IOException {
46
		FileSystem fs = FileSystem.get(conf);
47
		return Iterables.transform(Lists.newArrayList(fs.listStatus(path, f)), new Function<FileStatus, Path>() {
48
			@Override
49
			public Path apply(FileStatus fs) {
50
				Path path = fs.getPath();
51
				System.out.println("downloading xml files from path: " + path.toString());
52
				return path;
53
			}
54
		});
55
	}
56

  
57
}
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileWriterFactory.java
1
package eu.dnetlib.data.hadoop.hdfs;
2

  
3
import java.io.IOException;
4

  
5
import org.apache.hadoop.conf.Configuration;
6
import org.apache.hadoop.fs.Path;
7
import org.apache.hadoop.io.SequenceFile;
8
import org.apache.hadoop.io.SequenceFile.CompressionType;
9
import org.apache.hadoop.io.SequenceFile.Writer.Option;
10
import org.apache.hadoop.io.Writable;
11
import org.springframework.beans.factory.annotation.Required;
12

  
13
/**
14
 * Factory for SequenceFile.Writer instances
15
 *
16
 * @author claudio
17
 *
18
 */
19
public class SequenceFileWriterFactory {
20

  
21
	private Class<? extends Writable> keyClass;
22

  
23
	private Class<? extends Writable> valueClass;
24

  
25
	private String compressionType;
26

  
27
	private String blockSize = "64M";
28

  
29
	private short replication = 3;
30

  
31
	public SequenceFile.Writer getSequenceFileWriter(
32
			final Class<? extends Writable> keyClass,
33
			final Class<? extends Writable> valueClass,
34
			final Configuration conf,
35
			final Path sequenceFilePath) throws IOException {
36

  
37
		Configuration newConf = new Configuration(conf);
38
		newConf.set("dfs.blocksize", getBlockSize());
39

  
40
		Option oFile = SequenceFile.Writer.file(sequenceFilePath);
41
		Option oKey = SequenceFile.Writer.keyClass(keyClass);
42
		Option oValue = SequenceFile.Writer.valueClass(valueClass);
43
		Option oCmp = SequenceFile.Writer.compression(CompressionType.valueOf(getCompressionType()));
44
		Option oRpl = SequenceFile.Writer.replication(getReplication());
45

  
46
		return SequenceFile.createWriter(newConf, oFile, oKey, oValue, oCmp, oRpl);
47
	}
48

  
49
	public SequenceFile.Writer getSequenceFileWriter(final Configuration conf, final Path sequenceFilePath) throws IOException {
50
		return getSequenceFileWriter(getKeyClass(), getValueClass(), conf, sequenceFilePath);
51
	}
52

  
53
	public Class<? extends Writable> getKeyClass() {
54
		return keyClass;
55
	}
56

  
57
	@Required
58
	public void setKeyClass(final Class<? extends Writable> keyClass) {
59
		this.keyClass = keyClass;
60
	}
61

  
62
	public Class<? extends Writable> getValueClass() {
63
		return valueClass;
64
	}
65

  
66
	@Required
67
	public void setValueClass(final Class<? extends Writable> valueClass) {
68
		this.valueClass = valueClass;
69
	}
70

  
71
	public String getCompressionType() {
72
		return compressionType;
73
	}
74

  
75
	@Required
76
	public void setCompressionType(final String compressionType) {
77
		this.compressionType = compressionType;
78
	}
79

  
80
	public String getBlockSize() {
81
		return blockSize;
82
	}
83

  
84
	public void setBlockSize(final String blockSize) {
85
		this.blockSize = blockSize;
86
	}
87

  
88
	public short getReplication() {
89
		return replication;
90
	}
91

  
92
	public void setReplication(final short replication) {
93
		this.replication = replication;
94
	}
95

  
96
}
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/config/ConfigurationFactory.java
1
package eu.dnetlib.data.hadoop.config;
2

  
3
import java.io.IOException;
4
import java.util.Map.Entry;
5
import java.util.Properties;
6

  
7
import org.apache.commons.logging.Log;
8
import org.apache.commons.logging.LogFactory;
9
import org.apache.hadoop.conf.Configuration;
10
import org.springframework.beans.factory.FactoryBean;
11
import org.springframework.beans.factory.annotation.Required;
12
import org.springframework.core.io.Resource;
13

  
14
/**
15
 * Factory bean for hadoop cluster configuration object
16
 * 
17
 * @author claudio
18
 * 
19
 */
20
public class ConfigurationFactory implements FactoryBean<Configuration> {
21

  
22
	private static final Log log = LogFactory.getLog(ConfigurationFactory.class); // NOPMD by marko on 11/24/08 5:02 PM
23

  
24
	private Resource defaults;
25

  
26
	public Configuration getConfiguration() {
27
		try {
28
			return getObject();
29
		} catch (Exception e) {
30
			throw new IllegalStateException("Unable to load hadoop configuration", e);
31
		}
32
	}
33

  
34
	@Override
35
	public Configuration getObject() throws Exception {
36
		return initConfiguration(defaultProperties());
37
	}
38

  
39
	@Override
40
	public Class<?> getObjectType() {
41
		return Configuration.class;
42
	}
43

  
44
	@Override
45
	public boolean isSingleton() {
46
		return true;
47
	}
48

  
49
	private Configuration initConfiguration(final Properties p) {
50
		final Configuration conf = new Configuration(false);
51
		for (Entry<Object, Object> e : p.entrySet()) {
52
			conf.set(e.getKey().toString(), e.getValue().toString());
53
			log.info(e.getKey().toString() + ": " + conf.get(e.getKey().toString()));
54
		}
55
		return conf;
56
	}
57

  
58
	private Properties defaultProperties() throws IOException {
59
		Properties p = new Properties();
60
		p.load(defaults.getInputStream());
61
		return p;
62
	}
63

  
64
	public Resource getDefaults() {
65
		return defaults;
66
	}
67

  
68
	@Required
69
	public void setDefaults(Resource defaults) {
70
		this.defaults = defaults;
71
	}
72

  
73
}
0 74

  
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/config/ClusterName.java
1
package eu.dnetlib.data.hadoop.config;
2

  
3
import java.util.Arrays;
4
import java.util.List;
5

  
6
import com.google.common.base.Function;
7
import com.google.common.collect.Iterables;
8
import com.google.common.collect.Lists;
9

  
10
/**
11
 * Enumeration of the managed clusters.
12
 *
13
 * @author claudio
14
 *
15
 */
16
public enum ClusterName {
17
	DM, // Data Management
18
	IIS; // Information Inference Service(s)
19

  
20
	public static List<String> asStringList() {
21
		return Lists.newArrayList(Iterables.transform(asList(), new Function<ClusterName, String>() {
22

  
23
			@Override
24
			public String apply(final ClusterName clusterName) {
25
				return clusterName.toString();
26
			}
27
		}));
28
	}
29

  
30
	public static List<ClusterName> asList() {
31
		return Arrays.asList(ClusterName.values());
32
	}
33

  
34
}
0 35

  
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/hadoop/config/ConfigurationEnumerator.java
1
package eu.dnetlib.data.hadoop.config;
2

  
3
import java.util.Map;
4

  
5
import org.apache.commons.logging.Log;
6
import org.apache.commons.logging.LogFactory;
7
import org.apache.hadoop.conf.Configuration;
8
import org.springframework.beans.BeansException;
9
import org.springframework.beans.factory.BeanFactory;
10
import org.springframework.beans.factory.BeanFactoryAware;
11
import org.springframework.beans.factory.ListableBeanFactory;
12
import org.springframework.beans.factory.NoSuchBeanDefinitionException;
13

  
14
import com.google.common.base.Function;
15
import com.google.common.collect.Maps;
16

  
17
public class ConfigurationEnumerator implements BeanFactoryAware {
18

  
19
	private static final Log log = LogFactory.getLog(ConfigurationEnumerator.class); // NOPMD by marko on 11/24/08 5:02 PM
20

  
21
	/**
22
	 * bean factory.
23
	 */
24
	private ListableBeanFactory beanFactory;
25

  
26
	private final Function<ConfigurationFactory, Configuration> asConfiguration = new Function<ConfigurationFactory, Configuration>() {
27

  
28
		@Override
29
		public Configuration apply(ConfigurationFactory factory) {
30
			return factory.getConfiguration();
31
		}
32
	};
33

  
34
	/**
35
	 * Get all beans implementing the ConfiguredJob interface.
36
	 * 
37
	 * @return
38
	 */
39
	public Map<String, Configuration> getAll() {
40
		return Maps.transformValues(beanFactory.getBeansOfType(ConfigurationFactory.class), asConfiguration);
41
	}
42

  
43
	@Override
44
	public void setBeanFactory(final BeanFactory beanFactory) throws BeansException {
45
		this.beanFactory = (ListableBeanFactory) beanFactory;
46
	}
47

  
48
	public ListableBeanFactory getBeanFactory() {
49
		return beanFactory;
50
	}
51

  
52
	/**
53
	 * Get given Configuration or null.
54
	 * 
55
	 * @param name
56
	 * @return
57
	 */
58
	public Configuration get(final ClusterName name) {
59
		try {
60
			return (beanFactory.getBean(name.toString(), Configuration.class));
61
		} catch (final NoSuchBeanDefinitionException e) {
62
			log.error("undefined bean: " + name, e);
63
			return null;
64
		}
65
	}
66
}
0 67

  
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/transform/ColumnType.java
1
package eu.dnetlib.data.transform;
2

  
3
public enum ColumnType {
4
	base64,
5
	text,
6
}
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/transform/XsltRowTransformer.java
1
package eu.dnetlib.data.transform;
2

  
3
import java.io.StringReader;
4
import java.util.List;
5

  
6
import javax.xml.transform.Transformer;
7
import javax.xml.transform.TransformerException;
8

  
9
import org.apache.commons.codec.binary.Base64;
10
import org.apache.commons.logging.Log;
11
import org.apache.commons.logging.LogFactory;
12
import org.dom4j.Document;
13
import org.dom4j.DocumentException;
14
import org.dom4j.Element;
15
import org.dom4j.Node;
16
import org.dom4j.io.DocumentResult;
17
import org.dom4j.io.DocumentSource;
18
import org.dom4j.io.SAXReader;
19

  
20
import com.google.common.base.Function;
21
import com.google.common.collect.Lists;
22

  
23
public class XsltRowTransformer implements Function<String, List<Row>> {
24

  
25
	private static final Log log = LogFactory.getLog(XsltRowTransformer.class); // NOPMD by marko on 11/24/08 5:02 PM
26

  
27
	private Transformer transformer;
28

  
29
	private SAXReader reader = new SAXReader();
30

  
31
	public XsltRowTransformer(final Transformer transformer) {
32
		this.transformer = transformer;
33
		log.info(String.format("using trasformer: '%s'", getTransformerClassName()));
34
	}
35

  
36
	private Document transform(final Document doc) {
37
		if (transformer == null)
38
			return doc;
39
		final DocumentResult result = new DocumentResult();
40

  
41
		try {
42
			transformer.transform(new DocumentSource(doc), result);
43
			return result.getDocument();
44
		} catch (final TransformerException e) {
45
			throw new RuntimeException("Unable to transform document:\n" + doc.asXML(), e);
46
		}
47
	}
48

  
49
	private Document transform(final String xml) {
50
		try {
51
			return transform(reader.read(new StringReader(xml)));
52
		} catch (final DocumentException e) {
53
			log.error("Error parsing xml:\n" + xml, e);
54
			throw new RuntimeException("Unable to parse document:\n" + xml, e);
55
		}
56
	}
57

  
58
	private List<Row> transformAsListRow(final String xml) {
59
		final Document doc = transform(xml);
60

  
61
		final List<Row> rows = Lists.newArrayList();
62

  
63
		if (doc == null)
64
			return rows;
65

  
66
		for (final Object or : doc.selectNodes("//ROW")) {
67
			final Element row = (Element) or;
68

  
69
			final String columnFamily = row.valueOf("@columnFamily");
70
			final String key = row.valueOf("@key");
71

  
72
			if ((key == null) || key.isEmpty())
73
				throw new RuntimeException("Attribute 'key' is missing in XSLT");
74

  
75
			if ((columnFamily == null) || columnFamily.isEmpty())
76
				throw new RuntimeException("Attribute 'columnFamily' is missing in XSLT");
77

  
78
			final List<Column<String, byte[]>> cols = Lists.newArrayList();
79

  
80
			for (final Object of : row.selectNodes("./QUALIFIER")) {
81
				final Node node = (Node) of;
82

  
83
				final String name = node.valueOf("@name");
84
				final String type = node.valueOf("@type");
85

  
86
				final byte[] value = decode(node.getText().trim(), type);
87

  
88
				cols.add(new Column<String, byte[]>(name, value));
89
			}
90
			rows.add(new Row(columnFamily, key, cols));
91
		}
92
		return rows;
93
	}
94

  
95
	public String getTransformerClassName() {
96
		return transformer != null ? transformer.getClass().getName() : "null";
97
	}
98

  
99
	private byte[] decode(final String value, final String type) {
100

  
101
		if ("base64".equals(type))
102
			return Base64.decodeBase64(value);
103

  
104
		return value.getBytes();
105
	}
106

  
107
	@Override
108
	public List<Row> apply(final String xml) {
109
		return transformAsListRow(xml);
110
	}
111

  
112
}
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/transform/Row.java
1
package eu.dnetlib.data.transform;
2

  
3
import java.util.Collection;
4
import java.util.Iterator;
5
import java.util.List;
6
import java.util.Map;
7

  
8
import com.google.common.collect.Maps;
9

  
10
public class Row implements Iterable<Column<String, byte[]>> {
11

  
12
	private String columnFamily;
13

  
14
	private String key;
15

  
16
	private Map<String, Column<String, byte[]>> columns;
17

  
18
	public Row(final Row row) {
19
		this.columnFamily = row.getColumnFamily();
20
		this.key = row.getKey();
21
		this.columns = row.columns;
22
	}
23

  
24
	public Row(final String columnFamily, final String key) {
25
		this.columnFamily = columnFamily;
26
		this.key = key;
27
		this.columns = Maps.newHashMap();
28
	}
29

  
30
	public Row(final String columnFamily, final String key, final List<Column<String, byte[]>> columns) {
31
		this(columnFamily, key);
32
		this.setColumns(columns);
33
	}
34

  
35
	public void addColumn(final Column<String, byte[]> column) {
36
		getColumns().add(column);
37
	}
38

  
39
	public String getKey() {
40
		return key;
41
	}
42

  
43
	public Collection<Column<String, byte[]>> getColumns() {
44
		return columns.values();
45
	}
46

  
47
	public Column<String, byte[]> getColumn(final String name) {
48
		return columns.get(name);
49
	}
50

  
51
	public void setColumn(final String name, final Column<String, byte[]> newColumn) {
52
		columns.put(name, newColumn);
53
	}
54

  
55
	public void setColumns(final List<Column<String, byte[]>> columns) {
56
		this.columns.clear();
57
		for (final Column<String, byte[]> col : columns) {
58
			this.columns.put(col.getName(), col);
59
		}
60
	}
61

  
62
	public String getColumnFamily() {
63
		return columnFamily;
64
	}
65

  
66
	@Override
67
	public Iterator<Column<String, byte[]>> iterator() {
68
		return getColumns().iterator();
69
	}
70

  
71
	@Override
72
	public String toString() {
73
		return "\nRow [ KEY: " + key + ",\n\tCF: " + columnFamily + "\n\tCOLS: {" + columns + "}]";
74
	}
75

  
76
}
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/transform/Column.java
1
package eu.dnetlib.data.transform;
2

  
3
public class Column<N, V> {
4

  
5
	private N name;
6

  
7
	private V value;
8

  
9
	public static <N, V> Column<N, V> newInstance(final N name, final V value) {
10
		return new Column<N, V>(name, value);
11
	}
12

  
13
	public Column(final N name, final V value) {
14
		this.name = name;
15
		this.value = value;
16
	}
17

  
18
	public N getName() {
19
		return name;
20
	}
21

  
22
	public V getValue() {
23
		return value;
24
	}
25

  
26
}
modules/dnet-hadoop-commons/trunk/src/main/java/eu/dnetlib/data/transform/XsltRowTransformerFactory.java
1
package eu.dnetlib.data.transform;
2

  
3
import java.io.StringReader;
4
import java.util.Map;
5
import java.util.Map.Entry;
6

  
7
import javax.xml.transform.Transformer;
8
import javax.xml.transform.TransformerFactory;
9

  
10
import org.apache.commons.collections.MapUtils;
11
import org.apache.commons.logging.Log;
12
import org.apache.commons.logging.LogFactory;
13
import org.dom4j.io.DocumentSource;
14
import org.dom4j.io.SAXReader;
15

  
16
public class XsltRowTransformerFactory {
17

  
18
	private static final Log log = LogFactory.getLog(XsltRowTransformerFactory.class); // NOPMD by marko on 11/24/08 5:02 PM
19

  
20
	public XsltRowTransformer getTransformer(final String xslt) {
21
		return doGetTransformer(xslt, null);
22
	}
23

  
24
	public XsltRowTransformer getTransformer(final String xslt, final Map<String, Object> params) {
25
		return doGetTransformer(xslt, params);
26
	}
27

  
28
	private XsltRowTransformer doGetTransformer(final String xslt, final Map<String, Object> params) {
29
		try {
30
			if ((xslt == null) || xslt.isEmpty()) return new XsltRowTransformer(null);
31

  
32
			final TransformerFactory factory = TransformerFactory.newInstance();
33

  
34
			if (log.isDebugEnabled()) {
35
				log.debug("using transformer factory: '" + factory.getClass().getCanonicalName() + "'");
36
			}
37

  
38
			final Transformer t = factory.newTransformer(new DocumentSource(new SAXReader().read(new StringReader(xslt))));
39
			if (!MapUtils.isEmpty(params)) {
40
				for (final Entry<String, Object> e : params.entrySet()) {
41
					log.debug(String.format("using xslt param: %s - %s", e.getKey(), e.getValue()));
42
					t.setParameter(e.getKey(), e.getValue());
43
				}
44
			}
45
			return new XsltRowTransformer(t);
46
		} catch (final Exception e) {
47
			throw new RuntimeException("Error generating transformer from xslt:\n" + xslt, e);
48
		}
49
	}
50

  
51
	public static XsltRowTransformer newInstance(final String xslt) {
52
		return new XsltRowTransformerFactory().getTransformer(xslt);
53
	}
54

  
55
	public static XsltRowTransformer newInstance(final String xslt, final Map<String, Object> params) {
56
		return new XsltRowTransformerFactory().getTransformer(xslt, params);
57
	}
58

  
59
}
modules/dnet-hadoop-commons/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/applicationContext-hadoop.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<beans xmlns="http://www.springframework.org/schema/beans"
3
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:p="http://www.springframework.org/schema/p"
4
	xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd">
5

  
6
	<bean id="configurationEnumerator" class="eu.dnetlib.data.hadoop.config.ConfigurationEnumerator" />
7

  
8
	<bean id="DM" class="eu.dnetlib.data.hadoop.config.ConfigurationFactory"
9
		p:defaults="${services.data.hadoop.dm.properties}" />
10

  
11
	<bean id="IIS" class="eu.dnetlib.data.hadoop.config.ConfigurationFactory"
12
		p:defaults="${services.data.hadoop.iis.properties}" />
13

  
14
	<bean id="sequenceFileWriterFactory"
15
		class="eu.dnetlib.data.hadoop.hdfs.SequenceFileWriterFactory"
16
		scope="prototype" p:keyClass="${services.data.hadoop.hdfs.seqfilewriterfactory.keyclass}"
17
		p:valueClass="${services.data.hadoop.hdfs.seqfilewriterfactory.valueclass}"
18
		p:compressionType="${services.data.hadoop.hdfs.seqfilewriterfactory.compressiontype}"
19
		p:blockSize="${services.data.hadoop.hdfs.seqfilewriterfactory.blocksize}" 
20
		p:replication="${services.data.hadoop.hdfs.seqfilewriterfactory.replication}"/>
21

  
22
</beans>
0 23

  
modules/dnet-hadoop-commons/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.dm.local.properties
1
dnet.clustername				=	DM
2

  
3
#CORE-SITE
4
fs.defaultFS					=	hdfs://localhost:8020
5

  
6
hadoop.security.authentication	=	simple
7
hadoop.security.auth_to_local	=	DEFAULT
8

  
9
hadoop.rpc.socket.factory.class.default	=	org.apache.hadoop.net.StandardSocketFactory
10

  
11
#HBASE-SITE
12
hbase.rootdir					=	hdfs://localhost:8020/hbase
13

  
14
hbase.security.authentication	=	simple
15
zookeeper.znode.rootserver		=	root-region-server
16
hbase.zookeeper.quorum			=	localhost
17
hbase.zookeeper.property.clientPort	=	2181
18
hbase.zookeeper.client.port		=	2181
19
zookeeper.znode.parent			=	/hbase
20

  
21
#HDFS-SITE
22
dfs.replication					=	2
23
dfs.namenode.servicerpc-address	=	localhost:8022
24
dfs.namenode.http-address		=	localhost:50070
25

  
26

  
27

  
28
#MAPRED-SITE
29
mapred.job.tracker					=	localhost:8021
30

  
31
#OOZIE SERVER 
32
oozie.service.loc				=	http://localhost:11000/oozie
33

  
34

  
35
######### CDH QUICKSTART
36

  
37
#HDFS-SITE
38
#dfs.namenode.servicerpc-address=cloudera-vm:8022
39
#dfs.https.port=50470
40
#dfs.namenode.http-address=cloudera-vm:50070
41
#dfs.replication=3
42
#dfs.blocksize=134217728
43
#dfs.client.use.datanode.hostname=false
44
#fs.permissions.umask-mode=022
45
#dfs.client.read.shortcircuit=false
46
#dfs.domain.socket.path=/var/run/hdfs-sockets/dn
47
#dfs.client.read.shortcircuit.skip.checksum=false
48
#dfs.client.domain.socket.data.traffic=false
49
#dfs.datanode.hdfs-blocks-metadata.enabled=true
50
#
51
#
52
##HBASE-SITE
53
#hbase.rootdir=hdfs://cloudera-vm:8020/hbase
54
#hbase.client.write.buffer=2097152
55
#hbase.client.pause=1000
56
#hbase.client.retries.number=10
57
#hbase.client.scanner.caching=1
58
#hbase.client.keyvalue.maxsize=10485760
59
#hbase.rpc.timeout=60000
60
#hbase.security.authentication=simple
61
#zookeeper.session.timeout=60000
62
#zookeeper.znode.parent=/hbase
63
#zookeeper.znode.rootserver=root-region-server
64
#hbase.zookeeper.quorum=cloudera-vm
65
#hbase.zookeeper.property.clientPort=2181
66
#
67
##CORE-SITE
68
#fs.defaultFS=hdfs://cloudera-vm:8020
69
#fs.trash.interval=1
70
#hadoop.security.authentication=simple
71
#hadoop.rpc.protection=authentication
72
#hadoop.security.auth_to_local=DEFAULT
73

  
74

  
modules/dnet-hadoop-commons/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.iis.icm.properties
1
dnet.clustername				=	IIS
2

  
3
#CORE-SITE
4
fs.defaultFS					=	hdfs://iis-cluster-nn
5

  
6
hadoop.security.authentication	=	simple
7
hadoop.security.auth_to_local	=	DEFAULT
8

  
9
#HBASE-SITE
10
hbase.rootdir					=	hdfs://iis-cluster-nn/hbase
11

  
12
hbase.security.authentication	=	simple
13
zookeeper.znode.rootserver		=	root-region-server
14
hbase.zookeeper.quorum			=	master1.hadoop.iis.openaire.eu,master2.hadoop.iis.openaire.eu,master3.hadoop.iis.openaire.eu
15
hbase.zookeeper.property.clientPort	=	2181
16
hbase.zookeeper.client.port		=	2181
17
zookeeper.znode.parent			=	/hbase
18

  
19
#HDFS-SITE
20
dfs.replication					=	2
21
dfs.nameservices				=	iis-cluster-nn
22
dfs.ha.namenodes.iis-cluster-nn	=	nn1,nn2
23

  
24
dfs.namenode.rpc-address.iis-cluster-nn.nn1=namenode1.hadoop.iis.openaire.eu:8020
25
dfs.namenode.http-address.iis-cluster-nn.nn1=namenode1.hadoop.iis.openaire.eu:50070
26
dfs.namenode.rpc-address.iis-cluster-nn.nn2=namenode2.hadoop.iis.openaire.eu:8020
27
dfs.namenode.http-address.iis-cluster-nn.nn2=namenode2.hadoop.iis.openaire.eu:50070
28

  
29
dfs.client.failover.proxy.provider.iis-cluster-nn=org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
30

  
31
#MAPRED-SITE
32
mapred.job.tracker					=	iis-cluster-jt
33
mapred.jobtrackers.iis-cluster-jt	=	jt1,jt2
34

  
35
mapred.jobtracker.rpc-address.iis-cluster-jt.jt1 = master1.hadoop.iis.openaire.eu:8021
36
mapred.jobtracker.rpc-address.iis-cluster-jt.jt2 = master1.hadoop.iis.openaire.eu:8022
37

  
38
mapred.client.failover.proxy.provider.iis-cluster-jt = org.apache.hadoop.mapred.ConfiguredFailoverProxyProvider
39

  
40
mapred.mapper.new-api			=	true
41
mapred.reducer.new-api			=	true
42

  
43
#OOZIE SERVER
44
oozie.service.loc				=	http://oozie.hadoop.iis.openaire.eu:11000/oozie
modules/dnet-hadoop-commons/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.iis-cdh5.icm.properties
1
dnet.clustername				=	IIS
2

  
3
#CORE-SITE
4
fs.defaultFS				    =	hdfs://nameservice1
5

  
6
#HDFS-SITE
7
dfs.replication					=	2
8

  
9
#MAPRED-SITE
10
jobTracker                      =   yarnRM
11
mapred.job.tracker              =   yarnRM
12
mapred.mapper.new-api			=	true
13
mapred.reducer.new-api			=	true
14

  
15
#OOZIE SERVER
16
oozie.service.loc               =   http://iis-cdh5-test-m3.ocean.icm.edu.pl:11000/oozie
17

  
18
#MISC
19
importerQueueName               =   dm_import
20
oozieLauncherQueueName          =   default
modules/dnet-hadoop-commons/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.dm.cnr.properties
1
dnet.clustername				=	DM
2

  
3
#CORE-SITE
4
fs.defaultFS					=	hdfs://nmis-hadoop-cluster
5

  
6
hadoop.security.authentication	=	simple
7
hadoop.security.auth_to_local	=	DEFAULT
8

  
9
hadoop.rpc.socket.factory.class.default	=	org.apache.hadoop.net.StandardSocketFactory
10

  
11
#HBASE-SITE
12
hbase.rootdir					=	hdfs://nmis-hadoop-cluster/hbase
13

  
14
hbase.security.authentication	=	simple
15
zookeeper.znode.rootserver		=	root-region-server
16
hbase.zookeeper.quorum			=	quorum1.t.hadoop.research-infrastructures.eu,quorum2.t.hadoop.research-infrastructures.eu,quorum3.t.hadoop.research-infrastructures.eu,quorum4.t.hadoop.research-infrastructures.eu,jobtracker.t.hadoop.research-infrastructures.eu
17
hbase.zookeeper.property.clientPort	=	2182
18
hbase.zookeeper.client.port		=	2182
19
zookeeper.znode.parent			=	/hbase
20

  
21
#HDFS-SITE
22
dfs.replication					=	2
23
dfs.nameservices						=	nmis-hadoop-cluster
24
dfs.ha.namenodes.nmis-hadoop-cluster	=	nn1,nn2
25

  
26
dfs.namenode.rpc-address.nmis-hadoop-cluster.nn1=quorum1.t.hadoop.research-infrastructures.eu:8020
27
dfs.namenode.http-address.nmis-hadoop-cluster.nn1=quorum1.t.hadoop.research-infrastructures.eu:50070
28
dfs.namenode.rpc-address.nmis-hadoop-cluster.nn2=quorum2.t.hadoop.research-infrastructures.eu:8020
29
dfs.namenode.http-address.nmis-hadoop-cluster.nn2=quorum2.t.hadoop.research-infrastructures.eu:50070
30

  
31
dfs.client.failover.proxy.provider.nmis-hadoop-cluster=org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
32

  
33
#MAPRED-SITE
34
mapred.job.tracker					=	nmis-hadoop-jt
35
mapred.jobtrackers.nmis-hadoop-jt	=	jt1,jt2
36

  
37
mapred.jobtracker.rpc-address.nmis-hadoop-jt.jt1 = jobtracker.t.hadoop.research-infrastructures.eu:8021
38
mapred.jobtracker.rpc-address.nmis-hadoop-jt.jt2 = quorum4.t.hadoop.research-infrastructures.eu:8022
39

  
40
mapred.client.failover.proxy.provider.nmis-hadoop-jt = org.apache.hadoop.mapred.ConfiguredFailoverProxyProvider
41

  
42
mapred.mapper.new-api			=	true
43
mapred.reducer.new-api			=	true
44

  
45
#OOZIE SERVER
46
oozie.service.loc				=	http://oozie.t.hadoop.research-infrastructures.eu:11000/oozie
modules/dnet-hadoop-commons/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/applicationContext-hadoop.properties
1
services.data.hadoop.dm.properties = classpath:/eu/dnetlib/data/hadoop/config/hadoop-default.dm.cnr.properties
2
services.data.hadoop.iis.properties = classpath:/eu/dnetlib/data/hadoop/config/hadoop-default.iis.icm.properties
3

  
4
services.data.hadoop.hdfs.seqfilewriterfactory.keyclass = org.apache.hadoop.io.Text
5
services.data.hadoop.hdfs.seqfilewriterfactory.valueclass = org.apache.hadoop.io.Text
6
services.data.hadoop.hdfs.seqfilewriterfactory.compressiontype = BLOCK
7
services.data.hadoop.hdfs.seqfilewriterfactory.blocksize = 16M
8
services.data.hadoop.hdfs.seqfilewriterfactory.replication = 3
modules/dnet-hadoop-commons/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.dm.stdl.properties
1
dnet.clustername				=	DM
2

  
3
#CORE-SITE
4
fs.defaultFS					=	hdfs://d-net2.house.cnr.it:8020
5

  
6
hadoop.security.authentication	=	simple
7
hadoop.security.auth_to_local	=	DEFAULT
8

  
9
hadoop.rpc.socket.factory.class.default	=	org.apache.hadoop.net.StandardSocketFactory
10

  
11
#HBASE-SITE
12
hbase.rootdir					=	hdfs://d-net2.house.cnr.it:8020/hbase
13

  
14
hbase.security.authentication	=	simple
15
zookeeper.znode.rootserver		=	root-region-server
16
hbase.zookeeper.quorum			=	d-net2.house.cnr.it
17
hbase.zookeeper.property.clientPort	=	2181
18
hbase.zookeeper.client.port		=	2181
19
zookeeper.znode.parent			=	/hbase
20

  
21
#HDFS-SITE
22
dfs.replication					=	2
23
dfs.namenode.servicerpc-address	=	d-net2.house.cnr.it:8022
24
dfs.namenode.http-address		=	d-net2.house.cnr.it:50070
25
dfs.replication					= 	1
26

  
27
#MAPRED-SITE
28
mapred.job.tracker				=	d-net2.house.cnr.it:8021
29
mapred.child.java.opts			=	-Djava.net.preferIPv4Stack=true -Xmx1073741824
30

  
31
mapred.mapper.new-api			=	true
32
mapred.reducer.new-api			=	true
33

  
34
#OOZIE SERVER
35
#oozie.service.loc				=	http://oozie.t.hadoop.research-infrastructures.eu:11000/oozie
modules/dnet-hadoop-commons/trunk/src/main/resources/eu/dnetlib/data/hadoop/config/hadoop-default.dm.icm.properties
1
dnet.clustername				=	DM
2

  
3
#CORE-SITE
4
fs.defaultFS					=	hdfs://dm-cluster-nn
5

  
6
hadoop.security.authentication	=	simple
7
hadoop.security.auth_to_local	=	DEFAULT
8

  
9
hadoop.rpc.socket.factory.class.default	=	org.apache.hadoop.net.StandardSocketFactory
10

  
11
#HBASE-SITE
12
hbase.rootdir					=	hdfs://dm-cluster-nn/hbase
13

  
14
hbase.security.authentication	=	simple
15
zookeeper.znode.rootserver		=	root-region-server
16
hbase.zookeeper.quorum			=	namenode1.hadoop.dm.openaire.eu,namenode2.hadoop.dm.openaire.eu,jobtracker1.hadoop.dm.openaire.eu,jobtracker2.hadoop.dm.openaire.eu,hbase-master1.hadoop.dm.openaire.eu
17
hbase.zookeeper.property.clientPort	=	2181
18
hbase.zookeeper.client.port		=	2181
19
zookeeper.znode.parent			=	/hbase
20

  
21
#HDFS-SITE
22
dfs.replication					=	2
23
dfs.nameservices				=	dm-cluster-nn
24
dfs.ha.namenodes.dm-cluster-nn	=	nn1,nn2
25

  
26
dfs.namenode.rpc-address.dm-cluster-nn.nn1=namenode1.hadoop.dm.openaire.eu:8020
27
dfs.namenode.http-address.dm-cluster-nn.nn1=namenode1.hadoop.dm.openaire.eu:50070
28
dfs.namenode.rpc-address.dm-cluster-nn.nn2=namenode2.hadoop.dm.openaire.eu:8020
29
dfs.namenode.http-address.dm-cluster-nn.nn2=namenode2.hadoop.dm.openaire.eu:50070
30

  
31
dfs.client.failover.proxy.provider.dm-cluster-nn=org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
32

  
33
#MAPRED-SITE
34
mapred.job.tracker					=	dm-cluster-jt
35
mapred.jobtrackers.dm-cluster-jt	=	jt1,jt2
36

  
37
mapred.jobtracker.rpc-address.dm-cluster-jt.jt1 = jobtracker1.hadoop.dm.openaire.eu:8021
38
mapred.jobtracker.rpc-address.dm-cluster-jt.jt2 = jobtracker2.hadoop.dm.openaire.eu:8021
39

  
40
mapred.client.failover.proxy.provider.dm-cluster-jt = org.apache.hadoop.mapred.ConfiguredFailoverProxyProvider
41

  
42
mapred.mapper.new-api			=	true
43
mapred.reducer.new-api			=	true
44

  
45
#OOZIE SERVER 
46
oozie.service.loc				=	http://oozie.hadoop.dm.openaire.eu:11000/oozie
modules/dnet-hadoop-commons/trunk/pom.xml
1
<?xml version="1.0" encoding="UTF-8"?>
2
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
	<parent>
4
		<groupId>eu.dnetlib</groupId>
5
		<artifactId>dnet-parent</artifactId>
6
		<version>1.0.0</version>
7
		<relativePath />
8
	</parent>
9
	<modelVersion>4.0.0</modelVersion>
10
	<groupId>eu.dnetlib</groupId>
11
	<artifactId>dnet-hadoop-commons</artifactId>
12
	<packaging>jar</packaging>
13
	<version>2.0.1-SNAPSHOT</version>
14
	<scm>
15
		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-hadoop-commons/trunk</developerConnection>
16
	</scm>
17
	<repositories>
18
		<!-- Cloudera Repositories -->
19
		<repository>
20
			<snapshots>
21
				<enabled>false</enabled>
22
			</snapshots>
23
			<id>cloudera-central</id>
24
			<name>cloundera-libs-release</name>
25
			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/cloudera-central</url>
26
		</repository>
27
		<repository>
28
			<id>cloudera-snapshots</id>
29
			<name>cloudera-libs-snapshot</name>
30
			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/cloudera-snapshots</url>
31
		</repository>
32
		<repository>
33
			<id>typesafe</id>
34
			<name>typesafe-releases</name>
35
			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/typesafe</url>
36
		</repository>
37
	</repositories>
38

  
39
	<dependencies>
40
		<dependency>
41
			<groupId>eu.dnetlib</groupId>
42
			<artifactId>cnr-misc-utils</artifactId>
43
			<version>[1.0.0,2.0.0)</version>
44
			<exclusions>
45
				<exclusion>
46
					<artifactId>slf4j-api</artifactId>
47
					<groupId>org.slf4j</groupId>
48
				</exclusion>
49
			</exclusions>
50
		</dependency>
51
		<dependency>
52
			<groupId>org.apache.hadoop</groupId>
53
			<artifactId>hadoop-common</artifactId>
54
			<version>${hadoop.common.version}</version>
55
			<exclusions>
56
				<exclusion>
57
					<groupId>javax.servlet</groupId>
58
					<artifactId>servlet-api</artifactId>
59
				</exclusion>
60
				<exclusion>
61
					<artifactId>slf4j-api</artifactId>
62
					<groupId>org.slf4j</groupId>
63
				</exclusion>
64
				<exclusion>
65
					<artifactId>slf4j-log4j12</artifactId>
66
					<groupId>org.slf4j</groupId>
67
				</exclusion>
68
				<exclusion>
69
					<artifactId>jsp-api</artifactId>
70
					<groupId>javax.servlet.jsp</groupId>
71
				</exclusion>
72
				<exclusion>
73
					<artifactId>guava</artifactId>
74
					<groupId>com.google.guava</groupId>
75
				</exclusion>
76
			</exclusions>
77
		</dependency>
78
		<dependency>
79
			<groupId>org.apache.hadoop</groupId>
80
			<artifactId>hadoop-core</artifactId>
81
			<version>${hadoop.core.version}</version>
82
			<exclusions>
83
				<exclusion>
84
					<groupId>tomcat</groupId>
85
					<artifactId>jasper-runtime</artifactId>
86
				</exclusion>
87
				<exclusion>
88
					<groupId>tomcat</groupId>
89
					<artifactId>jasper-compiler</artifactId>
90
				</exclusion>
91
				<exclusion>
92
					<artifactId>jsp-api</artifactId>
93
					<groupId>javax.servlet.jsp</groupId>
94
				</exclusion>
95
				<exclusion>
96
					<groupId>javax.servlet</groupId>
97
					<artifactId>servlet-api</artifactId>
98
				</exclusion>
99
			</exclusions>
100
		</dependency>
101
		<dependency>
102
			<groupId>org.springframework</groupId>
103
			<artifactId>spring-beans</artifactId>
104
			<version>${spring.version}</version>
105
		</dependency>
106

  
107

  
108
		<!-- DO NOT REMOVE THIS DEPENDENCY!!! MODULES ACCESSING HDFS NEEDS IT EVEN 
109
			IF mvn dependency:analyze SUGGESTS TO REMOVE IT -->
110
		<dependency>
111
			<groupId>org.apache.hadoop</groupId>
112
			<artifactId>hadoop-hdfs</artifactId>
113
			<version>${hadoop.hdfs.version}</version>
114
			<exclusions>
115
				<exclusion>
116
					<groupId>javax.servlet</groupId>
117
					<artifactId>servlet-api</artifactId>
118
				</exclusion>
119
				<exclusion>
120
					<artifactId>jsp-api</artifactId>
121
					<groupId>javax.servlet.jsp</groupId>
122
				</exclusion>
123
				<exclusion>
124
					<artifactId>guava</artifactId>
125
					<groupId>com.google.guava</groupId>
126
				</exclusion>
127
			</exclusions>
128
		</dependency>
129
	</dependencies>
130
</project>
modules/dnet-hadoop-commons/branches/CDH-5.3.X/deploy.info
1
{"type_source": "SVN", "goal": "package -U -T 4C source:jar", "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-hadoop-commons/trunk/", "deploy_repository": "dnet4-snapshots", "version": "4", "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it", "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet4-snapshots", "name": "dnet-hadoop-commons"}
modules/dnet-hadoop-commons/branches/CDH-5.3.X/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterable.java
1
package eu.dnetlib.data.hadoop.hdfs;
2

  
3
import java.io.IOException;
4
import java.util.Iterator;
5

  
6
import org.apache.hadoop.conf.Configuration;
7
import org.apache.hadoop.fs.Path;
8
import org.apache.hadoop.io.Writable;
9

  
10
import eu.dnetlib.miscutils.collections.Pair;
11

  
12
/**
13
 * <p>
14
 * {@link Iterable} counterpart to {@link SequenceFileIterator}.
15
 * </p>
16
 */
17
public final class SequenceFileIterable<K extends Writable, V extends Writable> implements Iterable<Pair<K, V>> {
18

  
19
	private final Path path;
20
	private final boolean reuseKeyValueInstances;
21
	private final Configuration conf;
22

  
23
	/**
24
	 * Like {@link #SequenceFileIterable(Path, boolean, Configuration)} but key and value instances are not reused by
25
	 * default.
26
	 * 
27
	 * @param path
28
	 *            file to iterate over
29
	 */
30
	public SequenceFileIterable(Path path, Configuration conf) {
31
		this(path, false, conf);
32
	}
33

  
34
	/**
35
	 * @param path
36
	 *            file to iterate over
37
	 * @param reuseKeyValueInstances
38
	 *            if true, reuses instances of the key and value object instead of creating a new one for each read from
39
	 *            the file
40
	 */
41
	public SequenceFileIterable(Path path, boolean reuseKeyValueInstances, Configuration conf) {
42
		this.path = path;
43
		this.reuseKeyValueInstances = reuseKeyValueInstances;
44
		this.conf = conf;
45
	}
46

  
47
	@Override
48
	public Iterator<Pair<K, V>> iterator() {
49
		try {
50
			return new SequenceFileIterator<K, V>(path, reuseKeyValueInstances, conf);
51
		} catch (IOException ioe) {
52
			throw new IllegalStateException(path.toString(), ioe);
53
		}
54
	}
55

  
56
}
modules/dnet-hadoop-commons/branches/CDH-5.3.X/src/main/java/eu/dnetlib/data/hadoop/hdfs/SequenceFileIterator.java
1
package eu.dnetlib.data.hadoop.hdfs;
2

  
3
import java.io.Closeable;
4
import java.io.IOException;
5

  
6
import org.apache.hadoop.conf.Configuration;
7
import org.apache.hadoop.fs.FileSystem;
8
import org.apache.hadoop.fs.Path;
9
import org.apache.hadoop.io.NullWritable;
10
import org.apache.hadoop.io.SequenceFile;
11
import org.apache.hadoop.io.Writable;
12
import org.apache.hadoop.util.ReflectionUtils;
13

  
14
import com.google.common.collect.AbstractIterator;
15
import com.google.common.io.Closeables;
16

  
17
import eu.dnetlib.miscutils.collections.Pair;
18

  
19
/**
20
 * <p>
21
 * {@link java.util.Iterator} over a {@link SequenceFile}'s keys and values, as a {@link Pair} containing key and value.
22
 * </p>
23
 */
24
public final class SequenceFileIterator<K extends Writable, V extends Writable> extends AbstractIterator<Pair<K, V>> implements Closeable {
25

  
26
	private final SequenceFile.Reader reader;
27
	private final Configuration conf;
28
	private final Class<K> keyClass;
29
	private final Class<V> valueClass;
30
	private final boolean noValue;
31
	private K key;
32
	private V value;
33
	private final boolean reuseKeyValueInstances;
34

  
35
	/**
36
	 * @throws IOException
37
	 *             if path can't be read, or its key or value class can't be instantiated
38
	 */
39
	@SuppressWarnings("unchecked")
40
	public SequenceFileIterator(Path path, boolean reuseKeyValueInstances, Configuration conf) throws IOException {
41
		key = null;
42
		value = null;
43
		FileSystem fs = path.getFileSystem(conf);
44
		path = path.makeQualified(fs);
45
		reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path));
46
		this.conf = conf;
47
		keyClass = (Class<K>) reader.getKeyClass();
48
		valueClass = (Class<V>) reader.getValueClass();
49
		noValue = NullWritable.class.equals(valueClass);
50
		this.reuseKeyValueInstances = reuseKeyValueInstances;
51
	}
52

  
53
	public Class<K> getKeyClass() {
54
		return keyClass;
55
	}
56

  
57
	public Class<V> getValueClass() {
58
		return valueClass;
59
	}
60

  
61
	@Override
62
	public void close() {
63
		key = null;
64
		value = null;
65
		try {
66
			Closeables.close(reader, false);
67
		} catch (IOException e) {
68
			throw new IllegalStateException(e);
69
		}
70
		endOfData();
71
	}
72

  
73
	@Override
74
	protected Pair<K, V> computeNext() {
75
		if (!reuseKeyValueInstances || value == null) {
76
			key = ReflectionUtils.newInstance(keyClass, conf);
77
			if (!noValue) {
78
				value = ReflectionUtils.newInstance(valueClass, conf);
79
			}
80
		}
81
		try {
82
			boolean available;
83
			if (noValue) {
84
				available = reader.next(key);
85
			} else {
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff