Project

General

Profile

1
package eu.dnetlib.dhp.common.spark.pipe;
2

    
3
import java.io.Serializable;
4

    
5
import org.apache.avro.generic.GenericRecord;
6
import org.apache.avro.mapred.AvroKey;
7
import org.apache.hadoop.io.NullWritable;
8
import org.apache.spark.SparkFiles;
9
import org.apache.spark.api.java.JavaPairRDD;
10
import org.apache.spark.api.java.JavaRDD;
11

    
12
import eu.dnetlib.dhp.common.utils.AvroGsonFactory;
13
import scala.Tuple2;
14

    
15

    
16
/**
17
 * Executor of mapreduce scripts using spark pipes.
18
 * It imitates hadoop streaming behavior. 
19
 * 
20
 * @author madryk
21
 *
22
 */
23
public class SparkPipeExecutor implements Serializable {
24

    
25
	private static final long serialVersionUID = 1L;
26

    
27

    
28
	//------------------------ LOGIC --------------------------
29
	
30
	/**
31
	 * Imitates map part of hadoop streaming job.
32
	 * It executes provided script for every key in inputRecords rdd.
33
	 * <br/><br/>
34
	 * It is assumed that provided script will read records from standard input (one line for one record)
35
	 * and write mapped record into standard output (also one line for one record).
36
	 * Mapped record can be a key/value pair. In that case script should return key and value
37
	 * splitted by tab (\t) character in single line. 
38
	 */
39
	public JavaPairRDD<String, String> doMap(JavaPairRDD<AvroKey<GenericRecord>, NullWritable> inputRecords, String scriptName, String args) {
40

    
41
		JavaRDD<String> mappedRecords = inputRecords.keys().pipe("python " + SparkFiles.get(scriptName) + " " + args);
42

    
43
		JavaPairRDD<String, String> outputRecords = mappedRecords
44
				.mapToPair(line -> {
45
					String[] splittedPair = line.split("\t");
46
					return new Tuple2<String, String>(splittedPair[0], (splittedPair.length == 1) ? null : splittedPair[1]);
47
				});
48

    
49
		return outputRecords;
50
	}
51

    
52
	/**
53
	 * Imitates reduce part of hadoop streaming job.
54
	 * <br/><br/>
55
	 * It is assumed that provided script will read records from standard input (one line for one record)
56
	 * and group records with the same key into single record (reduce).
57
	 * Method assures that all input records with the same key will be transfered in adjacent lines.
58
	 * Reduced records should be written by script into standard output (one line for one record).
59
	 * Reduced records must be json strings of class provided as argument.
60
	 */
61
	public JavaPairRDD<AvroKey<GenericRecord>, NullWritable> doReduce(JavaPairRDD<String, String> inputRecords, String scriptName, String args, Class<? extends GenericRecord> outputClass) {
62

    
63
		JavaRDD<String> reducedRecords = inputRecords.sortByKey()
64
				.map(record -> record._1 + ((record._2 == null) ? "" : ("\t" + record._2)))
65
				.pipe("python " + SparkFiles.get(scriptName) + " " + args);
66

    
67
		JavaPairRDD<AvroKey<GenericRecord>, NullWritable> outputRecords = reducedRecords
68
				.map(recordString -> AvroGsonFactory.create().fromJson(recordString, outputClass))
69
				.mapToPair(record -> new Tuple2<AvroKey<GenericRecord>, NullWritable>(new AvroKey<>(record), NullWritable.get()));
70

    
71
		return outputRecords;
72
	}
73

    
74
}
    (1-1/1)