1
|
package eu.dnetlib.dhp.common.spark.pipe;
|
2
|
|
3
|
import java.io.Serializable;
|
4
|
|
5
|
import org.apache.avro.generic.GenericRecord;
|
6
|
import org.apache.avro.mapred.AvroKey;
|
7
|
import org.apache.hadoop.io.NullWritable;
|
8
|
import org.apache.spark.SparkFiles;
|
9
|
import org.apache.spark.api.java.JavaPairRDD;
|
10
|
import org.apache.spark.api.java.JavaRDD;
|
11
|
|
12
|
import eu.dnetlib.dhp.common.utils.AvroGsonFactory;
|
13
|
import scala.Tuple2;
|
14
|
|
15
|
|
16
|
/**
|
17
|
* Executor of mapreduce scripts using spark pipes.
|
18
|
* It imitates hadoop streaming behavior.
|
19
|
*
|
20
|
* @author madryk
|
21
|
*
|
22
|
*/
|
23
|
public class SparkPipeExecutor implements Serializable {
|
24
|
|
25
|
private static final long serialVersionUID = 1L;
|
26
|
|
27
|
|
28
|
//------------------------ LOGIC --------------------------
|
29
|
|
30
|
/**
|
31
|
* Imitates map part of hadoop streaming job.
|
32
|
* It executes provided script for every key in inputRecords rdd.
|
33
|
* <br/><br/>
|
34
|
* It is assumed that provided script will read records from standard input (one line for one record)
|
35
|
* and write mapped record into standard output (also one line for one record).
|
36
|
* Mapped record can be a key/value pair. In that case script should return key and value
|
37
|
* splitted by tab (\t) character in single line.
|
38
|
*/
|
39
|
public JavaPairRDD<String, String> doMap(JavaPairRDD<AvroKey<GenericRecord>, NullWritable> inputRecords, String scriptName, String args) {
|
40
|
|
41
|
JavaRDD<String> mappedRecords = inputRecords.keys().pipe("python " + SparkFiles.get(scriptName) + " " + args);
|
42
|
|
43
|
JavaPairRDD<String, String> outputRecords = mappedRecords
|
44
|
.mapToPair(line -> {
|
45
|
String[] splittedPair = line.split("\t");
|
46
|
return new Tuple2<String, String>(splittedPair[0], (splittedPair.length == 1) ? null : splittedPair[1]);
|
47
|
});
|
48
|
|
49
|
return outputRecords;
|
50
|
}
|
51
|
|
52
|
/**
|
53
|
* Imitates reduce part of hadoop streaming job.
|
54
|
* <br/><br/>
|
55
|
* It is assumed that provided script will read records from standard input (one line for one record)
|
56
|
* and group records with the same key into single record (reduce).
|
57
|
* Method assures that all input records with the same key will be transfered in adjacent lines.
|
58
|
* Reduced records should be written by script into standard output (one line for one record).
|
59
|
* Reduced records must be json strings of class provided as argument.
|
60
|
*/
|
61
|
public JavaPairRDD<AvroKey<GenericRecord>, NullWritable> doReduce(JavaPairRDD<String, String> inputRecords, String scriptName, String args, Class<? extends GenericRecord> outputClass) {
|
62
|
|
63
|
JavaRDD<String> reducedRecords = inputRecords.sortByKey()
|
64
|
.map(record -> record._1 + ((record._2 == null) ? "" : ("\t" + record._2)))
|
65
|
.pipe("python " + SparkFiles.get(scriptName) + " " + args);
|
66
|
|
67
|
JavaPairRDD<AvroKey<GenericRecord>, NullWritable> outputRecords = reducedRecords
|
68
|
.map(recordString -> AvroGsonFactory.create().fromJson(recordString, outputClass))
|
69
|
.mapToPair(record -> new Tuple2<AvroKey<GenericRecord>, NullWritable>(new AvroKey<>(record), NullWritable.get()));
|
70
|
|
71
|
return outputRecords;
|
72
|
}
|
73
|
|
74
|
}
|