1
|
To Build:
|
2
|
|
3
|
mvn clean package
|
4
|
|
5
|
Hive Usage:
|
6
|
|
7
|
add jar avro-json-1.0-SNAPSHOT.jar;
|
8
|
CREATE TABLE doctors (foo string)
|
9
|
ROW FORMAT SERDE 'com.cloudera.science.avro.serde.AvroAsJSONSerde'
|
10
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
11
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
12
|
WITH SERDEPROPERTIES
|
13
|
('avro.schema.literal'='{
|
14
|
"namespace": "testing.hive.avro.serde",
|
15
|
"name": "doctors",
|
16
|
"type": "record",
|
17
|
"fields": [
|
18
|
{
|
19
|
"name":"number",
|
20
|
"type":"int",
|
21
|
"doc":"Order of playing the role"
|
22
|
},
|
23
|
{
|
24
|
"name":"first_name",
|
25
|
"type":"string",
|
26
|
"doc":"first name of actor playing role"
|
27
|
},
|
28
|
{
|
29
|
"name":"last_name",
|
30
|
"type":"string",
|
31
|
"doc":"last name of actor playing role"
|
32
|
},
|
33
|
{
|
34
|
"name":"extra_field",
|
35
|
"type":"string",
|
36
|
"doc:":"an extra field not in the original file",
|
37
|
"default":"fishfingers and custard"
|
38
|
}
|
39
|
]
|
40
|
}');
|
41
|
DESCRIBE doctors;
|
42
|
SELECT * from doctors;
|
43
|
SELECT get_json_object(foo, '$.number') from doctors;
|
44
|
|
45
|
Streaming Usage:
|
46
|
|
47
|
hadoop jar hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar \
|
48
|
-libjars avro-json-1.0-SNAPSHOT.jar \
|
49
|
-Dinput.schema.url=file:///doctors.avsc \
|
50
|
-Doutput.schema.url=file:///doctors.avsc \
|
51
|
-inputformat com.cloudera.science.avro.streaming.AvroAsJSONInputFormat \
|
52
|
-outputformat com.cloudera.science.avro.streaming.AvroAsJSONOutputFormat \
|
53
|
-mapper '/bin/cat' \
|
54
|
-input doctors.avro \
|
55
|
-output foo
|
56
|
|
57
|
Streaming Over Multiple Files with Different Schemas (Make sure that the ordering of the
|
58
|
arguments to input.schema.url corresponds to the ordering of the -input args to streaming):
|
59
|
|
60
|
hadoop jar hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar \
|
61
|
-libjars avro-json-1.0-SNAPSHOT.jar \
|
62
|
-Dinput.schema.url=file:///schemas/doctors.avsc,file:///schemas/episodes.avsc \
|
63
|
-inputformat com.cloudera.science.avro.streaming.AvroAsJSONInputFormat \
|
64
|
-mapper '/bin/cat' \
|
65
|
-input /data/doctors.avro \
|
66
|
-input /data/episodes.avro \
|
67
|
-output multijson
|
68
|
|