1 |
18684
|
mateusz.ko
|
To Build:
|
2 |
|
|
|
3 |
|
|
mvn clean package
|
4 |
|
|
|
5 |
|
|
Hive Usage:
|
6 |
|
|
|
7 |
|
|
add jar avro-json-1.0-SNAPSHOT.jar;
|
8 |
|
|
CREATE TABLE doctors (foo string)
|
9 |
|
|
ROW FORMAT SERDE 'com.cloudera.science.avro.serde.AvroAsJSONSerde'
|
10 |
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
11 |
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
12 |
|
|
WITH SERDEPROPERTIES
|
13 |
|
|
('avro.schema.literal'='{
|
14 |
|
|
"namespace": "testing.hive.avro.serde",
|
15 |
|
|
"name": "doctors",
|
16 |
|
|
"type": "record",
|
17 |
|
|
"fields": [
|
18 |
|
|
{
|
19 |
|
|
"name":"number",
|
20 |
|
|
"type":"int",
|
21 |
|
|
"doc":"Order of playing the role"
|
22 |
|
|
},
|
23 |
|
|
{
|
24 |
|
|
"name":"first_name",
|
25 |
|
|
"type":"string",
|
26 |
|
|
"doc":"first name of actor playing role"
|
27 |
|
|
},
|
28 |
|
|
{
|
29 |
|
|
"name":"last_name",
|
30 |
|
|
"type":"string",
|
31 |
|
|
"doc":"last name of actor playing role"
|
32 |
|
|
},
|
33 |
|
|
{
|
34 |
|
|
"name":"extra_field",
|
35 |
|
|
"type":"string",
|
36 |
|
|
"doc:":"an extra field not in the original file",
|
37 |
|
|
"default":"fishfingers and custard"
|
38 |
|
|
}
|
39 |
|
|
]
|
40 |
|
|
}');
|
41 |
|
|
DESCRIBE doctors;
|
42 |
|
|
SELECT * from doctors;
|
43 |
|
|
SELECT get_json_object(foo, '$.number') from doctors;
|
44 |
|
|
|
45 |
|
|
Streaming Usage:
|
46 |
|
|
|
47 |
|
|
hadoop jar hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar \
|
48 |
|
|
-libjars avro-json-1.0-SNAPSHOT.jar \
|
49 |
|
|
-Dinput.schema.url=file:///doctors.avsc \
|
50 |
|
|
-Doutput.schema.url=file:///doctors.avsc \
|
51 |
|
|
-inputformat com.cloudera.science.avro.streaming.AvroAsJSONInputFormat \
|
52 |
|
|
-outputformat com.cloudera.science.avro.streaming.AvroAsJSONOutputFormat \
|
53 |
|
|
-mapper '/bin/cat' \
|
54 |
|
|
-input doctors.avro \
|
55 |
|
|
-output foo
|
56 |
|
|
|
57 |
|
|
Streaming Over Multiple Files with Different Schemas (Make sure that the ordering of the
|
58 |
|
|
arguments to input.schema.url corresponds to the ordering of the -input args to streaming):
|
59 |
|
|
|
60 |
|
|
hadoop jar hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar \
|
61 |
|
|
-libjars avro-json-1.0-SNAPSHOT.jar \
|
62 |
|
|
-Dinput.schema.url=file:///schemas/doctors.avsc,file:///schemas/episodes.avsc \
|
63 |
|
|
-inputformat com.cloudera.science.avro.streaming.AvroAsJSONInputFormat \
|
64 |
|
|
-mapper '/bin/cat' \
|
65 |
|
|
-input /data/doctors.avro \
|
66 |
|
|
-input /data/episodes.avro \
|
67 |
|
|
-output multijson
|