Hive复杂数据类型实战

发表: 2017-01-05 浏览: 4815

Hive

Hive的复杂数据类型
数组：ARRAY，有顺序的且每一个数组元素都必须是相同的数据类，从0开始进行索引
键值对：MAP<primitive_type,data_type>, 例，map<string,float>
结构体：STRUCT<col_name1:data_type,col_name2:data_type….>
联合体：UNIONTYPE<data_type,data_type…>，由于目前UNONTYPE的支持还不是很完善，主要体现在执行一些例如Join/where/group by等子句的时候会报错。

1. Hive 3种复杂类型基本使用

ctrl+v+A=\001,ctrl+v+B=\002,ctrl+v+C=\003

create external table if not exists emp(
 id int comment '员工id',
 name string comment '员工姓名',
 salary float comment '员工薪资',
 family array<string> comment '家庭成员姓名',
 deductions map<string,float> comment '税收项',
 address struct<city:string,street:string,zip:int> comment '住址（城市，街道，编码）'
)
row format delimited
fields terminated by '\001'
collection items terminated by '\002'
map keys terminated by '\003'
lines terminated by '\n'
stored as textfile;

-- 自定义分隔符的表
create external table if not exists emp1(
 id int comment '员工id',
 name string comment '员工姓名',
 salary float comment '员工薪资',
 family array<string> comment '家庭成员姓名',
 deductions map<string,float> comment '税收项',
 address struct<city:string,street:string,zip:int> comment '住址（城市，街道，编码）'
)
row format delimited
fields terminated by ','
collection items terminated by ' '
map keys terminated by ':';
-- 加载emp1的数据
load data local inpath '/var/root/complex/emp1.txt' into table emp1;

包含复杂类型的表的操作

-- ARRAY
select family[2] from emp1;
-- 下标溢出不会报错，而是返回NULL
-- 数组元素个数
select size(family) from emp1;
-- 判断数组当中是否包含了某个元素，返回值就是true,false
select array_contains(family,'Miao') from emp1;
-- 排序
select sort_array(family) from emp1;

-- MAP
-- 根据Key查询value值
select deductions["Tax"] from emp1;
-- 查询KeyValue对的个数
select size(deductions) from emp1;
-- 查询Map的所有Key
select map_keys(deductions) from emp1;
-- 查询Map的所有Value
select map_values(deductions) from emp1;

-- STRUCT 读取,把struct当中的每一个元素看成是一个属性
select address.city,address.zip from emp1;

2. Hive 3种复杂类型的嵌套

1. ARRAY 和 STRUCT嵌套

create external table if not exists emp2(
   id int,
   name string,
   salary float,
   info array<struct<city:string,street:string,zip:int>>
)
row format delimited
fields terminated by ','
collection items terminated by ' '
map keys terminated by ':';

load data local inpath '/var/root/complex/emp2.txt' into table emp2;

2. STRUCT 和 MAP嵌套

create external table if not exists emp3(
  id int,
  name string,
  info struct<
       salary:float,
        deductons:map<string,float>>
)
row format delimited
fields terminated by ','
collection items terminated by ' '
map keys terminated by ':';
load data local inpath '/var/root/complex/emp3.txt' into table emp3;
---4层嵌套关系
        1、各个字段之间
        2、Struct的各个成员之间，即salary和deductions之间
        3、deductions的各个KeyValue对之间，即Tax:.2 Insurance:.05
        4、每个KeyValue对的Key和Value之间,Tax:.2

create external table if not exists emp4(
  id int,
  name string,
  info struct<
       salary:float,
        deductions:map<string,float>>
);
第一层：\001
第二层：\002
第三层：\003
第四层：\004
load data local inpath '/var/root/complex/emp4.txt' into table emp4;

3. ARRAY/MAP/STRUCT

create external table if not exists emp5(
id int,
name string,
info struct<
        family:array<map<string,string>>,
        address:struct<city:string,street:string,zip:int>
>
);
---5层嵌套
        1、字段之间
        2、info的成员之间，即family和address之间
        3、family的各个元素之间、address的各个成员之间
        4、family每个元素的各个KeyValue对之间
        5、family的每一个元素的Key和Value之间
load data local inpath '/var/root/complex/emp5.txt' into table emp5;