Hive中如何处理JSON格式数据
Hive 处理json数据总体来说有三个办法:
使⽤内建的函数get_json_object、json_tuple
使⽤⾃定义的UDF(⼀进⼀出),⾃定义UDTF(⼀进多出)
第三⽅的SerDe--》JSONSerder
1、使⽤内建函数处理
get_json_object(string json_string, string path)
返回值:String
说明:解析json字符串json_string,返回path指定的内容;如果输⼊的json字符串⽆效,那么返回NUll;函数每次只能返回⼀个数据项;
json_tuple(jsonStr, k1, k2, ...)
返回值:所有的输⼊参数、输出参数都是String;
说明:参数为⼀组键k1,k2,。。。。。和json字符串,返回值的元组。该⽅法⽐get_json_object⾼效,因此可以在⼀次调⽤中输⼊多个键;
explode,使⽤explod将Hive⼀⾏中复杂的 array 或 map 结构拆分成多⾏。
测试数据:
user1;18;male;{"id": 1,"ids": [101,102,103],"total_number": 3}
user2;20;female;{"id": 2,"ids": [201,202,203,204],"total_number": 4}
user3;23;male;{"id": 3,"ids": [301,302,303,304,305],"total_number": 5}
user4;17;male;{"id": 4,"ids": [401,402,403,304],"total_number": 5}
user5;35;female;{"id": 5,"ids": [501,502,503],"total_number": 3}
建表加载数据:
CREATE TABLE IF NOT EXISTS jsont1(
username string,
age int,
sex string,
json string
)
row format delimited fields terminated by ';';
load data local inpath '/root/data/weibo.json' overwrite into table jsont1;
json的处理:
-- get 单层值
select username, age, sex, get_json_object(json, "$.id") id,
get_json_object(json, "$.ids") ids,
get_json_object(json, "$.total_number") num
from jsont1;
-- get 数组
select username, age, sex, get_json_object(json, "$.id") id,
get_json_object(json, "$.ids[0]") ids0,
get_json_object(json, "$.ids[1]") ids1,
get_json_object(json, "$.ids[2]") ids2,
get_json_object(json, "$.ids[3]") ids3,
get_json_object(json, "$.total_number") num
from jsont1;
-- 使⽤ json_tuple ⼀次处理多个字段
select json_tuple(json, 'id', 'ids', 'total_number')
from jsont1;
-- 有语法错误
select username, age, sex, json_tuple(json, 'id', 'ids', 'total_number')
from jsont1;
-- 使⽤ explode + lateral view
-- 在上⼀步的基础上,再将数据展开
-- 第⼀步,将 [101,102,103] 中的 [ ] 替换掉
-- select "[101,102,103]"
-- select "101,102,103"
select regexp_replace("[101,102,103]", "\\[|\\]", "");
-- 第⼆步,将上⼀步的字符串变为数组
select split(regexp_replace("[101,102,103]", "\\[|\\]", ""), ",");
-- 第三步,使⽤explode + lateral view 将数据展开
select username, age, sex, id, ids, num
from jsont1
lateral view json_tuple(json, 'id', 'ids', 'total_number') t1 as id, ids, num;
with tmp as(
select username, age, sex, id, ids, num
from jsont1
lateral view json_tuple(json, 'id', 'ids', 'total_number') t1 as id, ids, num
)
select username, age, sex, id, ids1, num
from tmp
lateral view explode(split(regexp_replace(ids, "\\[|\\]", ""), ",")) t1 as ids1;
⼩结:json_tuple 优点是⼀次可以解析多个json字段,对嵌套结果的解析操作复杂;
2、使⽤UDF处理
⾃定义UDF处理json串中的数组。⾃定义UDF函数:
输⼊:json串、数组的key
输出:字符串数组
pom⽂件
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.7</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.1.23</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
hive 字符串转数组<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
package cn.lagou.dw.hive.udf;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
lemon.base.Strings;
import org.apache.hadoop.UDF;
import org.junit.Test;
import java.util.ArrayList;
public class ParseJsonArray extends UDF {
public ArrayList<String> evaluate(String jsonStr, String arrKey){
if (Strings.isNullOrEmpty(jsonStr)) {
return null;
}
try{
JSONObject object = JSON.parseObject(jsonStr);
JSONArray jsonArray = JSONArray(arrKey);
ArrayList<String> result = new ArrayList<>();
for (Object o: jsonArray){
result.String());
}
return result;
} catch (JSONException e){
return null;
}
}
@Test
public void JunitParseJsonArray(){
String str = "{\"id\": 1,\"ids\": [101,102,103],\"total_number\": 3}";
String key = "ids";
ArrayList<String> evaluate = evaluate(str, key);
System.out.JSONString(evaluate));
}
}
使⽤⾃定义 UDF 函数:
-- 添加开发的jar包(在Hive命令⾏中)
add jar /root/edu_jars/my_udf.jar;
-- 创建临时函数。指定类名⼀定要完整的路径,即包名加类名
create temporary function lagou_json_array as "com.lagou.edu.ParseJsonArray"; -- 执⾏查询
-- 解析json串中的数组
select username, age, sex, lagou_json_array(json, "ids") ids
from jsont1;
-- 解析json串中的数组,并展开
select username, age, sex, ids1
from jsont1
lateral view explode(lagou_json_array(json, "ids")) t1 as ids1;
-- 解析json串中的id、num
select username, age, sex, id, num
from jsont1
lateral view json_tuple(json, 'id', 'total_number') t1 as id, num;
-- 解析json串中的数组,并展开
select username, age, sex, ids1, id, num
from jsont1
lateral view explode(lagou_json_array(json, "ids")) t1 as ids1
lateral view json_tuple(json, 'id', 'total_number') t1 as id, num;
3、使⽤UDTF处理
UDTF返回结果如下:
id ids total_number
11013
11023
11033
package com.lagou.edu.udtf;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
llect.Lists;
import org.apache.hadoop.UDFArgumentException;
import org.apache.hadoop.adata.HiveException;
import org.apache.hadoop.hive.ic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import java.util.List;
/**
* 官⽹编写UDTF地址:/confluence/display/Hive/DeveloperGuide+UDTF
*/
public class ParseJsonArrayUDTF extends GenericUDTF {
private String[] obj = new String[3];
/**
* 输出字段名称及数据类型
* @param argOIs
* @return
* @throws UDFArgumentException
*/
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { //输出的字段名称列表
List<String> colName = wLinkedList();
colName.add("id");
colName.add("ids");
colName.add("total_number");
//输出的结果数据类型
List<ObjectInspector> resType = wLinkedList();
resType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
resType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
resType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//返回列名与字段值的数据类型
StandardStructObjectInspector(colName, resType);
}
@Override
public void process(Object[] args) throws HiveException {
//第⼀个字段
if (args[0] == null ) {
return;
}
String jsonStr = args[0].toString();
JSONObject object = JSON.parseObject(jsonStr);
//获取到id
String id = String("id");
String total_number = String("total_number");
JSONArray ids = JSONArray("ids");
obj[0]=id;
obj[2]=total_number;
for (Object o : ids) {
obj[1]=o.toString();
System.out.println(obj[0]+"_"+obj[1]+"_"+obj[2]);
forward(obj);
}
}
@Override
public void close() throws HiveException {
}
public static void main(String[] args) throws HiveException {
ParseJsonArrayUDTF p = new ParseJsonArrayUDTF();
String str = "{\"id\": 1,\"ids\": [101,102,103],\"total_number\": 3}";
p.process(new String[]{str});
}
}
hive> add jar /root/jars/myudtf.jar;
hive> create temporary function myudtf as 'com.lagou.edu.udtf.ParseJsonArrayUDTF';
select username, age, sex, t1.id,t1.al_number
from jsont1
lateral view myudtf(json) t1 as id,ids,total_number;
4、使⽤SerDe处理
序列化是对象转换为字节序列的过程;反序列化是字节序列恢复为对象的过程;
对象的序列化主要有两种⽤途:
对象的持久化,即把对象转换成字节序列后保存到⽂件中
对象数据的⽹络传送
SerDe 是Serializer 和 Deserializer 的简写形式。Hive使⽤Serde进⾏⾏对象的序列与反序列化。最后实现把⽂件内容映射到 hive 表中的字段数据类型。SerDe包括 Serialize/Deserilize 两个功能:
Serialize把Hive使⽤的java object转换成能写⼊HDFS字节序列,或者其他系统能识别的流⽂件
Deserilize把字符串或者⼆进制流转换成Hive能识别的java object对象
Read : HDFS files => InputFileFormat => <key, value> => Deserializer => Row object
Write : Row object => Seriallizer => <key, value> => OutputFileFormat => HDFS files
Hive本⾝⾃带了⼏个内置的SerDe,还有其他⼀些第三⽅的SerDe可供选择。
create table t11(id string)
stored as parquet;
create table t12(id string)
stored as ORC;
desc formatted t11;
desc formatted t12;
LazySimpleSerDe(默认的SerDe)
ParquetHiveSerDe
OrcSerde
对于纯 json 格式的数据,可以使⽤ JsonSerDe 来处理。
{"id": 1,"ids": [101,102,103],"total_number": 3}
{"id": 2,"ids": [201,202,203,204],"total_number": 4}
{"id": 3,"ids": [301,302,303,304,305],"total_number": 5}
{"id": 4,"ids": [401,402,403,304],"total_number": 5}
{"id": 5,"ids": [501,502,503],"total_number": 3}
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论