经典sql:统计连续登陆的⽤户数(⽤hivemysql均可)[含建表语句]排版可能是不太好,因为我先记在我的语雀笔记⾥的,但想到也是CSDN教我这么多,回馈CSDN,因此⼆次发在这⾥。
我这边对语句进⾏了更新,话不多说,开始吧。
1.建表语句
· 源数据,⽂件中是以,号隔开的perl教程txt
· 这⾥数据重复了,但不要去重哈
id,date
A,2018-09-04
B,2018-09-04
C,2018-09-04
A,2018-09-05
A,2018-09-05
C,2018-09-05
A,2018-09-06
B,2018-09-06
C,2018-09-06
A,2018-09-04
B,2018-09-04
C,2018-09-04
A,2018-09-05
A,2018-09-05
C,2018-09-05
A,2018-09-06
mysql面试题sql语句多表联查B,2018-09-06
C,2018-09-06
· 插⼊语句
insert into table tb_use values
("A","2018-09-04")
,("B","2018-09-04")
,("C","2018-09-04")
,("A","2018-09-05")
,("A","2018-09-05")
,("C","2018-09-05")
,("A","2018-09-06")
,
("B","2018-09-06")
,("C","2018-09-06")
,("A","2018-09-04")
,("B","2018-09-04")
,("C","2018-09-04")
,("A","2018-09-05")
,("A","2018-09-05")
,("C","2018-09-05")
,("A","2018-09-06")
,("B","2018-09-06")
,("C","2018-09-06");
·
设置本地模式
ode.local.auto=true;
· 在hive中创建表
drop table tb_use;
create table tb_use(
id string,
date string
)
-- partitioned by (daystr string) -- 指定分区,必须在最前⾯(先指定存⼊的分区才能指定其他)
row format delimited fields terminated by','-- 指定⾏的分隔符
lines terminated by'\n'-- 指定列的分隔符,默认为'\n'
stored as textfile -- 指定存储⽂件的类型,hive中默认类型为textfile,SequenceFile,RCFile,ORCFile -- location 'hdfs_path'; -- 存储到HDFS路径
-- 【我使⽤的】
create table tb_use(
id string,
datee string
)row format delimited fields terminated by','
lines terminated by'\n'
stored as textfile ;
;
· 从本地加载数据到表中
load data local inpath '/export/' overwrite into table tb_use;
2.基础知识
· 将⽇期格式的字符串转为⽇期格式
servlet原理详解select cast('2018-09-05'as date);
-- 结果:
2018-09-05
· ⽇期增加函数:date_add(string startdate, int days)
select date_add('2016-12-08',10);
-- 结果:
2016-12-18
· ⽇期减少函数:date_sub (string startdate, int days)
select date_sub('2016-12-08',10);
-
- 结果:
2016-11-28
· lead函数(第三个参数是当查不到记录的时候显⽰-1,默认是null)
select id ,datee,
lead(datee,1)over(partition by id order by datee desc)as date1
from tb_use ;
select id ,datee,
lead(datee,1,-1)over(partition by id order by datee desc)as date1 from tb_use
group by id,datee ;
3.解法1:lead搭配date_sub函数
· 展现连续登陆两天的⽤户信息(⽤窗⼝函数分析⽅法)
select
volatile词根词缀distinct b.id as c1
from
(
select id ,datee,
lead(datee,1,-1)over(partition by id order by datee desc)as date1 from tb_use as a
group by id,datee
)as b
where date_sub(cast(b.datee as date),1)=cast(b.date1 as date);
· 展现连续登陆两天的⽤户id(⽤窗⼝函数分析⽅法)
-- hive版
select
count(distinct b.id)as c1
from
(
select id ,datee,
lead(datee,1,-1)over(partition by id order by datee desc)as date1
from tb_use as a
group by id,datee
)as b
where date_sub(cast(b.datee as date),1)=cast(b.date1 as date);
-- mysql版
select
b.id as c1
from
(
select id ,datee,
lead(datee,1,-1)over(partition by id order by datee desc)as date1
from tb_use as a
group by id,datee
)as b
where date_sub(cast(b.datee as date),1)=cast(b.date1 as date);
SELECT id
FROM(
-- step3:⽤date_sub函数,发现如果连续登录的话,ds是⼀样的
SELECT*
,DATE_SUB(datee,INTERVAL rn DAY)AS ds
FROM(
-- step2:加个开窗函数,为第三步做准备
碰撞检测软件SELECT*
,row_number()over(PARTITION BY id ORDER BY datee)AS rn
FROM(
-- step1:⾸先对id、⽇期进⾏分组去重,因为⼀个⽤户每天会多次登录,⽣成tmp1表SELECT*
FROM tb_use
GROUP BY id
,datee
) tmp1
) tmp2
) tmp3
GROUP BY id
,ds
linux就该这么学 怎么样HAVING count(*)>2;
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论