在SQLServer中如何⽐较两个表的各组数据
开始
前⼀阵⼦,在项⽬中碰到这样⼀个SQL查询需求,有两个相同结构的表(table_left & table_right),如下:
图1.
检查表table_left的各组(groupId),是否在表table_right中存在有⼀组(groupId)数据(data)与它的数据(data)完全相等.
如图1. 可以看出表table_left和table_right存在两组数据完整相等:
图2.
分析
从上⾯的两个表,可以知道它们存放的是⼀组⼀组的数据;那么,接下来我借助数学集合的列举法和运算进⾏分析。
先通过集合的列举法描述两个表的各组数据:
图3.
这⾥只有两种情况,相等和不相等。对于不相等,可再分为部分相等、包含、和完全不相等。使⽤集合描述,可使⽤交集,⼦集,并集。如下⾯图4.,我列举出这⼏种常见的情况:
图4.
实现
在数据库中,要出表table_left和表table_right存在相同数据的组,⽅法很多,这⾥我列出两种常⽤的⽅法。
(下⾯的SQL脚本,是以图4.的数据为基础参考)
⽅法1:
通过"Select … From …Order by … xml for path('') "把各组的data列数据连串起来(如,图4.把table_left的组#11的列data连串起来成"data1-data2-data3"),其他分组(包含表table_right)以此⽅
法实现data列数据连串起来;然后通过⽐较两表的连串后字段是否存在相等,若是相等就说明这⽐较多两组数据相等,由此可以判断出表table_left的哪组数据在表table_right存在与它数据完全相等的组。
针对⽅法1,需要对原表增加⼀个字段dataPath,⽤于存储data列数据连串的结果,如:
alter table table_left add dataPath nvarchar(200)
alter table table_right add dataPath nvarchar(200)
分组连串data列数据并update⾄刚新增的列dataPath,如:
update a
set dataPath=b.dataPath
from table_left a
cross apply(select (select'-'+x.data from table_left x upId order by x.data for xml path(''))as dataPath)b
update a
set dataPath=b.dataPath
from table_right a
cross apply(select (select'-'+x.data from table_right x upId order by x.data for xml path(''))as dataPath)b
接下来就是查询了,如:
select upId
from table_left a
where exists(select 1 from table_right x where x.dataPath=a.dataPath)
完整代码:
View Code
use tempdb
go
if object_id('table_left') is not null drop table table_left
if object_id('table_right') is not null drop table table_right
go
create table table_left(groupId nvarchar(5),data nvarchar(10))
create table table_right(groupId nvarchar(5),data nvarchar(10))
go
alter table table_left add dataPath nvarchar(200)
alter table table_right add dataPath nvarchar(200)
go
create nonclustered index ix_left on table_left(dataPath)
create nonclustered index ix_right on table_right(dataPath)
go
set nocount on
go
insert into table_right(groupId,data)
select'#1','data1'union all
select'#1','data2'union all
select'#1','data3'union all
select'#2','data55'union all
select'#2','data55'union all
select'#3','data91'union all
select'#3','data92'union all
select'#4','data65'union all
select'#4','data66'union all
select'#4','data67'union all
select'#4','data68'union all
select'#4','data69'union all
select'#5','data77'union all
select'#5','data79'
insert into table_left(groupId,data)
select'#11','data1'union all
select'#11','data2'union all
select'#11','data3'union all
select'#22','data55'union all
select'#22','data57'union all
select'#33','data99'union all
select'#33','data99'union all
select'#44','data66'union all
select'#44','data68'union all
select'#55','data77'union all
select'#55','data78'union all
select'#55','data79'
go
update a
set dataPath=b.dataPath
from table_left a
cross apply(select (select'-'+x.data from table_left x upId order by x.data for xml path('')) as dataPath)b
update a
set dataPath=b.dataPath
from table_right a
cross apply(select (select'-'+x.data from table_right x upId order by x.data for xml path('')) as dataPath)b
--
select upId
from table_left a
where exists(select1from table_right x where x.dataPath=a.dataPath)
⽅法2:
通过SQL Sever提供的集运算符"Except",判断两组⾮重复的数据。如果两组针对对⽅都不存在⾮重复的数据,就说明这两组数据完全相等。如,表table_left中的组#11和表table_right中的组#1,对列data进⾏"Except"集运算,⽆任是(#11 à #1)进⾏Except集运算,还是(#1 à #11 )进⾏Except集合运算,都返回空结果,这就说明组#1 和#11的data数据完全相等,如:
select data from table_left where groupId='#11'except select data from table_right where
groupId='#1'
select data from table_right where groupId='#1'except select data from table_left where
groupId='#11'
同样道理,我们把表table_left中的组#11和表 table_right中的组#2,对列data进⾏"Except"集运算,如:
select data from table_left where groupId='#11'except select data from table_right where
groupId='#2'
select data from table_right where groupId='#2'except select data from table_left where
groupId='#11'
只要(#11 à #2 )或 (#2 à #11 )的"Except"集运算结果有记录,就说明两组的数据不相等。
两张表的所有组都进⾏⽐较,我们需要通过以下SQL脚本实现,如:
select upId
from table_left a
inner join table_right b on b.data=a.data
where not exists(select x.data from table_left x upId except select y.data
from table_right y upId )
and not exists(select x.data from table_right x upId except select y.data
from table_left y upId )
完整代码:
View Code
use tempdb
go
if object_id('table_left') is not null drop table table_left
if object_id('table_right') is not null drop table table_right
go
create table table_left(groupId nvarchar(5),data nvarchar(10))
create table table_right(groupId nvarchar(5),data nvarchar(10))
go
create nonclustered index ix_left on table_left(data)
create nonclustered index ix_right on table_right(data)
go
set nocount on
go
insert into table_right(groupId,data)
select'#1','data1'union all
select'#1','data2'union all
select'#1','data3'union all
select'#2','data55'union all
select'#2','data55'union all
select'#3','data91'union all
select'#3','data92'union all
select'#4','data65'union all
select'#4','data66'union all
select'#4','data67'union all
select'#4','data68'union all
select'#4','data69'union all
select'#5','data77'union all
select'#5','data79'
insert into table_left(groupId,data)
sql中union多表合并select'#11','data1'union all
select'#11','data2'union all
select'#11','data3'union all
select'#22','data55'union all
select'#22','data57'union all
select'#33','data99'union all
select'#33','data99'union all
select'#44','data66'union all
select'#44','data68'union all
select'#55','data77'union all
select'#55','data78'union all
select'#55','data79'
go
--select
select upId
from table_left a
inner join table_right b on b.data=a.data
where not exists(select x.data from table_left x upId except select y.data from table_right y upId )
and not exists(select x.data from table_right x upId except select y.data from table_left y upId )
⽅法1 Vs. ⽅法2 :
⽅法1和⽅法2都能出表table_left在table_right存在数据完全相等的组#11。但性能⾓度上,⽅法2⽐
⽅法1略胜⼀筹,可以看它们执⾏过程的统计信息:
⽅法1:
图5.
⽅法2:
图6.
如果,数据量⼤情况下,那么⽅法2⽐⽅法1更具有明显的优点。因为⽅法1,多两个更新dataPath的部分,数据量随着增加,这⾥位置的更新就耗很多的资源;如果dataPath列
数据⼤⼩超过900字节,会导致⽆法在dataPath创建索引,影响后⾯的Select查询性能。
扩展
这⾥说扩展,主要是针对上⾯的⽅法2来说。在当列data的数据⼤⼩超过900字节,或者含有多个数据列要进⾏⽐较,看是否存在两组(groupId)的各对应列数据⼀⼀相等。
图7.
这样的情况,可对字段dataSub1 & dataSub2 创建⼀个哈希索引,如:
alter table table_left add dataChecksum as checksum(dataSub1,dataSub2)
alter table table_right add dataChecksum as checksum(dataSub1,dataSub2)
go
create nonclustered index ix_table_left_cs on table_right(dataChecksum)
create nonclustered index table_right_cs on table_right(dataChecksum)
后⾯的select查询语句,在Inner Join 部分稍改动下即可,如:
select upId
from table_left a
inner join table_right b on b.dataChecksum=a.dataChecksum
and b.dataSub1=a.dataSub1
and b.dataSub2=a.dataSub2
where not exists(select x.dataSub1,x.dataSub2from table_left x upId
except select y.dataSub1,y.dataSub2from table_right y upId )
and not exists(select x.dataSub1,x.dataSub2from table_right x upId
except select y.dataSub1,y.dataSub2from table_left y upId )
完整代码:
View Code
use tempdb
go
if object_id('table_left') is not null drop table table_left
if object_id('table_right') is not null drop table table_right
go
create table table_left(groupId nvarchar(5),dataSub1 nvarchar(10),dataSub2 nvarchar(10))
create table table_right(groupId nvarchar(5),dataSub1 nvarchar(10),dataSub2 nvarchar(10))
go
alter table table_left add dataChecksum as checksum(dataSub1,dataSub2)
alter table table_right add dataChecksum as checksum(dataSub1,dataSub2)
go
create nonclustered index ix_table_left_cs on table_left(dataChecksum)
create nonclustered index table_right_cs on table_right(dataChecksum)
go
set nocount on
go
insert into table_right(groupId,dataSub1,dataSub2)
select'#1','data1','data7'union all
select'#1','data2','data8'union all
select'#1','data3','data9'union all
select'#2','data55','data4'union all
select'#2','data55','data5'
insert into table_left(groupId,dataSub1,dataSub2)
select'#11','data1','data7'union all
select'#11','data2','data8'union all
select'#11','data3','data9'union all
select'#22','data55','data0'union all
select'#22','data57','data2'union all
select'#33','data99','data4'union all
select'#33','data99','data6'
go
--select
select upId
from table_left a
inner join table_right b on b.dataChecksum=a.dataChecksum
and b.dataSub1=a.dataSub1
and b.dataSub2=a.dataSub2
where not exists(select x.dataSub1,x.dataSub2 from table_left x upId except select y.dataSub1,y.dataSub2 from table_right y upId ) and not exists(select x.dataSub1,x.dataSub2 from table_right x upId except select y.dataSub1,y.dataSub2 from table_left y upId )⼩结
对于这个问题,可能还有其他的或更优的解决⽅法.⽽且在实际的⽣产环境中,可能碰到的情况会有所不同,⽆论如何,需要多分析,多动⼿多实验,到最优的解决⽅法。
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论