create table it201011a (
gal text not null
);
-- 輸入一億筆
insert into it201011a
select (array['小島南', '初川南', '相沢南', '架乃ゆら', '山岸逢花', '楓カレン', '坂道みる', '橋本ありな', '葵つかさ', '天使もえ', '水卜さくら'])[floor(random() * 11) + 1::int]
from generate_series(1, 1e8);
create index on it201011a(gal);
--
要找出欄位distinct , 一般直接的方式.
select distinct gal
from it201011a;
這樣會很慢....
我們只看一下 評估成本,就不用 timing.
explain (costs)
select distinct gal
from it201011a;
+------------------------------------------------------------------------------+
| QUERY PLAN |
+------------------------------------------------------------------------------+
| HashAggregate (cost=1790541.00..1790541.11 rows=11 width=12) |
| Group Key: gal |
| -> Seq Scan on it201011a (cost=0.00..1540541.00 rows=100000000 width=12) |
+------------------------------------------------------------------------------+
--
with recursive t1 as (
select min(t.gal) gal
from it201011a t
where t.gal is not null
union all
select (select min(gal)
from it201011a a
where a.gal > b.gal
and a.gal is not null)
from t1 b
where b.gal is not null
)
select *
from t1
where gal is not null;
+------------+
| gal |
+------------+
| 初川南 |
| 小島南 |
| 相沢南 |
| 坂道みる |
| 天使もえ |
| 山岸逢花 |
| 架乃ゆら |
| 楓カレン |
| 葵つかさ |
| 橋本ありな |
| 水卜さくら |
+------------+
(11 rows)
Time: 4.377 ms
速度超快!! 來看一下執行計畫
explain (analyze,timing,costs)
with recursive t1 as (
select min(t.gal) gal
from it201011a t
where t.gal is not null
union all
select (select min(gal)
from it201011a a
where a.gal > b.gal
and a.gal is not null)
from t1 b
where b.gal is not null
)
select *
from t1
where gal is not null;
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| QUERY PLAN |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| CTE Scan on t1 (cost=241.33..243.35 rows=100 width=32) (actual time=0.147..3.688 rows=11 loops=1) |
| Filter: (gal IS NOT NULL) |
| Rows Removed by Filter: 1 |
| CTE t1 |
| -> Recursive Union (cost=2.33..241.33 rows=101 width=32) (actual time=0.141..3.666 rows=12 loops=1) |
| -> Result (cost=2.33..2.34 rows=1 width=32) (actual time=0.107..0.109 rows=1 loops=1) |
| InitPlan 3 (returns $1) |
| -> Limit (cost=0.57..2.33 rows=1 width=12) (actual time=0.100..0.101 rows=1 loops=1) |
| -> Index Only Scan using it201011a_gal_idx on it201011a t (cost=0.57..176009683.02 rows=100000000 width=12) (actual time=0.098..0.099 rows=1 loops=1) |
| Index Cond: (gal IS NOT NULL) |
| Heap Fetches: 1 |
| -> WorkTable Scan on t1 b (cost=0.00..23.70 rows=10 width=32) (actual time=0.291..0.292 rows=1 loops=12) |
| Filter: (gal IS NOT NULL) |
| Rows Removed by Filter: 0 |
| SubPlan 2 |
| -> Result (cost=2.34..2.35 rows=1 width=32) (actual time=0.315..0.315 rows=1 loops=11) |
| InitPlan 1 (returns $3) |
| -> Limit (cost=0.57..2.34 rows=1 width=12) (actual time=0.313..0.313 rows=1 loops=11) |
| -> Index Only Scan using it201011a_gal_idx on it201011a a (cost=0.57..59072943.18 rows=33333333 width=12) (actual time=0.312..0.312 rows=1 loops=11) |
| Index Cond: ((gal > b.gal) AND (gal IS NOT NULL)) |
| Heap Fetches: 10 |
| Planning Time: 0.437 ms |
| Execution Time: 3.823 ms |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
其實就一個loop,依次比較,且逐步減少需要聚合的量.
-----------
接著來看第二個例子. 有一種情境是 一個 table 存放 device 相關資料,device 可以是 sensor, 車輛 等等;
另一個 table 存放 device 的紀錄, 例如感應數據,或是行車狀況等等.
為了簡便起見,以下的例子就不加上日期.
create table itdevice (
id int not null primary key
);
insert into itdevice
select generate_series(1,1000);
create table itdevval (
id int generated always as identity
, devid int not null
, ts timestamp not null
);
輸入600萬筆資料
insert into itdevval(devid, ts)
select b.devid
, clock_timestamp()
from (select generate_series(1,1e4)) a
, (select generate_series(1,600) devid) b;
建立 itdevval 中 devid 欄位的 index.
create index on itdevval(devid);
----
當我們想要查詢哪些device,沒有數據資料,例如設備故障等情況.
因為兩邊都有 device id 的 index, 一般直接查詢.
基本的 not in
select *
from itdevice
where id not in (
select distinct devid
from itdevval);
Time: 1372.260 ms (00:01.372)
select *
from itdevice
where id not in (
select devid
from itdevval);
Time: 249832.184 ms (04:09.832)
來看一下執行計畫
explain (costs)
select *
from itdevice
where id not in (
select distinct devid
from itdevval);
+--------------------------------------------------------------------------------+
| QUERY PLAN |
+--------------------------------------------------------------------------------+
| Seq Scan on itdevice (cost=107440.50..107458.00 rows=500 width=4) |
| Filter: (NOT (hashed SubPlan 1)) |
| SubPlan 1 |
| -> HashAggregate (cost=107433.00..107439.00 rows=600 width=4) |
| Group Key: itdevval.devid |
| -> Seq Scan on itdevval (cost=0.00..92433.00 rows=6000000 width=4) |
+--------------------------------------------------------------------------------+
explain (costs)
select *
from itdevice
where id not in (
select devid
from itdevval);
+--------------------------------------------------------------------------------+
| QUERY PLAN |
+--------------------------------------------------------------------------------+
| Seq Scan on itdevice (cost=0.00..80435517.50 rows=500 width=4) |
| Filter: (NOT (SubPlan 1)) |
| SubPlan 1 |
| -> Materialize (cost=0.00..145871.00 rows=6000000 width=4) |
| -> Seq Scan on itdevval (cost=0.00..92433.00 rows=6000000 width=4) |
+--------------------------------------------------------------------------------+
distinct 有聚合, HashAggregate Group Key: itdevval.devid
但是都是 Seq Scan
使用 outer join 的方式
select a.id
from itdevice a
left join itdevval b
on (a.id = b.devid)
where b.* is null;
Time: 2218.486 ms (00:02.218)
explain (costs)
select a.id
from itdevice a
left join itdevval b
on (a.id = b.devid)
where b.* is null;
+---------------------------------------------------------------------------+
| QUERY PLAN |
+---------------------------------------------------------------------------+
| Hash Right Join (cost=27.50..108277.25 rows=30000 width=4) |
| Hash Cond: (b.devid = a.id) |
| Filter: (b.* IS NULL) |
| -> Seq Scan on itdevval b (cost=0.00..92433.00 rows=6000000 width=44) |
| -> Hash (cost=15.00..15.00 rows=1000 width=4) |
| -> Seq Scan on itdevice a (cost=0.00..15.00 rows=1000 width=4) |
+---------------------------------------------------------------------------+
遞迴的方式
with recursive t1 as (
select min(devid) devid
from itdevval
where devid is not null
union all
select (select min(v.devid)
from itdevval v
where v.devid > t1.devid
and v.devid is not null)
from t1
where t1.devid is not null
), t2 as (
select devid
from t1
where devid is not null
-- t2 是有 itdevval 有資料的 devid
)
select id
from itdevice
where id not in (
select devid
from t2);
Time: 11.715 ms
原理與第一個例子相同.速度比一般方法快多了.