以Postgresql為主,再聊聊資料庫 PostgreSQL pg_stats 統計資訊與 index 的探討

postgresql sql rdbms

一級屠豬士 2020-09-25 23:42:34 ‧ 1898 瀏覽

分享至

在之前的 PostgreSQL BRIN index 介紹二 中,
使用了 pg_stats, 來查看關聯性.
我們接著來探討 pg_stats 中其他欄位與index的關係.

https://docs.postgresql.tw/internals/system-catalogs/pg_stats


create table it200925a (
  id int not null
, activated bool not null
);

insert into it200925a
select n
     , random() < 0.9 
  from generate_series(1, 1e5) as n;

select activated
     , count(*)
  from it200925a
 group by activated;

+-----------+-------+
| activated | count |
+-----------+-------+
| f         | 10104 |
| t         | 89896 |
+-----------+-------+
(2 rows)

analyze it200925a;

select attname
     , n_distinct
     , most_common_vals
     , most_common_freqs
  from pg_stats
 where tablename = 'it200925a';

+-----------+------------+------------------+-------------------------+
|  attname  | n_distinct | most_common_vals |    most_common_freqs    |
+-----------+------------+------------------+-------------------------+
| id        |         -1 | ¤                | ¤                       |
| activated |          2 | {t,f}            | {0.8986667,0.101333335} |
+-----------+------------+------------------+-------------------------+
(2 rows)

id 的 n_distinct 為 -1, 代表 unique ; 這是統計值.
我們在 create table 並沒有宣告 id 為 unique.
再insert 1000筆,然後再觀察.

insert into it200925a
select n
     , random() < 0.9 
  from generate_series(1, 1e3) as n;

commit;
analyze it200925a;

select attname
     , n_distinct
     , most_common_vals
     , most_common_freqs
  from pg_stats
 where tablename = 'it200925a';

+-----------+------------+------------------+--------------------------+
|  attname  | n_distinct | most_common_vals |    most_common_freqs     |
+-----------+------------+------------------+--------------------------+
| id        | -0.9828515 | ¤                | ¤                        |
| activated |          2 | {t,f}            | {0.89893335,0.101066664} |
+-----------+------------+------------------+--------------------------+

id 的 n_distinct 變為 -0.9828515 了.
接著觀察 activated 的 n_distinct 為 2, 因為只有 2種值,
most_common_vals, most_common_freqs 兩個都是 anyarray 形態, 也有對應關係.

接著使用 activated 建立 btree index 

create index on it200925a using btree(activated);


explain (costs off)
select *
  from it200925a
 where not activated;
+----------------------------------------------------+
|                     QUERY PLAN                     |
+----------------------------------------------------+
| Bitmap Heap Scan on it200925a                      |
|   Filter: (NOT activated)                          |
|   ->  Bitmap Index Scan on it200925a_activated_idx |
|         Index Cond: (activated = false)            |
+----------------------------------------------------+

activated 是 false 的情況約 10% , 優化器使用了 index

explain (costs off)
select *
  from it200925a
 where activated;
+-----------------------+
|      QUERY PLAN       |
+-----------------------+
| Seq Scan on it200925a |
|   Filter: activated   |
+-----------------------+

activated 是 true 的情況約 90% , 優化器不使用 index

---
接著再來看數值較多一些的情況.

create table it200925b (
 val text not null
);

insert into it200925b
select (array['foo','bar','baz','qux','quux'])[floor(random() * 5 + 1)]
  from generate_series(1, 1e6);

commit;
analyze it200925b;

select attname
     , n_distinct
     , most_common_vals
     , most_common_freqs
  from pg_stats
 where tablename = 'it200925b';

+---------+------------+------------------------+------------------------------------------------------+
| attname | n_distinct |    most_common_vals    |                  most_common_freqs                   |
+---------+------------+------------------------+------------------------------------------------------+
| val     |          5 | {qux,baz,bar,foo,quux} | {0.20163333,0.20066667,0.2006,0.19963333,0.19746667} |
+---------+------------+------------------------+------------------------------------------------------+

create index on it200925b using btree (val);
commit;

explain (costs off)
select *
  from it200925b
 where val = 'bar';

+----------------------------------------------+
|                  QUERY PLAN                  |
+----------------------------------------------+
| Bitmap Heap Scan on it200925b                |
|   Recheck Cond: (val = 'bar'::text)          |
|   ->  Bitmap Index Scan on it200925b_val_idx |
|         Index Cond: (val = 'bar'::text)      |
+----------------------------------------------+

explain (costs off)
select *
  from it200925b
 where val = 'foo';
 
+----------------------------------------------+
|                  QUERY PLAN                  |
+----------------------------------------------+
| Bitmap Heap Scan on it200925b                |
|   Recheck Cond: (val = 'foo'::text)          |
|   ->  Bitmap Index Scan on it200925b_val_idx |
|         Index Cond: (val = 'foo'::text)      |
+----------------------------------------------+

explain (costs off)
select *
  from it200925b
 where val in ('foo', 'bar', 'quux');

+--------------------------------------------------+
|                    QUERY PLAN                    |
+--------------------------------------------------+
| Seq Scan on it200925b                            |
|   Filter: (val = ANY ('{foo,bar,quux}'::text[])) |
+--------------------------------------------------+

由上面的結果,可以觀察到優化器的一些行為.
當然我們可以做一些參數調整,來調整優化器的評估基準,這是比較深入的課題了.

n_distinct 的數值,在一些探討資料庫 index 的資料中會用 選擇性 (Selective),
來表示.有興趣的可以用來 google.