最後兩天,讓我們來做個實驗吧!今天將分享實驗的設置和結果,比較兩種在 PostgreSQL 中存儲 log 方法的效能:
這次的實驗會產生 10 萬筆假的 Apache access log 作為測資,以及 300 個隨機生成的 IP 地址,來測量的新增和在各種條件下查詢的效能。
為確保公平,創建了兩個包含相同數量的數據但存儲結構不同的表格:
-- 表格 1:直接映射方法
CREATE TABLE apache_logs_direct (
id SERIAL PRIMARY KEY,
ip VARCHAR(15),
timestamp TIMESTAMP,
request_method VARCHAR(10),
request_path TEXT,
http_version VARCHAR(10),
status_code INTEGER,
response_size INTEGER,
referer TEXT,
user_agent TEXT,
processing_time FLOAT
);
-- 表格 2:JSONB 方法
CREATE TABLE apache_logs_jsonb (
id SERIAL PRIMARY KEY,
log_data JSONB
);
添加索引是為了測試索引是否對效能有幫助:
ip
欄位的 B-tree 索引log_data
欄位的 GIN 索引-- 直接映射表格中 ip 欄位的 B-tree 索引
CREATE INDEX btree_apache_logs_direct_ip ON apache_logs_direct USING btree(ip);
-- JSONB 表格中 log_data 欄位的 GIN 索引
CREATE INDEX gin_apache_logs_jsonb_log_data ON apache_logs_jsonb USING gin(log_data);
這是為了比較兩種索引在新增時,效能是否差異很大。
CREATE TEMPORARY TABLE random_ips AS
WITH RECURSIVE ip_generator AS (
SELECT 1 AS counter,
CONCAT(
FLOOR(RANDOM() * 223 + 1)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER
) AS ip
UNION ALL
SELECT counter + 1,
CONCAT(
FLOOR(RANDOM() * 223 + 1)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER
)
FROM ip_generator
WHERE counter < 300
)
SELECT DISTINCT ip FROM ip_generator LIMIT 300;
-- Generate and insert 100,000 Apache logs with direct property mapping
DO $$
DECLARE
v_ip TEXT;
v_timestamp TIMESTAMP;
v_request_method TEXT;
v_request_path TEXT;
v_http_version TEXT;
v_status_code INTEGER;
v_response_size INTEGER;
v_referer TEXT;
v_user_agent TEXT;
v_processing_time FLOAT;
v_methods TEXT[] := ARRAY['GET', 'POST', 'PUT', 'DELETE', 'HEAD'];
v_paths TEXT[] := ARRAY[
'/index.html', '/about', '/contact', '/products', '/services',
'/blog', '/api/v1/users', '/api/v1/orders', '/login', '/logout',
'/dashboard', '/profile', '/settings', '/images/logo.png', '/css/main.css'
];
v_versions TEXT[] := ARRAY['HTTP/1.0', 'HTTP/1.1', 'HTTP/2.0'];
v_status_codes INTEGER[] := ARRAY[200, 201, 204, 301, 302, 304, 400, 401, 403, 404, 500, 503];
v_referers TEXT[] := ARRAY[
'https://google.com', 'https://bing.com', 'https://yahoo.com',
'https://example.com', 'https://facebook.com', 'https://twitter.com',
'', 'https://linkedin.com', 'https://github.com'
];
v_user_agents TEXT[] := ARRAY[
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
];
BEGIN
-- Start timing for direct property mapping approach
RAISE NOTICE 'Starting direct property mapping insert at %', clock_timestamp();
FOR i IN 1..100000 LOOP
-- Select a random IP from our pre-generated list
SELECT ip INTO v_ip FROM random_ips ORDER BY RANDOM() LIMIT 1;
-- Generate random values for other fields
v_timestamp := NOW() - (RANDOM() * INTERVAL '365 days');
v_request_method := v_methods[1 + floor(RANDOM() * array_length(v_methods, 1))];
v_request_path := v_paths[1 + floor(RANDOM() * array_length(v_paths, 1))];
v_http_version := v_versions[1 + floor(RANDOM() * array_length(v_versions, 1))];
v_status_code := v_status_codes[1 + floor(RANDOM() * array_length(v_status_codes, 1))];
v_response_size := floor(RANDOM() * 1048576)::INTEGER; -- Random size up to 1MB
v_referer := v_referers[1 + floor(RANDOM() * array_length(v_referers, 1))];
v_user_agent := v_user_agents[1 + floor(RANDOM() * array_length(v_user_agents, 1))];
v_processing_time := RANDOM() * 2.0; -- Random processing time up to 2 seconds
-- Insert the record
INSERT INTO apache_logs_direct (
ip, timestamp, request_method, request_path, http_version,
status_code, response_size, referer, user_agent, processing_time
) VALUES (
v_ip, v_timestamp, v_request_method, v_request_path, v_http_version,
v_status_code, v_response_size, v_referer, v_user_agent, v_processing_time
);
-- Output progress every 10,000 records
IF i % 10000 = 0 THEN
RAISE NOTICE 'Inserted % direct mapping records at %', i, clock_timestamp();
END IF;
END LOOP;
RAISE NOTICE 'Finished direct property mapping insert at %', clock_timestamp();
END $$;
CREATE TEMPORARY TABLE random_ips AS
WITH RECURSIVE ip_generator AS (
SELECT 1 AS counter,
CONCAT(
FLOOR(RANDOM() * 223 + 1)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER
) AS ip
UNION ALL
SELECT counter + 1,
CONCAT(
FLOOR(RANDOM() * 223 + 1)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER, '.',
FLOOR(RANDOM() * 255)::INTEGER
)
FROM ip_generator
WHERE counter < 300
)
SELECT ip FROM ip_generator;
-- Generate and insert 100,000 Apache logs with JSONB approach
DO $$
DECLARE
v_ip TEXT;
v_log_data JSONB;
v_timestamp TIMESTAMP;
v_request_method TEXT;
v_request_path TEXT;
v_http_version TEXT;
v_status_code INTEGER;
v_response_size INTEGER;
v_referer TEXT;
v_user_agent TEXT;
v_processing_time FLOAT;
v_methods TEXT[] := ARRAY['GET', 'POST', 'PUT', 'DELETE', 'HEAD'];
v_paths TEXT[] := ARRAY[
'/index.html', '/about', '/contact', '/products', '/services',
'/blog', '/api/v1/users', '/api/v1/orders', '/login', '/logout',
'/dashboard', '/profile', '/settings', '/images/logo.png', '/css/main.css'
];
v_versions TEXT[] := ARRAY['HTTP/1.0', 'HTTP/1.1', 'HTTP/2.0'];
v_status_codes INTEGER[] := ARRAY[200, 201, 204, 301, 302, 304, 400, 401, 403, 404, 500, 503];
v_referers TEXT[] := ARRAY[
'https://google.com', 'https://bing.com', 'https://yahoo.com',
'https://example.com', 'https://facebook.com', 'https://twitter.com',
'', 'https://linkedin.com', 'https://github.com'
];
v_user_agents TEXT[] := ARRAY[
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
];
BEGIN
-- Start timing for JSONB approach
RAISE NOTICE 'Starting JSONB insert at %', clock_timestamp();
FOR i IN 1..100000 LOOP
-- Select a random IP from our pre-generated list
SELECT ip INTO v_ip FROM random_ips ORDER BY RANDOM() LIMIT 1;
-- Generate random values for other fields
v_timestamp := NOW() - (RANDOM() * INTERVAL '365 days');
v_request_method := v_methods[1 + floor(RANDOM() * array_length(v_methods, 1))];
v_request_path := v_paths[1 + floor(RANDOM() * array_length(v_paths, 1))];
v_http_version := v_versions[1 + floor(RANDOM() * array_length(v_versions, 1))];
v_status_code := v_status_codes[1 + floor(RANDOM() * array_length(v_status_codes, 1))];
v_response_size := floor(RANDOM() * 1048576)::INTEGER; -- Random size up to 1MB
v_referer := v_referers[1 + floor(RANDOM() * array_length(v_referers, 1))];
v_user_agent := v_user_agents[1 + floor(RANDOM() * array_length(v_user_agents, 1))];
v_processing_time := RANDOM() * 2.0; -- Random processing time up to 2 seconds
-- Create JSONB object
v_log_data := jsonb_build_object(
'ip', v_ip,
'timestamp', v_timestamp,
'request_method', v_request_method,
'request_path', v_request_path,
'http_version', v_http_version,
'status_code', v_status_code,
'response_size', v_response_size,
'referer', v_referer,
'user_agent', v_user_agent,
'processing_time', v_processing_time
);
-- Insert the record
INSERT INTO apache_logs_jsonb (log_data) VALUES (v_log_data);
-- Output progress every 10,000 records
IF i % 10000 = 0 THEN
RAISE NOTICE 'Inserted % JSONB records at %', i, clock_timestamp();
END IF;
END LOOP;
RAISE NOTICE 'Finished JSONB insert at %', clock_timestamp();
END $$;
從下面數據可以發現, JSONB 搭配 GIN 索引在新增資料部分,效能明顯比較不好。
因為直接映射的 ip
欄位有綁索引,而且 JSONB 使用 @>
作為查詢條件,也會用使用索引做優化,可以測試兩種資料結構在有索引的情況下,效能是否會差異很大。
SELECT id, ip, timestamp, request_method, request_path, status_code, response_size
FROM apache_logs_direct
WHERE ip = '191.162.228.64'
ORDER BY timestamp DESC;
SELECT id,
log_data->>'ip' AS ip,
(log_data->>'timestamp')::timestamp AS timestamp,
log_data->>'request_method' AS request_method,
log_data->>'request_path' AS request_path,
(log_data->>'status_code')::integer AS status_code,
(log_data->>'response_size')::integer AS response_size
FROM apache_logs_jsonb
WHERE log_data @> '{"ip": "21.72.118.249"}'
ORDER BY (log_data->>'timestamp')::timestamp DESC;
如果兩種方法都有吃到索引,其實效能差異不大。
這個查詢方式是想知道,在兩種方法都沒有使用索引的情況下做排序,是否在效能上會有明顯差異。
SELECT id, ip, timestamp, request_method, request_path, status_code, response_size
FROM apache_logs_direct
ORDER BY timestamp DESC;
SELECT id,
log_data->>'ip' AS ip,
(log_data->>'timestamp')::timestamp AS timestamp,
log_data->>'request_method' AS request_method,
log_data->>'request_path' AS request_path,
(log_data->>'status_code')::integer AS status_code,
(log_data->>'response_size')::integer AS response_size
FROM apache_logs_jsonb
ORDER BY (log_data->>'timestamp')::timestamp DESC;
兩種方法都沒有使用索引,搜尋效能差異不大。
這個查詢方式是想知道,在直接映射排序時使用索引,但 JSONB 沒有使用索引的情況下,是否在效能上會有明顯差異。
SELECT id, ip, timestamp, request_method, request_path, status_code, response_size
FROM apache_logs_direct
ORDER BY ip DESC;
SELECT id,
log_data->>'ip' AS ip,
(log_data->>'timestamp')::timestamp AS timestamp,
log_data->>'request_method' AS request_method,
log_data->>'request_path' AS request_path,
(log_data->>'status_code')::integer AS status_code,
(log_data->>'response_size')::integer AS response_size
FROM apache_logs_jsonb
ORDER BY (log_data->>'ip') DESC;
這種方式以目前情境來說, JSONB 就比較吃虧,因爲直接映射可以針對特定欄位綁定索引,所以以特定欄位排序就可以使用索引做優化,但是 JSONB 的資料被塞在同一個欄位, GIN 無法針對 JSON 裡面的單一欄位做排序的優化。
今天就到這裡,明天來總結一下這個實驗和心得。