iT邦幫忙

2025 iThome 鐵人賽

DAY 29
1
Software Development

我所不知道的PostgreSQL 30天系列 第 29

Day 29: 比較直接映射 和 JSONB效能:以 Apache access log為範例(上)

  • 分享至 

  • xImage
  •  

最後兩天,讓我們來做個實驗吧!今天將分享實驗的設置和結果,比較兩種在 PostgreSQL 中存儲 log 方法的效能:

  • 直接映射:每個日誌欄位映射到專用的表格欄位
  • JSONB:整個日誌條目作為 JSON 物件存儲在單一 JSONB 欄位中

這次的實驗會產生 10 萬筆假的 Apache access log 作為測資,以及 300 個隨機生成的 IP 地址,來測量的新增和在各種條件下查詢的效能。

實驗設置

新增表格

為確保公平,創建了兩個包含相同數量的數據但存儲結構不同的表格:

-- 表格 1:直接映射方法
CREATE TABLE apache_logs_direct (
    id SERIAL PRIMARY KEY,
    ip VARCHAR(15),
    timestamp TIMESTAMP,
    request_method VARCHAR(10),
    request_path TEXT,
    http_version VARCHAR(10),
    status_code INTEGER,
    response_size INTEGER,
    referer TEXT,
    user_agent TEXT,
    processing_time FLOAT
);

-- 表格 2:JSONB 方法
CREATE TABLE apache_logs_jsonb (
    id SERIAL PRIMARY KEY,
    log_data JSONB
);

添加索引

添加索引是為了測試索引是否對效能有幫助:

  • 直接映射表格中 ip 欄位的 B-tree 索引
  • JSONB 表格中 log_data 欄位的 GIN 索引
-- 直接映射表格中 ip 欄位的 B-tree 索引
CREATE INDEX btree_apache_logs_direct_ip ON apache_logs_direct USING btree(ip);

-- JSONB 表格中 log_data 欄位的 GIN 索引
CREATE INDEX gin_apache_logs_jsonb_log_data ON apache_logs_jsonb USING gin(log_data);

效能比較

新增 10 萬筆資料

這是為了比較兩種索引在新增時,效能是否差異很大。

  • 直接映射語法
CREATE TEMPORARY TABLE random_ips AS
WITH RECURSIVE ip_generator AS (
    SELECT 1 AS counter, 
           CONCAT(
               FLOOR(RANDOM() * 223 + 1)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER
           ) AS ip
    UNION ALL
    SELECT counter + 1, 
           CONCAT(
               FLOOR(RANDOM() * 223 + 1)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER
           )
    FROM ip_generator
    WHERE counter < 300
)
SELECT DISTINCT ip FROM ip_generator LIMIT 300;

-- Generate and insert 100,000 Apache logs with direct property mapping
DO $$
DECLARE
    v_ip TEXT;
    v_timestamp TIMESTAMP;
    v_request_method TEXT;
    v_request_path TEXT;
    v_http_version TEXT;
    v_status_code INTEGER;
    v_response_size INTEGER;
    v_referer TEXT;
    v_user_agent TEXT;
    v_processing_time FLOAT;
    v_methods TEXT[] := ARRAY['GET', 'POST', 'PUT', 'DELETE', 'HEAD'];
    v_paths TEXT[] := ARRAY[
        '/index.html', '/about', '/contact', '/products', '/services', 
        '/blog', '/api/v1/users', '/api/v1/orders', '/login', '/logout',
        '/dashboard', '/profile', '/settings', '/images/logo.png', '/css/main.css'
    ];
    v_versions TEXT[] := ARRAY['HTTP/1.0', 'HTTP/1.1', 'HTTP/2.0'];
    v_status_codes INTEGER[] := ARRAY[200, 201, 204, 301, 302, 304, 400, 401, 403, 404, 500, 503];
    v_referers TEXT[] := ARRAY[
        'https://google.com', 'https://bing.com', 'https://yahoo.com', 
        'https://example.com', 'https://facebook.com', 'https://twitter.com',
        '', 'https://linkedin.com', 'https://github.com'
    ];
    v_user_agents TEXT[] := ARRAY[
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
        'Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
    ];
BEGIN
    -- Start timing for direct property mapping approach
    RAISE NOTICE 'Starting direct property mapping insert at %', clock_timestamp();
    
    FOR i IN 1..100000 LOOP
        -- Select a random IP from our pre-generated list
        SELECT ip INTO v_ip FROM random_ips ORDER BY RANDOM() LIMIT 1;
        
        -- Generate random values for other fields
        v_timestamp := NOW() - (RANDOM() * INTERVAL '365 days');
        v_request_method := v_methods[1 + floor(RANDOM() * array_length(v_methods, 1))];
        v_request_path := v_paths[1 + floor(RANDOM() * array_length(v_paths, 1))];
        v_http_version := v_versions[1 + floor(RANDOM() * array_length(v_versions, 1))];
        v_status_code := v_status_codes[1 + floor(RANDOM() * array_length(v_status_codes, 1))];
        v_response_size := floor(RANDOM() * 1048576)::INTEGER; -- Random size up to 1MB
        v_referer := v_referers[1 + floor(RANDOM() * array_length(v_referers, 1))];
        v_user_agent := v_user_agents[1 + floor(RANDOM() * array_length(v_user_agents, 1))];
        v_processing_time := RANDOM() * 2.0; -- Random processing time up to 2 seconds
        
        -- Insert the record
        INSERT INTO apache_logs_direct (
            ip, timestamp, request_method, request_path, http_version, 
            status_code, response_size, referer, user_agent, processing_time
        ) VALUES (
            v_ip, v_timestamp, v_request_method, v_request_path, v_http_version,
            v_status_code, v_response_size, v_referer, v_user_agent, v_processing_time
        );
        
        -- Output progress every 10,000 records
        IF i % 10000 = 0 THEN
            RAISE NOTICE 'Inserted % direct mapping records at %', i, clock_timestamp();
        END IF;
    END LOOP;
    
    RAISE NOTICE 'Finished direct property mapping insert at %', clock_timestamp();
END $$;
  • JSONB 語法
CREATE TEMPORARY TABLE random_ips AS
WITH RECURSIVE ip_generator AS (
    SELECT 1 AS counter, 
           CONCAT(
               FLOOR(RANDOM() * 223 + 1)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER
           ) AS ip
    UNION ALL
    SELECT counter + 1, 
           CONCAT(
               FLOOR(RANDOM() * 223 + 1)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER, '.', 
               FLOOR(RANDOM() * 255)::INTEGER
           )
    FROM ip_generator
    WHERE counter < 300
)
SELECT ip FROM ip_generator;
-- Generate and insert 100,000 Apache logs with JSONB approach
DO $$
DECLARE
    v_ip TEXT;
    v_log_data JSONB;
    v_timestamp TIMESTAMP;
    v_request_method TEXT;
    v_request_path TEXT;
    v_http_version TEXT;
    v_status_code INTEGER;
    v_response_size INTEGER;
    v_referer TEXT;
    v_user_agent TEXT;
    v_processing_time FLOAT;
    v_methods TEXT[] := ARRAY['GET', 'POST', 'PUT', 'DELETE', 'HEAD'];
    v_paths TEXT[] := ARRAY[
        '/index.html', '/about', '/contact', '/products', '/services', 
        '/blog', '/api/v1/users', '/api/v1/orders', '/login', '/logout',
        '/dashboard', '/profile', '/settings', '/images/logo.png', '/css/main.css'
    ];
    v_versions TEXT[] := ARRAY['HTTP/1.0', 'HTTP/1.1', 'HTTP/2.0'];
    v_status_codes INTEGER[] := ARRAY[200, 201, 204, 301, 302, 304, 400, 401, 403, 404, 500, 503];
    v_referers TEXT[] := ARRAY[
        'https://google.com', 'https://bing.com', 'https://yahoo.com', 
        'https://example.com', 'https://facebook.com', 'https://twitter.com',
        '', 'https://linkedin.com', 'https://github.com'
    ];
    v_user_agents TEXT[] := ARRAY[
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
        'Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
    ];
BEGIN
    -- Start timing for JSONB approach
    RAISE NOTICE 'Starting JSONB insert at %', clock_timestamp();
    
    FOR i IN 1..100000 LOOP
        -- Select a random IP from our pre-generated list
        SELECT ip INTO v_ip FROM random_ips ORDER BY RANDOM() LIMIT 1;
        
        -- Generate random values for other fields
        v_timestamp := NOW() - (RANDOM() * INTERVAL '365 days');
        v_request_method := v_methods[1 + floor(RANDOM() * array_length(v_methods, 1))];
        v_request_path := v_paths[1 + floor(RANDOM() * array_length(v_paths, 1))];
        v_http_version := v_versions[1 + floor(RANDOM() * array_length(v_versions, 1))];
        v_status_code := v_status_codes[1 + floor(RANDOM() * array_length(v_status_codes, 1))];
        v_response_size := floor(RANDOM() * 1048576)::INTEGER; -- Random size up to 1MB
        v_referer := v_referers[1 + floor(RANDOM() * array_length(v_referers, 1))];
        v_user_agent := v_user_agents[1 + floor(RANDOM() * array_length(v_user_agents, 1))];
        v_processing_time := RANDOM() * 2.0; -- Random processing time up to 2 seconds
        
        -- Create JSONB object
        v_log_data := jsonb_build_object(
            'ip', v_ip,
            'timestamp', v_timestamp,
            'request_method', v_request_method,
            'request_path', v_request_path,
            'http_version', v_http_version,
            'status_code', v_status_code,
            'response_size', v_response_size,
            'referer', v_referer,
            'user_agent', v_user_agent,
            'processing_time', v_processing_time
        );
        
        -- Insert the record
        INSERT INTO apache_logs_jsonb (log_data) VALUES (v_log_data);
        
        -- Output progress every 10,000 records
        IF i % 10000 = 0 THEN
            RAISE NOTICE 'Inserted % JSONB records at %', i, clock_timestamp();
        END IF;
    END LOOP;
    
    RAISE NOTICE 'Finished JSONB insert at %', clock_timestamp();
END $$;

從下面數據可以發現, JSONB 搭配 GIN 索引在新增資料部分,效能明顯比較不好。

查詢指定 ip

因為直接映射的 ip 欄位有綁索引,而且 JSONB 使用 @> 作為查詢條件,也會用使用索引做優化,可以測試兩種資料結構在有索引的情況下,效能是否會差異很大。

  • 直接映射語法
SELECT id, ip, timestamp, request_method, request_path, status_code, response_size
FROM apache_logs_direct
WHERE ip = '191.162.228.64'
ORDER BY timestamp DESC;
  • JSONB 語法
SELECT id, 
       log_data->>'ip' AS ip, 
       (log_data->>'timestamp')::timestamp AS timestamp,
       log_data->>'request_method' AS request_method, 
       log_data->>'request_path' AS request_path,
       (log_data->>'status_code')::integer AS status_code,
       (log_data->>'response_size')::integer AS response_size
FROM apache_logs_jsonb
WHERE log_data @> '{"ip": "21.72.118.249"}'
ORDER BY (log_data->>'timestamp')::timestamp DESC;

如果兩種方法都有吃到索引,其實效能差異不大。

查詢所有資料,並以 timestamp 做排序

這個查詢方式是想知道,在兩種方法都沒有使用索引的情況下做排序,是否在效能上會有明顯差異。

  • 直接映射語法
SELECT id, ip, timestamp, request_method, request_path, status_code, response_size
FROM apache_logs_direct
ORDER BY timestamp DESC;
  • JSONB 語法
SELECT id, 
       log_data->>'ip' AS ip, 
       (log_data->>'timestamp')::timestamp AS timestamp,
       log_data->>'request_method' AS request_method, 
       log_data->>'request_path' AS request_path,
       (log_data->>'status_code')::integer AS status_code,
       (log_data->>'response_size')::integer AS response_size
FROM apache_logs_jsonb
ORDER BY (log_data->>'timestamp')::timestamp DESC;

兩種方法都沒有使用索引,搜尋效能差異不大。

查詢所有資料,並以 ip 做排序

這個查詢方式是想知道,在直接映射排序時使用索引,但 JSONB 沒有使用索引的情況下,是否在效能上會有明顯差異。

  • 直接映射語法
SELECT id, ip, timestamp, request_method, request_path, status_code, response_size
FROM apache_logs_direct
ORDER BY ip DESC;
  • JSONB 語法
SELECT id, 
       log_data->>'ip' AS ip, 
       (log_data->>'timestamp')::timestamp AS timestamp,
       log_data->>'request_method' AS request_method, 
       log_data->>'request_path' AS request_path,
       (log_data->>'status_code')::integer AS status_code,
       (log_data->>'response_size')::integer AS response_size
FROM apache_logs_jsonb
ORDER BY (log_data->>'ip') DESC;

這種方式以目前情境來說, JSONB 就比較吃虧,因爲直接映射可以針對特定欄位綁定索引,所以以特定欄位排序就可以使用索引做優化,但是 JSONB 的資料被塞在同一個欄位, GIN 無法針對 JSON 裡面的單一欄位做排序的優化。

今天就到這裡,明天來總結一下這個實驗和心得。


上一篇
Day 28: pgAdmin 的 EXPLAIN ANALYZE 工具介紹
下一篇
Day 30: 比較直接映射 和 JSONB效能:以 Apache access log為範例(下)
系列文
我所不知道的PostgreSQL 30天30
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言