iT邦幫忙

2025 iThome 鐵人賽

DAY 18
0
Rust

Rust 實戰專案集:30 個漸進式專案從工具到服務系列 第 18

Log分析器 - 解析網頁伺服器 Daily Log 並產生統計

  • 分享至 

  • xImage
  •  

前言

Log分析是系統維運中不可或缺的一環。
我將實作一個高效的網頁伺服器日誌分析器,能夠解析常見的日誌格式(如 Apache/Nginx 的 Combined Log Format),
並產生有用的統計資訊,如最常訪問的頁面、HTTP 狀態碼分布、流量高峰時段等。
希望我的學習歷程可以幫助我使用 Rust 落地到各種實用的地方執行

學習實作目標

  • 使用正則表達式解析結構化文本
  • 實作資料聚合與統計分析
  • 處理大型文件的高效讀取
  • 使用 HashMap 進行資料統計
  • 實作命令列參數解析

開始專案

cargo new log_analyzer
cd log_analyzer

依賴 (cargo.toml)

[package]
name = "log_analyzer"
version = "0.1.0"
edition = "2021"

[dependencies]
regex = "1.10"
chrono = "0.4"
clap = { version = "4.5", features = ["derive"] }

我們先定義 command line 常用的參數

use chrono::NaiveDateTime;
use clap::Parser;
use regex::Regex;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;

#[derive(Parser, Debug)]
#[command(author, version, about = "網頁伺服器Log分析器", long_about = None)]
struct Args {
    /// Log檔案路徑
    #[arg(short, long)]
    file: PathBuf,

    /// 顯示 Top N 個最常訪問的路徑
    #[arg(short = 'n', long, default_value = "10")]
    top_urls: usize,

    /// 是否顯示每小時流量統計
    #[arg(short = 'H', long)]
    hourly_stats: bool,

    /// 是否顯示狀態碼分布
    #[arg(short = 's', long)]
    status_codes: bool,
}

#[derive(Debug, Clone)]
struct LogEntry {
    ip: String,
    timestamp: NaiveDateTime,
    method: String,
    path: String,
    status: u16,
    size: u64,
    user_agent: String,
}

#[derive(Debug, Default)]
struct LogStats {
    total_requests: usize,
    path_counts: HashMap<String, usize>,
    status_counts: HashMap<u16, usize>,
    hourly_requests: HashMap<u8, usize>,
    total_bytes: u64,
    ip_counts: HashMap<String, usize>,
}

解析器

struct LogParser {
    // Combined Log Format 正則表達式
    // 格式: IP - - [timestamp] "METHOD path HTTP/version" status size "referer" "user-agent"
    regex: Regex,
}

impl LogParser {
    fn new() -> Result<Self, regex::Error> {
        let pattern = r#"^(\S+) \S+ \S+ \[([^\]]+)\] "(\S+) (\S+) \S+" (\d{3}) (\d+|-) "([^"]*)" "([^"]*)""#;
        let regex = Regex::new(pattern)?;
        Ok(Self { regex })
    }

    fn parse_line(&self, line: &str) -> Option<LogEntry> {
        let captures = self.regex.captures(line)?;

        let ip = captures.get(1)?.as_str().to_string();
        
        // 解析時間戳: 01/Jan/2024:12:00:00 +0800
        let timestamp_str = captures.get(2)?.as_str();
        let timestamp = self.parse_timestamp(timestamp_str)?;
        
        let method = captures.get(3)?.as_str().to_string();
        let path = captures.get(4)?.as_str().to_string();
        let status = captures.get(5)?.as_str().parse().ok()?;
        
        let size_str = captures.get(6)?.as_str();
        let size = if size_str == "-" {
            0
        } else {
            size_str.parse().ok()?
        };
        
        let user_agent = captures.get(8)?.as_str().to_string();

        Some(LogEntry {
            ip,
            timestamp,
            method,
            path,
            status,
            size,
            user_agent,
        })
    }

    fn parse_timestamp(&self, s: &str) -> Option<NaiveDateTime> {
        // 格式: 01/Jan/2024:12:00:00 +0800
        // 我們忽略時區部分,只解析日期時間
        let datetime_part = s.split_whitespace().next()?;
        NaiveDateTime::parse_from_str(datetime_part, "%d/%b/%Y:%H:%M:%S").ok()
    }
}

統計 (LogStat)

impl LogStats {
    fn new() -> Self {
        Self::default()
    }

    fn add_entry(&mut self, entry: &LogEntry) {
        self.total_requests += 1;
        
        // 統計路徑訪問次數
        *self.path_counts.entry(entry.path.clone()).or_insert(0) += 1;
        
        // 統計狀態碼分布
        *self.status_counts.entry(entry.status).or_insert(0) += 1;
        
        // 統計每小時請求數
        let hour = entry.timestamp.hour() as u8;
        *self.hourly_requests.entry(hour).or_insert(0) += 1;
        
        // 統計總流量
        self.total_bytes += entry.size;
        
        // 統計 IP 訪問次數
        *self.ip_counts.entry(entry.ip.clone()).or_insert(0) += 1;
    }

    fn get_top_paths(&self, n: usize) -> Vec<(&String, &usize)> {
        let mut paths: Vec<_> = self.path_counts.iter().collect();
        paths.sort_by(|a, b| b.1.cmp(a.1));
        paths.into_iter().take(n).collect()
    }

    fn get_top_ips(&self, n: usize) -> Vec<(&String, &usize)> {
        let mut ips: Vec<_> = self.ip_counts.iter().collect();
        ips.sort_by(|a, b| b.1.cmp(a.1));
        ips.into_iter().take(n).collect()
    }

    fn format_bytes(bytes: u64) -> String {
        const UNITS: [&str; 5] = ["B", "KB", "MB", "GB", "TB"];
        let mut size = bytes as f64;
        let mut unit_index = 0;

        while size >= 1024.0 && unit_index < UNITS.len() - 1 {
            size /= 1024.0;
            unit_index += 1;
        }

        format!("{:.2} {}", size, UNITS[unit_index])
    }
}

生成統計報告

struct ReportGenerator;

impl ReportGenerator {
    fn print_summary(stats: &LogStats) {
        println!("\n{}", "=".repeat(60));
        println!("📊 日誌分析報告");
        println!("{}", "=".repeat(60));
        println!("總請求數: {}", stats.total_requests);
        println!("總流量: {}", LogStats::format_bytes(stats.total_bytes));
        println!("不重複 IP 數: {}", stats.ip_counts.len());
        
        if stats.total_requests > 0 {
            let avg_size = stats.total_bytes / stats.total_requests as u64;
            println!("平均請求大小: {}", LogStats::format_bytes(avg_size));
        }
        println!();
    }

    fn print_top_urls(stats: &LogStats, n: usize) {
        println!("🔥 Top {} 最常訪問的路徑:", n);
        println!("{:-<60}", "");
        
        let top_paths = stats.get_top_paths(n);
        for (i, (path, count)) in top_paths.iter().enumerate() {
            let percentage = (*count as f64 / stats.total_requests as f64) * 100.0;
            println!("{:2}. {:4} 次 ({:5.2}%) - {}", 
                     i + 1, count, percentage, path);
        }
        println!();
    }

    fn print_status_codes(stats: &LogStats) {
        println!("📈 HTTP 狀態碼分布:");
        println!("{:-<60}", "");
        
        let mut status_vec: Vec<_> = stats.status_counts.iter().collect();
        status_vec.sort_by_key(|&(status, _)| status);
        
        for (status, count) in status_vec {
            let percentage = (*count as f64 / stats.total_requests as f64) * 100.0;
            let status_desc = Self::status_description(*status);
            println!("{} - {:4} 次 ({:5.2}%) - {}", 
                     status, count, percentage, status_desc);
        }
        println!();
    }

    fn print_hourly_stats(stats: &LogStats) {
        println!("⏰ 每小時請求分布:");
        println!("{:-<60}", "");
        
        for hour in 0..24 {
            let count = stats.hourly_requests.get(&hour).unwrap_or(&0);
            let percentage = (*count as f64 / stats.total_requests as f64) * 100.0;
            let bar = Self::generate_bar(*count, stats.total_requests);
            println!("{:02}:00 - {:4} 次 ({:5.2}%) {}", 
                     hour, count, percentage, bar);
        }
        println!();
    }

    fn print_top_ips(stats: &LogStats, n: usize) {
        println!("🌐 Top {} 活躍 IP 位址:", n);
        println!("{:-<60}", "");
        
        let top_ips = stats.get_top_ips(n);
        for (i, (ip, count)) in top_ips.iter().enumerate() {
            let percentage = (*count as f64 / stats.total_requests as f64) * 100.0;
            println!("{:2}. {:4} 次 ({:5.2}%) - {}", 
                     i + 1, count, percentage, ip);
        }
        println!();
    }

    fn status_description(status: u16) -> &'static str {
        match status {
            200 => "OK",
            201 => "Created",
            204 => "No Content",
            301 => "Moved Permanently",
            302 => "Found",
            304 => "Not Modified",
            400 => "Bad Request",
            401 => "Unauthorized",
            403 => "Forbidden",
            404 => "Not Found",
            500 => "Internal Server Error",
            502 => "Bad Gateway",
            503 => "Service Unavailable",
            _ => "Unknown",
        }
    }

    fn generate_bar(count: usize, max: usize) -> String {
        let bar_width = 40;
        let filled = ((count as f64 / max as f64) * bar_width as f64) as usize;
        let filled = filled.min(bar_width);
        format!("[{}{}]", "█".repeat(filled), "░".repeat(bar_width - filled))
    }
}

main.rs 主程式

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let args = Args::parse();

    // 檢查檔案是否存在
    if !args.file.exists() {
        eprintln!("❌ 錯誤: 檔案不存在: {:?}", args.file);
        std::process::exit(1);
    }

    println!("📂 正在分析日誌檔案: {:?}", args.file);
    
    let parser = LogParser::new()?;
    let mut stats = LogStats::new();
    
    // 讀取並解析日誌
    let file = File::open(&args.file)?;
    let reader = BufReader::new(file);
    
    let mut parse_errors = 0;
    let mut line_number = 0;
    
    for line in reader.lines() {
        line_number += 1;
        let line = line?;
        
        if line.trim().is_empty() {
            continue;
        }
        
        match parser.parse_line(&line) {
            Some(entry) => stats.add_entry(&entry),
            None => {
                parse_errors += 1;
                if parse_errors <= 5 {
                    eprintln!("⚠️  第 {} 行解析失敗", line_number);
                }
            }
        }
    }
    
    if parse_errors > 0 {
        eprintln!("\n⚠️  共有 {} 行無法解析", parse_errors);
    }

    // 生成報告
    ReportGenerator::print_summary(&stats);
    ReportGenerator::print_top_urls(&stats, args.top_urls);
    ReportGenerator::print_top_ips(&stats, 5);
    
    if args.status_codes {
        ReportGenerator::print_status_codes(&stats);
    }
    
    if args.hourly_stats {
        ReportGenerator::print_hourly_stats(&stats);
    }

    Ok(())
}

完成!

測試資料產生

src/generate_sample_log.rs

這裡我們製作 fake data 製作一些 log 的產生 讓我們好使用一些 log 測試

use chrono::{Datelike, NaiveDateTime, Timelike};
use rand::seq::SliceRandom;
use rand::Rng;
use std::fs::File;
use std::io::Write;

fn main() -> std::io::Result<()> {
    let mut file = File::create("sample.log")?;
    let mut rng = rand::thread_rng();

    let paths = vec![
        "/", "/about", "/contact", "/products", "/api/users",
        "/api/orders", "/login", "/signup", "/dashboard",
        "/static/css/style.css", "/static/js/app.js",
    ];

    let user_agents = vec![
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "curl/7.68.0",
    ];

    let statuses = vec![200, 200, 200, 200, 304, 404, 500, 302];

    // 生成 10000 條日誌
    for i in 0..10000 {
        let ip = format!(
            "{}.{}.{}.{}",
            rng.gen_range(1..255),
            rng.gen_range(0..255),
            rng.gen_range(0..255),
            rng.gen_range(1..255)
        );

        let timestamp = format!(
            "{:02}/{}/2024:{:02}:{:02}:{:02} +0800",
            rng.gen_range(1..29),
            ["Jan", "Feb", "Mar"].choose(&mut rng).unwrap(),
            rng.gen_range(0..24),
            rng.gen_range(0..60),
            rng.gen_range(0..60)
        );

        let method = "GET";
        let path = paths.choose(&mut rng).unwrap();
        let status = statuses.choose(&mut rng).unwrap();
        let size = rng.gen_range(100..50000);
        let user_agent = user_agents.choose(&mut rng).unwrap();

        writeln!(
            file,
            r#"{} - - [{}] "{} {} HTTP/1.1" {} {} "-" "{}""#,
            ip, timestamp, method, path, status, size, user_agent
        )?;
    }

    println!("✅ 已生成 sample.log");
    Ok(())
}

開始使用

# 編譯並執行
cargo build --release

# 生成測試資料
cargo run --bin generate_sample_log

# 基本分析
cargo run -- -f sample.log

# 顯示 Top 20 路徑
cargo run -- -f sample.log -n 20

# 顯示所有統計資訊
cargo run -- -f sample.log -n 15 --status-codes --hourly-stats

# 分析真實的 Nginx Log
cargo run -- -f /var/log/nginx/access.log -s -H

上一篇
CSV 資料清洗工具 - 處理和轉換 CSV 檔案
下一篇
JSON Schema 驗證器 - 驗證 JSON 資料格式
系列文
Rust 實戰專案集:30 個漸進式專案從工具到服務25
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言