iT邦幫忙

0

[Day13]批次網址健檢器:檢查狀態碼/跳轉/延遲,輸出 CSV(可串接 SQLite)

  • 分享至 

  • xImage
  •  

延續前幾天我們有 links.csv、crawler.db 的連結資料。今天做一個超實用的小工具:
一次檢查一堆網址的可用性,記錄 HTTP 狀態碼、是否成功、最終跳轉網址、延遲(ms),輸出成 CSV;也可把結果寫回 SQLite 做歷史記錄。

需要的套件
(前面 Day 4 已裝過 requests,沒裝就裝一下)

pip install requests

檔名:link_check.py

# link_check.py — Day 13:批次網址健檢器
from __future__ import annotations
import argparse, csv, sqlite3, time, datetime as dt
from pathlib import Path
from typing import List, Tuple, Iterable
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests

UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
      "(KHTML, like Gecko) Chrome/124.0 Safari/537.36")

def load_from_csv(path: Path, limit: int | None) -> List[Tuple[int, str]]:
    urls: List[Tuple[int, str]] = []
    with path.open("r", encoding="utf-8", newline="") as f:
        r = csv.DictReader(f)
        for i, row in enumerate(r, 1):
            u = (row.get("url") or "").strip()
            if not u:
                continue
            rid = int(row.get("id", i))
            urls.append((rid, u))
            if limit and len(urls) >= limit:
                break
    return urls

def load_from_db(db: Path, limit: int | None) -> List[Tuple[int, str]]:
    con = sqlite3.connect(str(db))
    con.row_factory = sqlite3.Row
    cur = con.execute("SELECT id, url FROM links ORDER BY id DESC " + (f"LIMIT {int(limit)}" if limit else ""))
    rows = [(int(r["id"]), str(r["url"])) for r in cur.fetchall() if r["url"]]
    con.close()
    return rows

def check_one(url_id: int, url: str, timeout: float, insecure: bool) -> dict:
    headers = {"User-Agent": UA}
    verify = not insecure
    t0 = time.perf_counter()
    try:
        # 先 HEAD,405/403 等再退回 GET
        resp = requests.head(url, allow_redirects=True, timeout=timeout, headers=headers, verify=verify)
        if resp.status_code in (405, 403, 400, 501) or resp.status_code >= 400:
            resp = requests.get(url, allow_redirects=True, timeout=timeout, headers=headers, verify=verify, stream=True)
        elapsed_ms = int((time.perf_counter() - t0) * 1000)
        ok = 200 <= resp.status_code < 400
        return {
            "id": url_id,
            "url": url,
            "status": resp.status_code,
            "ok": 1 if ok else 0,
            "final_url": resp.url,
            "elapsed_ms": elapsed_ms,
            "error": "",
            "checked_at": dt.datetime.now().isoformat(timespec="seconds"),
        }
    except requests.exceptions.SSLError as e:
        elapsed_ms = int((time.perf_counter() - t0) * 1000)
        return {"id": url_id, "url": url, "status": "", "ok": 0, "final_url": "",
                "elapsed_ms": elapsed_ms, "error": f"SSL: {e}", "checked_at": dt.datetime.now().isoformat(timespec="seconds")}
    except Exception as e:
        elapsed_ms = int((time.perf_counter() - t0) * 1000)
        return {"id": url_id, "url": url, "status": "", "ok": 0, "final_url": "",
                "elapsed_ms": elapsed_ms, "error": str(e), "checked_at": dt.datetime.now().isoformat(timespec="seconds")}

def write_csv(rows: Iterable[dict], out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    cols = ["id", "url", "status", "ok", "final_url", "elapsed_ms", "error", "checked_at"]
    with out_path.open("w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writeheader()
        for r in rows:
            w.writerow(r)

def save_to_db(db: Path, rows: Iterable[dict]):
    con = sqlite3.connect(str(db))
    con.execute("""
        CREATE TABLE IF NOT EXISTS link_checks(
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            url_id INTEGER,
            status INTEGER,
            ok INTEGER,
            final_url TEXT,
            elapsed_ms INTEGER,
            error TEXT,
            checked_at TEXT
        )
    """)
    con.executemany("""
        INSERT INTO link_checks(url_id, status, ok, final_url, elapsed_ms, error, checked_at)
        VALUES(:id, :status, :ok, :final_url, :elapsed_ms, :error, :checked_at)
    """, list(rows))
    con.commit()
    con.close()

def main():
    ap = argparse.ArgumentParser(description="批次網址健檢器:狀態碼/跳轉/延遲 -> CSV,可選擇寫回 SQLite")
    src = ap.add_mutually_exclusive_group(required=True)
    src.add_argument("--csv", type=Path, help="來源 CSV(需含 url 欄,最好加 id)")
    src.add_argument("--db", type=Path, help="來源 SQLite(讀取 links 表)")
    ap.add_argument("--out", type=Path, default=Path("check_result.csv"), help="輸出 CSV 路徑")
    ap.add_argument("--limit", type=int, default=0, help="最多檢查幾筆(0=全部)")
    ap.add_argument("--timeout", type=float, default=6.0, help="逾時秒數(預設 6)")
    ap.add_argument("--workers", type=int, default=8, help="同時併發數(預設 8)")
    ap.add_argument("--insecure", action="store_true", help="忽略 SSL 憑證(除非必要,不建議)")
    ap.add_argument("--save-db", action="store_true", help="把結果寫回 SQLite(需同時使用 --db)")
    args = ap.parse_args()

    limit = args.limit if args.limit and args.limit > 0 else None
    if args.csv:
        items = load_from_csv(args.csv, limit)
    else:
        items = load_from_db(args.db, limit)

    total = len(items)
    if total == 0:
        print("沒有可檢查的網址。")
        return

    print(f"開始檢查,共 {total} 筆,timeout={args.timeout}s,workers={args.workers} …")

    results: List[dict] = []
    with ThreadPoolExecutor(max_workers=args.workers) as ex:
        futs = [ex.submit(check_one, uid, url, args.timeout, args.insecure) for (uid, url) in items]
        for i, fut in enumerate(as_completed(futs), 1):
            r = fut.result()
            results.append(r)
            status = r["status"] if r["status"] != "" else "ERR"
            print(f"[{i:>4}/{total}] {status:>3}  {r['elapsed_ms']:>5}ms  {r['url'][:80]}")

    # 輸出 CSV
    write_csv(results, args.out)
    print(f"✅ 已輸出 CSV:{args.out}")

    # 可選:寫回 DB
    if args.save_db:
        if not args.db:
            print("⚠️ 需搭配 --db 才能寫回資料庫")
        else:
            save_to_db(args.db, results)
            print(f"✅ 已寫入 SQLite:{args.db} 的 link_checks 表")

if __name__ == "__main__":
    main()

使用方式

  1. 從 CSV 檢查(例如 Day 5 產生的 links.csv)
python link_check.py --csv links.csv --out check_links.csv --limit 50

  1. 從 SQLite 檢查(例如 Day 6 的 crawler.db → links 表)
    python link_check.py --db crawler.db --out check_db.csv --limit 100
  2. 開多線程、調逾時
python link_check.py --db crawler.db --out check_fast.csv --workers 16 --timeout 5

  1. 寫回資料庫,建立 link_checks 歷史表
python link_check.py --db crawler.db --save-db --out check.csv

實作:
https://ithelp.ithome.com.tw/upload/images/20250926/20169368y6ojkPGDfL.png

小提醒
SSL: 相關錯誤:環境的 CA 憑證驗證失敗,可先加 --insecure 測試(不建議長期使用)。
太多 403/429:網站可能擋爬;把 --workers 調小、--timeout 調高,或改時段再試。
CSV 沒有 url 欄:請確認來源 CSV 至少有 url 欄位,id 欄可選。
DB 模式沒資料:確定 crawler.db 裡有 links(text, url) 表。


圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言