我們昨天 [Day 10] GU價格提醒系統 (4) - 我要怎麼拿取GU資料? 爬蟲? 完成了如何拿取 GU 商品價格資料,今天就來把這個爬蟲腳本容器化,用成我們n8n可用的服務吧!
把既有爬蟲直接放進 FastAPI
app.py
# app.py
# scraper/app.py
from fastapi import FastAPI, HTTPException, Query
from fastapi.responses import JSONResponse
import requests, time, random
app = FastAPI()
BASE_WEB = "https://www.gu-global.com"
HOST = "https://d.gu-global.com"
CATEGORY_API = f"{HOST}/tw/p/search/products/by-category"
BASE_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-TW,zh;q=0.9,en;q=0.8",
"Content-Type": "application/json;charset=UTF-8",
"Origin": BASE_WEB,
"Connection": "keep-alive",
}
def _get_in(d, path, default=None):
cur = d
for k in path:
if isinstance(cur, dict) and k in cur:
cur = cur[k]
else:
return default
return cur
def _normalize(p: dict):
def to_f(x):
try: return float(x)
except: return None
pc = p.get("productCode") or p.get("product_code") or p.get("code") or p.get("goodsCode")
name = p.get("name") or p.get("goodsName") or p.get("title")
price = p.get("price", {}).get("currentPrice") if isinstance(p.get("price"), dict) else p.get("minPrice") or p.get("price")
oprice = p.get("price", {}).get("originalPrice") if isinstance(p.get("price"), dict) else p.get("originPrice")
instock = _get_in(p, ("stock","status")) or p.get("availability") or p.get("inStock")
url = p.get("url") or p.get("path")
return {
"product_code": pc,
"name": name,
"price": to_f(price),
"original_price": to_f(oprice),
"in_stock": instock,
"url": url,
"raw": p
}
def _post_with_retry(session, url, headers, json, timeout=20, retries=4, backoff_base=0.6, backoff_cap=6.0):
last_err = None
for i in range(retries + 1):
try:
r = session.post(url, headers=headers, json=json, timeout=timeout)
# 對 5xx / 429 做重試
if r.status_code >= 500 or r.status_code in (429,):
raise requests.HTTPError(f"{r.status_code} {r.reason}", response=r)
r.raise_for_status()
return r
except requests.RequestException as e:
last_err = e
if i == retries:
break
sleep = min(backoff_cap, backoff_base * (2 ** i)) + random.uniform(0, 0.3)
time.sleep(sleep)
raise last_err
def _fetch_page(session: requests.Session, category_code: str, page: int, page_size: int, stock_filter: str, referer: str, retries: int):
payload = {
"pageInfo": {"page": page, "pageSize": page_size},
"belongTo": "pc",
"rank": "overall",
"priceRange": {"low": 0, "high": 0},
"color": [], "size": [], "identity": [], "exist": [],
"categoryCode": category_code,
"searchFlag": False,
"description": "",
"stockFilter": stock_filter,
}
headers = {**BASE_HEADERS, "Referer": referer}
r = _post_with_retry(session, CATEGORY_API, headers, payload, retries=retries)
return r.json()
def _extract_list_and_total(j):
resp = j.get("resp")
if isinstance(resp, list) and resp:
products = resp[0].get("productList") or []
total = resp[0].get("productSum") or resp[0].get("total") or _get_in(resp[0], ("pageInfo","total"), 0)
if products:
return products, int(total or len(products))
if isinstance(resp, dict):
products = resp.get("productList") or resp.get("products") or []
if products:
total = resp.get("productSum") or resp.get("total") or _get_in(resp, ("pageInfo","total"), len(products))
return products, int(total or len(products))
products = _get_in(j, ("data","productList"), [])
if products:
total = _get_in(j, ("data","pageInfo","total"), len(products))
return products, int(total or len(products))
return [], 0
@app.get("/health")
def health():
return {"ok": True}
@app.get("/scrape")
def scrape(
category: str = Query(..., description="categoryCode, e.g., women_tshirtsweat"),
delay: float = 1.0,
page_size: int = 24,
max_pages: int = 200,
stock_filter: str = "warehouse",
retries: int = 4,
allow_partial: bool = True,
):
s = requests.Session()
s.headers.update(BASE_HEADERS)
# warm-up
referer = f"{BASE_WEB}/tw/zh_TW/c/{category}.html"
try:
s.get(referer, headers={"User-Agent": BASE_HEADERS["User-Agent"], "Referer": BASE_WEB}, timeout=20)
except Exception:
pass
all_items, total_hint, page = [], None, 1
start = int(time.time())
try:
while page <= max_pages:
j = _fetch_page(s, category, page, page_size, stock_filter, referer, retries=retries)
products, total = _extract_list_and_total(j)
if total_hint is None and total:
total_hint = int(total)
if not products:
break
now = int(time.time())
for p in products:
norm = _normalize(p)
norm["category"] = category
norm["ts"] = now
all_items.append(norm)
if total_hint and len(all_items) >= total_hint:
break
page += 1
time.sleep(max(0.0, delay))
return JSONResponse({
"ok": True,
"category": category,
"count": len(all_items),
"total_hint": total_hint,
"items": all_items,
"started_at": start,
"ended_at": int(time.time())
})
except requests.RequestException as e:
# 允許回傳「部分成功」避免整條 n8n 失敗
if allow_partial and all_items:
return JSONResponse({
"ok": False,
"partial": True,
"error": f"{type(e).__name__}: {e}",
"category": category,
"count": len(all_items),
"total_hint": total_hint,
"items": all_items,
"started_at": start,
"ended_at": int(time.time())
})
raise HTTPException(status_code=502, detail=f"Upstream HTTP error: {e}")
Warm-up:先 GET 該分類頁,拿必要 cookie/token,成功率更高;失敗也不會中斷。
分頁抓取:每頁 POST 類別 API;對 429/5xx 做重試+指數退避(含抖動)。
正規化:把不同回傳格式對齊成固定欄位(product_code/name/price/original_price/in_stock/url),並保留 raw 方便除錯。
停止條件:遇到「最後一頁」(不足 page_size)或已達 total_hint 即停;兩頁之間 sleep(delay) 防限流。
部分成功:若中途失敗且 allow_partial=true,仍回傳已抓到的 items(附錯誤訊息/統計),n8n 流程不中斷。
參數 | 預設 | 說明 | 什麼時候調 |
---|---|---|---|
category |
(必填) | GU 類別代碼(例:women_tshirtsweat ) |
換抓分類就改這個 |
delay |
1.0 | 每頁抓完後等待的秒數 | 429 多時調大(0.8~1.2) |
page_size |
24 | 每頁筆數(傳給 GU API) | 被限流就降到 12 |
max_pages |
200 | 最多抓幾頁 | 防止誤抓過多 |
stock_filter |
warehouse |
GU 端庫存過濾 | 要含缺貨就改/移除 |
retries |
4 | 單頁 POST 的最大重試 | 網路不穩可加 |
allow_partial |
true | 允許「部分成功」 | 建議保持 true |
curl "http://localhost:8000/scrape?category=women_tshirtsweat&delay=0.8&page_size=24&retries=4"
有了上面這些 我們即可進行佈署
這部我們在之前[Day 7] GU價格提醒系統 (2) - 專案的環境架設 ( Docker + n8n )有講到,
整個專案架構、dockerfile、docker-compose.yml等 都在這。 (今天重點在講如何轉成可呼叫的API)
delay
調大一點(例如 0.8–1.2),或把 page_size
降到 12;若是短時間大量請求,建議在 n8n 增加節流 。(這部分後面會再說)http://scraper:8000/…
(服務名),不是 localhost
。今天我們把 CLI 腳本變成一個可容器化部署的 GU 價格查詢 API: