前篇我們把 chat-service 和 memory-service 在本地跑起來了。這篇要把它們部署到 GCP 生產環境,並且做對所有容易踩坑的地方。
目標:1 小時內從零開始建立一個可擴展、可監控、安全的生產環境。
flowchart LR
DEV[Development<br/>本地 Docker] --> STAGING[Staging<br/>GCP 測試環境]
STAGING --> PROD[Production<br/>GCP 生產環境]
subgraph "部署檢查點"
CHECK1[✓ 功能測試]
CHECK2[✓ 整合測試]
CHECK3[✓ 效能測試]
CHECK4[✓ 安全檢查]
end
DEV --> CHECK1
STAGING --> CHECK2
STAGING --> CHECK3
PROD --> CHECK4
項目 | Development | Staging | Production |
---|---|---|---|
運算資源 | Docker Compose | Cloud Run (小) | Cloud Run (最佳化) |
資料庫 | SQLite | Cloud SQL (dev) | Cloud SQL (HA) |
網域 | localhost | staging.yourdomain.com | yourdomain.com |
SSL | HTTP | 自簽憑證 | Let's Encrypt |
監控 | 基本日誌 | Cloud Logging | 完整 APM |
備份 | 無 | 每日 | 每小時 + 跨區域 |
成本 | $0 | ~$50/月 | ~$200/月 |
#!/bin/bash
# scripts/setup-gcp-project.sh
set -e # 遇到錯誤就停止
# ============ 基本配置 ============
PROJECT_ID="ai-assistant-prod" # 改成你的專案 ID
REGION="asia-east1" # 選擇最近的區域
ZONE="asia-east1-a"
STAGING_DOMAIN="staging.yourdomain.com" # 改成你的網域
PROD_DOMAIN="yourdomain.com" # 改成你的網域
echo "🚀 開始設定 GCP 專案: $PROJECT_ID"
# 1. 建立專案(如果還沒有)
if ! gcloud projects describe $PROJECT_ID &>/dev/null; then
gcloud projects create $PROJECT_ID --name="AI Assistant"
echo "✅ 專案已建立"
fi
# 2. 設定預設專案
gcloud config set project $PROJECT_ID
gcloud config set compute/region $REGION
gcloud config set compute/zone $ZONE
# 3. 啟用必要的 API(這會花幾分鐘)
echo "🔧 啟用 GCP API..."
gcloud services enable \
cloudbuild.googleapis.com \
run.googleapis.com \
sql-component.googleapis.com \
sqladmin.googleapis.com \
secretmanager.googleapis.com \
pubsub.googleapis.com \
logging.googleapis.com \
monitoring.googleapis.com \
cloudresourcemanager.googleapis.com \
iam.googleapis.com \
artifactregistry.googleapis.com \
vpcaccess.googleapis.com
echo "✅ API 啟用完成"
# 4. 建立 Artifact Registry
echo "📦 建立 Docker Registry..."
if ! gcloud artifacts repositories describe ai-assistant \
--location=$REGION &>/dev/null; then
gcloud artifacts repositories create ai-assistant \
--repository-format=docker \
--location=$REGION \
--description="AI Assistant Docker Images"
fi
# 5. 建立 Pub/Sub Topics
echo "📨 建立 Pub/Sub Topics..."
gcloud pubsub topics create chat-tasks --quiet || true
gcloud pubsub topics create chat-events --quiet || true
echo "✅ GCP 基礎設施準備完成!"
#!/bin/bash
# scripts/setup-service-accounts.sh
echo "👤 建立 Service Accounts..."
# 建立服務帳號
gcloud iam service-accounts create chat-service-sa \
--display-name="Chat Service Account" \
--description="用於 chat-service 的服務帳號"
gcloud iam service-accounts create memory-service-sa \
--display-name="Memory Service Account" \
--description="用於 memory-service 的服務帳號"
gcloud iam service-accounts create worker-service-sa \
--display-name="Worker Service Account" \
--description="用於 worker-service 的服務帳號"
# 取得專案號碼(不是 ID)
PROJECT_NUMBER=$(gcloud projects describe $PROJECT_ID --format="value(projectNumber)")
echo "🔐 設定 IAM 權限..."
# Chat Service 權限
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:chat-service-sa@$PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/secretmanager.secretAccessor"
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:chat-service-sa@$PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/pubsub.publisher"
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:chat-service-sa@$PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/aiplatform.user"
# Memory Service 權限
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:memory-service-sa@$PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/secretmanager.secretAccessor"
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:memory-service-sa@$PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/cloudsql.client"
# Worker Service 權限
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:worker-service-sa@$PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/secretmanager.secretAccessor"
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:worker-service-sa@$PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/pubsub.subscriber"
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:worker-service-sa@$PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/aiplatform.user"
echo "✅ Service Accounts 建立完成"
#!/bin/bash
# scripts/setup-database.sh
echo "🗄️ 建立 Cloud SQL 實例..."
# Staging 資料庫(較小配置)
gcloud sql instances create ai-assistant-staging \
--database-version=POSTGRES_15 \
--tier=db-f1-micro \
--region=$REGION \
--storage-size=10GB \
--storage-type=SSD \
--backup-start-time=03:00 \
--maintenance-window-day=SUN \
--maintenance-window-hour=04 \
--deletion-protection
# Production 資料庫(高可用配置)
gcloud sql instances create ai-assistant-prod \
--database-version=POSTGRES_15 \
--tier=db-g1-small \
--region=$REGION \
--storage-size=20GB \
--storage-type=SSD \
--availability-type=REGIONAL \
--backup-start-time=02:00 \
--maintenance-window-day=SUN \
--maintenance-window-hour=03 \
--deletion-protection
# 建立資料庫
gcloud sql databases create ai_assistant_staging --instance=ai-assistant-staging
gcloud sql databases create ai_assistant_prod --instance=ai-assistant-prod
# 建立資料庫用戶
STAGING_DB_PASSWORD=$(openssl rand -base64 32)
PROD_DB_PASSWORD=$(openssl rand -base64 32)
gcloud sql users create app-user \
--instance=ai-assistant-staging \
--password="$STAGING_DB_PASSWORD"
gcloud sql users create app-user \
--instance=ai-assistant-prod \
--password="$PROD_DB_PASSWORD"
# 取得連線資訊
STAGING_CONNECTION_NAME=$(gcloud sql instances describe ai-assistant-staging --format="value(connectionName)")
PROD_CONNECTION_NAME=$(gcloud sql instances describe ai-assistant-prod --format="value(connectionName)")
echo "✅ 資料庫建立完成"
echo "📝 請記住這些資訊:"
echo "Staging DB Password: $STAGING_DB_PASSWORD"
echo "Staging Connection: $STAGING_CONNECTION_NAME"
echo "Prod DB Password: $PROD_DB_PASSWORD"
echo "Prod Connection: $PROD_CONNECTION_NAME"
-- scripts/init-database.sql
-- 執行: psql -h [DB_HOST] -U app-user -d ai_assistant_staging < init-database.sql
-- 啟用必要的擴展
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
CREATE EXTENSION IF NOT EXISTS "pg_trgm";
-- 建立 conversation_history 表
CREATE TABLE IF NOT EXISTS conversation_history (
id SERIAL PRIMARY KEY,
chat_id VARCHAR(36) NOT NULL,
user_id VARCHAR(36) NOT NULL,
role VARCHAR(20) NOT NULL CHECK (role IN ('user', 'assistant', 'system')),
content TEXT NOT NULL,
metadata JSONB DEFAULT '{}',
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
-- 建立索引
CREATE INDEX IF NOT EXISTS idx_conversation_chat_id ON conversation_history(chat_id);
CREATE INDEX IF NOT EXISTS idx_conversation_user_id ON conversation_history(user_id);
CREATE INDEX IF NOT EXISTS idx_conversation_created_at ON conversation_history(created_at DESC);
CREATE INDEX IF NOT EXISTS idx_conversation_composite ON conversation_history(user_id, chat_id, created_at DESC);
-- 建立 user_memory 表
CREATE TABLE IF NOT EXISTS user_memory (
id SERIAL PRIMARY KEY,
user_id VARCHAR(36) UNIQUE NOT NULL,
short_term_summary TEXT,
long_term_memory JSONB DEFAULT '{}',
preferences JSONB DEFAULT '{}',
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
-- 建立索引
CREATE INDEX IF NOT EXISTS idx_user_memory_user_id ON user_memory(user_id);
-- 建立更新時間觸發器
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ language 'plpgsql';
CREATE TRIGGER update_conversation_history_updated_at
BEFORE UPDATE ON conversation_history
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
CREATE TRIGGER update_user_memory_updated_at
BEFORE UPDATE ON user_memory
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
-- 插入測試資料(僅 staging)
INSERT INTO user_memory (user_id, short_term_summary, preferences) VALUES
('test-user-123', '測試用戶', '{"language": "zh-TW", "tone": "friendly"}')
ON CONFLICT (user_id) DO NOTHING;
COMMIT;
#!/bin/bash
# scripts/setup-secrets.sh
echo "🔐 建立 Secret Manager 祕密..."
# 生成安全的隨機密鑰
JWT_SECRET=$(openssl rand -base64 64)
GEMINI_API_KEY="your-actual-gemini-api-key" # 請替換成真實的 API Key
# 建立 Staging 環境祕密
echo -n "$JWT_SECRET" | gcloud secrets create jwt-secret-staging --data-file=-
echo -n "$GEMINI_API_KEY" | gcloud secrets create gemini-api-key-staging --data-file=-
echo -n "postgresql://app-user:$STAGING_DB_PASSWORD@/$STAGING_CONNECTION_NAME/ai_assistant_staging" | \
gcloud secrets create database-url-staging --data-file=-
# 建立 Production 環境祕密
echo -n "$JWT_SECRET" | gcloud secrets create jwt-secret-prod --data-file=-
echo -n "$GEMINI_API_KEY" | gcloud secrets create gemini-api-key-prod --data-file=-
echo -n "postgresql://app-user:$PROD_DB_PASSWORD@/$PROD_CONNECTION_NAME/ai_assistant_prod" | \
gcloud secrets create database-url-prod --data-file=-
# 設定 Service Account 存取權限
for env in staging prod; do
for secret in jwt-secret gemini-api-key database-url; do
gcloud secrets add-iam-policy-binding "${secret}-${env}" \
--member="serviceAccount:chat-service-sa@$PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/secretmanager.secretAccessor"
gcloud secrets add-iam-policy-binding "${secret}-${env}" \
--member="serviceAccount:memory-service-sa@$PROJECT_ID.iam.gserviceaccount.com" \
--role="roles/secretmanager.secretAccessor"
done
done
echo "✅ Secrets 建立完成"
# shared/secrets.py
import os
from google.cloud import secretmanager
from functools import lru_cache
import logging
logger = logging.getLogger(__name__)
class SecretManager:
def __init__(self, project_id: str):
self.project_id = project_id
self.client = secretmanager.SecretManagerServiceClient()
@lru_cache(maxsize=128)
def get_secret(self, secret_name: str, version: str = "latest") -> str:
"""取得祕密值(帶快取)"""
try:
name = f"projects/{self.project_id}/secrets/{secret_name}/versions/{version}"
response = self.client.access_secret_version(request={"name": name})
return response.payload.data.decode("UTF-8")
except Exception as e:
logger.error(f"無法取得祕密 {secret_name}: {e}")
raise
def get_database_url(self, environment: str) -> str:
"""取得資料庫連線字串"""
return self.get_secret(f"database-url-{environment}")
def get_jwt_secret(self, environment: str) -> str:
"""取得 JWT 簽章密鑰"""
return self.get_secret(f"jwt-secret-{environment}")
def get_gemini_api_key(self, environment: str) -> str:
"""取得 Gemini API Key"""
return self.get_secret(f"gemini-api-key-{environment}")
# 使用範例
def get_secret_manager() -> SecretManager:
project_id = os.getenv("GCP_PROJECT_ID")
if not project_id:
raise ValueError("GCP_PROJECT_ID 環境變數未設定")
return SecretManager(project_id)
# cloudbuild.yaml
steps:
# ==================== 建置階段 ====================
# 1. 建置 Chat Service
- name: 'gcr.io/cloud-builders/docker'
id: 'build-chat-service'
args: [
'build',
'-t', '${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/chat-service:$SHORT_SHA',
'-t', '${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/chat-service:latest',
'-f', 'services/chat/Dockerfile',
'.'
]
# 2. 建置 Memory Service
- name: 'gcr.io/cloud-builders/docker'
id: 'build-memory-service'
args: [
'build',
'-t', '${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/memory-service:$SHORT_SHA',
'-t', '${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/memory-service:latest',
'-f', 'services/memory/Dockerfile',
'.'
]
# 3. 推送映像到 Artifact Registry
- name: 'gcr.io/cloud-builders/docker'
id: 'push-images'
args: ['push', '--all-tags', '${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/chat-service']
waitFor: ['build-chat-service']
- name: 'gcr.io/cloud-builders/docker'
args: ['push', '--all-tags', '${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/memory-service']
waitFor: ['build-memory-service']
# ==================== 測試階段 ====================
# 4. 安全掃描
- name: 'gcr.io/cloud-builders/gcloud'
id: 'security-scan'
entrypoint: 'bash'
args:
- '-c'
- |
echo "🔍 執行安全掃描..."
gcloud container images scan \
${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/chat-service:$SHORT_SHA \
--remote
waitFor: ['push-images']
# ==================== Staging 部署 ====================
# 5. 部署到 Staging - Memory Service
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
id: 'deploy-memory-staging'
entrypoint: 'gcloud'
args:
- 'run'
- 'deploy'
- 'memory-service-staging'
- '--image=${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/memory-service:$SHORT_SHA'
- '--region=${_REGION}'
- '--platform=managed'
- '--service-account=memory-service-sa@$PROJECT_ID.iam.gserviceaccount.com'
- '--set-secrets=DATABASE_URL=database-url-staging:latest'
- '--set-env-vars=ENVIRONMENT=staging,GCP_PROJECT_ID=$PROJECT_ID'
- '--min-instances=0'
- '--max-instances=3'
- '--cpu=1'
- '--memory=512Mi'
- '--concurrency=80'
- '--timeout=300s'
- '--add-cloudsql-instances=$PROJECT_ID:${_REGION}:ai-assistant-staging'
- '--no-allow-unauthenticated'
waitFor: ['security-scan']
# 6. 部署到 Staging - Chat Service
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
id: 'deploy-chat-staging'
entrypoint: 'gcloud'
args:
- 'run'
- 'deploy'
- 'chat-service-staging'
- '--image=${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/chat-service:$SHORT_SHA'
- '--region=${_REGION}'
- '--platform=managed'
- '--service-account=chat-service-sa@$PROJECT_ID.iam.gserviceaccount.com'
- '--set-secrets=JWT_SECRET=jwt-secret-staging:latest,GEMINI_API_KEY=gemini-api-key-staging:latest'
- '--set-env-vars=ENVIRONMENT=staging,GCP_PROJECT_ID=$PROJECT_ID,MEMORY_SERVICE_URL=https://memory-service-staging-xxx.run.app,VERTEX_LOCATION=${_REGION}'
- '--min-instances=1'
- '--max-instances=5'
- '--cpu=1'
- '--memory=1Gi'
- '--concurrency=80'
- '--timeout=300s'
- '--allow-unauthenticated'
waitFor: ['deploy-memory-staging']
# ==================== 整合測試 ====================
# 7. Staging 整合測試
- name: 'gcr.io/cloud-builders/curl'
id: 'integration-test'
entrypoint: 'bash'
args:
- '-c'
- |
echo "🧪 執行整合測試..."
# 等待服務就緒
sleep 30
# 取得 Chat Service URL
CHAT_URL=$(gcloud run services describe chat-service-staging \
--region=${_REGION} \
--format="value(status.url)")
# 健康檢查
curl -f "$CHAT_URL/health" || exit 1
# 基本對話測試
curl -f -X POST "$CHAT_URL/chat" \
-H "Content-Type: application/json" \
-d '{"message": "你好", "user_id": "test-user", "processing_mode": "sync"}' || exit 1
echo "✅ 整合測試通過"
waitFor: ['deploy-chat-staging']
# ==================== Production 部署 ====================
# 8. Production 部署需要手動核准
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
id: 'deploy-memory-prod'
entrypoint: 'bash'
args:
- '-c'
- |
if [ "$_DEPLOY_TO_PROD" = "true" ]; then
echo "🚀 部署到 Production..."
gcloud run deploy memory-service-prod \
--image=${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/memory-service:$SHORT_SHA \
--region=${_REGION} \
--platform=managed \
--service-account=memory-service-sa@$PROJECT_ID.iam.gserviceaccount.com \
--set-secrets=DATABASE_URL=database-url-prod:latest \
--set-env-vars=ENVIRONMENT=production,GCP_PROJECT_ID=$PROJECT_ID \
--min-instances=2 \
--max-instances=20 \
--cpu=2 \
--memory=1Gi \
--concurrency=100 \
--timeout=300s \
--add-cloudsql-instances=$PROJECT_ID:${_REGION}:ai-assistant-prod \
--no-allow-unauthenticated
else
echo "⏸️ 跳過 Production 部署(需要手動觸發)"
fi
waitFor: ['integration-test']
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
id: 'deploy-chat-prod'
entrypoint: 'bash'
args:
- '-c'
- |
if [ "$_DEPLOY_TO_PROD" = "true" ]; then
echo "🚀 部署 Chat Service 到 Production..."
gcloud run deploy chat-service-prod \
--image=${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/chat-service:$SHORT_SHA \
--region=${_REGION} \
--platform=managed \
--service-account=chat-service-sa@$PROJECT_ID.iam.gserviceaccount.com \
--set-secrets=JWT_SECRET=jwt-secret-prod:latest,GEMINI_API_KEY=gemini-api-key-prod:latest \
--set-env-vars=ENVIRONMENT=production,GCP_PROJECT_ID=$PROJECT_ID,MEMORY_SERVICE_URL=https://memory-service-prod-xxx.run.app,VERTEX_LOCATION=${_REGION} \
--min-instances=2 \
--max-instances=50 \
--cpu=2 \
--memory=2Gi \
--concurrency=100 \
--timeout=300s \
--allow-unauthenticated
else
echo "⏸️ 跳過 Production 部署(需要手動觸發)"
fi
waitFor: ['deploy-memory-prod']
# 替換變數
substitutions:
_REGION: 'asia-east1'
_DEPLOY_TO_PROD: 'false' # 預設不部署到 Production
# 構建選項
options:
logging: CLOUD_LOGGING_ONLY
machineType: 'E2_HIGHCPU_8'
substitution_option: 'ALLOW_LOOSE'
# 超時設定
timeout: '1800s' # 30 分鐘
#!/bin/bash
# scripts/deploy.sh
set -e
ENVIRONMENT=${1:-staging} # 預設部署到 staging
BRANCH=${2:-main} # 預設從 main 分支部署
echo "🚀 開始部署到 $ENVIRONMENT 環境..."
case $ENVIRONMENT in
staging)
echo "📋 部署到 Staging 環境"
gcloud builds submit \
--config cloudbuild.yaml \
--substitutions _DEPLOY_TO_PROD=false \
--branch $BRANCH
;;
production)
echo "⚠️ 部署到 Production 環境需要確認"
read -p "確定要部署到 Production 嗎?(y/N): " confirm
if [[ $confirm == [yY] ]]; then
gcloud builds submit \
--config cloudbuild.yaml \
--substitutions _DEPLOY_TO_PROD=true \
--branch $BRANCH
else
echo "❌ 取消部署"
exit 1
fi
;;
*)
echo "❌ 不支援的環境: $ENVIRONMENT"
echo "支援的環境: staging, production"
exit 1
;;
esac
echo "✅ 部署完成!"
#!/bin/bash
# scripts/setup-domain.sh
# 設定 Staging 網域
gcloud run domain-mappings create \
--service chat-service-staging \
--domain $STAGING_DOMAIN \
--region $REGION
# 設定 Production 網域
gcloud run domain-mappings create \
--service chat-service-prod \
--domain $PROD_DOMAIN \
--region $REGION
echo "✅ 網域設定完成"
echo "📝 請將以下 DNS 記錄加入你的網域提供商:"
# 取得 CNAME 設定
gcloud run domain-mappings describe $STAGING_DOMAIN --region $REGION \
--format="value(status.resourceRecords[0].rrdata)" > /tmp/staging_cname
gcloud run domain-mappings describe $PROD_DOMAIN --region $REGION \
--format="value(status.resourceRecords[0].rrdata)" > /tmp/prod_cname
echo "Staging CNAME: $STAGING_DOMAIN -> $(cat /tmp/staging_cname)"
echo "Production CNAME: $PROD_DOMAIN -> $(cat /tmp/prod_cname)"
# Cloud Run 會自動管理 SSL 憑證,但如果需要手動配置:
apiVersion: networking.gke.io/v1
kind: ManagedCertificate
metadata:
name: ai-assistant-ssl
spec:
domains:
- yourdomain.com
- staging.yourdomain.com
# shared/logging_config.py
import json
import logging
import sys
from datetime import datetime
from typing import Any, Dict
class GCPFormatter(logging.Formatter):
"""Google Cloud Logging 格式化器"""
def format(self, record: logging.LogRecord) -> str:
log_entry = {
"timestamp": datetime.fromtimestamp(record.created).isoformat() + "Z",
"severity": record.levelname,
"message": record.getMessage(),
"logger": record.name,
"module": record.module,
"function": record.funcName,
"line": record.lineno
}
# 加入額外的欄位
if hasattr(record, 'trace_id'):
log_entry["logging.googleapis.com/trace"] = f"projects/{record.project_id}/traces/{record.trace_id}"
if hasattr(record, 'user_id'):
log_entry["user_id"] = record.user_id
if hasattr(record, 'chat_id'):
log_entry["chat_id"] = record.chat_id
if hasattr(record, 'duration_ms'):
log_entry["duration_ms"] = record.duration_ms
# 錯誤資訊
if record.exc_info:
log_entry["error"] = {
"type": record.exc_info[0].__name__,
"message": str(record.exc_info[1]),
"stack_trace": self.formatException(record.exc_info)
}
return json.dumps(log_entry, ensure_ascii=False)
def setup_logging(service_name: str, level: str = "INFO"):
"""設定日誌配置"""
root_logger = logging.getLogger()
root_logger.setLevel(getattr(logging, level.upper()))
# 清除現有的 handlers
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
# 建立 stdout handler
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(GCPFormatter())
root_logger.addHandler(handler)
# 設定特定 logger
service_logger = logging.getLogger(service_name)
return service_logger
# 使用範例
logger = setup_logging("chat-service")
# 在業務代碼中使用
def log_request(user_id: str, chat_id: str, duration_ms: float):
logger.info(
"處理用戶請求",
extra={
"user_id": user_id,
"chat_id": chat_id,
"duration_ms": duration_ms
}
)
# shared/metrics.py
from google.cloud import monitoring_v3
import time
from functools import wraps
import os
class MetricsCollector:
def __init__(self, project_id: str, service_name: str):
self.project_id = project_id
self.service_name = service_name
self.client = monitoring_v3.MetricServiceClient()
self.project_name = f"projects/{project_id}"
def record_request_duration(self, endpoint: str, duration_ms: float, status: str):
"""記錄請求延遲"""
series = monitoring_v3.TimeSeries()
series.metric.type = "custom.googleapis.com/ai_assistant/request_duration"
series.metric.labels['service'] = self.service_name
series.metric.labels['endpoint'] = endpoint
series.metric.labels['status'] = status
series.resource.type = 'cloud_run_revision'
series.resource.labels['service_name'] = self.service_name
series.resource.labels['location'] = os.getenv('VERTEX_LOCATION', 'asia-east1')
now = time.time()
seconds = int(now)
nanos = int((now - seconds) * 10 ** 9)
interval = monitoring_v3.TimeInterval({
"end_time": {"seconds": seconds, "nanos": nanos}
})
point = monitoring_v3.Point({
"interval": interval,
"value": {"double_value": duration_ms}
})
series.points = [point]
self.client.create_time_series(name=self.project_name, time_series=[series])
def record_counter(self, metric_name: str, labels: dict = None):
"""記錄計數器指標"""
# 實作計數器指標記錄
pass
# 裝飾器:自動記錄函數執行時間
def monitor_performance(metrics_collector: MetricsCollector, endpoint: str):
def decorator(func):
@wraps(func)
async def wrapper(*args, **kwargs):
start_time = time.time()
status = "success"
try:
result = await func(*args, **kwargs)
return result
except Exception as e:
status = "error"
raise
finally:
duration_ms = (time.time() - start_time) * 1000
metrics_collector.record_request_duration(endpoint, duration_ms, status)
return wrapper
return decorator
# monitoring/alert-policies.yaml
alertPolicy:
displayName: "AI Assistant High Error Rate"
conditions:
- displayName: "Error rate > 5%"
conditionThreshold:
filter: 'resource.type="cloud_run_revision" AND resource.labels.service_name=~"chat-service.*|memory-service.*"'
comparison: COMPARISON_GREATER_THAN
thresholdValue: 0.05
duration: "300s"
aggregations:
- alignmentPeriod: "60s"
perSeriesAligner: ALIGN_RATE
crossSeriesReducer: REDUCE_MEAN
notificationChannels:
- "projects/PROJECT_ID/notificationChannels/CHANNEL_ID"
alertStrategy:
autoClose: "1800s"
---
alertPolicy:
displayName: "AI Assistant High Latency"
conditions:
- displayName: "P95 latency > 5 seconds"
conditionThreshold:
filter: 'metric.type="custom.googleapis.com/ai_assistant/request_duration"'
comparison: COMPARISON_GREATER_THAN
thresholdValue: 5000
duration: "300s"
aggregations:
- alignmentPeriod: "300s"
perSeriesAligner: ALIGN_DELTA
crossSeriesReducer: REDUCE_PERCENTILE_95
#!/bin/bash
# scripts/setup-network.sh
echo "🌐 設定網路安全..."
# 建立 VPC(如果需要私有網路)
gcloud compute networks create ai-assistant-vpc \
--subnet-mode custom
# 建立子網路
gcloud compute networks subnets create ai-assistant-subnet \
--network ai-assistant-vpc \
--range 10.0.0.0/24 \
--region $REGION
# 建立 VPC Connector(Cloud Run 連接 VPC)
gcloud compute networks vpc-access connectors create ai-assistant-connector \
--region $REGION \
--subnet ai-assistant-subnet \
--subnet-project $PROJECT_ID \
--min-instances 2 \
--max-instances 3 \
--machine-type f1-micro
# 建立防火牆規則
gcloud compute firewall-rules create allow-internal \
--network ai-assistant-vpc \
--allow tcp,udp,icmp \
--source-ranges 10.0.0.0/24
echo "✅ 網路設定完成"
# shared/security.py
from fastapi import FastAPI, Request, Response
from fastapi.middleware.base import BaseHTTPMiddleware
from typing import Callable
class SecurityHeadersMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next: Callable) -> Response:
response = await call_next(request)
# 安全標頭
response.headers["X-Content-Type-Options"] = "nosniff"
response.headers["X-Frame-Options"] = "DENY"
response.headers["X-XSS-Protection"] = "1; mode=block"
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
# CSP(內容安全政策)
response.headers["Content-Security-Policy"] = (
"default-src 'self'; "
"script-src 'self' 'unsafe-inline'; "
"style-src 'self' 'unsafe-inline'; "
"img-src 'self' data: https:; "
"connect-src 'self' https://api.gemini.google.com"
)
return response
def add_security_middleware(app: FastAPI):
app.add_middleware(SecurityHeadersMiddleware)
#!/bin/bash
# scripts/setup-backup.sh
echo "💾 設定備份策略..."
# Cloud SQL 自動備份(已在建立時設定)
# 建立備份排程(額外保護)
gcloud sql backups create \
--instance=ai-assistant-prod \
--description="Manual backup before deployment"
# 設定備份保留政策
gcloud sql instances patch ai-assistant-prod \
--backup-start-time=02:00 \
--backup-location=asia \
--retained-backups-count=30
# 建立 Cloud Storage 備份桶
gsutil mb -l $REGION gs://$PROJECT_ID-backups
# 設定生命週期管理
cat > lifecycle.json << EOF
{
"lifecycle": {
"rule": [
{
"action": {"type": "SetStorageClass", "storageClass": "COLDLINE"},
"condition": {"age": 30}
},
{
"action": {"type": "Delete"},
"condition": {"age": 365}
}
]
}
}
EOF
gsutil lifecycle set lifecycle.json gs://$PROJECT_ID-backups
echo "✅ 備份設定完成"
#!/bin/bash
# scripts/disaster-recovery.sh
# 災難恢復腳本(緊急時使用)
echo "🚨 執行災難恢復程序..."
# 1. 檢查服務狀態
echo "📊 檢查服務狀態..."
gcloud run services list --region=$REGION
# 2. 恢復資料庫(從最新備份)
if [ "$1" = "restore-db" ]; then
echo "🗄️ 恢復資料庫..."
BACKUP_ID=$(gcloud sql backups list --instance=ai-assistant-prod --limit=1 --format="value(id)")
gcloud sql backups restore $BACKUP_ID --restore-instance=ai-assistant-prod-recovered
fi
# 3. 切換到備用區域
if [ "$1" = "failover" ]; then
echo "🔄 切換到備用區域..."
# 部署到備用區域
gcloud run deploy chat-service-failover \
--image=${_REGION}-docker.pkg.dev/$PROJECT_ID/ai-assistant/chat-service:latest \
--region=asia-northeast1 \
--min-instances=2
fi
echo "✅ 災難恢復完成"
# 效能調優配置
apiVersion: serving.knative.dev/v1
kind: Service
metadata:
name: chat-service-prod
annotations:
# 自動擴展設定
autoscaling.knative.dev/minScale: "2"
autoscaling.knative.dev/maxScale: "100"
autoscaling.knative.dev/target: "70" # 目標併發數
# 冷啟動優化
run.googleapis.com/cpu-throttling: "false"
run.googleapis.com/execution-environment: gen2
# 網路設定
run.googleapis.com/vpc-access-connector: "ai-assistant-connector"
run.googleapis.com/vpc-access-egress: "private-ranges-only"
#!/bin/bash
# scripts/cost-monitoring.sh
echo "💰 檢查服務成本..."
# 1. 檢查 Cloud Run 使用量
gcloud run services describe chat-service-prod \
--region=$REGION \
--format="table(metadata.name,status.traffic,spec.template.spec.containers[0].resources)"
# 2. 檢查 Cloud SQL 使用量
gcloud sql instances describe ai-assistant-prod \
--format="table(name,settings.tier,settings.diskSize,state)"
# 3. 設定預算警報
gcloud billing budgets create \
--billing-account=$BILLING_ACCOUNT_ID \
--display-name="AI Assistant Monthly Budget" \
--budget-amount=500USD \
--threshold-rules-percent=50,80,100 \
--notification-channels=$NOTIFICATION_CHANNEL
echo "✅ 成本監控設定完成"
#!/bin/bash
# scripts/troubleshoot.sh
echo "🔧 執行故障排除..."
# 1. 檢查服務狀態
echo "=== 服務狀態 ==="
gcloud run services list --region=$REGION
# 2. 檢查日誌
echo "=== 最近錯誤日誌 ==="
gcloud logging read "resource.type=cloud_run_revision AND severity>=ERROR" \
--limit=10 --format="table(timestamp,jsonPayload.message)"
# 3. 檢查資料庫連線
echo "=== 資料庫狀態 ==="
gcloud sql instances list
# 4. 檢查 Secret 存取
echo "=== Secret 權限 ==="
gcloud secrets list
# 5. 執行健康檢查
echo "=== 健康檢查 ==="
CHAT_URL=$(gcloud run services describe chat-service-prod --region=$REGION --format="value(status.url)")
curl -f "$CHAT_URL/health" || echo "❌ 健康檢查失敗"
# 6. 檢查配額使用量
echo "=== 配額使用量 ==="
gcloud compute project-info describe --format="table(quotas.metric,quotas.usage,quotas.limit)"
## 🚀 部署前檢查清單
### 基礎設施
- [ ] GCP 專案已建立並啟用 API
- [ ] Service Accounts 已建立並設定權限
- [ ] Secret Manager 已設定所有祕密
- [ ] Cloud SQL 實例已建立並初始化
- [ ] Pub/Sub Topics 已建立
### 安全性
- [ ] 所有祕密都使用 Secret Manager
- [ ] Service Account 權限遵循最小權限原則
- [ ] HTTPS 強制執行
- [ ] 安全標頭已設定
### 監控
- [ ] Cloud Logging 已啟用
- [ ] 告警規則已設定
- [ ] 預算警報已設定
- [ ] 健康檢查端點正常
### 測試
- [ ] 單元測試通過
- [ ] 整合測試通過
- [ ] 負載測試完成
- [ ] 安全掃描通過
### 備份
- [ ] 資料庫自動備份已啟用
- [ ] 災難恢復計畫已準備
- [ ] 回滾程序已測試
#!/bin/bash
# deploy-all.sh - 一鍵部署完整系統
set -e
# 載入配置
source scripts/config.sh
echo "🚀 開始一鍵部署 AI Assistant 系統..."
# 1. 檢查先決條件
echo "🔍 檢查先決條件..."
./scripts/check-prerequisites.sh
# 2. 設定 GCP 專案
echo "🏗️ 設定 GCP 專案..."
./scripts/setup-gcp-project.sh
# 3. 建立資料庫
echo "🗄️ 建立資料庫..."
./scripts/setup-database.sh
# 4. 設定 Service Accounts
echo "👤 設定服務帳號..."
./scripts/setup-service-accounts.sh
# 5. 建立 Secrets
echo "🔐 建立祕密..."
./scripts/setup-secrets.sh
# 6. 設定網路
echo "🌐 設定網路..."
./scripts/setup-network.sh
# 7. 部署到 Staging
echo "📦 部署到 Staging..."
./scripts/deploy.sh staging
# 8. 執行整合測試
echo "🧪 執行整合測試..."
./scripts/integration-test.sh staging
# 9. 設定監控
echo "📊 設定監控..."
./scripts/setup-monitoring.sh
# 10. 設定備份
echo "💾 設定備份..."
./scripts/setup-backup.sh
echo "✅ 系統部署完成!"
echo ""
echo "🔗 服務連結:"
echo " Staging: https://staging.yourdomain.com"
echo " Production: 請執行 './scripts/deploy.sh production'"
echo ""
echo "📊 監控儀表板:"
echo " Cloud Console: https://console.cloud.google.com"
echo " Logs: https://console.cloud.google.com/logs"
echo ""
echo "📝 下一步:"
echo " 1. 設定網域 DNS 記錄"
echo " 2. 配置告警通知"
echo " 3. 執行負載測試"