Day 13: 錯誤處理與異常管理

17th鐵人賽

yuxiu1212

團隊nutc imac

2025-09-27 00:35:53

88 瀏覽

分享至

Day 13: 錯誤處理與異常管理

在 AI 助理開發中，穩定性和可靠性至關重要。今天我們要建立一個完整的錯誤處理與異常管理系統，確保我們的助理在面對各種意外狀況時仍能優雅地運行。

🚨 為什麼需要完善的錯誤處理？

AI 助理在實際運行中會遇到各種挑戰：

🌐 網路問題：API 呼叫失敗、連線逾時
📊 資料異常：輸入格式錯誤、資料不完整
🔑 認證問題：API 金鑰失效、權限不足
💾 資源限制：記憶體不足、檔案讀取失敗
🤖 模型問題：回應解析錯誤、內容過濾

良好的錯誤處理不僅能提升使用者體驗，更能幫助開發者快速定位和解決問題。

🏗 專案結構

robust_assistant/
├── main.py                          # 主程式
├── core/
│   ├── __init__.py
│   ├── error_handler.py             # 錯誤處理核心
│   ├── exception_manager.py         # 異常管理器
│   └── recovery_system.py           # 恢復系統
├── monitoring/
│   ├── __init__.py
│   ├── health_checker.py            # 健康檢查
│   ├── performance_monitor.py       # 效能監控
│   └── alert_system.py              # 警報系統
├── utils/
│   ├── __init__.py
│   ├── logger.py                    # 日誌系統
│   └── retry_decorator.py           # 重試裝飾器
└── workflows/
    ├── __init__.py
    └── robust_workflow.py           # 強健工作流程

🔧 核心實作

1. 自訂異常類別 (core/exception_manager.py)

class AIAssistantException(Exception):
    """AI 助理基礎異常類別"""
    
    def __init__(self, message: str, error_code: str = None, details: dict = None):
        super().__init__(message)
        self.message = message
        self.error_code = error_code
        self.details = details or {}
        self.timestamp = datetime.now()

class APIException(AIAssistantException):
    """API 相關異常"""
    
    def __init__(self, message: str, service_name: str, status_code: int = None):
        super().__init__(message, "API_ERROR")
        self.service_name = service_name
        self.status_code = status_code

class ValidationException(AIAssistantException):
    """資料驗證異常"""
    
    def __init__(self, message: str, field_name: str = None, invalid_value=None):
        super().__init__(message, "VALIDATION_ERROR")
        self.field_name = field_name
        self.invalid_value = invalid_value

class ResourceException(AIAssistantException):
    """資源相關異常"""
    
    def __init__(self, message: str, resource_type: str, resource_path: str = None):
        super().__init__(message, "RESOURCE_ERROR")
        self.resource_type = resource_type
        self.resource_path = resource_path

class ModelException(AIAssistantException):
    """模型處理異常"""
    
    def __init__(self, message: str, model_name: str = None, input_text: str = None):
        super().__init__(message, "MODEL_ERROR")
        self.model_name = model_name
        self.input_text = input_text

class ExceptionManager:
    """異常管理器"""
    
    @staticmethod
    def classify_exception(error: Exception) -> AIAssistantException:
        """將標準異常分類為自訂異常"""
        if isinstance(error, requests.RequestException):
            return APIException(f"網路請求失敗: {str(error)}", "HTTP_REQUEST")
        elif isinstance(error, FileNotFoundError):
            return ResourceException(f"檔案不存在: {str(error)}", "FILE", str(error))
        elif isinstance(error, json.JSONDecodeError):
            return ValidationException(f"JSON 解析失敗: {str(error)}")
        elif isinstance(error, PermissionError):
            return ResourceException(f"權限不足: {str(error)}", "PERMISSION")
        elif isinstance(error, MemoryError):
            return ResourceException("記憶體不足", "MEMORY")
        else:
            return AIAssistantException(f"未知錯誤: {str(error)}", "UNKNOWN")

2. 錯誤處理核心 (core/error_handler.py)

from typing import Dict, Any, Callable, Optional
from functools import wraps
import logging
import traceback
from datetime import datetime, timedelta

class ErrorHandler:
    """錯誤處理核心類別"""
    
    def __init__(self, logger: logging.Logger = None):
        self.logger = logger or logging.getLogger(__name__)
        self.error_stats = {}
        self.recovery_strategies = {}
        self.error_threshold = 5  # 錯誤閾值
        self.time_window = timedelta(minutes=5)  # 時間窗口
    
    def register_recovery_strategy(self, error_type: str, strategy: Callable):
        """註冊恢復策略"""
        self.recovery_strategies[error_type] = strategy
    
    def handle_error(self, error: Exception, context: Dict[str, Any] = None) -> Dict[str, Any]:
        """處理錯誤的主要方法"""
        # 分類異常
        classified_error = ExceptionManager.classify_exception(error)
        
        # 記錄錯誤
        self._log_error(classified_error, context)
        
        # 更新統計
        self._update_error_stats(classified_error.error_code)
        
        # 檢查是否需要熔斷
        if self._should_circuit_break(classified_error.error_code):
            return self._create_circuit_breaker_response()
        
        # 嘗試恢復
        recovery_result = self._attempt_recovery(classified_error, context)
        
        return {
            'success': recovery_result['success'],
            'error_type': classified_error.error_code,
            'message': recovery_result['message'],
            'user_message': self._generate_user_friendly_message(classified_error),
            'recovery_attempted': recovery_result['recovery_attempted'],
            'suggestions': self._get_error_suggestions(classified_error),
            'timestamp': datetime.now().isoformat()
        }
    
    def _log_error(self, error: AIAssistantException, context: Dict = None):
        """記錄錯誤詳細資訊"""
        log_data = {
            'error_code': error.error_code,
            'message': error.message,
            'timestamp': error.timestamp.isoformat(),
            'traceback': traceback.format_exc(),
            'context': context or {}
        }
        
        self.logger.error(f"錯誤發生: {error.error_code}", extra=log_data)
    
    def _update_error_stats(self, error_code: str):
        """更新錯誤統計"""
        now = datetime.now()
        
        if error_code not in self.error_stats:
            self.error_stats[error_code] = []
        
        # 添加當前錯誤時間
        self.error_stats[error_code].append(now)
        
        # 清理過期的錯誤記錄
        cutoff_time = now - self.time_window
        self.error_stats[error_code] = [
            timestamp for timestamp in self.error_stats[error_code]
            if timestamp > cutoff_time
        ]
    
    def _should_circuit_break(self, error_code: str) -> bool:
        """判斷是否應該熔斷"""
        if error_code not in self.error_stats:
            return False
        
        recent_errors = len(self.error_stats[error_code])
        return recent_errors >= self.error_threshold
    
    def _create_circuit_breaker_response(self) -> Dict[str, Any]:
        """創建熔斷器回應"""
        return {
            'success': False,
            'error_type': 'CIRCUIT_BREAKER',
            'message': '系統檢測到頻繁錯誤，暫時啟用保護模式',
            'user_message': '⚠️ 系統正在恢復中，請稍後再試',
            'recovery_attempted': False,
            'suggestions': ['等待幾分鐘後重試', '檢查網路連線', '聯繫技術支援']
        }
    
    def _attempt_recovery(self, error: AIAssistantException, context: Dict = None) -> Dict:
        """嘗試錯誤恢復"""
        error_code = error.error_code
        
        if error_code in self.recovery_strategies:
            try:
                strategy = self.recovery_strategies[error_code]
                recovery_result = strategy(error, context)
                
                return {
                    'success': recovery_result,
                    'recovery_attempted': True,
                    'message': '嘗試自動恢復' + ('成功' if recovery_result else '失敗')
                }
                
            except Exception as recovery_error:
                self.logger.warning(f"恢復策略執行失敗: {recovery_error}")
        
        return {
            'success': False,
            'recovery_attempted': False,
            'message': '無可用的恢復策略'
        }
    
    def _generate_user_friendly_message(self, error: AIAssistantException) -> str:
        """生成使用者友善的錯誤訊息"""
        error_messages = {
            'API_ERROR': '🌐 網路服務暫時無法使用，請稍後再試',
            'VALIDATION_ERROR': '📝 輸入格式有誤，請檢查您的輸入',
            'RESOURCE_ERROR': '📁 檔案或資源存取失敗，請檢查路徑',
            'MODEL_ERROR': '🤖 AI 模型處理失敗，正在嘗試恢復',
            'UNKNOWN': '❓ 發生未知錯誤，請重新嘗試'
        }
        
        return error_messages.get(error.error_code, '❓ 系統發生錯誤，請稍後再試')
    
    def _get_error_suggestions(self, error: AIAssistantException) -> list:
        """獲取錯誤處理建議"""
        suggestions_map = {
            'API_ERROR': [
                '檢查網路連線',
                '確認 API 金鑰有效',
                '等待幾分鐘後重試'
            ],
            'VALIDATION_ERROR': [
                '檢查輸入格式',
                '確認必填欄位已填寫',
                '參考輸入範例'
            ],
            'RESOURCE_ERROR': [
                '確認檔案路徑正確',
                '檢查檔案權限',
                '確保有足夠的磁碟空間'
            ],
            'MODEL_ERROR': [
                '簡化您的問題',
                '嘗試不同的表達方式',
                '稍後重新提問'
            ]
        }
        
        return suggestions_map.get(error.error_code, ['重新啟動程式', '聯繫技術支援'])

3. 重試裝飾器 (utils/retry_decorator.py)

import time
import random
from functools import wraps
from typing import Tuple, Callable, Type

def retry_with_backoff(
    max_retries: int = 3,
    base_delay: float = 1.0,
    max_delay: float = 60.0,
    backoff_factor: float = 2.0,
    jitter: bool = True,
    exceptions: Tuple[Type[Exception], ...] = (Exception,)
):
    """帶有指數退避的重試裝飾器"""
    
    def decorator(func: Callable):
        @wraps(func)
        def wrapper(*args, **kwargs):
            last_exception = None
            
            for attempt in range(max_retries + 1):
                try:
                    return func(*args, **kwargs)
                
                except exceptions as e:
                    last_exception = e
                    
                    if attempt == max_retries:
                        # 最後一次嘗試失敗，拋出異常
                        raise last_exception
                    
                    # 計算延遲時間
                    delay = min(base_delay * (backoff_factor ** attempt), max_delay)
                    
                    # 添加隨機抖動
                    if jitter:
                        delay = delay * (0.5 + random.random() * 0.5)
                    
                    print(f"嘗試 {attempt + 1}/{max_retries + 1} 失敗: {e}")
                    print(f"等待 {delay:.2f} 秒後重試...")
                    time.sleep(delay)
            
            # 如果所有重試都失敗，拋出最後一個異常
            raise last_exception
        
        return wrapper
    return decorator

class CircuitBreaker:
    """熔斷器模式實現"""
    
    def __init__(self, failure_threshold: int = 5, recovery_timeout: int = 60):
        self.failure_threshold = failure_threshold
        self.recovery_timeout = recovery_timeout
        self.failure_count = 0
        self.last_failure_time = None
        self.state = 'CLOSED'  # CLOSED, OPEN, HALF_OPEN
    
    def __call__(self, func: Callable):
        @wraps(func)
        def wrapper(*args, **kwargs):
            if self.state == 'OPEN':
                if self._should_attempt_reset():
                    self.state = 'HALF_OPEN'
                else:
                    raise Exception("熔斷器開啟中，服務暫時不可用")
            
            try:
                result = func(*args, **kwargs)
                self._on_success()
                return result
            
            except Exception as e:
                self._on_failure()
                raise e
        
        return wrapper
    
    def _should_attempt_reset(self) -> bool:
        """判斷是否應該嘗試重置熔斷器"""
        return (
            self.last_failure_time and
            time.time() - self.last_failure_time >= self.recovery_timeout
        )
    
    def _on_success(self):
        """成功時的處理"""
        self.failure_count = 0
        self.state = 'CLOSED'
    
    def _on_failure(self):
        """失敗時的處理"""
        self.failure_count += 1
        self.last_failure_time = time.time()
        
        if self.failure_count >= self.failure_threshold:
            self.state = 'OPEN'

4. 強健工作流程 (workflows/robust_workflow.py)

from langgraph.graph import StateGraph, END
from typing import TypedDict, Dict, Any
from core.error_handler import ErrorHandler
from core.exception_manager import *
import google.generativeai as genai
import os

genai.configure(api_key=os.getenv('GEMINI_API_KEY'))

class RobustWorkflowState(TypedDict):
    user_input: str
    processing_attempts: int
    error_history: list
    fallback_used: bool
    final_response: str
    success: bool
    error_info: Dict[str, Any]

# 初始化錯誤處理器
error_handler = ErrorHandler()

# 註冊恢復策略
def api_recovery_strategy(error: APIException, context: Dict = None) -> bool:
    """API 錯誤恢復策略"""
    # 嘗試使用備用端點或降級服務
    print("🔄 嘗試 API 恢復策略...")
    time.sleep(1)  # 簡單的等待策略
    return True

def model_recovery_strategy(error: ModelException, context: Dict = None) -> bool:
    """模型錯誤恢復策略"""
    print("🤖 嘗試模型恢復策略...")
    # 可以嘗試使用不同的提示詞或模型參數
    return False

error_handler.register_recovery_strategy('API_ERROR', api_recovery_strategy)
error_handler.register_recovery_strategy('MODEL_ERROR', model_recovery_strategy)

@retry_with_backoff(max_retries=2, base_delay=1.0)
def safe_gemini_call(prompt: str) -> str:
    """安全的 Gemini API 呼叫"""
    try:
        model = genai.GenerativeModel('gemini-pro')
        response = model.generate_content(prompt)
        
        if not response.text:
            raise ModelException("模型回應為空", "gemini-pro", prompt)
        
        return response.text
    
    except Exception as e:
        if "quota" in str(e).lower():
            raise APIException("API 配額不足", "Gemini", 429)
        elif "key" in str(e).lower():
            raise APIException("API 金鑰無效", "Gemini", 401)
        else:
            raise ModelException(f"模型處理失敗: {str(e)}", "gemini-pro", prompt)

def process_with_error_handling(state: RobustWorkflowState) -> RobustWorkflowState:
    """帶有錯誤處理的主要處理節點"""
    user_input = state["user_input"]
    attempts = state["processing_attempts"]
    
    try:
        # 驗證輸入
        if not user_input or len(user_input.strip()) == 0:
            raise ValidationException("使用者輸入為空")
        
        if len(user_input) > 5000:
            raise ValidationException("輸入過長", "user_input", len(user_input))
        
        # 嘗試處理
        response = safe_gemini_call(user_input)
        
        return {
            **state,
            "final_response": response,
            "success": True,
            "processing_attempts": attempts + 1
        }
    
    except Exception as e:
        # 使用錯誤處理器處理異常
        error_result = error_handler.handle_error(e, {
            "user_input": user_input,
            "attempt": attempts + 1
        })
        
        # 更新錯誤歷史
        error_history = state["error_history"]
        error_history.append({
            "attempt": attempts + 1,
            "error_type": error_result["error_type"],
            "message": error_result["message"],
            "timestamp": error_result["timestamp"]
        })
        
        return {
            **state,
            "processing_attempts": attempts + 1,
            "error_history": error_history,
            "error_info": error_result,
            "success": False
        }

def fallback_processing(state: RobustWorkflowState) -> RobustWorkflowState:
    """備用處理節點"""
    fallback_responses = [
        "抱歉，我現在遇到一些技術問題，無法完全處理您的請求。",
        "系統正在維護中，請稍後再試，或者您可以：\n• 重新表達您的問題\n• 檢查網路連線\n• 稍後重試",
        "感謝您的耐心，我正在努力恢復正常服務。"
    ]
    
    # 根據錯誤歷史選擇合適的備用回應
    attempt_count = len(state["error_history"])
    fallback_index = min(attempt_count - 1, len(fallback_responses) - 1)
    
    return {
        **state,
        "final_response": fallback_responses[fallback_index],
        "fallback_used": True,
        "success": True  # 備用處理視為成功
    }

def should_retry(state: RobustWorkflowState) -> str:
    """決定是否重試或使用備用方案"""
    max_attempts = 3
    
    if state["success"]:
        return "complete"
    elif state["processing_attempts"] < max_attempts:
        # 檢查錯誤類型，決定是否值得重試
        if state.get("error_info", {}).get("error_type") in ["CIRCUIT_BREAKER"]:
            return "fallback"
        else:
            return "retry"
    else:
        return "fallback"

def create_robust_workflow():
    """建立強健的工作流程"""
    workflow = StateGraph(RobustWorkflowState)
    
    # 添加節點
    workflow.add_node("process", process_with_error_handling)
    workflow.add_node("fallback", fallback_processing)
    
    # 設定流程
    workflow.set_entry_point("process")
    
    # 條件路由
    workflow.add_conditional_edges(
        "process",
        should_retry,
        {
            "complete": END,
            "retry": "process",
            "fallback": "fallback"
        }
    )
    
    workflow.add_edge("fallback", END)
    
    return workflow.compile()

5. 主程式 (main.py)

from workflows.robust_workflow import create_robust_workflow
import json

def main():
    """強健 AI 助理主程式"""
    print("🛡️ 強健 AI 助理系統")
    print("🔧 具備完整的錯誤處理與恢復機制")
    print("=" * 50)
    
    app = create_robust_workflow()
    
    while True:
        try:
            user_input = input("\n💬 請輸入您的問題：").strip()
            
            if user_input.lower() in ['quit', 'exit', '退出']:
                print("👋 再見！")
                break
            
            # 初始化狀態
            initial_state = {
                "user_input": user_input,
                "processing_attempts": 0,
                "error_history": [],
                "fallback_used": False,
                "final_response": "",
                "success": False,
                "error_info": {}
            }
            
            print("🔍 處理中...")
            result = app.invoke(initial_state)
            
            # 顯示結果
            print(f"\n🤖 助理：{result['final_response']}")
            
            # 顯示處理資訊
            if result.get('error_history'):
                print(f"⚠️ 處理嘗試：{result['processing_attempts']} 次")
                if result.get('fallback_used'):
                    print("🔄 使用了備用處理方案")
            
            print("-" * 40)
            
        except KeyboardInterrupt:
            print("\n👋 再見！")
            break
        except Exception as e:
            print(f"❌ 系統級錯誤：{e}")
            print("請重新啟動程式")

if __name__ == "__main__":
    main()