在 AI 助理開發中,穩定性和可靠性至關重要。今天我們要建立一個完整的錯誤處理與異常管理系統,確保我們的助理在面對各種意外狀況時仍能優雅地運行。
AI 助理在實際運行中會遇到各種挑戰:
良好的錯誤處理不僅能提升使用者體驗,更能幫助開發者快速定位和解決問題。
robust_assistant/
├── main.py # 主程式
├── core/
│ ├── __init__.py
│ ├── error_handler.py # 錯誤處理核心
│ ├── exception_manager.py # 異常管理器
│ └── recovery_system.py # 恢復系統
├── monitoring/
│ ├── __init__.py
│ ├── health_checker.py # 健康檢查
│ ├── performance_monitor.py # 效能監控
│ └── alert_system.py # 警報系統
├── utils/
│ ├── __init__.py
│ ├── logger.py # 日誌系統
│ └── retry_decorator.py # 重試裝飾器
└── workflows/
├── __init__.py
└── robust_workflow.py # 強健工作流程
class AIAssistantException(Exception):
"""AI 助理基礎異常類別"""
def __init__(self, message: str, error_code: str = None, details: dict = None):
super().__init__(message)
self.message = message
self.error_code = error_code
self.details = details or {}
self.timestamp = datetime.now()
class APIException(AIAssistantException):
"""API 相關異常"""
def __init__(self, message: str, service_name: str, status_code: int = None):
super().__init__(message, "API_ERROR")
self.service_name = service_name
self.status_code = status_code
class ValidationException(AIAssistantException):
"""資料驗證異常"""
def __init__(self, message: str, field_name: str = None, invalid_value=None):
super().__init__(message, "VALIDATION_ERROR")
self.field_name = field_name
self.invalid_value = invalid_value
class ResourceException(AIAssistantException):
"""資源相關異常"""
def __init__(self, message: str, resource_type: str, resource_path: str = None):
super().__init__(message, "RESOURCE_ERROR")
self.resource_type = resource_type
self.resource_path = resource_path
class ModelException(AIAssistantException):
"""模型處理異常"""
def __init__(self, message: str, model_name: str = None, input_text: str = None):
super().__init__(message, "MODEL_ERROR")
self.model_name = model_name
self.input_text = input_text
class ExceptionManager:
"""異常管理器"""
@staticmethod
def classify_exception(error: Exception) -> AIAssistantException:
"""將標準異常分類為自訂異常"""
if isinstance(error, requests.RequestException):
return APIException(f"網路請求失敗: {str(error)}", "HTTP_REQUEST")
elif isinstance(error, FileNotFoundError):
return ResourceException(f"檔案不存在: {str(error)}", "FILE", str(error))
elif isinstance(error, json.JSONDecodeError):
return ValidationException(f"JSON 解析失敗: {str(error)}")
elif isinstance(error, PermissionError):
return ResourceException(f"權限不足: {str(error)}", "PERMISSION")
elif isinstance(error, MemoryError):
return ResourceException("記憶體不足", "MEMORY")
else:
return AIAssistantException(f"未知錯誤: {str(error)}", "UNKNOWN")
from typing import Dict, Any, Callable, Optional
from functools import wraps
import logging
import traceback
from datetime import datetime, timedelta
class ErrorHandler:
"""錯誤處理核心類別"""
def __init__(self, logger: logging.Logger = None):
self.logger = logger or logging.getLogger(__name__)
self.error_stats = {}
self.recovery_strategies = {}
self.error_threshold = 5 # 錯誤閾值
self.time_window = timedelta(minutes=5) # 時間窗口
def register_recovery_strategy(self, error_type: str, strategy: Callable):
"""註冊恢復策略"""
self.recovery_strategies[error_type] = strategy
def handle_error(self, error: Exception, context: Dict[str, Any] = None) -> Dict[str, Any]:
"""處理錯誤的主要方法"""
# 分類異常
classified_error = ExceptionManager.classify_exception(error)
# 記錄錯誤
self._log_error(classified_error, context)
# 更新統計
self._update_error_stats(classified_error.error_code)
# 檢查是否需要熔斷
if self._should_circuit_break(classified_error.error_code):
return self._create_circuit_breaker_response()
# 嘗試恢復
recovery_result = self._attempt_recovery(classified_error, context)
return {
'success': recovery_result['success'],
'error_type': classified_error.error_code,
'message': recovery_result['message'],
'user_message': self._generate_user_friendly_message(classified_error),
'recovery_attempted': recovery_result['recovery_attempted'],
'suggestions': self._get_error_suggestions(classified_error),
'timestamp': datetime.now().isoformat()
}
def _log_error(self, error: AIAssistantException, context: Dict = None):
"""記錄錯誤詳細資訊"""
log_data = {
'error_code': error.error_code,
'message': error.message,
'timestamp': error.timestamp.isoformat(),
'traceback': traceback.format_exc(),
'context': context or {}
}
self.logger.error(f"錯誤發生: {error.error_code}", extra=log_data)
def _update_error_stats(self, error_code: str):
"""更新錯誤統計"""
now = datetime.now()
if error_code not in self.error_stats:
self.error_stats[error_code] = []
# 添加當前錯誤時間
self.error_stats[error_code].append(now)
# 清理過期的錯誤記錄
cutoff_time = now - self.time_window
self.error_stats[error_code] = [
timestamp for timestamp in self.error_stats[error_code]
if timestamp > cutoff_time
]
def _should_circuit_break(self, error_code: str) -> bool:
"""判斷是否應該熔斷"""
if error_code not in self.error_stats:
return False
recent_errors = len(self.error_stats[error_code])
return recent_errors >= self.error_threshold
def _create_circuit_breaker_response(self) -> Dict[str, Any]:
"""創建熔斷器回應"""
return {
'success': False,
'error_type': 'CIRCUIT_BREAKER',
'message': '系統檢測到頻繁錯誤,暫時啟用保護模式',
'user_message': '⚠️ 系統正在恢復中,請稍後再試',
'recovery_attempted': False,
'suggestions': ['等待幾分鐘後重試', '檢查網路連線', '聯繫技術支援']
}
def _attempt_recovery(self, error: AIAssistantException, context: Dict = None) -> Dict:
"""嘗試錯誤恢復"""
error_code = error.error_code
if error_code in self.recovery_strategies:
try:
strategy = self.recovery_strategies[error_code]
recovery_result = strategy(error, context)
return {
'success': recovery_result,
'recovery_attempted': True,
'message': '嘗試自動恢復' + ('成功' if recovery_result else '失敗')
}
except Exception as recovery_error:
self.logger.warning(f"恢復策略執行失敗: {recovery_error}")
return {
'success': False,
'recovery_attempted': False,
'message': '無可用的恢復策略'
}
def _generate_user_friendly_message(self, error: AIAssistantException) -> str:
"""生成使用者友善的錯誤訊息"""
error_messages = {
'API_ERROR': '🌐 網路服務暫時無法使用,請稍後再試',
'VALIDATION_ERROR': '📝 輸入格式有誤,請檢查您的輸入',
'RESOURCE_ERROR': '📁 檔案或資源存取失敗,請檢查路徑',
'MODEL_ERROR': '🤖 AI 模型處理失敗,正在嘗試恢復',
'UNKNOWN': '❓ 發生未知錯誤,請重新嘗試'
}
return error_messages.get(error.error_code, '❓ 系統發生錯誤,請稍後再試')
def _get_error_suggestions(self, error: AIAssistantException) -> list:
"""獲取錯誤處理建議"""
suggestions_map = {
'API_ERROR': [
'檢查網路連線',
'確認 API 金鑰有效',
'等待幾分鐘後重試'
],
'VALIDATION_ERROR': [
'檢查輸入格式',
'確認必填欄位已填寫',
'參考輸入範例'
],
'RESOURCE_ERROR': [
'確認檔案路徑正確',
'檢查檔案權限',
'確保有足夠的磁碟空間'
],
'MODEL_ERROR': [
'簡化您的問題',
'嘗試不同的表達方式',
'稍後重新提問'
]
}
return suggestions_map.get(error.error_code, ['重新啟動程式', '聯繫技術支援'])
import time
import random
from functools import wraps
from typing import Tuple, Callable, Type
def retry_with_backoff(
max_retries: int = 3,
base_delay: float = 1.0,
max_delay: float = 60.0,
backoff_factor: float = 2.0,
jitter: bool = True,
exceptions: Tuple[Type[Exception], ...] = (Exception,)
):
"""帶有指數退避的重試裝飾器"""
def decorator(func: Callable):
@wraps(func)
def wrapper(*args, **kwargs):
last_exception = None
for attempt in range(max_retries + 1):
try:
return func(*args, **kwargs)
except exceptions as e:
last_exception = e
if attempt == max_retries:
# 最後一次嘗試失敗,拋出異常
raise last_exception
# 計算延遲時間
delay = min(base_delay * (backoff_factor ** attempt), max_delay)
# 添加隨機抖動
if jitter:
delay = delay * (0.5 + random.random() * 0.5)
print(f"嘗試 {attempt + 1}/{max_retries + 1} 失敗: {e}")
print(f"等待 {delay:.2f} 秒後重試...")
time.sleep(delay)
# 如果所有重試都失敗,拋出最後一個異常
raise last_exception
return wrapper
return decorator
class CircuitBreaker:
"""熔斷器模式實現"""
def __init__(self, failure_threshold: int = 5, recovery_timeout: int = 60):
self.failure_threshold = failure_threshold
self.recovery_timeout = recovery_timeout
self.failure_count = 0
self.last_failure_time = None
self.state = 'CLOSED' # CLOSED, OPEN, HALF_OPEN
def __call__(self, func: Callable):
@wraps(func)
def wrapper(*args, **kwargs):
if self.state == 'OPEN':
if self._should_attempt_reset():
self.state = 'HALF_OPEN'
else:
raise Exception("熔斷器開啟中,服務暫時不可用")
try:
result = func(*args, **kwargs)
self._on_success()
return result
except Exception as e:
self._on_failure()
raise e
return wrapper
def _should_attempt_reset(self) -> bool:
"""判斷是否應該嘗試重置熔斷器"""
return (
self.last_failure_time and
time.time() - self.last_failure_time >= self.recovery_timeout
)
def _on_success(self):
"""成功時的處理"""
self.failure_count = 0
self.state = 'CLOSED'
def _on_failure(self):
"""失敗時的處理"""
self.failure_count += 1
self.last_failure_time = time.time()
if self.failure_count >= self.failure_threshold:
self.state = 'OPEN'
from langgraph.graph import StateGraph, END
from typing import TypedDict, Dict, Any
from core.error_handler import ErrorHandler
from core.exception_manager import *
import google.generativeai as genai
import os
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
class RobustWorkflowState(TypedDict):
user_input: str
processing_attempts: int
error_history: list
fallback_used: bool
final_response: str
success: bool
error_info: Dict[str, Any]
# 初始化錯誤處理器
error_handler = ErrorHandler()
# 註冊恢復策略
def api_recovery_strategy(error: APIException, context: Dict = None) -> bool:
"""API 錯誤恢復策略"""
# 嘗試使用備用端點或降級服務
print("🔄 嘗試 API 恢復策略...")
time.sleep(1) # 簡單的等待策略
return True
def model_recovery_strategy(error: ModelException, context: Dict = None) -> bool:
"""模型錯誤恢復策略"""
print("🤖 嘗試模型恢復策略...")
# 可以嘗試使用不同的提示詞或模型參數
return False
error_handler.register_recovery_strategy('API_ERROR', api_recovery_strategy)
error_handler.register_recovery_strategy('MODEL_ERROR', model_recovery_strategy)
@retry_with_backoff(max_retries=2, base_delay=1.0)
def safe_gemini_call(prompt: str) -> str:
"""安全的 Gemini API 呼叫"""
try:
model = genai.GenerativeModel('gemini-pro')
response = model.generate_content(prompt)
if not response.text:
raise ModelException("模型回應為空", "gemini-pro", prompt)
return response.text
except Exception as e:
if "quota" in str(e).lower():
raise APIException("API 配額不足", "Gemini", 429)
elif "key" in str(e).lower():
raise APIException("API 金鑰無效", "Gemini", 401)
else:
raise ModelException(f"模型處理失敗: {str(e)}", "gemini-pro", prompt)
def process_with_error_handling(state: RobustWorkflowState) -> RobustWorkflowState:
"""帶有錯誤處理的主要處理節點"""
user_input = state["user_input"]
attempts = state["processing_attempts"]
try:
# 驗證輸入
if not user_input or len(user_input.strip()) == 0:
raise ValidationException("使用者輸入為空")
if len(user_input) > 5000:
raise ValidationException("輸入過長", "user_input", len(user_input))
# 嘗試處理
response = safe_gemini_call(user_input)
return {
**state,
"final_response": response,
"success": True,
"processing_attempts": attempts + 1
}
except Exception as e:
# 使用錯誤處理器處理異常
error_result = error_handler.handle_error(e, {
"user_input": user_input,
"attempt": attempts + 1
})
# 更新錯誤歷史
error_history = state["error_history"]
error_history.append({
"attempt": attempts + 1,
"error_type": error_result["error_type"],
"message": error_result["message"],
"timestamp": error_result["timestamp"]
})
return {
**state,
"processing_attempts": attempts + 1,
"error_history": error_history,
"error_info": error_result,
"success": False
}
def fallback_processing(state: RobustWorkflowState) -> RobustWorkflowState:
"""備用處理節點"""
fallback_responses = [
"抱歉,我現在遇到一些技術問題,無法完全處理您的請求。",
"系統正在維護中,請稍後再試,或者您可以:\n• 重新表達您的問題\n• 檢查網路連線\n• 稍後重試",
"感謝您的耐心,我正在努力恢復正常服務。"
]
# 根據錯誤歷史選擇合適的備用回應
attempt_count = len(state["error_history"])
fallback_index = min(attempt_count - 1, len(fallback_responses) - 1)
return {
**state,
"final_response": fallback_responses[fallback_index],
"fallback_used": True,
"success": True # 備用處理視為成功
}
def should_retry(state: RobustWorkflowState) -> str:
"""決定是否重試或使用備用方案"""
max_attempts = 3
if state["success"]:
return "complete"
elif state["processing_attempts"] < max_attempts:
# 檢查錯誤類型,決定是否值得重試
if state.get("error_info", {}).get("error_type") in ["CIRCUIT_BREAKER"]:
return "fallback"
else:
return "retry"
else:
return "fallback"
def create_robust_workflow():
"""建立強健的工作流程"""
workflow = StateGraph(RobustWorkflowState)
# 添加節點
workflow.add_node("process", process_with_error_handling)
workflow.add_node("fallback", fallback_processing)
# 設定流程
workflow.set_entry_point("process")
# 條件路由
workflow.add_conditional_edges(
"process",
should_retry,
{
"complete": END,
"retry": "process",
"fallback": "fallback"
}
)
workflow.add_edge("fallback", END)
return workflow.compile()
from workflows.robust_workflow import create_robust_workflow
import json
def main():
"""強健 AI 助理主程式"""
print("🛡️ 強健 AI 助理系統")
print("🔧 具備完整的錯誤處理與恢復機制")
print("=" * 50)
app = create_robust_workflow()
while True:
try:
user_input = input("\n💬 請輸入您的問題:").strip()
if user_input.lower() in ['quit', 'exit', '退出']:
print("👋 再見!")
break
# 初始化狀態
initial_state = {
"user_input": user_input,
"processing_attempts": 0,
"error_history": [],
"fallback_used": False,
"final_response": "",
"success": False,
"error_info": {}
}
print("🔍 處理中...")
result = app.invoke(initial_state)
# 顯示結果
print(f"\n🤖 助理:{result['final_response']}")
# 顯示處理資訊
if result.get('error_history'):
print(f"⚠️ 處理嘗試:{result['processing_attempts']} 次")
if result.get('fallback_used'):
print("🔄 使用了備用處理方案")
print("-" * 40)
except KeyboardInterrupt:
print("\n👋 再見!")
break
except Exception as e:
print(f"❌ 系統級錯誤:{e}")
print("請重新啟動程式")
if __name__ == "__main__":
main()
✅ 完整異常分類:自訂異常類別涵蓋各種錯誤情況
✅ 智能重試機制:指數退避、抖動、熔斷器模式
✅ 優雅降級:備用處理方案確保服務可用性
✅ 詳細錯誤日誌:完整的錯誤追蹤和統計
✅ 使用者友善:將技術錯誤轉換為易懂的訊息
今天我們建立了一個強健的錯誤處理系統,讓 AI 助理在面對各種異常狀況時仍能穩定運行。明天我們將進入本週的總結,學習如何打造一個智能客服系統,整合本週學到的所有功能!