目標先講清楚:
分析OpenAI SDK, langGraph 和 claude如何處理Error
在Google-Agentic Pattern Design有提到使用的策略包含:
策略 | OpenAI SDK | LangGraph/LangSmith | Claude |
---|---|---|---|
error logging | 提供 enable_verbose_stdout_logging() 及 tracing |
主要提供 LangSmith 進行 trace | na |
retries | 有使用類似 retry 的方式,並建議在 function 中處理 | 有使用類似 retry 的方式,並建議在 function 中處理 | 使用max_retries進行設定 |
fallbacks | na | na | na |
graceful degradation | 會把錯誤轉換成模型可讀的訊息 | 會把錯誤轉換成模型可讀的訊息 | na |
notifications | 提供常見的 exception,提供給開發人員錯誤的類型及狀況 | 提供常見的 exception,提供給開發人員錯誤的類型及狀況 | 提供 stop_reason,並有一些範例(可用 prompt 讓 LLM continue) |
在以下整理各個框架相對應的範例。
# 詳細 stdout 日誌
from agents import enable_verbose_stdout_logging
enable_verbose_stdout_logging()
# 範例:加入 tracing span(可在儀表板檢視)
from agents.tracing.create import trace
from agents import Agent, Runner
bot = Agent(name="OpsBot", instructions="Trace me", tools=[])
with trace("ops_workflow", metadata={"user_id": "acme-123"}):
result = Runner.run_sync(bot, "hello")
print(result.final_output)
import random, time
from agents import function_tool
@function_tool()
def fetch_json(url: str, max_retries: int = 3):
backoff = 1.0
for attempt in range(max_retries):
try:
# e.g., requests.get(url, timeout=5)
raise ConnectionError("transient")
except (TimeoutError, ConnectionError):
if attempt == max_retries - 1:
raise
time.sleep(backoff + random.random()) # jitter
backoff *= 2
import time
from typing_extensions import TypedDict
from langgraph.graph import StateGraph, START
class S(TypedDict):
data: str | None
error: str | None
def fetch_node(state: S) -> S:
backoff = 1
for i in range(3):
try:
# 呼叫外部 API...
raise TimeoutError("transient")
except TimeoutError as e:
if i == 2:
return {"data": None, "error": f"retry_exhausted:{e}"}
time.sleep(backoff); backoff *= 2
return {"data": "ok", "error": None}
builder = StateGraph(S)
builder.add_node("fetch", fetch_node)
builder.add_edge(START, "fetch")
graph = builder.compile()
print(graph.invoke({"data": None, "error": None}))
pause_turn
的重試模式def handle_paused_conversation(initial_response, max_retries=3):
response = initial_response
messages = [{"role": "user", "content": original_query}]
for _ in range(max_retries):
if response.stop_reason != "pause_turn":
break
messages.append({"role": "assistant", "content": response.content})
response = client.messages.create(
model="claude-sonnet-4-20250514",
messages=messages,
tools=original_tools
)
return response
@function_tool(failure_error_function=...):自訂工具出錯時回給模型的錯誤訊息;若設為 None,則不回訊息而是丟出 Exception 交由你外層攔截。
from agents import function_tool
def tool_err(ctx, err: Exception) -> str:
return f"tool_failed: {type(err).__name__}: {str(err)[:160]}"
@function_tool(failure_error_function=tool_err, strict_mode=True)
def quote_flight(city: str) -> dict:
raise TimeoutError("upstream timeout") # → 轉為文字給模型,模型可採替代策略
若在 LangGraph 中用 LangChain Tool,工具可以丟 ToolException,然後依 handle_tool_error 設定,把錯誤當作觀測值回給 agent,而不是整個流程炸掉。
from langchain_core.tools import tool, ToolException
@tool(handle_tool_error=True)
def fetch_hotel(city: str):
raise ToolException(f"Upstream error when querying {city}")
# → 不中斷整張圖;代理可根據這個 observation 選擇降級或改路徑
可直接依例外型別(如 ModelBehaviorError
、UserError
、MaxTurnsExceeded
)決定告警嚴重度與通道。
# 假想的通知器(換成你們的 Slack / Email / PagerDuty SDK)
def notify(channel: str, title: str, payload: dict):
print(f"[{channel}] {title}: {payload}")
from agents import Agent, Runner
from agents.exceptions import ModelBehaviorError, UserError, MaxTurnsExceeded
bot = Agent(name="OpsBot", instructions="...", tools=[])
try:
result = Runner.run_sync(bot, "run my task")
print(result.final_output)
except ModelBehaviorError as e:
# 模型行為異常 → 立即 Pager(高優先級)
notify("pager", "ModelBehaviorError", {"detail": str(e)})
except UserError as e:
# 設定/Schema 問題 → Slack 給開發者(中優先級)
notify("slack", "UserError", {"detail": str(e)})
except MaxTurnsExceeded as e:
# 迴圈未收斂 → Email 報表/排程跟進(低~中)
notify("email", "MaxTurnsExceeded", {"detail": str(e)})
LangGraph 會拋出一組框架專用例外,用來指示圖執行的典型問題——像是遞迴步數超限、節點回傳值不合法、並發更新衝突等。你可以在 graph.invoke() 外層用 try/except 分流處理。
from langgraph.graph import StateGraph, START
from typing_extensions import TypedDict
from langgraph.errors import GraphRecursionError # 代表迴圈過深
# 其他錯誤還有 INVALID_GRAPH_NODE_RETURN_VALUE / INVALID_CONCURRENT_GRAPH_UPDATE 等
class S(TypedDict):
n: int
def loop_node(state: S):
# 故意造成無窮迴圈示例(請勿在產線這樣做)
return {"n": state["n"] + 1}
builder = StateGraph(S)
builder.add_node("loop", loop_node)
builder.add_edge(START, "loop")
graph = builder.compile(recursion_limit=10) # 防無窮遞迴
try:
graph.invoke({"n": 0})
except GraphRecursionError as e:
# 與 OpenAI 的 ModelBehaviorError/MaxTurnsExceeded 類似:提醒你調整設計或提高上限
print("Hit recursion limit:", e)
Claude 以 stop_reason
(如 pause_turn
、max_tokens
、refusal
)為主,可在應用層判斷是否通知或自動續寫/改寫;真正的 HTTP 失敗(如 429/500)再以例外處理並觸發告警。
try:
response = client.messages.create(...)
if response.stop_reason == "refusal":
notify("slack", "Claude Refusal", {"hint": "consider rephrasing"})
elif response.stop_reason == "max_tokens":
notify("email", "Claude Truncated", {"hint": "auto-continue"})
except anthropic.APIError as e:
notify("pager", f"Claude API {e.status_code}", {"detail": str(e)})
了解tool的設計方式後,接下來會處理agent memory的部分