今天先把最小能跑的版本 MVP 做出來,後面的幾天會加入像是工具選擇、記憶、評分那些的。為了避免越做越亂,我會先把目錄的結構畫出來再開始實作。
檔案架構是我們整個 agent 也是重要的一部分,檔案命名也需要清楚明瞭,因為 Agent 會逐漸擴大,所以如果沒有分好之後就會很亂,之後擴充才不會一直東拼西補的。
檔案架構:
project/
├─ agent/
│ └─ agent_single.py # 今天的 MVP 入口
├─ rag/
│ └─ connect.py # 查詢三步:search/build_prompt/ask_ollama
├─ scripts/
│ └─ build_law_db.py # 先前建立向量庫用
├─ data/
│ ├─ raw/
│ │ └─ 資通安全管理法.pdf # 原始 PDF
│ ├─ law_chunks_full.csv # 切分結果
│ └─ law_db/ # ChromaDB 向量庫
└─ logs/ # 執行記錄
接下來我們會一步步教學怎麼做這個接口,如果想看前面的程式碼,我都把它放到最後面了,可以搜尋一下附錄,應該就會定位到了。
agent/agent_single.py
我們需要先讓 agent_single.py
這個入口可以去連結到 connect.py
的三個函式
from pathlib import Path
import sys
sys.path.append(str(Path(__file__).resolve().parents[1]))
from rag.connect import search_chunks, build_prompt, ask_ollama
接下來我們需要加入 CLI 介面,也就是我們可以在命令列 輸入--q (問題)
去進行發問
import argparse
def parse_args():
p = argparse.ArgumentParser(description="Single-Agent MVP 入口")
p.add_argument("--q", "--query", dest="q", required=True, help="問題內容(必填)")
return p.parse_args()
我們可以先測試一下有沒有連結到
def main():
args = parse_args()
# 之後會在這裡呼叫 search/build_prompt/ask_ollama
print(f"你的問題在這裡:{args.q}")
if __name__ == "__main__":
main()
輸入這行指令看跑不跑得起來:
python agent/agent_single.py --q "什麼是關鍵基礎設施?"
注意:這邊還沒有真的在查詢噢!
接下來是真的要查詢了我們需要去連結我們的 ChromaDB ,去抓取最相關的 Top-k,這邊預設是 4
# 在 parse_args() 內新增一行
p.add_argument("--k", type=int, default=4, help="Top-k(預設 4)")
如果要驗證的話可以增加這行
# 在 main() 內加入:
hits = search_chunks(args.q, k=args.k)
print(f"命中片段數量:{len(hits)}")
一樣可以輸入這行指令看跑不跑得起來:
python agent/agent_single.py --q "什麼是關鍵基礎設施?"
這邊為了美化摘要的產出,所以特別寫這段:
def brief(text: str, limit: int = 120) -> str:
t = (text or "").replace("\n", " ").strip()
return t[:limit] + ("..." if len(t) > limit else "")
def pretty_print_hits(hits):
print("=== Top-k 檢索片段 ===")
for i, h in enumerate(hits, 1):
ch = h.get("chapter", "")
sec = h.get("section_id", "")
dist = float(h.get("distance", 0.0))
txt = brief(h.get("text", ""))
print(f"[{i}] {ch} | {sec} | 距離={dist:.4f}")
print(f" {txt}")
要檢查的話可以下面的程式碼:
# 在 main() 查完 hits 後面加:
pretty_print_hits(hits)
接下來需要把檢索到的內容,餵給 LLM 請他回答
# parse_args() 內新增一行
p.add_argument("--model", default="mistral", help="模型名(預設 mistral)")
# main() 內,pretty_print_hits(hits) 之後加:
prompt = build_prompt(args.q, hits)
answer = ask_ollama(prompt, model=args.model).strip()
print("\n=== 最終回答 ===")
print(answer)
最後我們會把我們的輸出給紀錄下來
# parse_args() 內新增一行
p.add_argument("--json_out", action="store_true", help="是否輸出 JSONL 到 logs/")
# main() 的最後(印完答案後)
from datetime import datetime
from pathlib import Path
import json
if args.json_out:
logs_dir = Path("logs")
logs_dir.mkdir(parents=True, exist_ok=True)
outpath = logs_dir / f"run_{datetime.now():%Y%m%d}.jsonl"
record = {
"ts": datetime.now().isoformat(timespec="seconds"),
"query": args.q,
"k": args.k,
"model": args.model,
"hits": hits,
"answer": answer,
}
with outpath.open("a", encoding="utf-8") as f:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
print(f"\n[log] 已寫入:{outpath}")
最後我們一樣可以對它進行提問,再依你的需求去調整:
python agent/agent_single.py --q "什麼是關鍵基礎設施?" --k 4 --model mistral --json_out
今天我們的教學就到這邊了,希望沒有寫得太亂,感覺我沒有整理好嗚嗚。
後面只是補充前面的內容而已可以直接跳過,有需要再去看。
這邊補上先前寫的程式,但我有放很多註解,很多都是我自己給自己檢查用的,大家隨時都可以刪掉><
rag/connect.py
from pathlib import Path
import chromadb
from chromadb.errors import InvalidArgumentError
from sentence_transformers import SentenceTransformer
import numpy as np
import requests
import textwrap
import re
# ---- Chroma 連線 ----
BASE_DIR = Path(__file__).resolve().parents[1] # project/
CHROMA_PATH = BASE_DIR / "data" / "law_db" # project/data/law_db
client = chromadb.PersistentClient(path=str(CHROMA_PATH))
coll = client.get_or_create_collection("laws")
# ---- 固定使用 384 維模型 ----
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
def embed(texts):
vec = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
return vec.tolist()
def search_chunks(query, k=4):
q_vec = embed([query])
res = coll.query(
query_embeddings=q_vec,
n_results=k,
include=["documents", "metadatas", "distances"],
)
docs = res["documents"][0]
metas = res["metadatas"][0]
dists = res["distances"][0]
items = []
for doc, meta, dist in zip(docs, metas, dists):
items.append({
"chapter": meta.get("chapter", ""),
"section_id": meta.get("section_id", ""),
"distance": float(dist),
"text": doc
})
return items
import textwrap
def build_prompt(query, hits):
blocks = []
# 取得欄位數值,做基本處理
for i, h in enumerate(hits, 1):
chapter = h.get("chapter", "")
section = h.get("section_id", "")
distance = float(h.get("distance", 0.0))
text = (h.get("text", "") or "").replace("\n", " ")
header = f"[{i}] {chapter} | {section} | 距離={distance:.4f}" # (來源+分數)
body = f"```text\n{text}\n```" # 內文
blocks.append(header + "\n" + body)
context = "\n\n".join(blocks) if blocks else "(本次查無相關段落)"
prompt = textwrap.dedent(f"""
你是一位專業的台灣資安法規顧問。可以參考相關文件內容回答;
若文件中沒有明確資訊,請回答「文件中沒有相關內容」,不要亂猜。
回答請用中文條列式,並在每條末尾標註對應來源索引。
【相關文件內容】
{context}
【問題】
{query}
""").strip()
return prompt
def ask_ollama(prompt, model="mistral"):
resp = requests.post(
"http://localhost:11434/api/generate",
json={"model": model, "prompt": prompt, "stream": False},
timeout=120
)
resp.raise_for_status()
data = resp.json()
return data.get("response", "").strip()
if __name__ == "__main__":
query = "什麼是關鍵基礎設施?"
hits = search_chunks(query, k=4)
# print("=== 檢索到的段落(前 4 筆)===")
# for h in hits:
# print(f"{h['chapter']} | {h['section_id']} | 距離={h['distance']:.4f}")
# print(h["text"][:160].replace("\n"," "), "\n---")
prompt = build_prompt(query, hits)
# print("\n=== 準備送入 LLM 的 Prompt(節錄)===")
# print(prompt[:600], "...\n")
answer = ask_ollama(prompt, model="mistral")
# print("=== LLM 回答 ===")
# print(answer)
# RAGAs
contexts = [h["text"] for h in hits] # 檢索到的段落文字
# 標準答案(正確答案集合)
reference = "關鍵基礎設施是指實體或虛擬資產、系統或網路,其功能一旦停止運作或效能降低,對國家安全、社會公共利益、國民生活或經濟活動有重大影響之虞,經主管機關定期檢視並公告之領域。"
# print(contexts)
from datasets import Dataset
# 建立 dataset
data = {
"question": [query],
"contexts": [contexts],
"answer": [answer],
"reference": [reference],
}
dataset = Dataset.from_dict(data)
from ragas.metrics import context_precision, context_recall
from ragas.metrics import faithfulness, answer_relevancy
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from langchain_ollama import OllamaLLM
from ragas.embeddings import HuggingFaceEmbeddings
# 包裝 Ollama 當作 RAGAS 的 LLM
ollama_raw = OllamaLLM(model="mistral", temperature=0, num_ctx=4096)
ollama_llm = LangchainLLMWrapper(ollama_raw)
# 安裝一次(已裝可略):pip install langchain-huggingface
from langchain_huggingface import HuggingFaceEmbeddings as LCHuggingFaceEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
lc_hf = LCHuggingFaceEmbeddings(
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
hf_embeddings = LangchainEmbeddingsWrapper(lc_hf) # 這個有 embed_query()
result = evaluate(
dataset,
metrics=[context_precision, context_recall, answer_relevancy, faithfulness],
llm=ollama_llm,
embeddings=hf_embeddings
)
print(result)
scripts/build_law_db.py
import re
import pdfplumber
import pandas as pd
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parents[1] # project/
PDF_PATH = BASE_DIR / "data" / "raw" / "資通安全管理法.pdf" # 輸入
CSV_OUT = BASE_DIR / "data" / "law_chunks_full.csv" # 輸出
CHROMA_DIR = BASE_DIR / "data" / "law_db" # DB 目錄
CHAPTER_RE = re.compile(r"(?m)^\s*(第\s*[一二三四五六七八九十百千0-90-9]+\s*章[^\n]*)$")
# ★ 錨在行首,且只抓到「第N條」
ARTICLE_RE = re.compile(r"(?m)^\s*(第\s*(?:[0-90-9]+|[一二三四五六七八九十百千]+)\s*條)")
ITEM_RE = re.compile(r"(?<!\S)[一二三四五六七八九十百千]+、")
NUM_RE = re.compile(r"(?<!\S)[0-90-9]{1,3}[.)、]")
def extract_text_plumber(path):
with pdfplumber.open(path) as pdf:
return "\n".join([(p.extract_text() or "") for p in pdf.pages])
def normalize_text(s):
s = s.replace("\r", "")
# 先保護章標(前後加空行,避免被黏到上一句)
s = CHAPTER_RE.sub(lambda m: "\n" + m.group(1).strip() + "\n", s)
# 若「第N條」後面緊跟段次(1/一),插入換行,避免黏在條頭後
s = re.sub(r"(第\s*(?:[0-90-9]+|[一二三四五六七八九十百千]+)\s*條)\s*(?=[0-90-9一二三四五六七八九十百千])", r"\1\n", s)
# 合併「不是句末」的換行
s = re.sub(r"(?<![。!?;:…])\n(?!\n)", "", s)
# 清掉行首的版面序號(例如「1 」)
s = re.sub(r"(?m)^\s*[0-90-9]+\s{2,}", "", s)
# 縮空白
s = re.sub(r"[ \t]+", " ", s)
return s.strip()
def split_by_chapter(full_text):
parts = []
ms = list(CHAPTER_RE.finditer(full_text))
if not ms:
return [("(未分章)", full_text)]
for i, m in enumerate(ms):
start = m.start()
end = ms[i+1].start() if i+1 < len(ms) else len(full_text)
parts.append((m.group(1).strip(), full_text[start:end].strip()))
return parts
def split_by_article_in_chapter(chapter_text):
matches = list(ARTICLE_RE.finditer(chapter_text))
out = []
for i, m in enumerate(matches):
start = m.start()
end = matches[i+1].start() if i+1 < len(matches) else len(chapter_text)
head = m.group(1).strip() # ★ 只留「第N條」
body = chapter_text[start:end].strip()
out.append((head, body))
return out
def window_chunk(text, size=800, overlap=400):
chunks, i, n = [], 0, len(text)
while i < n:
j = min(i + size, n)
chunks.append(text[i:j])
if j == n: break
i = j - overlap
return chunks
def split_items_in_article(chapter_name, art_head, art_text):
body = art_text[len(art_head):].strip()
if ITEM_RE.search(body):
idxs = [m.start() for m in ITEM_RE.finditer(body)] + [len(body)]
elif NUM_RE.search(body):
idxs = [m.start() for m in NUM_RE.finditer(body)] + [len(body)]
else:
idxs = [0, len(body)]
parts = []
for i in range(len(idxs) - 1):
seg = body[idxs[i]:idxs[i+1]].strip()
if not seg:
continue
if len(seg) > 1200:
for w in window_chunk(seg, size=800, overlap=400):
parts.append({"chapter": chapter_name, "section_id": art_head, "text": w})
else:
parts.append({
"chapter": chapter_name,
"section_id": art_head.replace(" ", ""),
"text": seg
})
return parts
def law_chunks(full_text):
out = []
for chap_name, chap_text in split_by_chapter(full_text):
for head, body in split_by_article_in_chapter(chap_text):
out.extend(split_items_in_article(chap_name, head, body))
return out
# ---- 執行與輸出 ----
raw = extract_text_plumber(PDF_PATH)
normalized = normalize_text(raw)
chunks = law_chunks(normalized)
df = pd.DataFrame(chunks)
df.to_csv(CSV_OUT, index=False, encoding="utf-8-sig")
# 驗證:第 1 條是否出現
mask = df["section_id"].str.contains(r"第\s*(?:1|1|一)\s*條", na=False)
print("第 1 條筆數:", mask.sum())
# === 存到 Chroma + 查詢(替換後面那段)===
import hashlib
import chromadb
from sentence_transformers import SentenceTransformer
# 1) 建資料庫(硬碟保存)
client = chromadb.PersistentClient(path=str(CHROMA_DIR))
coll = client.get_or_create_collection("laws")
# 2) 準備 embedding 模型(多語)
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# 3) 產生穩定 id,避免重覆寫入
def make_id(c):
base = c["chapter"] + "|" + c["section_id"] + "|" + " ".join(c["text"].split())
return hashlib.sha1(base.encode("utf-8")).hexdigest()
# 4) 批次 upsert(避免一次塞太多)
batch = 3
buf_ids, buf_docs, buf_meta = [], [], []
for c in chunks:
buf_ids.append(make_id(c))
buf_docs.append(c["text"])
buf_meta.append({"chapter": c["chapter"], "section_id": c["section_id"]})
if len(buf_ids) == batch:
emb = model.encode(buf_docs, convert_to_numpy=True, normalize_embeddings=True)
coll.upsert(ids=buf_ids, documents=buf_docs, metadatas=buf_meta, embeddings=emb)
buf_ids, buf_docs, buf_meta = [], [], []
# 尾批
if buf_ids:
emb = model.encode(buf_docs, convert_to_numpy=True, normalize_embeddings=True)
coll.upsert(ids=buf_ids, documents=buf_docs, metadatas=buf_meta, embeddings=emb)
print("寫入完成。")
# 5) 簡單查詢示範
q = "什麼是關鍵基礎設施?"
q_emb = model.encode([q], normalize_embeddings=True)
res = coll.query(
query_embeddings=q_emb,
n_results=5,
include=["documents", "metadatas", "distances"]
)
for i, (doc, meta, dist) in enumerate(zip(res["documents"][0], res["metadatas"][0], res["distances"][0]), 1):
print(f"[{i}] {meta['chapter']} | {meta['section_id']} | 距離={dist:.4f}")
print(doc[:160].replace("\n", " "), "\n---")
print("模型向量維度 =", model.get_sentence_embedding_dimension())