章節錨點 + 章節優先檢索（Section-first Retrieval）

2025 iThome 鐵人賽

DAY 23

生成式 AI

練習AI系列第 24 篇

17th鐵人賽

frankfrank8785

2025-09-29 19:06:39

93 瀏覽

分享至

🆕 程式碼

src/utils/md_ast.js（新增）

輕量 Markdown 區塊解析：抓出 H1~H6 標題、slug、章節文字範圍。

// src/utils/md_ast.js
/** 產生簡單 slug（中文轉拼音可日後再加；先做安全 slug） */
export function slugify(s = "") {
return (s || "")
.trim()
.toLowerCase()
.replace(/[^\p{Letter}\p{Number}\s-]/gu, "")
.replace(/\s+/g, "-")
.replace(/-+/g, "-")
.slice(0, 80) || "section";
}

/** 解析 Markdown 成章節樹（陣列，含層級/slug/text） */
export function parseMarkdownToSections(md = "") {
const lines = (md || "").replace(/\r\n/g, "\n").split("\n");
const nodes = [];
let cur = null;

const flush = () => {
if (cur) {
cur.text = (cur.textParts || []).join("\n").trim();
delete cur.textParts;
nodes.push(cur);
cur = null;
}
};

for (let i = 0; i < lines.length; i++) {
const line = lines[i];

// Fenced code：保持在章節 text 中，不做切分（簡化處理）
const h = line.match(/^(#{1,6})\s+(.+?)\s*$/);
if (h) {
  flush();
  const level = h[1].length;
  const title = h[2].trim();
  cur = {
    level,
    title,
    slug: slugify(title),
    textParts: [],
  };
  continue;
}
if (!cur) {
  // 若文件開頭沒有標題，放到「導言」章節
  cur = { level: 1, title: "導言", slug: "intro", textParts: [] };
}
cur.textParts.push(line);

}
flush();

// 標示父子（以 level 決定）
const stack = [];
for (const n of nodes) {
while (stack.length && stack[stack.length - 1].level >= n.level) stack.pop();
n.parent = stack.length ? stack[stack.length - 1].slug : null;
stack.push(n);
}
return nodes;
}

/** 章節麵包屑（由 parent 串回去；這裡只回 slug 與 title 陣列） */
export function buildBreadcrumb(sections) {
const bySlug = new Map(sections.map(s => [s.slug, s]));
const getTrail = (slug) => {
const trail = [];
let cur = bySlug.get(slug);
while (cur) {
trail.push({ slug: cur.slug, title: cur.title, level: cur.level });
if (!cur.parent) break;
cur = bySlug.get(cur.parent);
}
return trail.reverse();
};
return { getTrail };
}

src/day23_sections.js（新增：章節索引 & 檢索）

章節索引獨立存檔：data/rag//.sections.json。
注意：請先有 chunk 索引（Day 18/21/22）再建章節索引，因為要把 chunk 映射到章節。

// src/day23_sections.js
import fs from "fs";
import path from "path";
import { openai } from "./aiClient.js";
import { ensureTenantNS } from "./utils/tenantfs.js";
import { parseMarkdownToSections, buildBreadcrumb } from "./utils/md_ast.js";

const EMBED_MODEL = process.env.OPENAI_EMBEDDING_MODEL || "text-embedding-3-small";

async function embedMany(texts = []) {
if (!texts.length) return [];
const res = await openai.embeddings.create({ model: EMBED_MODEL, input: texts });
return res.data.map(d => d.embedding);
}

function cosine(a,b){let dot=0,na=0,nb=0;for(let i=0;i<a.length;i++){dot+=a[i]*b[i];na+=a[i]*a[i];nb+=b[i]*b[i]}return dot/(Math.sqrt(na)*Math.sqrt(nb)+1e-9)}

function loadChunkIndex(idxFile) {
if (!fs.existsSync(idxFile)) throw new Error("找不到 chunk 索引，請先重建全文索引");
const data = JSON.parse(fs.readFileSync(idxFile, "utf-8"));
return data.index || [];
}

function listDocs(kbDir) {
return fs.readdirSync(kbDir).filter(f => /.md$|.txt$/i.test(f)).map(f => path.join(kbDir, f));
}

/** 建立章節索引（需先有 chunk 索引） */
export async function buildSectionIndex({ tenant, ns }) {
const { kbDir, idxFile } = ensureTenantNS(tenant, ns);
const secFile = path.join(path.dirname(idxFile), ${ns}.sections.json);
const chunkIndex = loadChunkIndex(idxFile); // [{id, docId, text, vector}]

// 1) 解析所有檔案 → 章節清單
const files = listDocs(kbDir);
const sections = [];
for (const fp of files) {
const docId = path.basename(fp);
const md = fs.readFileSync(fp, "utf-8");
const secs = parseMarkdownToSections(md).map(s => ({ ...s, docId }));
sections.push(...secs);
}

// 2) 為每個章節掛上 chunkIds（簡單以「chunk.text 是否包含於章節 text」判定）
for (const s of sections) {
const cand = chunkIndex.filter(ch => ch.docId === s.docId);
const hits = [];
for (const ch of cand) {
if (!ch?.text) continue;
if (s.text && s.text.includes(ch.text.slice(0, Math.min(ch.text.length, 48)))) {
hits.push(ch.id); // chunkId
}
}
s.chunkIds = hits;
}

// 3) 章節向量
const vecs = await embedMany(sections.map(s => ${s.title}\n\n${s.text}));
sections.forEach((s, i) => { s.vector = vecs[i]; });

fs.writeFileSync(secFile, JSON.stringify({
builtAt: Date.now(),
model: EMBED_MODEL,
sections: sections.map(({ vector, ...rest }) => rest), // vectors 另存避免檔案過大？這裡先一起存
vectors: vecs, // 和 sections 平行陣列，index 對應
}, null, 2), "utf-8");

return { secFile, sections: sections.length };
}

function loadSectionIndex(secFile) {
if (!fs.existsSync(secFile)) throw new Error("章節索引不存在，請先重建");
const json = JSON.parse(fs.readFileSync(secFile, "utf-8"));
return json;
}

/** 章節優先檢索：取 topS 章節 → 匹配其 chunk → 再以 query re-rank chunk */
export async function retrieveBySections({ tenant, ns, query, topS = 5, topK = 8 }) {
const { idxFile } = ensureTenantNS(tenant, ns);
const secFile = path.join(path.dirname(idxFile), ${ns}.sections.json);
const chunkIndex = loadChunkIndex(idxFile);
const secIndex = loadSectionIndex(secFile); // {sections, vectors}

// 1) 章節檢索
const qv = (await embedMany([query]))[0];
const scoredSecs = secIndex.sections.map((s, i) => ({
...s,
score: cosine(qv, secIndex.vectors[i]),
})).sort((a,b)=>b.score-a.score).slice(0, topS);

// 2) 匹配候選 chunks（來自這些章節）
const candChunkIds = new Set(scoredSecs.flatMap(s => s.chunkIds || []));
const cands = chunkIndex.filter(ch => candChunkIds.has(ch.id));

// 3) 對候選 chunks 以 query re-rank（cosine）
// 注意：chunkIndex 內沒有 query 向量；我們需要臨時重算 qv 與每個 chunk 的向量相似度。
// 這裡 chunk 的 vector 已在 chunkIndex 中，直接用 cosine(qv, ch.vector)
const reranked = cands.map(ch => ({
...ch,
score: cosine(qv, ch.vector),
})).sort((a,b)=>b.score-a.score).slice(0, topK);

return { sections: scoredSecs, chunks: reranked };
}

/** 幫助前端做 breadcrumb 顯示 */
export function sectionBreadcrumbs({ tenant, ns, sectionSlug }) {
const { idxFile } = ensureTenantNS(tenant, ns);
const secFile = path.join(path.dirname(idxFile), ${ns}.sections.json);
const { sections } = loadSectionIndex(secFile);
const { getTrail } = buildBreadcrumb(sections);
return getTrail(sectionSlug);
}

app/api/kb/[tenant]/[ns]/reindex-sections/route.js（新增）
import { NextResponse } from "next/server";
import { withAuth } from "../../../../../src/api/withAuth.js";
import { buildSectionIndex } from "../../../../../src/day23_sections.js";

export const runtime = "nodejs";

export const POST = withAuth(async (req, ctx) => {
if (ctx.user.role === "viewer") return NextResponse.json({ ok:false, error:"Editor/Admin only" }, { status:403 });
const { tenant, ns } = ctx.params;
const out = await buildSectionIndex({ tenant, ns });
return NextResponse.json({ ok:true, ...out });
}, ["editor","admin"]);

app/api/kb/[tenant]/[ns]/ask/route.js（修改）

加入 strategy：default（沿用 Day 19 高亮）或 section（章節優先）。

import { NextResponse } from "next/server";
import { withAuth } from "../../../../../src/api/withAuth.js";
import { answerWithRAG, answerWithRAGHighlighted } from "../../../../../src/day16_rag_store.js";
import { retrieveBySections } from "../../../../../src/day23_sections.js";
import { openai } from "../../../../../src/aiClient.js";
import { alignAnswerToSources } from "../../../../../src/day19_highlight.js";

export const runtime = "nodejs";

export const POST = withAuth(async (req, ctx) => {
const { tenant, ns } = ctx.params;
const { q, highlight = true, strategy = "default" } = await req.json();

if (!q || !q.trim()) return NextResponse.json({ ok:false, error:"q 必填" }, { status:400 });

if (strategy === "section") {
// 章節優先檢索
const { sections, chunks } = await retrieveBySections({ tenant, ns, query: q, topS: 5, topK: 8 });

// 用這些 chunks 當作上下文（與 default 一致的回答 prompt）
const ctxText = chunks.map((h,i)=>`# 片段${i+1}（${h.docId}｜${h.id.split("#")[1]}，score=${h.score.toFixed(3)}）\n${h.text}`).join("\n\n");
const res = await openai.chat.completions.create({
  model:"gpt-4o-mini", temperature:0.2,
  messages:[
    { role:"system", content:"你是嚴謹的客服知識庫助理。僅依據提供片段回答；不足請明確說明。" },
    { role:"user", content:`問題：${q}\n\n片段：\n${ctxText}\n\n請用繁體中文回答，先結論、再步驟、最後列注意事項。` }
  ]
});
const answer = res.choices?.[0]?.message?.content?.trim() || "目前找不到足夠資訊。";

if (highlight) {
  // 對齊：需要 chunks 含 vector；我們已有（從索引來的）
  const aligned = await alignAnswerToSources(answer, chunks, 0.27);
  // 附上章節 breadcrumb（以 section slug/title）
  const secMeta = sections.map((s, idx) => ({
    rank: idx + 1,
    docId: s.docId,
    slug: s.slug,
    title: s.title,
    level: s.level,
    score: s.score,
  }));

  const sourceChunks = chunks.map((h, i) => ({
    displayIndex: i + 1, docId: h.docId, id: h.id, score: h.score, text: h.text
  }));

  return NextResponse.json({
    ok: true,
    strategy: "section",
    answer,
    answerHtml: aligned.html,
    spans: aligned.spans,
    sources: aligned.sources,
    sourceChunks,
    sections: secMeta
  });
}

// 不高亮的精簡回傳
return NextResponse.json({
  ok: true, strategy: "section",
  answer,
  sources: chunks.map(h=>({ id:h.id, docId:h.docId, score:h.score })),
  sections: sections.map(s=>({ docId:s.docId, slug:s.slug, title:s.title, score:s.score }))
});

}

// === default：沿用 Day 19 的高亮路徑 ===
if (highlight) {
const out = await answerWithRAGHighlighted({ tenant, ns, query: q, topK: 4 });
return NextResponse.json({ ok:true, strategy: "default", ...out });
} else {
const { answer, sources } = await answerWithRAG({ tenant, ns, query: q, topK: 4 });
return NextResponse.json({ ok:true, strategy: "default", answer, sources });
}
}, ["viewer","editor","admin"]);

app/studio/page.tsx（修改：策略切換＋章節 breadcrumb）

僅列需要新增/替換的片段；其餘維持 Day 19 版本。

(A) 新增 state

const [strategy, setStrategy] = useState<"default"|"section">("default");
const [secMeta, setSecMeta] = useState<Array<{rank:number;docId:string;slug:string;title:string;level:number;score:number}>>([]);

(B) 問答請求改帶 strategy

(D) 在回答下方顯示章節 Breadcrumb（僅當 strategy=section）

{strategy === "section" && secMeta.length > 0 && (

新增 API（Studio 可綁一顆按鈕）
package.json（可加便捷指令）
{
"scripts": {
"day23:reindex:sections": "curl -s -X POST http://localhost:3000/api/kb/acme/faq/reindex-sections -H 'Authorization: Bearer '"
}
}