iT邦幫忙

2025 iThome 鐵人賽

DAY 22
0
生成式 AI

練習AI系列 第 23

智慧切塊(語義/段落)+穩定 Chunk ID(降低重算抖動)

  • 分享至 

  • xImage
  •  

🆕 新增:src/utils/chunkers.js
// src/utils/chunkers.js
import { sha256String } from "../utils/hash.js";

/**

    1. 粗切 Markdown 區塊:標題、清單、程式碼、段落
    1. 句子級補切:避免超長段落
    1. 黏合封包:靠近 targetChars 形成 chunk,並保留少量重疊(paragraph overlap)
    1. 穩定 chunkId:sha256( normalized_text ),內容不變則 ID 不變
  • @param {string} markdown - 原文(.md/.txt 皆可)
  • @param {object} opts
  • @param {number} opts.targetChars - 目標每塊字元數(近似 tokens)
  • @param {number} opts.maxChars - 單塊上限
  • @param {number} opts.parOverlap - 段落級重疊(字元)
  • @param {string} opts.docPrefix - 可選:用於 chunkId 前綴(如 docId/首標題)
  • @returns {Array<{chunkId:string, content:string, len:number}>}
    */
    export function semanticChunk(markdown, {
    targetChars = 900,
    maxChars = 1300,
    parOverlap = 120,
    docPrefix = ""
    } = {}) {
    if (!markdown || !markdown.trim()) return [];

const text = normalize(markdown);
const blocks = splitBlocks(text); // 粗切:標題/清單/程式碼/段落
const pieces = flattenOversizeBlocks(blocks, maxChars); // 句子級補切

const chunks = [];
let buf = "";

for (let i = 0; i < pieces.length; i++) {
const cur = pieces[i];
if ((buf + sep(buf) + cur).length <= targetChars) {
buf = concat(buf, cur);
continue;
}
// emit one chunk
if (buf.trim()) chunks.push(makeChunk(buf, docPrefix));
// overlap:帶上一點上一塊尾巴
const tail = buf.slice(Math.max(0, buf.length - parOverlap));
buf = tail + (tail && !tail.endsWith("\n") ? "\n" : "") + cur;
if (buf.length > maxChars) { // 保底:超長直接切
chunks.push(makeChunk(buf.slice(0, maxChars), docPrefix));
buf = buf.slice(maxChars - parOverlap);
}
}
if (buf.trim()) chunks.push(makeChunk(buf, docPrefix));
return chunks;
}

// === helpers ===
function sep(s){ return s ? "\n" : ""; }
function concat(a,b){ return a ? (a + "\n" + b) : b; }

function normalize(s=""){
return s.replace(/\r\n/g,"\n")
.replace(/\t/g," ")
.replace(/[ \u3000]+$/gm,"") // 行尾空白
.trim();
}

function splitBlocks(s){
const lines = s.split("\n");
const blocks = [];
let cur = [];
let mode = "p"; // p=段落, code=程式碼
let fence = "";

const pushCur = () => {
if (!cur.length) return;
const txt = cur.join("\n").trim();
if (txt) blocks.push(txt);
cur = [];
};

for (let i=0;i<lines.length;i++){
const line = lines[i];

// code fence
const mFence = line.match(/^```([a-zA-Z0-9_-]*)\s*$/);
if (mFence) {
  if (mode === "code") { // close
    cur.push(line);
    pushCur(); mode = "p"; fence = "";
  } else {
    pushCur(); mode = "code"; fence = mFence[1] || "";
    cur.push(line);
  }
  continue;
}

if (mode === "code") { cur.push(line); continue; }

// headings or list as block boundary
if (/^#{1,6}\s+/.test(line) || /^[\-\*\+]\s+/.test(line) || /^\d+\.\s+/.test(line)) {
  pushCur();
  cur.push(line);
  pushCur();
  continue;
}

// blank line boundary
if (/^\s*$/.test(line)) { pushCur(); continue; }

cur.push(line);

}
pushCur();
return blocks;
}

function splitSentences(paragraph){
// 混中英句子切分(保守)
return paragraph.split(/(?<=[。!?!?;;]|(?:.\s)|(?:?\s)|(?:!\s))/g)
.map(s=>s.trim())
.filter(Boolean);
}

function flattenOversizeBlocks(blocks, maxChars){
const out = [];
for (const b of blocks) {
if (b.length <= maxChars) { out.push(b); continue; }
const sents = splitSentences(b);
let buf = "";
for (const s of sents) {
if ((buf + sep(buf) + s).length <= maxChars) buf = concat(buf, s);
else { if (buf) out.push(buf); buf = s; }
}
if (buf) out.push(buf);
}
return out;
}

function makeChunk(content, prefix=""){
const norm = content.trim();
const id = sha256String(norm).slice(0, 24); // 短一點便於讀
const chunkId = prefix ? ${prefix}_${id} : id;
return { chunkId, content: norm, len: norm.length };
}

chunkId 使用 內容哈希,小改一段只影響那一塊;其餘 chunk ID 不動。
你也能把 prefix 設成 docId 或「首個 H1 標題 slug」增加可讀性。

♻️ 修改:src/day21_indexer.js(用智慧切塊+穩定 ID)

只貼「需要替換」的段落(其餘保留 Day 21 版本不動):

  • import { chunkText, clean } from "./utils/text.js";
  • import { clean } from "./utils/text.js";
  • import { semanticChunk } from "./utils/chunkers.js";

@@
-export async function buildIndexIncremental({

  • tenant, ns, chunkSize = 800, overlap = 80, concurrency = 4
    -}) {
    +export async function buildIndexIncremental({
  • tenant, ns,
  • chunkSize = 900, // 目標大小(近似 tokens)
  • overlap = 120, // 段落級重疊字元
  • maxChars = 1300, // 單塊上限
  • concurrency = 4
    +}) {

@@

  • const raw = clean(fs.readFileSync(fp, "utf-8"));
  • const chunks = chunkText(raw, chunkSize, overlap).map(c => ({
  •  chunkId: c.id,
    
  •  text: c.content,
    
  •  hash: sha256String(c.content),
    
  •  textLen: c.content.length,
    
  •  docId,
    
  • }));
  • const raw = clean(fs.readFileSync(fp, "utf-8"));
  • const chunks = semanticChunk(raw, {
  •  targetChars: chunkSize, maxChars, parOverlap: overlap, docPrefix: "" // prefix 可選
    
  • }).map(c => ({
  •  chunkId: c.chunkId,
    
  •  text: c.content,
    
  •  hash: sha256String(c.content),
    
  •  textLen: c.len,
    
  •  docId,
    
  • }));

其他合併邏輯(依 chunkId 比對、向量重用、移除刪除檔)原封不動即可正常工作,
因為新 chunkId 已經是「內容穩定 ID」。

♻️ 修改:src/day16_rag_store.js(全量索引也換智慧切塊)

替換 buildIndex 中切塊段落即可:

-import { chunkText, clean } from "./utils/text.js";
+import { clean } from "./utils/text.js";
+import { semanticChunk } from "./utils/chunkers.js";

@@
-export async function buildIndex({ tenant, ns, chunkSize=800, overlap=80 }) {
+export async function buildIndex({ tenant, ns, chunkSize=900, overlap=120, maxChars=1300 }) {
const { kbDir, idxFile } = ensureTenantNS(tenant, ns);
const files = listDocs(kbDir);
if (!files.length) throw new Error("此命名空間沒有 .md/.txt 檔案");

const docs=[];
for (const fp of files) {
const raw = clean(fs.readFileSync(fp,"utf-8"));

  • const chs = chunkText(raw, chunkSize, overlap);
  • for (const c of chs) docs.push({ docId: path.basename(fp), chunkId: c.id, text: c.content });
  • const chs = semanticChunk(raw, { targetChars: chunkSize, maxChars, parOverlap: overlap });
  • for (const c of chs) docs.push({ docId: path.basename(fp), chunkId: c.chunkId, text: c.content });
    }

♻️ 修改:增量/全量 API 支援參數
app/api/kb/[tenant]/[ns]/reindex-incremental/route.js
-export const POST = withAuth(async (req, ctx) => {
+export const POST = withAuth(async (req, ctx) => {
if (ctx.user.role === "viewer") return NextResponse.json({ ok:false, error:"Editor/Admin only" }, { status:403 });
const { tenant, ns } = ctx.params;

  • const { chunkSize = 800, overlap = 80, concurrency = 4 } = await req.json().catch(()=>({}));
  • const out = await buildIndexIncremental({ tenant, ns, chunkSize, overlap, concurrency });
  • const { chunkSize = 900, overlap = 120, maxChars = 1300, concurrency = 4 } = await req.json().catch(()=>({}));
  • const out = await buildIndexIncremental({ tenant, ns, chunkSize, overlap, maxChars, concurrency });
    return NextResponse.json({ ok:true, ...out });
    }, ["editor","admin"]);

app/api/kb/[tenant]/[ns]/reindex/route.js

(Day 18 已有的全量重建 route,改參數沿用即可)

  • const out = await buildIndex({ tenant, ns });
  • const body = await req.json().catch(()=>({}));
  • const out = await buildIndex({
  • tenant, ns,
  • chunkSize: body?.chunkSize ?? 900,
  • overlap: body?.overlap ?? 120,
  • maxChars: body?.maxChars ?? 1300,
    +});

package.json(腳本補充)
{
"scripts": {
"day22:reindex": "curl -s -X POST http://localhost:3000/api/kb/acme/faq/reindex -H 'Authorization: Bearer ' -H 'Content-Type: application/json' -d '{"chunkSize":900,"overlap":120,"maxChars":1300}'",
"day22:reindex:inc": "curl -s -X POST http://localhost:3000/api/kb/acme/faq/reindex-incremental -H 'Authorization: Bearer ' -H 'Content-Type: application/json' -d '{"chunkSize":900,"overlap":120,"maxChars":1300,"concurrency":4}'"
}
}


上一篇
RAG「增量索引」+ 併發控制 + 刪除檢測
下一篇
章節錨點 + 章節優先檢索(Section-first Retrieval)
系列文
練習AI24
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言