🆕 新增:src/utils/chunkers.js
// src/utils/chunkers.js
import { sha256String } from "../utils/hash.js";
/**
const text = normalize(markdown);
const blocks = splitBlocks(text); // 粗切:標題/清單/程式碼/段落
const pieces = flattenOversizeBlocks(blocks, maxChars); // 句子級補切
const chunks = [];
let buf = "";
for (let i = 0; i < pieces.length; i++) {
const cur = pieces[i];
if ((buf + sep(buf) + cur).length <= targetChars) {
buf = concat(buf, cur);
continue;
}
// emit one chunk
if (buf.trim()) chunks.push(makeChunk(buf, docPrefix));
// overlap:帶上一點上一塊尾巴
const tail = buf.slice(Math.max(0, buf.length - parOverlap));
buf = tail + (tail && !tail.endsWith("\n") ? "\n" : "") + cur;
if (buf.length > maxChars) { // 保底:超長直接切
chunks.push(makeChunk(buf.slice(0, maxChars), docPrefix));
buf = buf.slice(maxChars - parOverlap);
}
}
if (buf.trim()) chunks.push(makeChunk(buf, docPrefix));
return chunks;
}
// === helpers ===
function sep(s){ return s ? "\n" : ""; }
function concat(a,b){ return a ? (a + "\n" + b) : b; }
function normalize(s=""){
return s.replace(/\r\n/g,"\n")
.replace(/\t/g," ")
.replace(/[ \u3000]+$/gm,"") // 行尾空白
.trim();
}
function splitBlocks(s){
const lines = s.split("\n");
const blocks = [];
let cur = [];
let mode = "p"; // p=段落, code=程式碼
let fence = "";
const pushCur = () => {
if (!cur.length) return;
const txt = cur.join("\n").trim();
if (txt) blocks.push(txt);
cur = [];
};
for (let i=0;i<lines.length;i++){
const line = lines[i];
// code fence
const mFence = line.match(/^```([a-zA-Z0-9_-]*)\s*$/);
if (mFence) {
if (mode === "code") { // close
cur.push(line);
pushCur(); mode = "p"; fence = "";
} else {
pushCur(); mode = "code"; fence = mFence[1] || "";
cur.push(line);
}
continue;
}
if (mode === "code") { cur.push(line); continue; }
// headings or list as block boundary
if (/^#{1,6}\s+/.test(line) || /^[\-\*\+]\s+/.test(line) || /^\d+\.\s+/.test(line)) {
pushCur();
cur.push(line);
pushCur();
continue;
}
// blank line boundary
if (/^\s*$/.test(line)) { pushCur(); continue; }
cur.push(line);
}
pushCur();
return blocks;
}
function splitSentences(paragraph){
// 混中英句子切分(保守)
return paragraph.split(/(?<=[。!?!?;;]|(?:.\s)|(?:?\s)|(?:!\s))/g)
.map(s=>s.trim())
.filter(Boolean);
}
function flattenOversizeBlocks(blocks, maxChars){
const out = [];
for (const b of blocks) {
if (b.length <= maxChars) { out.push(b); continue; }
const sents = splitSentences(b);
let buf = "";
for (const s of sents) {
if ((buf + sep(buf) + s).length <= maxChars) buf = concat(buf, s);
else { if (buf) out.push(buf); buf = s; }
}
if (buf) out.push(buf);
}
return out;
}
function makeChunk(content, prefix=""){
const norm = content.trim();
const id = sha256String(norm).slice(0, 24); // 短一點便於讀
const chunkId = prefix ? ${prefix}_${id}
: id;
return { chunkId, content: norm, len: norm.length };
}
chunkId 使用 內容哈希,小改一段只影響那一塊;其餘 chunk ID 不動。
你也能把 prefix 設成 docId 或「首個 H1 標題 slug」增加可讀性。
♻️ 修改:src/day21_indexer.js(用智慧切塊+穩定 ID)
只貼「需要替換」的段落(其餘保留 Day 21 版本不動):
@@
-export async function buildIndexIncremental({
@@
chunkId: c.id,
text: c.content,
hash: sha256String(c.content),
textLen: c.content.length,
docId,
targetChars: chunkSize, maxChars, parOverlap: overlap, docPrefix: "" // prefix 可選
chunkId: c.chunkId,
text: c.content,
hash: sha256String(c.content),
textLen: c.len,
docId,
其他合併邏輯(依 chunkId 比對、向量重用、移除刪除檔)原封不動即可正常工作,
因為新 chunkId 已經是「內容穩定 ID」。
♻️ 修改:src/day16_rag_store.js(全量索引也換智慧切塊)
替換 buildIndex 中切塊段落即可:
-import { chunkText, clean } from "./utils/text.js";
+import { clean } from "./utils/text.js";
+import { semanticChunk } from "./utils/chunkers.js";
@@
-export async function buildIndex({ tenant, ns, chunkSize=800, overlap=80 }) {
+export async function buildIndex({ tenant, ns, chunkSize=900, overlap=120, maxChars=1300 }) {
const { kbDir, idxFile } = ensureTenantNS(tenant, ns);
const files = listDocs(kbDir);
if (!files.length) throw new Error("此命名空間沒有 .md/.txt 檔案");
const docs=[];
for (const fp of files) {
const raw = clean(fs.readFileSync(fp,"utf-8"));
♻️ 修改:增量/全量 API 支援參數
app/api/kb/[tenant]/[ns]/reindex-incremental/route.js
-export const POST = withAuth(async (req, ctx) => {
+export const POST = withAuth(async (req, ctx) => {
if (ctx.user.role === "viewer") return NextResponse.json({ ok:false, error:"Editor/Admin only" }, { status:403 });
const { tenant, ns } = ctx.params;
app/api/kb/[tenant]/[ns]/reindex/route.js
(Day 18 已有的全量重建 route,改參數沿用即可)
package.json(腳本補充)
{
"scripts": {
"day22:reindex": "curl -s -X POST http://localhost:3000/api/kb/acme/faq/reindex -H 'Authorization: Bearer ' -H 'Content-Type: application/json' -d '{"chunkSize":900,"overlap":120,"maxChars":1300}'",
"day22:reindex:inc": "curl -s -X POST http://localhost:3000/api/kb/acme/faq/reindex-incremental -H 'Authorization: Bearer ' -H 'Content-Type: application/json' -d '{"chunkSize":900,"overlap":120,"maxChars":1300,"concurrency":4}'"
}
}