iT邦幫忙

2025 iThome 鐵人賽

DAY 25
0
生成式 AI

練習AI系列 第 26

Hybrid Retrieval(向量 + 關鍵字 BM25)

  • 分享至 

  • xImage
  •  

🆕 程式碼

  1. src/day25_hybrid.js
// src/day25_hybrid.js
import fs from "fs";
import path from "path";
import { openai } from "./aiClient.js";
import { ensureTenantNS } from "./utils/tenantfs.js";
import { embedText } from "./day16_rag_store.js"; // 你已有的封裝
import { cosine } from "./utils/math.js";

/**
 * 簡單 BM25 實作(僅支援小規模 KB)
 */
class BM25 {
  constructor(docs) {
    this.docs = docs;
    this.docLens = docs.map(d => d.text.split(/\s+/).length);
    this.avgLen = this.docLens.reduce((a,b)=>a+b,0)/docs.length;
    this.k1 = 1.5; this.b = 0.75;
    this.termFreqs = docs.map(d => {
      const map = {};
      for (const t of d.text.toLowerCase().split(/\s+/)) {
        map[t] = (map[t]||0)+1;
      }
      return map;
    });
    this.df = {};
    for (const map of this.termFreqs) {
      for (const t of Object.keys(map)) {
        this.df[t] = (this.df[t]||0)+1;
      }
    }
    this.N = docs.length;
  }

  score(query) {
    const qterms = query.toLowerCase().split(/\s+/);
    const scores = Array(this.docs.length).fill(0);
    for (const term of qterms) {
      const df = this.df[term] || 0;
      if (!df) continue;
      const idf = Math.log(1 + (this.N - df + 0.5)/(df + 0.5));
      for (let i=0;i<this.docs.length;i++) {
        const tf = this.termFreqs[i][term] || 0;
        if (!tf) continue;
        const len = this.docLens[i];
        const denom = tf + this.k1*(1-this.b+this.b*len/this.avgLen);
        const s = idf * (tf*(this.k1+1))/denom;
        scores[i]+=s;
      }
    }
    return scores.map((s,i)=>({score:s,doc:this.docs[i]}));
  }
}

/**
 * Hybrid Retrieval
 */
export async function retrieveHybrid({ tenant, ns, query, topK=6 }) {
  const { idxFile } = ensureTenantNS(tenant, ns);
  if (!fs.existsSync(idxFile)) throw new Error("索引不存在");
  const data = JSON.parse(fs.readFileSync(idxFile,"utf-8"));
  const docs = data.index || [];

  // 向量檢索
  const qv = await embedText(query);
  const vecScores = docs.map(d => ({
    ...d,
    score_vec: cosine(qv, d.vector)
  }));

  // BM25 檢索
  const bm25 = new BM25(docs);
  const kwScores = bm25.score(query).map(x=>({
    ...x.doc,
    score_kw: x.score
  }));

  // 合併
  const merged = new Map();
  for (const v of vecScores) merged.set(v.id,{...v});
  for (const k of kwScores) {
    if (!merged.has(k.id)) merged.set(k.id,{...k});
    else merged.get(k.id).score_kw = k.score_kw;
  }

  // 分數標準化(min-max)
  const arr = [...merged.values()];
  const vecMin = Math.min(...arr.map(x=>x.score_vec||0));
  const vecMax = Math.max(...arr.map(x=>x.score_vec||1));
  const kwMin = Math.min(...arr.map(x=>x.score_kw||0));
  const kwMax = Math.max(...arr.map(x=>x.score_kw||1));

  for (const a of arr) {
    const nv = (a.score_vec - vecMin)/(vecMax-vecMin+1e-9);
    const nk = (a.score_kw - kwMin)/(kwMax-kwMin+1e-9);
    a.hybridScore = 0.6*nv + 0.4*nk;
  }

  return arr.sort((a,b)=>b.hybridScore-a.hybridScore).slice(0,topK);
}
  1. app/api/kb/[tenant]/[ns]/ask/route.js

新增 strategy=hybrid 分支:

import { retrieveHybrid } from "../../../../../src/day25_hybrid.js";

...

  if (strategy === "hybrid") {
    const chunks = await retrieveHybrid({ tenant, ns, query: q, topK: 6 });
    const ctxText = chunks.map((h,i)=>`# 片段${i+1}(${h.docId})\n${h.text}`).join("\n\n");

    const res = await openai.chat.completions.create({
      model:"gpt-4o-mini", temperature:0.2,
      messages:[
        { role:"system", content:"你是嚴謹的知識庫助理,根據片段回答,若不足請說明。" },
        { role:"user", content:`問題:${q}\n\n片段:\n${ctxText}` }
      ]
    });
    const answer = res.choices?.[0]?.message?.content?.trim() || "沒有足夠資訊。";

    return NextResponse.json({ ok:true, strategy:"hybrid", answer, sources: chunks });
  }
  1. app/studio/page.tsx

在下拉選單加一項:

<select ... value={strategy} onChange={e=>setStrategy(e.target.value as any)}>
   <option value="default">Default(Top-K)</option>
   <option value="section">Section-first</option>
   <option value="qrewrite">Query Rewrite</option>
+  <option value="hybrid">Hybrid(向量+關鍵字)</option>
</select>

上一篇
查詢重寫(Query Rewriting,多路召回 + 去重)
下一篇
索引壓縮 + 向量量化(Index Compression & Vector Quantization)
系列文
練習AI29
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言