實作 :
import math
import json
import os
from typing import List, Dict, Optional
import numpy as np
import pandas as pd
# tokenization & embedding
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
# FAISS (optional)
try:
import faiss
FAISS_AVAILABLE = True
except:
FAISS_AVAILABLE = False
# Chroma (optional)
try:
import chromadb
from chromadb import PersistentClient
CHROMA_AVAILABLE = True
except:
CHROMA_AVAILABLE = False
# ------------------------------
# Chunk 函式:char-based
# ------------------------------
def chunk_by_chars(text: str, max_chars: int=500, overlap_chars: int=50) -> List[str]:
"""
以字元為單位切分 (簡單 sliding window),回傳 list of chunk texts
"""
if max_chars <= 0:
raise ValueError("max_chars 必須 > 0")
if overlap_chars >= max_chars:
raise ValueError("overlap_chars 必須小於 max_chars")
chunks = []
start = 0
N = len(text)
while start < N:
end = start + max_chars
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start += (max_chars - overlap_chars)
return chunks
# ------------------------------
# Chunk 函式:token-based (需要 fast tokenizer 以回傳 offsets)
# ------------------------------
def chunk_by_tokens(text: str, tokenizer: AutoTokenizer, max_tokens: int=200, overlap_tokens: int=40) -> List[Dict]:
"""
使用 tokenizer (use_fast=True) 對 text 進行 token-based chunk。
回傳 list of dict,每個 dict 含 keys: tokens, text (decoded), start_token, end_token, start_char, end_char
"""
if not getattr(tokenizer, "is_fast", False):
raise ValueError("請使用 fast tokenizer (use_fast=True),否則無法取得 offset mapping。")
if max_tokens <= 0 or overlap_tokens >= max_tokens:
raise ValueError("max_tokens 必須 >0 且 overlap_tokens < max_tokens")
# 1) tokenize with offsets
enc = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
token_ids = enc["input_ids"]
offsets = enc["offset_mapping"] # list of (start_char, end_char) per token
n_tokens = len(token_ids)
chunks = []
step = max_tokens - overlap_tokens
start_idx = 0
chunk_idx = 0
while start_idx < n_tokens:
end_idx = min(start_idx + max_tokens, n_tokens)
# decode tokens[start_idx:end_idx] to text
sub_ids = token_ids[start_idx:end_idx]
chunk_text = tokenizer.decode(sub_ids, skip_special_tokens=True).strip()
# compute char offsets from offsets list
start_char = offsets[start_idx][0] if start_idx < len(offsets) else None
end_char = offsets[end_idx-1][1] if (end_idx-1) < len(offsets) else None
chunks.append({
"chunk_id": f"c{chunk_idx}",
"start_token": start_idx,
"end_token": end_idx,
"start_char": start_char,
"end_char": end_char,
"text": chunk_text
})
chunk_idx += 1
start_idx += step
return chunks
# ------------------------------
# 高階函式:把一個 doc 切成 chunks(回傳 DataFrame)
# ------------------------------
def chunk_document(doc_id: str, text: str,
method: str="token",
tokenizer: Optional[AutoTokenizer]=None,
max_tokens: int=200, overlap_tokens: int=40,
max_chars: int=1000, overlap_chars: int=200,
source: Optional[str]=None) -> pd.DataFrame:
"""
doc_id, text -> DataFrame of chunks with metadata
method: "token" or "char"
若 method="token",需要提供 tokenizer (use_fast=True)
"""
records = []
if method == "token":
if tokenizer is None:
raise ValueError("token-based 分割需要提供 tokenizer (use_fast=True)")
token_chunks = chunk_by_tokens(text, tokenizer, max_tokens=max_tokens, overlap_tokens=overlap_tokens)
for c in token_chunks:
records.append({
"doc_id": doc_id,
"chunk_id": f"{doc_id}_{c['chunk_id']}",
"text": c["text"],
"start_char": c["start_char"],
"end_char": c["end_char"],
"start_token": c["start_token"],
"end_token": c["end_token"],
"source": source
})
elif method == "char":
char_chunks = chunk_by_chars(text, max_chars=max_chars, overlap_chars=overlap_chars)
for i, ctext in enumerate(char_chunks):
# find start/end (best-effort, 可能遇到重複 substring)
start = text.find(ctext)
end = start + len(ctext) if start != -1 else None
records.append({
"doc_id": doc_id,
"chunk_id": f"{doc_id}_c{i}",
"text": ctext,
"start_char": start,
"end_char": end,
"start_token": None,
"end_token": None,
"source": source
})
else:
raise ValueError("method 必須是 'token' 或 'char'")
return pd.DataFrame.from_records(records)
# ------------------------------
# 批次製作 chunks(多文件)
# ------------------------------
def chunk_documents_bulk(docs: List[Dict], method="token", tokenizer=None, **kwargs) -> pd.DataFrame:
"""
docs: list of dict {doc_id, text, source(optional)}
回傳合併的 DataFrame
"""
all_recs = []
for d in docs:
dfc = chunk_document(d["doc_id"], d["text"], method=method, tokenizer=tokenizer, source=d.get("source"), **kwargs)
all_recs.append(dfc)
if all_recs:
return pd.concat(all_recs, ignore_index=True)
else:
return pd.DataFrame(columns=["doc_id","chunk_id","text","start_char","end_char","start_token","end_token","source"])
# ------------------------------
# Embedding & upsert to FAISS / Chroma
# ------------------------------
def embed_chunks_and_index(df_chunks: pd.DataFrame, embedder: SentenceTransformer,
faiss_index: Optional[object]=None, faiss_ids: Optional[List[str]]=None,
chroma_collection: Optional[object]=None,
batch_size: int=64,
normalize_for_faiss: bool=True,
faiss_index_path: Optional[str]=None,
faiss_id_map_path: Optional[str]=None):
"""
- df_chunks: DataFrame must include chunk_id & text
- embedder: SentenceTransformer instance
- if faiss_index provided: add embeddings to it (and return updated index and id list)
- if chroma_collection provided: upsert chunks into chroma (ids, documents, metadatas, embeddings)
- returns: embeddings ndarray, updated faiss_index, updated faiss_ids
"""
texts = df_chunks["text"].astype(str).tolist()
ids = df_chunks["chunk_id"].astype(str).tolist()
N = len(texts)
print(f"Embedding {N} chunks (batch {batch_size}) ...")
emb_list = []
for i in range(0, N, batch_size):
batch_texts = texts[i:i+batch_size]
emb = embedder.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
emb_list.append(emb)
embeddings = np.vstack(emb_list).astype("float32")
print("embeddings shape:", embeddings.shape)
# 1) FAISS: add
if faiss_index is not None:
emb_to_add = embeddings.copy()
if normalize_for_faiss:
faiss.normalize_L2(emb_to_add)
faiss_index.add(emb_to_add)
if faiss_ids is None:
faiss_ids = []
faiss_ids.extend(ids)
print("FAISS ntotal:", faiss_index.ntotal)
if faiss_index_path:
faiss.write_index(faiss_index, faiss_index_path)
print("Saved FAISS index to", faiss_index_path)
if faiss_id_map_path:
with open(faiss_id_map_path, "w", encoding="utf-8") as f:
json.dump(faiss_ids, f, ensure_ascii=False, indent=2)
print("Saved FAISS id map to", faiss_id_map_path)
# 2) Chroma: upsert
if chroma_collection is not None:
# build metadatas from df_chunks (safe conversion)
metadatas = df_chunks.drop(columns=["text"]).to_dict(orient="records")
chroma_collection.upsert(ids=ids, documents=texts, metadatas=metadatas, embeddings=embeddings.tolist())
print("Upserted into Chroma collection.")
return embeddings, faiss_index, faiss_ids
# ------------------------------
# 示例:把一篇長文章做 token-chunk -> embed -> insert into FAISS & Chroma
# ------------------------------
if __name__ == "__main__":
# 範例長文本(你可以改成讀檔或讀 PDF 的文字)
long_text = (
"2025年9月28日,某城市宣布新的公共自行車專案,"
"將增設200個停車樁,改善共享單車管理系統,"
"預計降低交通壅塞並提升綠色出行的比例。市長說,"
"此計畫如果取得中央補助,將可拓展至周邊鄉鎮,"
"同時也會改善夜間照明與安全監控。專案預算約三千萬元。"
) * 6 # 重複造成更長的文本
docs = [{"doc_id": "news_001", "text": long_text, "source":"news_sample"}]
# 先建立 tokenizer (fast) 與 embedder
tk_model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tokenizer = AutoTokenizer.from_pretrained(tk_model, use_fast=True)
embedder = SentenceTransformer(tk_model)
# chunk(token-based)
df_chunks = chunk_documents_bulk(docs, method="token", tokenizer=tokenizer, max_tokens=120, overlap_tokens=24)
print("Total chunks:", len(df_chunks))
print(df_chunks.head())
# 選擇要用哪個 index(FAISS 或 Chroma)
# FAISS 建 index 示例(如果想用 FAISS)
faiss_index = None
faiss_ids = None
if FAISS_AVAILABLE:
d = embedder.get_sentence_embedding_dimension()
faiss_index = faiss.IndexFlatIP(d) # inner product (需 normalize)
faiss_ids = []
print("Created FAISS IndexFlatIP (d=%d)"%d)
# Chroma 建 collection 示例(如果想用 Chroma)
chroma_collection = None
if CHROMA_AVAILABLE:
client = PersistentClient(path="./chroma_day25")
try:
chroma_collection = client.get_collection("chunks_collection")
except Exception:
chroma_collection = client.create_collection(name="chunks_collection")
# embed 並加入 index
embeddings, faiss_index, faiss_ids = embed_chunks_and_index(df_chunks, embedder,
faiss_index=faiss_index, faiss_ids=faiss_ids,
chroma_collection=chroma_collection,
batch_size=32,
faiss_index_path="faiss_news.index",
faiss_id_map_path="faiss_id_map.json")
print("Done. You can now query FAISS or Chroma with the same embedder.")
結果 :
Total chunks: 5
doc_id chunk_id text
0 news_001 news_001_c0 2025年9月28日,某城市宣布新的公共自行車專案,將增設200個停車樁,改善共享單車管理系...
1 news_001 news_001_c1 樁,改善共享單車管理系統,預計降低交通壅塞並提升綠色出行的比例。市長說,此計畫如果取得中央補...
2 news_001 news_001_c2 說,此計畫如果取得中央補助,將可拓展至周邊鄉鎮,同時也會改善夜間照明與安全監控。專案預算約三...
3 news_001 news_001_c3 照明與安全監控。專案預算約三千萬元。2025年9月28日,某城市宣布新的公共自行車專案,將增...
4 news_001 news_001_c4 宣布新的公共自行車專案,將增設200個停車樁,改善共享單車管理系統,預計降低交通壅塞並提升綠...
start_char end_char start_token end_token source
0 0 191 0 120 news_sample
1 154 344 96 216 news_sample
2 307 496 192 312 news_sample
3 458 649 288 408 news_sample
4 609 714 384 450 news_sample
Embedding 5 chunks (batch 32) ...
embeddings shape: (5, 384)
Done. You can now query FAISS or Chroma with the same embedder.
說明 :