六、把資料轉成 Hugging Face Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
df = pd.read_csv("sentiment.csv") # id,text,label
train_val, test = train_test_split(df, test_size=0.1, stratify=df["label"], random_state=42)
train, val = train_test_split(train_val, test_size=0.1111, stratify=train_val["label"], random_state=42) # -> 0.8/0.1/0.1
from datasets import load_dataset
data = load_dataset('csv', data_files={'train':'train.csv','validation':'val.csv','test':'test.csv'})
print(data)
七、Tokenize / chunk / 存 embeddings(RAG 或檢索)
from transformers import AutoTokenizer
model_name = "bert-base-chinese" # 或 distilbert、roberta 的中文變體
tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess_fn(examples):
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
tokenized = data.map(preprocess_fn, batched=True)
# 對 label 進行 int 編碼(若 label 是文字)
def label_fn(example):
example["label"] = int(example["label"]) # 視情況
return example
tokenized = tokenized.map(label_fn)
def chunk_text(text, tokenizer, chunk_size=512, overlap=50):
ids = tokenizer.encode(text, add_special_tokens=False)
chunks = []
for i in range(0, len(ids), chunk_size - overlap):
chunk_ids = ids[i:i+chunk_size]
chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
chunks.append(chunk_text)
return chunks
# 使用範例
chunks = chunk_text(long_doc_text, tokenizer, chunk_size=512, overlap=50)
# 再把每個 chunk 存成 jsonl,附上原 doc id + chunk index
from sentence_transformers import SentenceTransformer
import numpy as np
embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
docs = [d["text"] for d in docs_list] # docs_list 為 chunked documents
embeddings = embedder.encode(docs, convert_to_numpy=True, show_progress_bar=True)
# 存檔
np.save("doc_embeddings.npy", embeddings)
# 也可把每個 embedding 與 doc metadata 寫到 sqlite/json 或上傳到 vector DB (FAISS/Chroma/Weaviate)
八、評估
from sklearn.metrics import classification_report
y_true = [...]
y_pred = [...]
print(classification_report(y_true, y_pred, digits=4))