在 HCI 裡,我們不只想知道「使用者說了什麼」,更想知道「他在什麼情境、帶著什麼情緒說」。BERT(Bidirectional Encoder Representations from Transformers)透過雙向注意力機制,把一句話前後文一起「讀」進去,因此特別擅長情緒/極性判斷(positive/negative)這類需要語境理解的任務。
這篇用 Hugging Face 的 GLUE/SST-2 二分類資料集做一個最小可用範例:下載資料 → 標準前處理 → 純 PyTorch 微調 BERT → 報表與推論。文末也附上推論小函式,可直接拿去做 UI 留言/客服對話的即時情緒判斷。
為什麼這樣設計:
bert-base-uncased
,語言泛用、社群資源多,遷移學習穩定。TRANSFORMERS_NO_TF=1
強制只走 PyTorch 分支。需要套件:pip install torch datasets transformers scikit-learn
預設取 4,000 筆做快速實驗;要全量訓練,把 N_TRAIN = 4000
改為 None
,或直接拿掉 .select(...)
。
一定要在 import transformers 之前
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
import random
import numpy as np
import torch
import torch.nn.functional as F
from datasets import load_dataset
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (
BertTokenizerFast,
BertForSequenceClassification,
DataCollatorWithPadding,
get_linear_schedule_with_warmup,
)
優先 MPS -> CUDA -> CPU:
if torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
print("Device:", device)
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
raw = load_dataset("glue", "sst2")
N_TRAIN = 4000
train_ds = raw["train"].shuffle(SEED).select(range(N_TRAIN))
val_ds = raw["validation"]
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
def tokenize_fn(batch):
return tokenizer(
batch["sentence"],
truncation=True,
padding=False,
max_length=128,
)
train_enc = train_ds.map(tokenize_fn, batched=True)
val_enc = val_ds.map(tokenize_fn, batched=True)
train_enc = train_enc.remove_columns(["sentence", "idx"])
val_enc = val_enc.remove_columns(["sentence", "idx"])
train_enc = train_enc.rename_column("label", "labels")
val_enc = val_enc.rename_column("label", "labels")
注意 columns 要對齊
cols = ["input_ids", "attention_mask", "labels"]
train_enc.set_format(type="torch", columns=cols)
val_enc.set_format(type="torch", columns=cols)
重點:用 collator 動態補齊長度
from torch.utils.data import DataLoader
collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
BATCH_TRAIN = 16
BATCH_EVAL = 32
train_loader = DataLoader(train_enc, batch_size=BATCH_TRAIN, shuffle=True,
collate_fn=collator)
val_loader = DataLoader(val_enc, batch_size=BATCH_EVAL, shuffle=False,
collate_fn=collator)
id2label = {0: "negative", 1: "positive"}
label2id = {"negative": 0, "positive": 1}
model = BertForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=2,
id2label=id2label,
label2id=label2id,
).to(device)
from torch.optim import AdamW
EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
num_training_steps = EPOCHS * len(train_loader)
num_warmup_steps = int(0.1 * num_training_steps)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)
支援 MPS/CUDA 混合精度
use_amp = device.type in ["cuda", "mps"]
if device.type == "cuda":
from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler(enabled=True)
else:
from torch.amp import autocast
scaler = None
def evaluate(m, loader):
m.eval()
total_loss = 0.0
all_y, all_p = [], []
with torch.no_grad():
for batch in loader:
batch = {k: v.to(device) for k, v in batch.items()}
with autocast(device_type=device.type, enabled=use_amp):
out = m(**batch)
loss = out.loss
total_loss += loss.item()
preds = out.logits.argmax(-1)
all_y.extend(batch["labels"].cpu().tolist())
all_p.extend(preds.cpu().tolist())
avg_loss = total_loss / max(1, len(loader))
return avg_loss, np.array(all_y), np.array(all_p)
best_val = float("inf")
for epoch in range(1, EPOCHS + 1):
model.train()
running = 0.0
for batch in train_loader:
batch = {k: v.to(device) for k, v in batch.items()}
optimizer.zero_grad(set_to_none=True)
with autocast(device_type=device.type, enabled=use_amp):
out = model(**batch)
loss = out.loss
if scaler is not None:
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
else:
loss.backward()
optimizer.step()
scheduler.step()
running += loss.item()
train_loss = running / max(1, len(train_loader))
val_loss, y_true, y_pred = evaluate(model, val_loader)
val_acc = (y_true == y_pred).mean()
print(f"Epoch {epoch} | Train {train_loss:.4f} | Val {val_loss:.4f} | Val Acc {val_acc:.4f}")
if val_loss < best_val:
best_val = val_loss
model.save_pretrained("./bert_sst2_pt/best")
tokenizer.save_pretrained("./bert_sst2_pt/best")
from sklearn.metrics import classification_report, confusion_matrix
print("\n=== Validation report ===")
print(classification_report(y_true, y_pred, target_names=["negative","positive"], digits=4))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))
不使用 pipeline
def predict_texts(texts, path="./bert_sst2_pt/best"):
tok = BertTokenizerFast.from_pretrained(path)
mdl = BertForSequenceClassification.from_pretrained(path).to(device)
mdl.eval()
enc = tok(texts, truncation=True, padding="longest",
max_length=128, return_tensors="pt")
enc = {k: v.to(device) for k, v in enc.items()}
with torch.no_grad():
logits = mdl(**enc).logits
probs = torch.softmax(logits, dim=-1).cpu().numpy()
preds = probs.argmax(axis=1)
return preds, probs
samples = [
"I absolutely loved this!",
"This is terrible and disappointing.",
"It was okay, not great, not terrible."
]
preds, probs = predict_texts(samples)
for s, p, pr in zip(samples, preds, probs):
print(f"{s} => {id2label[int(p)]} (pos={pr[1]:.3f}, neg={pr[0]:.3f})")
這份最小可用範例展示了:BERT 憑藉雙向上下文,能把情緒與語境讀進去,快速在標準資料集上達到高準確率。接下來你可以:
distilbert-base-uncased
(更快)或 roberta-base
(更強)。max_length
調到 256 以容納更長句子。