表情是人類最即時、最自然的情緒訊號。把「臉部表情」轉成模型可理解的特徵,便能應用在客服分流、駕駛警示、課堂互動、輔具設計等情境。這篇用 FER2013 資料集加上 ResNet18 轉移學習,完成一個穩定的入門級表情分類器:包含資料載入、前處理、訓練與評估(含 混淆矩陣 與 Flip-TTA)。
FER2013 (Facial Expression Recognition 2013) 是一個經典的人臉表情辨識資料集,常被用來做情緒分類入門與基準測試(benchmark)。
取得方式:Kaggle FER2013 資料集
規模與來源:
類別(7 種情緒),常見的索引對應如下(以官方 CSV 的 emotion 欄位為準):
0=angry, 1=disgust, 2=fear, 3=happy, 4=sad, 5=surprise, 6=neutral
import os, numpy as np, torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms as T, models
from torchvision.models import ResNet18_Weights
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
ROOT = "FER2013" # ←← 改成自己的資料夾路徑
TRAIN_DIR = os.path.join(ROOT, "train")
TEST_DIR = os.path.join(ROOT, "test")
BATCH_TRAIN = 128
BATCH_EVAL = 256
EPOCHS = 20
SEED = 42
torch.manual_seed(SEED); np.random.seed(SEED)
固定寫死,避免不同 torchvision 版本 meta 差異
IMNET_MEAN = [0.485, 0.456, 0.406]
IMNET_STD = [0.229, 0.224, 0.225]
train_tf = T.Compose([
T.Grayscale(3),
T.RandomResizedCrop(224, scale=(0.9, 1.0)),
T.RandomHorizontalFlip(0.5),
T.RandomRotation(10, fill=0),
T.ToTensor(),
T.Normalize(IMNET_MEAN, IMNET_STD),
])
eval_tf = T.Compose([
T.Grayscale(3),
T.Resize(256),
T.CenterCrop(224),
T.ToTensor(),
T.Normalize(IMNET_MEAN, IMNET_STD),
])
train_full = datasets.ImageFolder(TRAIN_DIR, transform=train_tf)
test_set = datasets.ImageFolder(TEST_DIR, transform=eval_tf)
class_names = train_full.classes
num_classes = len(class_names)
print("Classes:", class_names)
以目錄標籤做 stratified split(維持類別比例):
targets = [lbl for _, lbl in train_full.samples] # 每張圖對應的類別索引
idx_all = np.arange(len(targets))
train_idx, val_idx = train_test_split(
idx_all, test_size=0.2, stratify=targets, random_state=SEED
)
train_set = Subset(datasets.ImageFolder(TRAIN_DIR, transform=train_tf), train_idx)
val_set = Subset(datasets.ImageFolder(TRAIN_DIR, transform=eval_tf), val_idx)
自動偵測裝置
device = (torch.device("cuda") if torch.cuda.is_available()
else torch.device("mps") if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()
else torch.device("cpu"))
PIN = device.type == "cuda"
NUM_WORKERS = min(4, os.cpu_count() or 2)
common_kwargs = dict(num_workers=NUM_WORKERS, pin_memory=PIN, persistent_workers=NUM_WORKERS > 0)
prefetch_factor 僅在 num_workers>0 可用:
if NUM_WORKERS > 0:
train_loader = DataLoader(train_set, batch_size=BATCH_TRAIN, shuffle=True,
drop_last=True, prefetch_factor=2, **common_kwargs)
else:
train_loader = DataLoader(train_set, batch_size=BATCH_TRAIN, shuffle=True,
drop_last=True, **common_kwargs)
val_loader = DataLoader(val_set, batch_size=BATCH_EVAL, shuffle=False, **common_kwargs)
test_loader = DataLoader(test_set, batch_size=BATCH_EVAL, shuffle=False, **common_kwargs)
weights = ResNet18_Weights.DEFAULT
model = models.resnet18(weights=weights)
model.fc = nn.Linear(model.fc.in_features, num_classes)
model = model.to(device)
Optim / Loss / LR 調度:
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss(label_smoothing=0.05)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode='min', factor=0.5, patience=2, threshold=1e-4, cooldown=0, min_lr=1e-6
)
保留最佳權重
best_val_acc, no_improve, patience_es = 0.0, 0, 5
for epoch in range(1, EPOCHS + 1):
# Train
model.train()
running = 0.0
for xb, yb in train_loader:
xb, yb = xb.to(device), yb.to(device)
optimizer.zero_grad()
out = model(xb)
loss = criterion(out, yb)
loss.backward()
optimizer.step()
running += loss.item()
train_loss = running / max(1, len(train_loader))
# Validate
model.eval()
vloss, correct, total = 0.0, 0, 0
with torch.no_grad():
for xb, yb in val_loader:
xb, yb = xb.to(device), yb.to(device)
out = model(xb)
vloss += criterion(out, yb).item()
pred = out.argmax(1)
correct += (pred == yb).sum().item()
total += yb.size(0)
val_loss = vloss / max(1, len(val_loader))
val_acc = correct / max(1, total)
scheduler.step(val_loss)
print(f"Epoch {epoch}/{EPOCHS} TrainLoss:{train_loss:.4f} ValLoss:{val_loss:.4f} ValAcc:{val_acc:.4f}")
if val_acc > best_val_acc:
best_val_acc, no_improve = val_acc, 0
torch.save(model.state_dict(), "fer_resnet18_best.pth")
else:
no_improve += 1
if no_improve >= patience_es:
print("Early stopping triggered.")
break
try:
import seaborn as sns
USE_SNS = True
except Exception:
USE_SNS = False
model.load_state_dict(torch.load("fer_resnet18_best.pth", map_location=device))
model.eval()
def predict_tta(xb: torch.Tensor) -> torch.Tensor:
with torch.no_grad():
l1 = model(xb)
l2 = model(torch.flip(xb, dims=[3])) # 水平翻轉
return (l1 + l2) / 2
all_y, all_p = [], []
with torch.no_grad():
for xb, yb in test_loader:
xb = xb.to(device)
preds = predict_tta(xb).argmax(1).cpu().numpy().tolist()
all_p += preds
all_y += yb.numpy().tolist()
print("\n=== Classification Report (Test, Flip-TTA) ===")
print(classification_report(all_y, all_p, target_names=class_names, digits=4))
cm = confusion_matrix(all_y, all_p)
plt.figure(figsize=(7,6))
if USE_SNS:
sns.heatmap(cm, annot=False, cmap="Blues",
xticklabels=class_names, yticklabels=class_names)
else:
plt.imshow(cm, cmap="Blues")
plt.xticks(range(len(class_names)), class_names, rotation=45, ha="right")
plt.yticks(range(len(class_names)), class_names)
plt.title("FER2013 Confusion Matrix (ResNet18 + Flip-TTA)")
plt.xlabel("Predicted"); plt.ylabel("True")
plt.tight_layout(); plt.show()
設定下,20 個 epoch 可達到約 69.2% 的測試整體正確率(weighted avg ≈ 0.691),happy / surprise 表現最佳,fear / sad / neutral 仍有提升空間。
到這裡,你已經把 FER2013 做到一個「可用、可調」的基準線:合理增強、分層切分、轉移學習、調度器與 TTA。這個方案適合當作任何表情/情緒偵測實作的起手式。