在前幾篇文章中,我們已經成功完成了 Sign Language MNIST 資料集的訓練,並透過 CNN 模型進行手勢分類。接下來會將模型與 MediaPipe Hands 與 OpenCV 結合,讓電腦能夠在即時影像中辨識手勢。要讓這個系統更貼近真實應用場景,僅僅能輸出單次的推論結果還不夠,我們還需要即時的回饋機制,例如:
import cv2
import mediapipe as mp
import torch
import torchvision.transforms as transforms
from PIL import Image
import torch.nn as nn
class SimpleCNN(nn.Module):
def __init__(self, num_classes=24):
super(SimpleCNN, self).__init__()
self.cnn = nn.Sequential(
nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(),
nn.MaxPool2d(2),
)
self.fc = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 3 * 3, 256), nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, num_classes)
)
def forward(self, x):
return self.fc(self.cnn(x))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleCNN().to(device)
model.load_state_dict(torch.load("model.pth", map_location=device))
model.eval()
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1)
mp_draw = mp.solutions.drawing_utils
cap = cv2.VideoCapture(0)
transform = transforms.Compose([
transforms.Resize((28, 28)),
transforms.Grayscale(),
transforms.ToTensor(),
])
while True:
ret, frame = cap.read()
if not ret:
break
img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(img_rgb)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
h, w, _ = frame.shape
x_list = [lm.x for lm in hand_landmarks.landmark]
y_list = [lm.y for lm in hand_landmarks.landmark]
xmin, xmax = int(min(x_list)*w), int(max(x_list)*w)
ymin, ymax = int(min(y_list)*h), int(max(y_list)*h)
### 加 margin
margin = 20
xmin, ymin = max(xmin - margin, 0), max(ymin - margin, 0)
xmax, ymax = min(xmax + margin, w), min(ymax + margin, h)
### ===== ROI 過小略過 =====
if xmax - xmin < 50 or ymax - ymin < 50:
continue
### 擷取 ROI
hand_roi = frame[ymin:ymax, xmin:xmax]
hand_pil = Image.fromarray(cv2.cvtColor(hand_roi, cv2.COLOR_BGR2RGB))
input_tensor = transform(hand_pil).unsqueeze(0).to(device)
### CNN 推論
with torch.no_grad():
output = model(input_tensor)
probs = torch.softmax(output, dim=1)
conf, pred = torch.max(probs, dim=1)
pred, conf = pred.item(), conf.item()
### ===== 信心度過低略過 =====
if conf < 0.7:
continue
### Label 編碼轉字母(跳過 J)
label_char = chr(pred + 65 if pred < 9 else pred + 66)
### ===== 顯示結果 =====
cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
text = f"{label_char} ({conf*100:.1f}%)"
cv2.putText(frame, text, (xmin, ymin - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)
mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
cv2.imshow("Hand Sign Prediction", frame)
if cv2.waitKey(1) == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
即時辨識:使用 MediaPipe 精準追蹤手部,CNN 模型負責分類手勢。
即時回饋:在畫面上顯示手勢字母與信心度。
過濾機制:
使用者友善:讓整個辨識系統更穩定,避免「誤判字母跳來跳去」的情況。
手勢識別系統不僅能 即時辨識,還能透過 即時回饋機制 提升穩定性與可靠性。透過這些設計,我們離一個真正實用的「即時手勢互動系統」又更進一步。