昨天我們把「感知 → 理解 → 回應」拆成可落地的骨架。今天的任務很單純:把骨架跑起來。用 Gradio 搭一個可點可玩的多模態 Demo,把表情 × 手勢 × 語音串成一條線,檢查資料流是否順、融合規則是否穩、介面回饋是否好懂。
用 Gradio Blocks 在本機做出「鏡頭 + 麥克風 + 狀態欄 + 回饋條」的互動雛形,驗證昨天的系統設計在真實互動中可否順利運作。
gradio
, numpy
day23_app/
app.py # Gradio 介面與事件迴圈
preprocess.py # 影像/音訊前處理(最小可行)
fusion.py # 滑動視窗 + 權重融合
states.py # 狀態機 + 冷卻
preprocess.py
|前處理與「最小可行特徵」from __future__ import annotations
import numpy as np
音訊前處理:
def _as_wave(audio):
if audio is None:
return 16000, np.zeros(16000, dtype=np.float32)
if isinstance(audio, tuple) and len(audio) == 2: # (sr, np.ndarray)
sr, wav = audio
elif isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
sr, wav = int(audio["sampling_rate"]), np.array(audio["data"])
else:
sr, wav = 16000, np.array(audio, dtype=np.float32)
wav = np.asarray(wav, dtype=np.float32).squeeze()
if wav.ndim > 1:
wav = wav.mean(axis=1)
return int(sr), wav
def preprocess_audio(audio) -> dict:
sr, wav = _as_wave(audio)
if len(wav) == 0:
return {"sr": sr, "energy": 0.0, "zcr": 0.0, "centroid": 0.0}
energy = float(np.sqrt(np.mean(np.square(wav)) + 1e-9))
zcr = float(np.mean(np.abs(np.diff(np.sign(wav))) > 0))
spec = np.abs(np.fft.rfft(wav))
freqs = np.fft.rfftfreq(len(wav), d=1.0 / sr)
centroid = float((freqs * spec).sum() / (spec.sum() + 1e-9))
return {"sr": sr, "energy": energy, "zcr": zcr, "centroid": centroid}
def kws_scores_from_feats(feats: dict) -> dict:
e = feats.get("energy", 0.0)
c = feats.get("centroid", 0.0)
logit_on = 2.0 * e
logit_next = 0.003 * c
logit_stop = 1.0 - 0.002 * c
logit_none = 0.2
logits = np.array([logit_on, logit_next, logit_stop, logit_none], dtype=np.float32)
probs = softmax(logits)
return {k: float(v) for k, v in zip(["on", "next", "stop", "none"], probs)}
def softmax(x: np.ndarray) -> np.ndarray:
x = x - np.max(x)
ex = np.exp(x)
return ex / (np.sum(ex) + 1e-9)
影像前處理:
def preprocess_frame(img: np.ndarray) -> dict:
if img is None:
return {"bright": 0.0, "contrast": 0.0}
imgf = img.astype(np.float32) / 255.0
gray = imgf.mean(axis=2)
bright = float(gray.mean())
contrast = float(gray.std())
return {"bright": bright, "contrast": contrast}
def gesture_scores_from_feats(feats: dict) -> dict:
b = feats.get("bright", 0.0)
ct = feats.get("contrast", 0.0)
logit_thumbs = 1.5 * b + 0.2 * ct
logit_wave = 1.0 * ct
logit_stop = 0.8 * (1.0 - b) + 0.1
logit_none = 0.2
logits = np.array([logit_thumbs, logit_wave, logit_stop, logit_none], dtype=np.float32)
probs = softmax(logits)
return {k: float(v) for k, v in zip(["thumbs_up", "wave", "stop", "none"], probs)}
fusion.py
|滑動視窗 + 權重融合from __future__ import annotations
from collections import deque
import time
import numpy as np
def _top1(d: dict) -> tuple[str, float]:
if not d:
return "none", 0.0
k = max(d, key=d.get)
return k, float(d[k])
class FusionWindow:
def __init__(self, win_len:int=10, w_gesture=0.5, w_emotion=0.3, w_kws=0.2):
self.win_len = win_len
self.w = {"gesture": w_gesture, "emotion": w_emotion, "kws": w_kws}
self.buf = deque(maxlen=win_len)
def feed(self, gesture:dict|None=None, emotion:dict|None=None, kws:dict|None=None):
self.buf.append({"gesture": gesture or {}, "emotion": emotion or {}, "kws": kws or {}, "t": time.time()})
def _avg_scores(self, key:str) -> dict:
acc = {}
cnt = 0
for item in self.buf:
sc = item.get(key, {})
if not sc:
continue
for k, v in sc.items():
acc[k] = acc.get(k, 0.0) + float(v)
cnt += 1
if cnt == 0:
return {}
for k in acc:
acc[k] /= cnt
return acc
def fuse(self) -> dict:
g = self._avg_scores("gesture")
e = self._avg_scores("emotion")
k = self._avg_scores("kws")
labels = set(g) | set(e) | set(k) | {"none"}
fused = {}
for lab in labels:
fused[lab] = self.w["gesture"] * g.get(lab, 0.0) + \
self.w["emotion"] * e.get(lab, 0.0) + \
self.w["kws"] * k.get(lab, 0.0)
s = sum(fused.values()) or 1.0
for lab in fused:
fused[lab] /= s
top, score = _top1(fused)
return {"scores": fused, "top": top, "top_score": score}
states.py
|狀態機 + 冷卻from __future__ import annotations
import time
from dataclasses import dataclass, field
@dataclass
class Cooldown:
secs: float
last: float = field(default=0.0)
def ready(self) -> bool:
return (time.time() - self.last) >= self.secs
def tick(self):
self.last = time.time()
class StateMachine:
def __init__(self):
self.state = "IDLE"
self.cool = {
"start": Cooldown(5.0),
"encourage": Cooldown(8.0),
"next": Cooldown(2.0),
}
def _emit_hint(self) -> str:
if self.state == "IDLE": return "等待開始(比 👍 或說 on)"
if self.state == "LISTEN": return "跟上節奏(需要時說 next / 揮手 👋)"
if self.state == "COACH": return "我在,你可以的!(說 stop / ✋ 結束)"
if self.state == "END": return "今天很棒!"
return ""
def step(self, fused_top: str, voice_energy: float | None = None) -> dict:
ev = None
if fused_top in ("on", "thumbs_up"): ev = "start"
elif fused_top in ("next", "wave"): ev = "next"
elif fused_top in ("stop",): ev = "stop"
if voice_energy is not None and voice_energy < 0.01:
ev = ev or "encourage"
if self.state == "IDLE" and ev == "start" and self.cool["start"].ready():
self.state = "LISTEN"; self.cool["start"].tick()
elif self.state in ("LISTEN", "COACH") and ev == "next" and self.cool["next"].ready():
self.state = "COACH"; self.cool["next"].tick()
elif self.state in ("LISTEN", "COACH") and ev == "stop":
self.state = "END"
elif self.state == "COACH" and ev == "encourage" and self.cool["encourage"].ready():
self.cool["encourage"].tick()
return {"state": self.state, "hint": self._emit_hint(), "event": ev}
app.py
|Gradio 介面與 300ms 事件迴圈from __future__ import annotations
import time
import gradio as gr
import numpy as np
from preprocess import preprocess_audio, kws_scores_from_feats, preprocess_frame, gesture_scores_from_feats
from fusion import FusionWindow
from states import StateMachine
FUSION = FusionWindow(win_len=10, w_gesture=0.5, w_emotion=0.3, w_kws=0.2)
STATE = StateMachine()
_last_gesture = {}
_last_kws = {}
_last_voice_feats = {"energy": 0.0}
事件處理:
def on_image(img):
global _last_gesture
feats = preprocess_frame(img)
_last_gesture = gesture_scores_from_feats(feats)
return [
gr.update(value=render_scores(_last_gesture, ["thumbs_up", "wave", "stop", "none"])), # Gesture Scores
max(_last_gesture, key=_last_gesture.get) if _last_gesture else "none" # Gesture Top
]
def on_audio(audio):
global _last_kws, _last_voice_feats
feats = preprocess_audio(audio)
_last_voice_feats = {"energy": feats["energy"]}
_last_kws = kws_scores_from_feats(feats)
top = max(_last_kws, key=_last_kws.get) if _last_kws else "none"
return [
gr.update(value=render_scores(_last_kws, ["on", "next", "stop", "none"])),
top,
feats
]
def tick():
FUSION.feed(gesture=_last_gesture, kws=_last_kws)
fused = FUSION.fuse()
sm = STATE.step(fused_top=fused["top"], voice_energy=_last_voice_feats.get("energy", 0.0))
system_msg = f"State={sm['state']} | FusionTop={fused['top']} | Hint={sm['hint']}"
return sm["state"], system_msg
把 dict 分數畫成文字條(不依賴前端繪圖):
def render_scores(scores: dict, order: list[str]) -> str:
parts = []
for k in order:
v = scores.get(k, 0.0)
bar = "▰" * int(v * 20) + "▱" * (20 - int(v * 20))
parts.append(f"{k:<10} {bar} {int(v*100)}%")
return "\n".join(parts)
Gradio 介面:
with gr.Blocks(title="多模態情緒互動 Demo — 表情 × 手勢 × 語音", theme=gr.themes.Soft()) as demo:
gr.Markdown("### 多模態情緒互動 Demo — 表情 × 手勢 × 語音")
with gr.Row():
with gr.Column(scale=1):
webcam = gr.Image(sources=["webcam"], streaming=True, label="Webcam")
gesture_scores = gr.Textbox(label="Gesture Scores", value="(等待影像)", lines=6)
system_state = gr.Textbox(label="System State")
gesture_top = gr.Textbox(label="Gesture Top")
with gr.Column(scale=1):
mic = gr.Audio(sources=["microphone"], type="numpy", label="Mic(按錄音)")
kws_scores = gr.Textbox(label="KWS Scores", value="(等待錄音)", lines=6)
kws_top = gr.Textbox(label="KWS Top")
voice_feats = gr.JSON(label="Voice Features")
system_log = gr.Textbox(label="系統回饋", value="State=IDLE | FusionTop=idle | Hint=等待開始(比 👍 或說 on)")
webcam.stream(fn=on_image, inputs=webcam, outputs=[gesture_scores, gesture_top])
mic.change(fn=on_audio, inputs=mic, outputs=[kws_scores, kws_top, voice_feats])
t = gr.Timer(0.3, active=True)
t.tick(fn=tick, outputs=[system_state, system_log])
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, share=True)
python app.py
,開鏡頭:畫面左下角的 Gesture Scores / Gesture Top 會開始更新(來自 preprocess_frame()
的啟發式)。on_image()
裡把gesture_scores_from_feats(...)
改成 你的模型輸出({label: prob}
)。kws_scores_from_feats(...)
換成模型 logits/softmax 的 {label: prob}
。gr.Timer(0.3)
每 300ms 呼叫 tick()
,FusionWindow(win_len=10)
≈ 2–3 秒滑窗,權重(0.5 / 0.3 / 0.2)寫在建構子。FusionWindow(w_gesture=..., w_emotion=..., w_kws=...)
即可。StateMachine
把 start/next/stop
事件映射到 IDLE → LISTEN → COACH → END,且有冷卻。states.py
,可依你 Day22 的事件表調整。State=... | FusionTop=... | Hint=...
。events.jsonl
:在 tick()
裡加上 with open(...,"a")
將 fused 分數、state、event 寫檔即可。事件 | 觸發 | 模型/門檻 | 狀態轉移 | 回饋 | 冷卻 |
---|---|---|---|---|---|
開始 | 👍 或 on | conf > 0.8 | IDLE→LISTEN | 準備開始! | 5s |
鼓勵 | 低能量或連續悲傷>2s | RMS<θ 或 emo=sad | 保持 COACH | 我在,你可以的! | 8s |
切換 | 👋 或 next | 任一觸發 | 保持 COACH | 下一組! | 2s |
結束 | ✋ 或 stop | 任一觸發 | → END | 今天很棒! | — |
今天重點不是「模型多強」,而是整條互動鏈要穩、能解釋。先讓系統不亂跳、會說人話、尊重隱私——再追求更聰明的感知。