iT邦幫忙

2025 iThome 鐵人賽

DAY 8
0
生成式 AI

練習AI系列 第 9

語音轉文字(STT, Speech-to-Text)

  • 分享至 

  • xImage
  •  

🆕 程式碼實作

  1. src/day8_speech_to_text.js(新增)
    // src/day8_speech_to_text.js
    import fs from "fs";
    import path from "path";
    import { openai } from "./aiClient.js";

/**

  • 下載遠端檔到暫存目錄(Node 18+ 內建 fetch)
  • @param {string} url
  • @returns {Promise} 下載後的本機暫存檔路徑
    */
    async function downloadToTemp(url) {
    const res = await fetch(url);
    if (!res.ok) {
    throw new Error(下載失敗:${res.status} ${res.statusText});
    }
    const buf = Buffer.from(await res.arrayBuffer());
    const ext = guessExtFromContentType(res.headers.get("content-type")) || ".tmp";
    const tmpDir = path.join(process.cwd(), "outputs", "audio");
    if (!fs.existsSync(tmpDir)) fs.mkdirSync(tmpDir, { recursive: true });
    const filePath = path.join(tmpDir, audio_${Date.now()}${ext});
    fs.writeFileSync(filePath, buf);
    return filePath;
    }

function guessExtFromContentType(ct) {
if (!ct) return "";
if (ct.includes("audio/mpeg")) return ".mp3";
if (ct.includes("audio/mp4")) return ".m4a";
if (ct.includes("audio/x-m4a")) return ".m4a";
if (ct.includes("audio/wav")) return ".wav";
if (ct.includes("audio/webm")) return ".webm";
if (ct.includes("video/mp4")) return ".mp4"; // 有些錄音會是 mp4 容器
return "";
}

/**

  • 將語音檔轉成文字
  • @param {Object} opts
  • @param {string} [opts.filePath] - 本地檔案路徑
  • @param {string} [opts.url] - 遠端 URL(二擇一)
  • @param {string} [opts.language] - 語言提示(如 "zh", "en", "ja")
  • @param {string} [opts.prompt] - 自訂詞彙/領域提示,例如專有名詞
  • @param {boolean} [opts.detailed=false] - 是否需要詳細段落(若模型支援 verbose_json)
  • @returns {Promise<{ text: string, raw?: any, saved: {txt: string, json: string} }>}
    */
    export async function transcribe(opts = {}) {
    let { filePath, url, language, prompt, detailed = false } = opts;
    if (!filePath && !url) {
    throw new Error("請提供 filePath(本地檔)或 url(遠端)其一。");
    }

// 若是 URL 先下載
let localPath = filePath;
if (!localPath && url) {
localPath = await downloadToTemp(url);
}
if (!fs.existsSync(localPath)) {
throw new Error(找不到檔案:${localPath});
}

// 以 Readable 方式傳入
const file = await fs.createReadStream(localPath);

// 模型:gpt-4o-transcribe(OpenAI 2025 推薦語音轉寫)
// 註:部分參數(如 response_format)依模型支援度而定
const baseReq = {
file,
model: "gpt-4o-transcribe",
// languageprompt 為可選,有助於專有名詞與口音辨識
...(language ? { language } : {}),
...(prompt ? { prompt } : {}),
};

let result;
try {
// 預設回傳純文字
result = await openai.audio.transcriptions.create(baseReq);
} catch (e) {
// 有些環境需要明確 mime 或副檔名提示;或 fallback whisper-1
// 你也可以在這裡實作重試/降級
throw e;
}

const text = (result?.text || "").trim();

// (可選)嘗試詳細段落(若模型支援 verbose_json)
let detailedJson = null;
if (detailed) {
try {
const verbose = await openai.audio.transcriptions.create({
...baseReq,
response_format: "verbose_json",
});
detailedJson = verbose || null;
} catch {
// 若不支援或失敗,忽略即可
detailedJson = null;
}
}

// 落檔 outputs/transcripts
const outDir = path.join(process.cwd(), "outputs", "transcripts");
if (!fs.existsSync(outDir)) fs.mkdirSync(outDir, { recursive: true });
const base = path.basename(localPath).replace(/.[^.]+$/, "");
const txtPath = path.join(outDir, ${base}.txt);
const jsonPath = path.join(outDir, ${base}.json);
fs.writeFileSync(txtPath, text, "utf-8");
fs.writeFileSync(
jsonPath,
JSON.stringify({ text, detailed: detailedJson }, null, 2),
"utf-8"
);

return { text, raw: detailedJson, saved: { txt: txtPath, json: jsonPath } };
}

  1. index.js(修改:加入 STT 入口)
    // index.js
    import { englishTeacher, codeReview, sentimentClassify } from "./src/day3_prompt_engineering.js";
    import { newsToJson } from "./src/day4_text_to_json.js";
    import { chatOnce, resetSession } from "./src/day5_chat_history.js";
    import { textToImage } from "./src/day6_text_to_image.js";
    import { imageToJson } from "./src/day7_image_to_text.js";
    import { transcribe } from "./src/day8_speech_to_text.js";

const args = Object.fromEntries(
process.argv.slice(2).reduce((acc, cur, i, arr) => {
if (cur.startsWith("--")) {
const key = cur.replace(/^--/, "");
const val = arr[i + 1] && !arr[i + 1].startsWith("--") ? arr[i + 1] : true;
acc.push([key, val]);
}
return acc;
}, [])
);

async function main() {
const task = args.task || "chat";

if (task === "stt") {
const filePath = args.filePath || null;
const url = args.url || null;
const language = args.lang || ""; // 例:"zh" | "en" | "ja"
const prompt = args.prompt || ""; // 專有名詞提示
const detailed = args.detailed === "true" || args.detailed === true;

const { text, saved } = await transcribe({ filePath, url, language, prompt, detailed });
console.log("\n=== 語音轉文字(STT) ===\n");
console.log(text);
console.log("\n已儲存:");
console.log("- TXT:", saved.txt);
console.log("- JSON:", saved.json);

} else if (task === "vision") {
const imagePath = args.imagePath || null;
const imageUrl = args.imageUrl || null;
const wantOCR = args.ocr === "true" || args.ocr === true;
const length = args.length || "medium";
const out = await imageToJson({ imagePath, imageUrl, wantOCR, length });
console.log("\n=== 圖片 → JSON 描述 ===\n");
console.log(JSON.stringify(out, null, 2));

} else if (task === "image") {
const prompt = args.text || "一隻戴著太空頭盔的柴犬,漂浮在月球上,插著台灣國旗";
const size = args.size || "512x512";
const n = args.n ? Number(args.n) : 1;
const urls = await textToImage(prompt, { size, n });
console.log("\n=== 生成圖片 ===\n");
urls.forEach((f) => console.log("已儲存:" + f));

} else if (task === "chat") {
const sessionId = args.session || "default";
if (args.reset) {
resetSession(sessionId);
console.log(已重設會話:${sessionId});
return;
}
const input = args.text || "嗨,我想規劃 3 天 2 夜的台中旅遊行程。";
const { reply } = await chatOnce(input, { sessionId });
console.log(\n[${sessionId}] AI:\n${reply}\n);

} else if (task === "teacher") {
const out = await englishTeacher(args.text || "He go to school every day.");
console.log("\n=== 英文老師 ===\n");
console.log(out);

} else if (task === "review") {
const out = await codeReview("function sum(arr){ return arr.reduce((a,b)=>a+b,0) }");
console.log("\n=== 程式碼審查 ===\n");
console.log(out);

} else if (task === "sentiment") {
const out = await sentimentClassify(args.text || "今天心情糟透了,事情一團亂。");
console.log("\n=== 情緒分類(JSON) ===\n");
console.log(out);

} else if (task === "json_summary") {
const out = await newsToJson(args.text || "OpenAI 發布新模型,效能大幅提升。");
console.log("\n=== 新聞 JSON 摘要 ===\n");
console.log(out);

} else {
console.log("未知任務,請使用 --task stt | vision | image | chat | teacher | review | sentiment | json_summary");
}
}

main().catch((e) => {
console.error("發生錯誤:", e.message);
process.exit(1);
});

  1. package.json(新增 Script)
    {
    "scripts": {
    "day8:stt:file": "node index.js --task stt --filePath sample/audio_zh.mp3 --lang zh --detailed true",
    "day8:stt:url": "node index.js --task stt --url https://example.com/demo.m4a --lang zh"
    }
    }

▶️ CLI 操作範例

轉寫本地檔,中文,輸出詳細段落(若模型支援)

npm run day8:stt:file --silent

轉寫遠端 URL(下載到暫存再轉寫)

npm run day8:stt:url --silent

自定領域提示(提升專有名詞正確率)

node index.js --task stt --filePath sample/meeting.m4a --lang zh --prompt "專案名:SmartGo Plus;人名:小王、小美;術語:RAG、LIFF、IIS"

產出檔案示例:

outputs/transcripts/meeting.txt
outputs/transcripts/meeting.json // { text, detailed?: {...} }


上一篇
圖片描述 (Image-to-Text)
下一篇
文字轉語音(TTS, Text-to-Speech)
系列文
練習AI11
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言