iT邦幫忙

2025 iThome 鐵人賽

DAY 9
0
生成式 AI

練習AI系列 第 10

文字轉語音(TTS, Text-to-Speech)

  • 分享至 

  • xImage
  •  

🆕 新增/修改的程式碼

  1. src/day9_text_to_speech.js(新增)

封裝成 speak(),可被 CLI 與 Next.js API 重用。

// src/day9_text_to_speech.js
import fs from "fs";
import path from "path";
import { openai } from "./aiClient.js";

/**

  • 文字轉語音
  • @param {Object} opts
  • @param {string} opts.text - 要轉成語音的文字(必填)
  • @param {string} [opts.model] - TTS 模型,預設 "gpt-4o-mini-tts";可改 "tts-1"
  • @param {string} [opts.voice="alloy"] - 聲線(依模型支援:alloy, verse, aria, coral, sage…)
  • @param {string} [opts.format="mp3"] - 輸出格式:mp3|wav|opus
  • @param {number} [opts.speed=1.0] - 語速:0.5 ~ 1.5
  • @param {string} [opts.outputDir="outputs/tts"] - 存檔目錄
  • @param {string} [opts.filename] - 自訂檔名(不含副檔名)
  • @returns {Promise<{filepath: string, bytes: number}>}
    */
    export async function speak(opts = {}) {
    const {
    text,
    model = process.env.OPENAI_TTS_MODEL || "gpt-4o-mini-tts",
    voice = "alloy",
    format = "mp3",
    speed = 1.0,
    outputDir = "outputs/tts",
    filename,
    } = opts;

if (!text || !text.trim()) throw new Error("文字內容 text 為必填。");
if (speed < 0.5 || speed > 1.5) throw new Error("speed 建議介於 0.5 ~ 1.5 之間。");
if (!["mp3", "wav", "opus"].includes(format)) throw new Error("format 僅支援 mp3|wav|opus。");

// 使用 OpenAI Audio Speech API
// 註:官方 SDK 常見用法為 openai.audio.speech.create({ model, voice, input, format, speed })
const res = await openai.audio.speech.create({
model,
voice,
input: text,
format, // 有些版本為 "audio_format",此處以 SDK 當前主參數為準
speed,
});

const arrayBuffer = await res.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);

if (!fs.existsSync(outputDir)) fs.mkdirSync(outputDir, { recursive: true });
const base = filename
? filename.replace(/.[^.]+$/, "")
: tts_${Date.now()};
const filepath = path.join(outputDir, ${base}.${format});
fs.writeFileSync(filepath, buffer);

return { filepath, bytes: buffer.length };
}

/** 便利方法:讀取檔案並唸出(例如把 Day 8 的逐字稿 txt 直接轉語音) */
export async function speakFromFile(filePath, opts = {}) {
if (!fs.existsSync(filePath)) throw new Error(找不到檔案:${filePath});
const text = fs.readFileSync(filePath, "utf-8");
return speak({ ...opts, text });
}

  1. index.js(修改:加入 TTS 入口)
    // index.js
    import { englishTeacher, codeReview, sentimentClassify } from "./src/day3_prompt_engineering.js";
    import { newsToJson } from "./src/day4_text_to_json.js";
    import { chatOnce, resetSession } from "./src/day5_chat_history.js";
    import { textToImage } from "./src/day6_text_to_image.js";
    import { imageToJson } from "./src/day7_image_to_text.js";
    import { transcribe } from "./src/day8_speech_to_text.js";
    import { speak, speakFromFile } from "./src/day9_text_to_speech.js";

const args = Object.fromEntries(
process.argv.slice(2).reduce((acc, cur, i, arr) => {
if (cur.startsWith("--")) {
const key = cur.replace(/^--/, "");
const val = arr[i + 1] && !arr[i + 1].startsWith("--") ? arr[i + 1] : true;
acc.push([key, val]);
}
return acc;
}, [])
);

async function main() {
const task = args.task || "chat";

if (task === "tts") {
const text = args.text || "";
const file = args.file || ""; // 若提供文字檔(.txt),直接轉語音
const model = args.model || process.env.OPENAI_TTS_MODEL || "gpt-4o-mini-tts";
const voice = args.voice || "alloy";
const format = args.format || "mp3";
const speed = args.speed ? Number(args.speed) : 1.0;
const filename = args.out || undefined;

if (file) {
  const { filepath, bytes } = await speakFromFile(file, { model, voice, format, speed, filename });
  console.log("\n=== 文字檔 → 語音 ===");
  console.log("輸出:", filepath, `(${bytes} bytes)`);
} else {
  const content = text || "這是一段測試用的語音。";
  const { filepath, bytes } = await speak({ text: content, model, voice, format, speed, filename });
  console.log("\n=== 文字 → 語音 ===");
  console.log("輸出:", filepath, `(${bytes} bytes)`);
}

} else if (task === "stt") {
const filePath = args.filePath || null;
const url = args.url || null;
const language = args.lang || "";
const prompt = args.prompt || "";
const detailed = args.detailed === "true" || args.detailed === true;
const { text, saved } = await transcribe({ filePath, url, language, prompt, detailed });
console.log("\n=== 語音轉文字(STT) ===\n");
console.log(text);
console.log("\n已儲存:", saved);

} else if (task === "vision") {
const imagePath = args.imagePath || null;
const imageUrl = args.imageUrl || null;
const wantOCR = args.ocr === "true" || args.ocr === true;
const length = args.length || "medium";
const out = await imageToJson({ imagePath, imageUrl, wantOCR, length });
console.log("\n=== 圖片 → JSON 描述 ===\n");
console.log(JSON.stringify(out, null, 2));

} else if (task === "image") {
const prompt = args.text || "一隻戴著太空頭盔的柴犬,漂浮在月球上,插著台灣國旗";
const size = args.size || "512x512";
const n = args.n ? Number(args.n) : 1;
const urls = await textToImage(prompt, { size, n });
console.log("\n=== 生成圖片 ===\n");
urls.forEach((f) => console.log("已儲存:" + f));

} else if (task === "chat") {
const sessionId = args.session || "default";
if (args.reset) {
resetSession(sessionId);
console.log(已重設會話:${sessionId});
return;
}
const input = args.text || "嗨,我想規劃 3 天 2 夜的台中旅遊行程。";
const { reply } = await chatOnce(input, { sessionId });
console.log(\n[${sessionId}] AI:\n${reply}\n);

} else if (task === "teacher") {
const out = await englishTeacher(args.text || "He go to school every day.");
console.log("\n=== 英文老師 ===\n");
console.log(out);

} else if (task === "review") {
const out = await codeReview("function sum(arr){ return arr.reduce((a,b)=>a+b,0) }");
console.log("\n=== 程式碼審查 ===\n");
console.log(out);

} else if (task === "sentiment") {
const out = await sentimentClassify(args.text || "今天心情糟透了,事情一團亂。");
console.log("\n=== 情緒分類(JSON) ===\n");
console.log(out);

} else if (task === "json_summary") {
const out = await newsToJson(args.text || "OpenAI 發布新模型,效能大幅提升。");
console.log("\n=== 新聞 JSON 摘要 ===\n");
console.log(out);

} else {
console.log("未知任務,請使用 --task tts | stt | vision | image | chat | teacher | review | sentiment | json_summary");
}
}

main().catch((e) => {
console.error("發生錯誤:", e.message);
process.exit(1);
});

  1. package.json(新增 Script)
    {
    "scripts": {
    "day9:tts": "node index.js --task tts --text "這是 Day 9 的 TTS 測試,祝開發順利!" --voice alloy --format mp3 --speed 1.0",
    "day9:tts:file": "node index.js --task tts --file outputs/transcripts/meeting.txt --voice aria --format mp3 --speed 0.95"
    }
    }

你可以用 --model tts-1 測試不同模型(若你的帳戶有開啟)。
也能用 --out my_demo 自訂檔名:會輸出 outputs/tts/my_demo.mp3。

▶️ CLI 測試

直接把字串轉成 mp3

npm run day9:tts --silent

把 Day 8 產出的逐字稿 .txt 轉成語音(aria 聲線、微調慢速)

npm run day9:tts:file --silent

指定輸出格式與速度

node index.js --task tts --text "早安,今天開始第九天的挑戰。" --format wav --speed 0.9

輸出範例:

outputs/tts/tts_17264xxxxx.mp3


上一篇
語音轉文字(STT, Speech-to-Text)
下一篇
多模態應用整合!
系列文
練習AI11
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言