iT邦幫忙

2025 iThome 鐵人賽

DAY 19
0
Software Development

Vibe Unity - AI時代的遊戲開發工作流系列 第 19

Day 19 - 在 Unity 中使用 AI TTS 文字生成聲音模型

  • 分享至 

  • xImage
  •  

這一章要來介紹文字生成語音的模型 - Eleven Labs

image.png

https://elevenlabs.io/

Eleven Labs 是目前市面上最強大的 AI 語音模型工具

第二可能是 Azure TTS, 便宜又好用, 但我們今天不討論這個

Eleven Labs 免費版允許你保存3個不同的語音, 也可以自己錄製自己的聲音

image.png

也有生成音效, 生成背景音樂的選項


那我們今天會專注在 文字轉語音 的 API 功能

我們現在 Unity 擺放 UI 的設計

會需要用到的元件有: 輸入框, Slider, 和一顆生成的按鈕

image.png

Eleven Labs Core:

using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Text;
using Core;
using Newtonsoft.Json;
using UnityEngine;
using UnityEngine.Networking;
using UnityEngine.UI;

namespace PolarAI.Scripts.AICore.ElevenLabs
{
    public class ElevenLabsCore 
    {
        public string ApiKey = "ELEVENLABS_API_KEY";
        public string VoiceId = "YOUR_VOICE_ID";
        public string ModelId = "eleven_multilingual_v2";
        private string OutputFormat = "mp3_44100_128";

        private AudioClip _recording;

        public void Initialize(string apiKey, string voiceId)
        {
            ApiKey = apiKey;
            VoiceId = voiceId;
        }
        
        
        public void SetDefaultVoiceId(string voiceId)
        {
            VoiceId = voiceId;
        }


        public void TextToSound(string text,  Action<AudioClip, bool> onComplete, string voiceId=null)
        {
            if (string.IsNullOrWhiteSpace(text))
            {
                Debug.Log("TTS Text is empty.");
                return;
            }

            CoroutineManager.Instance.StartCoroutine(DoTTS(text,onComplete, voiceId));
        }

        private IEnumerator DoTTS(string text,Action<AudioClip, bool> onComplete,  string voiceId =null)
        {
            Debug.Log("Requesting ElevenLabs TTS...");
            voiceId ??= VoiceId;
            var url = $"https://api.elevenlabs.io/v1/text-to-speech/{voiceId}?output_format={OutputFormat}";
            var payload = new ElevenTTS { text = text, model_id = ModelId };
            var json = JsonConvert.SerializeObject(payload);
            
            using var req = new UnityWebRequest(url, "POST");
            byte[] body = Encoding.UTF8.GetBytes(json);
            req.uploadHandler = new UploadHandlerRaw(body);
            req.downloadHandler = new DownloadHandlerBuffer();
            req.SetRequestHeader("Content-Type", "application/json");
            req.SetRequestHeader("xi-api-key", ApiKey);
            yield return req.SendWebRequest();

            if (req.result != UnityWebRequest.Result.Success)
            {
                string err = req.error + " | " + req.downloadHandler.text;
                onComplete?.Invoke(null, false);
                Debug.LogError(err);
                yield break;
            }

            byte[] audioData = req.downloadHandler.data;

            string ext = OutputFormat.StartsWith("mp3") ? "mp3" : "wav";
            string tempPath = Path.Combine(Application.persistentDataPath, "tts." + ext);
            File.WriteAllBytes(tempPath, audioData);

            yield return CoroutineManager.Instance.StartCoroutine(
                LoadCLip(tempPath, ext == "mp3" ? AudioType.MPEG : AudioType.WAV, onComplete));
        }

        public IEnumerator LoadCLip(string path, AudioType type, Action<AudioClip,bool> onLoad)
        {
            using var www = UnityWebRequestMultimedia.GetAudioClip("file://" + path, type);
            yield return www.SendWebRequest();
            if (www.result != UnityWebRequest.Result.Success)
            {
                Debug.LogError("Audio load failed: " + www.error);
                yield break;
            }

            AudioClip clip = DownloadHandlerAudioClip.GetContent(www);
            onLoad?.Invoke(clip, true);
        }

        [Serializable]
        private class ElevenTTS
        {
            public string text;
            public string model_id;
        }
    }

}

UI 使用方法:

using TMPro;
using UnityEngine;
using UnityEngine.UI;

namespace PolarAI.Scripts.AICore.ElevenLabs
{
    public class ElevenLabsExample : MonoBehaviour
    {
        [Header("ElevenLabs API 设置")] public string ElevenLabsApiKey = "";
        public string VoiceId = "";

        [Header("UI 组件")] public TMP_InputField TextInput;
        public Button GenBtn;
        public Button PlayStopBtn;
        public Slider VoiceSlider;
        public AudioSource AudioSource;

        [Header("可选:状态文本")] public TMP_Text StatusText;
        public TMP_Text TimeText; // 显示当前时间/总时间

        private ElevenLabsCore ElevenLabsCore = new ElevenLabsCore();
        private AudioClip _currentClip;
        private bool _isGenerating = false;
        private bool _isPlaying = false;
        private bool _isDraggingSlider = false;

        private void Start()
        {
            // 初始化 ElevenLabsCore
            ElevenLabsCore.Initialize(ElevenLabsApiKey, VoiceId);

            // 设置 UI 事件监听
            SetupUIEvents();

            // 初始化 UI 状态
            UpdateButtonStates();
            UpdateStatusText("就绪");

            // 初始化进度条
            if (VoiceSlider != null)
            {
                VoiceSlider.minValue = 0f;
                VoiceSlider.maxValue = 1f;
                VoiceSlider.value = 0f;
                VoiceSlider.interactable = false;
            }
        }

        private void SetupUIEvents()
        {
            // 生成按钮
            if (GenBtn != null)
            {
                GenBtn.onClick.AddListener(OnGenerateButtonClicked);
            }

            // 播放/停止按钮
            if (PlayStopBtn != null)
            {
                PlayStopBtn.onClick.AddListener(OnPlayStopButtonClicked);
            }

            // 进度滑块 - 支持拖拽调整播放位置
            if (VoiceSlider != null)
            {
                VoiceSlider.onValueChanged.AddListener(OnProgressChanged);

                // 检测拖拽开始和结束
                var sliderEvents = VoiceSlider.gameObject.AddComponent<SliderDragHandler>();
                sliderEvents.onBeginDrag = () => { _isDraggingSlider = true; };
                sliderEvents.onEndDrag = () =>
                {
                    _isDraggingSlider = false;
                    if (AudioSource != null && _currentClip != null)
                    {
                        // 根据滑块位置设置播放时间
                        AudioSource.time = VoiceSlider.value * _currentClip.length;
                    }
                };
            }
        }

        private void OnGenerateButtonClicked()
        {
            if (_isGenerating)
            {
                UpdateStatusText("正在生成中,请稍候...");
                return;
            }

            if (TextInput == null || string.IsNullOrWhiteSpace(TextInput.text))
            {
                UpdateStatusText("请输入要转换的文本");
                return;
            }

            if (string.IsNullOrWhiteSpace(ElevenLabsApiKey))
            {
                UpdateStatusText("错误:请设置 ElevenLabs API Key");
                Debug.LogError("ElevenLabs API Key 未设置!");
                return;
            }

            if (string.IsNullOrWhiteSpace(VoiceId))
            {
                UpdateStatusText("错误:请设置 Voice ID");
                Debug.LogError("Voice ID 未设置!");
                return;
            }

            GenerateSpeech(TextInput.text);
        }

        private void GenerateSpeech(string text)
        {
            _isGenerating = true;
            UpdateButtonStates();
            UpdateStatusText("正在生成语音...");

            // 停止当前播放
            if (_isPlaying && AudioSource != null)
            {
                AudioSource.Stop();
                _isPlaying = false;
            }

            // 重置进度条
            if (VoiceSlider != null)
            {
                VoiceSlider.value = 0f;
                VoiceSlider.interactable = false;
            }

            UpdateTimeText(0f, 0f);

            // 调用 ElevenLabsCore 生成语音
            ElevenLabsCore.TextToSound(text, OnSpeechGenerated);
        }

        private void OnSpeechGenerated(AudioClip clip, bool success)
        {
            _isGenerating = false;

            if (success && clip != null)
            {
                _currentClip = clip;
                UpdateStatusText("语音生成成功!");
                Debug.Log("语音生成成功,时长: " + clip.length + " 秒");

                // 启用进度条
                if (VoiceSlider != null)
                {
                    VoiceSlider.interactable = true;
                }

                // 自动播放
                PlayAudio();
            }
            else
            {
                UpdateStatusText("语音生成失败");
                Debug.LogError("语音生成失败");
            }

            UpdateButtonStates();
        }

        private void OnPlayStopButtonClicked()
        {
            if (_currentClip == null)
            {
                UpdateStatusText("请先生成语音");
                return;
            }

            if (_isPlaying)
            {
                StopAudio();
            }
            else
            {
                PlayAudio();
            }
        }

        private void PlayAudio()
        {
            if (AudioSource == null)
            {
                Debug.LogError("AudioSource 未设置!");
                return;
            }

            if (_currentClip == null)
            {
                UpdateStatusText("没有可播放的音频");
                return;
            }

            AudioSource.clip = _currentClip;
            AudioSource.Play();
            _isPlaying = true;
            UpdateStatusText("正在播放...");
            UpdateButtonStates();
        }

        private void StopAudio()
        {
            if (AudioSource != null)
            {
                AudioSource.Stop();
            }

            _isPlaying = false;
            UpdateStatusText("已停止播放");
            UpdateButtonStates();
        }

        private void OnProgressChanged(float value)
        {
            // 如果用户正在拖拽滑块,更新时间显示
            if (_isDraggingSlider && _currentClip != null)
            {
                float currentTime = value * _currentClip.length;
                UpdateTimeText(currentTime, _currentClip.length);
            }
        }

        private void Update()
        {
            // 更新播放进度
            if (_isPlaying && AudioSource != null && AudioSource.isPlaying && _currentClip != null)
            {
                // 如果用户没有拖拽滑块,更新进度条
                if (!_isDraggingSlider && VoiceSlider != null)
                {
                    float progress = AudioSource.time / _currentClip.length;
                    VoiceSlider.value = progress;
                }

                // 更新时间显示
                UpdateTimeText(AudioSource.time, _currentClip.length);
            }

            // 检测音频播放结束
            if (_isPlaying && AudioSource != null && !AudioSource.isPlaying)
            {
                _isPlaying = false;
                UpdateStatusText("播放完成");
                UpdateButtonStates();

                // 重置进度条到开头
                if (VoiceSlider != null)
                {
                    VoiceSlider.value = 0f;
                }

                if (_currentClip != null)
                {
                    UpdateTimeText(0f, _currentClip.length);
                }
            }
        }

        private void UpdateButtonStates()
        {
            // 更新生成按钮状态
            if (GenBtn != null)
            {
                GenBtn.interactable = !_isGenerating;
                var btnText = GenBtn.GetComponentInChildren<TMP_Text>();
                if (btnText != null)
                {
                    btnText.text = _isGenerating ? "生成中..." : "生成语音";
                }
            }

            // 更新播放/停止按钮状态
            if (PlayStopBtn != null)
            {
                PlayStopBtn.interactable = _currentClip != null && !_isGenerating;
                var btnText = PlayStopBtn.GetComponentInChildren<TMP_Text>();
                if (btnText != null)
                {
                    btnText.text = _isPlaying ? "停止" : "播放";
                }
            }
        }

        private void UpdateStatusText(string message)
        {
            if (StatusText != null)
            {
                StatusText.text = message;
            }

            Debug.Log($"[ElevenLabs] {message}");
        }

        private void UpdateTimeText(float currentTime, float totalTime)
        {
            if (TimeText != null)
            {
                TimeText.text = $"{FormatTime(currentTime)} / {FormatTime(totalTime)}";
            }
        }

        private string FormatTime(float seconds)
        {
            int minutes = Mathf.FloorToInt(seconds / 60f);
            int secs = Mathf.FloorToInt(seconds % 60f);
            return $"{minutes:00}:{secs:00}";
        }

        private void OnDestroy()
        {
            // 清理事件监听
            if (GenBtn != null)
            {
                GenBtn.onClick.RemoveListener(OnGenerateButtonClicked);
            }

            if (PlayStopBtn != null)
            {
                PlayStopBtn.onClick.RemoveListener(OnPlayStopButtonClicked);
            }

            if (VoiceSlider != null)
            {
                VoiceSlider.onValueChanged.RemoveListener(OnProgressChanged);
            }

            // 停止播放
            if (AudioSource != null && AudioSource.isPlaying)
            {
                AudioSource.Stop();
            }
        }

        // 公共方法:允许外部调用生成语音
        public void GenerateSpeechFromText(string text)
        {
            if (!string.IsNullOrWhiteSpace(text))
            {
                GenerateSpeech(text);
            }
        }

        // 公共方法:设置 API Key
        public void SetApiKey(string apiKey)
        {
            ElevenLabsApiKey = apiKey;
            ElevenLabsCore.Initialize(ElevenLabsApiKey, VoiceId);
        }

        // 公共方法:设置 Voice ID
        public void SetVoiceId(string voiceId)
        {
            VoiceId = voiceId;
            ElevenLabsCore.Initialize(ElevenLabsApiKey, VoiceId);
        }
    }

    // 辅助类:检测滑块拖拽事件
    public class SliderDragHandler : MonoBehaviour, UnityEngine.EventSystems.IBeginDragHandler,
        UnityEngine.EventSystems.IEndDragHandler
    {
        public System.Action onBeginDrag;
        public System.Action onEndDrag;

        public void OnBeginDrag(UnityEngine.EventSystems.PointerEventData eventData)
        {
            onBeginDrag?.Invoke();
        }

        public void OnEndDrag(UnityEngine.EventSystems.PointerEventData eventData)
        {
            onEndDrag?.Invoke();
        }
    }
}

把對應的 API Key 和 UI欄 位填上去即可

Demo 效果:

TTS_Demo.mp4


上一篇
Day 18 - ComfyUI 本地化的圖片生成 API
下一篇
Day 20 - Unity RAG 開發技術介紹
系列文
Vibe Unity - AI時代的遊戲開發工作流25
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言