上一篇討論到SeamlessM4T在轉譯過程中最重要的是Translator以及predict這兩個函式,發現這兩支主程式又分別引用了兩個函式,UnitYModel及get_prediction。
class UnitYModel(EncoderDecoderModel):
"""Represents a UnitY model as described in
:cite:t`https://doi.org/10.48550/arxiv.2212.08055`.
Note that this implementation is augmented with a text encoder to enable
translating from text.
"""
# 定義屬性:定義以下用到的引數或函式的屬性
model_dim: int
input_modality: str
speech_encoder_frontend: TransformerFrontend
speech_encoder: TransformerEncoder
text_encoder_frontend: Optional[TransformerFrontend]
text_encoder: Optional[TransformerEncoder]
text_decoder_frontend: TransformerFrontend
text_decoder: TransformerDecoder
final_proj: Projection
t2u_model: Optional["UnitYT2UModel"]
pad_idx: Optional[int]
def __init__(
self,
speech_encoder_frontend: TransformerFrontend,
speech_encoder: TransformerEncoder,
text_encoder_frontend: Optional[TransformerFrontend],
text_encoder: Optional[TransformerEncoder],
text_decoder_frontend: TransformerFrontend,
text_decoder: TransformerDecoder,
final_proj: Projection,
t2u_model: Optional["UnitYT2UModel"],
pad_idx: Optional[int],
input_modality: str = "speech",
) -> None:
# 模型維度定義為與speech_encoder之維度相同
model_dim = speech_encoder.model_dim
super().__init__(model_dim)
# 初始化輸入類別(speech or text)
self.input_modality = input_modality
# 初始化speech_encoder前級與speech_encoder
self.speech_encoder_frontend = speech_encoder_frontend
self.speech_encoder = speech_encoder
if text_encoder is not None:
if text_encoder_frontend is None:
raise ValueError(
"Both `text_encoder` and `text_encoder_frontend` must be
specified, but `text_encoder_frontend` is `None`."
)
# 初始化text_encoder前級與text_encoder
self.text_encoder_frontend = text_encoder_frontend
self.text_encoder = text_encoder
else:
if text_encoder_frontend is not None:
raise ValueError(
"Both `text_encoder` and `text_encoder_frontend` must be
specified, but `text_encoder` is `None`."
)
self.register_module("text_encoder_frontend", None)
self.register_module("text_encoder", None)
# 初始化text_decoder前級與text_decoder
self.text_decoder_frontend = text_decoder_frontend
self.text_decoder = text_decoder
# 初始化投影層
self.final_proj = final_proj
# 初始化text-to-unit層
if t2u_model is not None:
self.t2u_model = t2u_model
else:
self.register_module("t2u_model", None)
self.pad_idx = pad_idx
check_model_dim(self)
# 針對不同的輸入類別,引用不同的編碼器
@finaloverride
def encode(
self, seqs: Tensor, seq_lens: Optional[Tensor]
) -> Tuple[Tensor, Optional[Tensor]]:
if self.input_modality == "speech":
return self.encode_speech(seqs, seq_lens)
if self.input_modality == "text":
return self.encode_text(seqs, seq_lens)
raise RuntimeError(
f"`input_modality` must be 'speech' or 'text', but is '{self.input_modality}'
instead."
)
# 先將原始語音數據之序列及序列長度用speech_encoder_frontend轉換為可用的新序列及遮罩層
# 將新序列及遮罩層匯入speech_encoder後回傳結果
def encode_speech(
self, seqs: Tensor, seq_lens: Optional[Tensor]
) -> Tuple[Tensor, Optional[Tensor]]:
seqs, padding_mask = self.speech_encoder_frontend(seqs, seq_lens)
return self.speech_encoder(seqs, padding_mask) # type: ignore[no-any-return]
# 先將原始文本數據之序列及序列長度用text_encoder_frontend轉換為可用的新序列及遮罩層
# 將新序列及遮罩層匯入text_encoder後回傳結果
def encode_text(
self, seqs: Tensor, seq_lens: Optional[Tensor]
) -> Tuple[Tensor, Optional[Tensor]]:
if self.text_encoder is None:
raise ValueError(
"`encode_text()` requires a text encoder, but the current UnitY model does not
have one."
)
assert self.text_encoder_frontend is not None
seqs, padding_mask = self.text_encoder_frontend(seqs, seq_lens)
return self.text_encoder(seqs, padding_mask) # type: ignore[no-any-return]
# 先將轉譯後的文本數據序列、序列長度、state_bag用text_decoder_frontend轉換為可用的
# 新序列及遮罩層,將新序列、遮罩層、編碼器結果、編碼遮罩層及state_bag,匯入text_decoder
# 後回傳結果
# state_bag:儲存和管理解碼過程中的任何需要保留的狀態或信息以維護上下文。
@finaloverride
def decode(
self,
seqs: Tensor,
seq_lens: Optional[Tensor],
encoder_output: Tensor,
encoder_padding_mask: Optional[Tensor],
state_bag: Optional[IncrementalStateBag] = None,
) -> Tuple[Tensor, Optional[Tensor]]:
seqs, padding_mask = self.text_decoder_frontend(seqs, seq_lens, state_bag)
return self.text_decoder( # type: ignore[no-any-return]
seqs, padding_mask, encoder_output, encoder_padding_mask, state_bag
)
# 將解碼器的輸出轉換為最終的模型輸出,並且確保輸出的形狀和內容符合模型的期望格式。
# 將解碼後的結果用final_proj轉換為logits,未經歸一化的向量或矩陣。
# 最後匯入SequenceModelOutput並回傳結果
@finaloverride
def project(
self, decoder_output: Tensor, decoder_padding_mask: Optional[Tensor]
) -> SequenceModelOutput:
logits = self.final_proj(decoder_output)
return SequenceModelOutput(logits, self.pad_idx)
@classmethod
def get_prediction(
cls,
model: UnitYModel,
text_tokenizer: TextTokenizer,
unit_tokenizer: UnitTokenizer,
src: Dict[str, Tensor],
input_modality: Modality,
output_modality: Modality,
tgt_lang: str,
ngram_filtering: bool = False,
text_max_len_a: int = 1,
text_max_len_b: int = 200,
unit_max_len_a: Optional[int] = None,
unit_max_len_b: Optional[int] = None,
) -> Tuple[SequenceToTextOutput, Optional[SequenceToUnitOutput]]:
if unit_max_len_a is None:
# need to adjust this for T2ST since src_len is smaller for text.
if input_modality == Modality.TEXT:
unit_max_len_a = 25
else:
unit_max_len_a = 1
text_opts = SequenceGeneratorOptions(
beam_size=5, soft_max_seq_len=(text_max_len_a, text_max_len_b)
)
unit_opts = SequenceGeneratorOptions(
beam_size=5, soft_max_seq_len=(unit_max_len_a, unit_max_len_b or 50)
)
if ngram_filtering:
text_opts.logits_processor = NGramRepeatBlockProcessor(
no_repeat_ngram_size=4
)
unit_opts.logits_processor = NGramRepeatBlockProcessor(
no_repeat_ngram_size=4
)
# UnitYGenerator:從UnitY模型產生文本翻譯和語音單元。
generator = UnitYGenerator(
model,
text_tokenizer,
tgt_lang,
unit_tokenizer if output_modality == Modality.SPEECH else None,
text_opts=text_opts,
unit_opts=unit_opts,
)
# 回傳filter後的生成文字文本或語音單元中的序列、序列長度、輸入及輸出類別
# ngram_filtering設定no_repeat_ngram_size=4,以呈現自然語氣
return generator(
src["seqs"],
src["seq_lens"],
input_modality.value,
output_modality.value,
ngram_filtering=ngram_filtering,
)
從本篇的程式碼結構研究,可以看出真正在做文本或語音處理的是UnitY模型,做大量文本或語音的編碼及解碼,解碼後的東西是一堆向量矩陣,經過get_prediction作語句過濾及生成後,才能得到人看得懂的文本及語音單元,而語音單元要再傳入前篇主程式介紹到的Vocder才會生成語音訊號。如此看來UnitY模型至關重要,接下來要仔細研究UnitY模型。