Introduction
語音識別(speech recognition)是將語音內容轉換為對應文字的技術。
連結時間序列分類(Connectionist Temporal Classification, CTC),是時間序列(seq2seq)的一種訓練方法,使用遞歸神經網路標記序列資料。
Tasks
引用物件。
import os
import cntk as C
import numpy as np
import cntk.tests.test_utils
cntk.tests.test_utils.set_device_from_pytest_env()
1.資料讀取(Data reading):
下載資料至本地端資料夾。
data_dir = os.path.join("..", "Tests", "EndToEndTests", "Speech", "Data")
print("Current directory {0}".format(os.getcwd()))
if os.path.exists(data_dir):
if os.path.realpath(data_dir) != os.path.realpath(os.getcwd()):
os.chdir(data_dir)
print("Changed to data directory {0}".format(data_dir))
else:
print("Data directory not available locally. Downloading data.")
try:
from urllib.request import urlretrieve
except ImportError:
from urllib import urlretrieve
for dir in ['GlobalStats', 'Features']:
if not os.path.exists(dir):
os.mkdir(dir)
for file in ['glob_0000.scp', 'glob_0000.write.scp', 'glob_0000.mlf', 'state_ctc.list', 'GlobalStats/mean.363', 'GlobalStats/var.363', 'Features/000000000.chunk']:
if os.path.exists(file):
print('Already downloaded %s' % file)
else:
print('Downloading %s' % file)
urlretrieve('https://github.com/Microsoft/CNTK/raw/release/2.3.1/Tests/EndToEndTests/Speech/Data/%s' % file, file)
HTK/MLF格式的聲學模型(Acoustic Model, AM)訓練資料集:
讀取資料,使用 CNTK 的 HTK 資料讀取器,HTKFeatureDeserializer、HTKMLFDeserializer。
feature_dimension = 33
feature = C.sequence.input((feature_dimension))
label_dimension = 133
label = C.sequence.input((label_dimension))
train_feature_filepath = "glob_0000.scp"
train_label_filepath = "glob_0000.mlf"
mapping_filepath = "state_ctc.list"
try:
train_feature_stream = C.io.HTKFeatureDeserializer(
C.io.StreamDefs(speech_feature = C.io.StreamDef(shape = feature_dimension, scp = train_feature_filepath)))
train_label_stream = C.io.HTKMLFDeserializer(
mapping_filepath, C.io.StreamDefs(speech_label = C.io.StreamDef(shape = label_dimension, mlf = train_label_filepath)), True)
train_data_reader = C.io.MinibatchSource([train_feature_stream, train_label_stream], frame_mode = False)
train_input_map = {feature: train_data_reader.streams.speech_feature, label: train_data_reader.streams.speech_label}
except RuntimeError:
print ("ERROR: not able to read features or labels")
2.資料處理(Data preprocessing):
將輸入的特徵值減去平均值,乘以標準差的倒數(reciprocal),將其標準化。
feature_mean = np.fromfile(os.path.join("GlobalStats", "mean.363"), dtype=float, count=feature_dimension)
feature_inverse_stddev = np.fromfile(os.path.join("GlobalStats", "var.363"), dtype=float, count=feature_dimension)
feature_normalized = (feature - feature_mean) * feature_inverse_stddev
with C.default_options(activation=C.sigmoid):
z = C.layers.Sequential([
C.layers.For(range(3), lambda: C.layers.Recurrence(C.layers.LSTM(1024))),
C.layers.Dense(label_dimension)
])(feature_normalized)
3.建立模型(Model creation):
CTC準則函數通過結合labels_to_graph函數和
定義超參數。
labels_to_graph,轉換輸入的標籤序列,成為圖形結構資料。
forward_backward,歸納時間序列經常使用的雙向學習速率。
mbsize = 1024
mbs_per_epoch = 10
max_epochs = 5
criteria = C.forward_backward(C.labels_to_graph(label), z, blankTokenId=132, delayConstraint=3)
err = C.edit_distance_error(z, label, squashInputs=True, tokensToIgnore=[132])
lr = C.learning_parameter_schedule_per_sample([(3, .01), (1,.001)])
mm = C.momentum_schedule([(1000, 0.9), (0, 0.99)], mbsize)
learner = C.momentum_sgd(z.parameters, lr, mm)
trainer = C.Trainer(z, (criteria, err), learner)
4.訓練模型(Learning the model):
C.logging.log_number_of_parameters(z)
progress_printer = C.logging.progress_print.ProgressPrinter(tag='Training', num_epochs = max_epochs)
for epoch in range(max_epochs):
for mb in range(mbs_per_epoch):
minibatch = train_data_reader.next_minibatch(mbsize, input_map = train_input_map)
trainer.train_minibatch(minibatch)
progress_printer.update_with_trainer(trainer, with_metric = True)
print('Trained on a total of ' + str(trainer.total_number_of_samples_seen) + ' frames')
progress_printer.epoch_summary(with_metric = True)
# 儲存模型
# z.save('CTC_' + str(max_epochs) + 'epochs_' + str(mbsize) + 'mbsize_' + str(mbs_per_epoch) + 'mbs.model')
5.評估模型(Evaluation):
test_feature_filepath = "glob_0000.write.scp"
test_feature_stream = C.io.HTKFeatureDeserializer(
C.io.StreamDefs(speech_feature = C.io.StreamDef(shape = feature_dimension, scp = test_feature_filepath)))
test_data_reader = C.io.MinibatchSource([test_feature_stream, train_label_stream], frame_mode = False)
test_input_map = {feature: test_data_reader.streams.speech_feature, label: test_data_reader.streams.speech_label}
num_test_minibatches = 2
test_result = 0.0
for i in range(num_test_minibatches):
test_minibatch = test_data_reader.next_minibatch(mbsize, input_map = test_input_map)
eval_error = trainer.test_minibatch(test_minibatch)
test_result = test_result + eval_error
# 評估誤差的平均值
round(test_result / num_test_minibatches,2)