資料前處理部分跟 Pytorch 篇一樣,就不重複寫了,這邊只寫 model 跟 Training 部分。
後來發現 keras 的 BatchNormalization 也訓練得起來,就沒有用自己的 BatchNormalization 訓練了
def Encoder(input_shape,dim_neck = 32 , dim_emb=256 , freq = 22):
initializer = tf.keras.initializers.GlorotUniform(tf.sqrt(2.0))
inp = Input(shape=input_shape)
### 要把 336 的那維變成 512
###
x = tf.transpose(inp,(0,2,1))
x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
x = tf.transpose(x,(0,2,1))
x = BatchNormalization()(x)
x = ReLU()(x)
###
x = tf.transpose(x,(0,2,1))
x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
x = tf.transpose(x,(0,2,1))
x = BatchNormalization()(x)
x = ReLU()(x)
###
x = tf.transpose(x,(0,2,1))
x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
x = tf.transpose(x,(0,2,1))
x = BatchNormalization()(x)
x = ReLU()(x)
#########################
# 對 176 那維做 LSTM #
#########################
x = tf.transpose(x,(0,2,1))
lstm_tf_1 = LSTM(32,return_sequences = True)
lstm_tf_2 = LSTM(32,return_sequences = True)
x = Bidirectional(lstm_tf_1)(x)
x = Bidirectional(lstm_tf_2)(x)
## 注意冒號,這裡是做下採樣
x_up = x[:, :, :dim_neck]
x_down = x[:, :,dim_neck:]
codes = []
for i in range(0, LEN_CROP, FREQ):
codes.append(tf.concat((x_up[:,i+ freq-1,:],x_down[:,i,:]), axis=-1))
return Model(inputs=inp, outputs = codes , name="content_encoder")
def Decoder(encoder_input_shape):
initializer = tf.keras.initializers.GlorotUniform(tf.sqrt(2.0))
inputs = Input(shape = encoder_input_shape)
# 進 LSTM 時 shape = (2,176,320)
x = LSTM(512,return_sequences = True,kernel_initializer=initializer)(inputs)
"""
3 個 5x1 Conv + BN + ReLU
"""
x = Conv1D(512, kernel_size =5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
x = tf.transpose(x,(0,2,1))
x = BatchNormalization()(x)
x = ReLU()(x)
x = tf.transpose(x,(0,2,1))
x = Conv1D(512, kernel_size = 5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
x = tf.transpose(x,(0,2,1))
x = BatchNormalization()(x)
x = ReLU()(x)
x = tf.transpose(x,(0,2,1))
x = Conv1D(512, kernel_size = 5 , strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
x = tf.transpose(x,(0,2,1))
x = BatchNormalization()(x)
x = ReLU()(x)
x = tf.transpose(x,(0,2,1))
#########################
# 對 512 那維做 LSTM #
#########################
x = LSTM(1024,return_sequences = True,kernel_initializer=initializer)(x)
x = LSTM(1024,return_sequences = True,kernel_initializer=initializer)(x)
"""
Linear
"""
x = Dense(80)(x)
return Model(inputs = inputs, outputs = x, name="decoder")
def Posnet(input_shape):
initializer = tf.keras.initializers.GlorotUniform(tf.cast(5/3,tf.float32))
liner = tf.keras.initializers.GlorotUniform(1)
inp = Input(shape = input_shape)
"""
這裡是第二個輸出
要把 80 的那維變成 512
"""
"""
4 個 5x1 Conv + BN + ReLU
"""
x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(inp)
x = tf.transpose(x,(0,2,1))
x = BatchNormalization()(x)
x = Activation("tanh")(x)
x = tf.transpose(x,(0,2,1))
x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
x = tf.transpose(x,(0,2,1)
x = BatchNormalization()(x)
x = Activation("tanh")(x)
x = tf.transpose(x,(0,2,1))
x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
x = tf.transpose(x,(0,2,1))
x = BatchNormalization()(x)
x = Activation("tanh")(x)
x = tf.transpose(x,(0,2,1))
x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
x = tf.transpose(x,(0,2,1))
x = BatchNormalization()(x)
x = Activation("tanh")(x)
x = tf.transpose(x,(0,2,1))
x = Conv1D(80, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
x = tf.transpose(x,(0,2,1))
x = BatchNormalization()(x)
return Model(inputs = inp, outputs = x, name="posnet")
class Autovc(tf.keras.Model):
def __init__(self,dim_neck = 32,dim_emb=256,len_crop=176,freq=22):
super(Autovc, self).__init__()
self.encoder = Encoder((dim_emb+80,len_crop),dim_neck,dim_emb,freq)
self.decoder = Decoder((len_crop,320))
self.postnet = Posnet((len_crop,80))
def call(self, inputs):
x = inputs[0]
c_org = inputs[1]
c_trg = inputs[-1]
batch_size = tf.shape(x)[0]
x = tf.transpose(x,(0,2,1))
c_org = tf.expand_dims(c_org, axis=1)
c_org = tf.transpose(tf.broadcast_to(c_org,(tf.shape(c_org)[0],LEN_CROP,tf.shape(c_org)[-1])),(0,2,1))
# concat 80 那維
x = tf.concat([x, c_org],axis=1)
codes = self.encoder(x)
if c_trg is None:
return tf.concat(codes,axis=-1)
tmp = []
for code in codes:
tc = tf.expand_dims(code,axis=1)
tmp.append(tf.broadcast_to(tc,(batch_size,int(LEN_CROP/len(codes)),64)))
code_exp = tf.concat(tmp, axis=1)
c_trg = tf.expand_dims(c_trg, axis=1)
c_trg = tf.broadcast_to(c_trg,(batch_size,tf.shape(x)[-1],DIM_EMB))
# concat 64 那維
encoder_outputs = tf.concat((code_exp, c_trg), axis=-1)
mel_outputs = self.decoder(encoder_outputs)
mel_outputs_postnet = self.postnet(mel_outputs)
mel_outputs_postnet = tf.transpose(mel_outputs_postnet,(0,2,1))
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
return mel_outputs, mel_outputs_postnet, tf.concat(codes, axis=-1)
非常神奇的是,Tensorflow 版的要 DIM_NECK 設 44 才會 work (32 的會生出不知道誰的聲音),目前原因不明,我在猜是計算的精度問題?
def train(step =30000 ,batch_size = 2):
print(".....Strat.....")
for j in range(step):
# 這裡的跟之前 pytorch 那邊一樣
# 確保輸入跟 pytorch 版的一樣
try:
x_real, emb_org = next(data_iter)
except:
data_iter = iter(vcc_loader)
x_real, emb_org = next(data_iter)
# 因為輸入資料是 torch tensor 記得要轉回 np
x_real = x_real.detach().cpu().numpy().astype(np.float32)
emb_org = emb_org.detach().cpu().numpy().astype(np.float32)
# train_step 見下方
g_loss_id, g_loss_id_psnt, g_loss_cd = train_step(x_real, emb_org,emb_org)
if (j+1)%10 == 0:
print(f"Step:{j}")
print(f"G_loss_id:{g_loss_id}")
print(f"G_loss_id_psnet:{g_loss_id_psnt}")
print(f"G_loss_cd:{g_loss_cd}")
if (j+2)%10 == 0:
clear_output(wait=True)
# 看你想什麼時候存,使用的時候就 autovc.encoder.load_weight("encoder_weights") 載入就好
if (j+1)%10000 == 0:
autovc.encoder.save_weights(f"model/encoder_weights_step_{j+1}.h5")
autovc.decoder.save_weights(f"model/decoder_weights_step_{j+1}.h5")
autovc.postnet.save_weights(f"model/postnet_weights_step_{j+1}.h5")
autovc_optimizer = tf.keras.optimizers.Adam(0.0001)
@tf.function
def train_step(x_real,emb_org,emb_trg):
# tf.GradientTape() 等價於 loss.backward()
with tf.GradientTape() as autovc_tape:
x_identic, x_identic_psnt, code_real = autovc([x_real, emb_org, emb_trg])
# loss 請參考昨天那篇
g_loss_id = mse_loss(x_real, x_identic)
g_loss_id_psnt = mse_loss(x_real, x_identic_psnt)
code_reconst = autovc([x_identic_psnt, emb_org, None])
g_loss_cd = l1_loss(code_real, code_reconst)
g_loss = g_loss_id + g_loss_id_psnt + g_loss_cd
gradients_of_autovc = autovc_tape.gradient(g_loss,autovc.trainable_variables)
autovc_optimizer.apply_gradients(zip(gradients_of_autovc,autovc.trainable_variables))
return g_loss_id, g_loss_id_psnt, g_loss_cd
最後就開始愉快的訓練拉 ~~~
到這邊我們已經把 AutoVC 做過兩遍了,TF 做出來的效果跟 Pytorch 的是一樣的,只是 dim_neck 這個參數比較令人疑惑,為什麼 pytorch 的可以在 32 上成功但 TF 的不行,但兩邊在 freq = 22, dim_neck = 44 的情況下轉出來的聲音效果我聽起來是差不多的。
10/1 號更新:
已經找到原因了,在上下採樣的時候出了一些問題,現在解決了,兩邊都可以得到一樣的結果。
10/2 號更新:
最新版的程式碼重構完了,你可以在這裡下載
有了 model 之後就是要想辦法能讓它變得更好,像 LSTM 這邊是有機會用 Transformer 來去取代它的,或是改變一下訓練的方法之類的; (這次的鐵人賽看到有幾位邦友在專門寫 Transformer 的介紹),這邊我想說改來分享一些 Gan 的音樂生成經驗與音樂情緒反應的相關話題好了 XD,那聲音轉換的部分就先告一段落了,終於挺過一半了,大家繼續加油!