資料前處理部分跟 Pytorch 篇一樣,就不重複寫了,這邊只寫 model 跟 Training 部分。
後來發現 keras 的 BatchNormalization 也訓練得起來,就沒有用自己的 BatchNormalization 訓練了
def Encoder(input_shape,dim_neck = 32 , dim_emb=256 , freq = 22):
    initializer = tf.keras.initializers.GlorotUniform(tf.sqrt(2.0))
    inp = Input(shape=input_shape)
    ### 要把 336 的那維變成 512
    ###
    x = tf.transpose(inp,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)     
    ###  
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x) 
    ###
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x) 
    #########################
    #   對 176 那維做 LSTM  #
    #########################
    x = tf.transpose(x,(0,2,1))
    lstm_tf_1 =  LSTM(32,return_sequences = True)
    lstm_tf_2 = LSTM(32,return_sequences = True)
    x = Bidirectional(lstm_tf_1)(x)
    x = Bidirectional(lstm_tf_2)(x)
    ## 注意冒號,這裡是做下採樣
    x_up = x[:, :, :dim_neck]
    x_down = x[:, :,dim_neck:]
    codes = []
    for i in range(0, LEN_CROP, FREQ):
        codes.append(tf.concat((x_up[:,i+ freq-1,:],x_down[:,i,:]), axis=-1))
    return Model(inputs=inp, outputs = codes , name="content_encoder")
    
def Decoder(encoder_input_shape):
    initializer = tf.keras.initializers.GlorotUniform(tf.sqrt(2.0))
    inputs = Input(shape = encoder_input_shape)
    # 進 LSTM 時 shape = (2,176,320)
    x = LSTM(512,return_sequences = True,kernel_initializer=initializer)(inputs)
    """
    3  個 5x1 Conv + BN + ReLU
    """
    x = Conv1D(512, kernel_size =5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size = 5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size = 5 , strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = tf.transpose(x,(0,2,1))
    #########################
    #   對 512 那維做 LSTM  #
    #########################
    x = LSTM(1024,return_sequences = True,kernel_initializer=initializer)(x)
    x = LSTM(1024,return_sequences = True,kernel_initializer=initializer)(x)
    """
    Linear
    """
    x = Dense(80)(x)
    return Model(inputs = inputs, outputs = x, name="decoder")   
    
def Posnet(input_shape):
    initializer = tf.keras.initializers.GlorotUniform(tf.cast(5/3,tf.float32))
    liner = tf.keras.initializers.GlorotUniform(1)
    inp = Input(shape = input_shape)
    """
    這裡是第二個輸出
    要把 80 的那維變成 512
    """
    """
    4  個 5x1 Conv + BN + ReLU
    """
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(inp)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = Activation("tanh")(x)
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1)
    x = BatchNormalization()(x)
    x = Activation("tanh")(x)
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = Activation("tanh")(x)
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = Activation("tanh")(x)
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(80, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    return Model(inputs = inp, outputs = x, name="posnet")
    
class Autovc(tf.keras.Model):
    def __init__(self,dim_neck = 32,dim_emb=256,len_crop=176,freq=22):
        super(Autovc, self).__init__()
        self.encoder = Encoder((dim_emb+80,len_crop),dim_neck,dim_emb,freq)
        self.decoder = Decoder((len_crop,320))
        self.postnet = Posnet((len_crop,80))
    def call(self, inputs):
        x = inputs[0]
        c_org = inputs[1]
        c_trg = inputs[-1]
        batch_size = tf.shape(x)[0]
        x = tf.transpose(x,(0,2,1))
        c_org = tf.expand_dims(c_org, axis=1)
        c_org = tf.transpose(tf.broadcast_to(c_org,(tf.shape(c_org)[0],LEN_CROP,tf.shape(c_org)[-1])),(0,2,1))
        # concat 80 那維
        x = tf.concat([x, c_org],axis=1)
        codes = self.encoder(x)
        if c_trg is None:
            return tf.concat(codes,axis=-1)
        tmp = []
        for code in codes:
            tc = tf.expand_dims(code,axis=1)
            tmp.append(tf.broadcast_to(tc,(batch_size,int(LEN_CROP/len(codes)),64))) 
        code_exp = tf.concat(tmp, axis=1)
        c_trg =  tf.expand_dims(c_trg, axis=1)                       
        c_trg = tf.broadcast_to(c_trg,(batch_size,tf.shape(x)[-1],DIM_EMB))
        # concat 64 那維
        encoder_outputs = tf.concat((code_exp, c_trg), axis=-1)
        mel_outputs  = self.decoder(encoder_outputs)
        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = tf.transpose(mel_outputs_postnet,(0,2,1))
        mel_outputs_postnet = mel_outputs +  mel_outputs_postnet
        return mel_outputs, mel_outputs_postnet, tf.concat(codes, axis=-1)
非常神奇的是,Tensorflow 版的要 DIM_NECK 設 44 才會 work (32 的會生出不知道誰的聲音),目前原因不明,我在猜是計算的精度問題?
def train(step =30000 ,batch_size = 2):
    print(".....Strat.....")
    for j in range(step): 
        # 這裡的跟之前 pytorch 那邊一樣
        # 確保輸入跟 pytorch 版的一樣
        try:
            x_real, emb_org = next(data_iter)
        except:
            data_iter = iter(vcc_loader)
            x_real, emb_org = next(data_iter)  
            
        # 因為輸入資料是 torch tensor 記得要轉回 np
        x_real = x_real.detach().cpu().numpy().astype(np.float32)
        emb_org = emb_org.detach().cpu().numpy().astype(np.float32) 
        # train_step 見下方
        g_loss_id, g_loss_id_psnt, g_loss_cd = train_step(x_real, emb_org,emb_org)
        if (j+1)%10 == 0:
            print(f"Step:{j}")
            print(f"G_loss_id:{g_loss_id}")
            print(f"G_loss_id_psnet:{g_loss_id_psnt}")
            print(f"G_loss_cd:{g_loss_cd}")
        if (j+2)%10 == 0:
            clear_output(wait=True)
        # 看你想什麼時候存,使用的時候就 autovc.encoder.load_weight("encoder_weights") 載入就好
        if (j+1)%10000 == 0:
            autovc.encoder.save_weights(f"model/encoder_weights_step_{j+1}.h5")
            autovc.decoder.save_weights(f"model/decoder_weights_step_{j+1}.h5")
            autovc.postnet.save_weights(f"model/postnet_weights_step_{j+1}.h5")
            
autovc_optimizer = tf.keras.optimizers.Adam(0.0001)
@tf.function
def train_step(x_real,emb_org,emb_trg):
    # tf.GradientTape() 等價於 loss.backward()
    with tf.GradientTape() as autovc_tape:
        x_identic, x_identic_psnt, code_real = autovc([x_real, emb_org, emb_trg])
        # loss 請參考昨天那篇
        g_loss_id = mse_loss(x_real, x_identic)
        g_loss_id_psnt = mse_loss(x_real, x_identic_psnt)
        code_reconst = autovc([x_identic_psnt, emb_org, None])
        g_loss_cd = l1_loss(code_real, code_reconst)
        g_loss = g_loss_id + g_loss_id_psnt + g_loss_cd
    gradients_of_autovc = autovc_tape.gradient(g_loss,autovc.trainable_variables)
    autovc_optimizer.apply_gradients(zip(gradients_of_autovc,autovc.trainable_variables))
    return g_loss_id, g_loss_id_psnt, g_loss_cd
最後就開始愉快的訓練拉 ~~~
到這邊我們已經把 AutoVC 做過兩遍了,TF 做出來的效果跟 Pytorch 的是一樣的,只是 dim_neck 這個參數比較令人疑惑,為什麼 pytorch 的可以在 32 上成功但 TF 的不行,但兩邊在 freq = 22, dim_neck = 44 的情況下轉出來的聲音效果我聽起來是差不多的。
10/1 號更新:
已經找到原因了,在上下採樣的時候出了一些問題,現在解決了,兩邊都可以得到一樣的結果。
10/2 號更新:
最新版的程式碼重構完了,你可以在這裡下載
有了 model 之後就是要想辦法能讓它變得更好,像 LSTM 這邊是有機會用 Transformer 來去取代它的,或是改變一下訓練的方法之類的; (這次的鐵人賽看到有幾位邦友在專門寫 Transformer 的介紹),這邊我想說改來分享一些 Gan 的音樂生成經驗與音樂情緒反應的相關話題好了 XD,那聲音轉換的部分就先告一段落了,終於挺過一半了,大家繼續加油!




