iT邦幫忙

2021 iThome 鐵人賽

DAY 15
1
AI & Data

AI Voice Conversion系列 第 15

【Day15】AutoVC 實作 - Tensorflow 篇

資料前處理部分跟 Pytorch 篇一樣,就不重複寫了,這邊只寫 model 跟 Training 部分。

後來發現 keras 的 BatchNormalization 也訓練得起來,就沒有用自己的 BatchNormalization 訓練了

Content-Encoder

def Encoder(input_shape,dim_neck = 32 , dim_emb=256 , freq = 22):
    initializer = tf.keras.initializers.GlorotUniform(tf.sqrt(2.0))

    inp = Input(shape=input_shape)
    ### 要把 336 的那維變成 512
    ###
    x = tf.transpose(inp,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)     
    ###  
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x) 
    ###
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x) 

    #########################
    #   對 176 那維做 LSTM  #
    #########################

    x = tf.transpose(x,(0,2,1))
    lstm_tf_1 =  LSTM(32,return_sequences = True)
    lstm_tf_2 = LSTM(32,return_sequences = True)

    x = Bidirectional(lstm_tf_1)(x)
    x = Bidirectional(lstm_tf_2)(x)

    ## 注意冒號,這裡是做下採樣
    x_up = x[:, :, :dim_neck]
    x_down = x[:, :,dim_neck:]
    codes = []

    for i in range(0, LEN_CROP, FREQ):
        codes.append(tf.concat((x_up[:,i+ freq-1,:],x_down[:,i,:]), axis=-1))


    return Model(inputs=inp, outputs = codes , name="content_encoder")
    

Decoder

def Decoder(encoder_input_shape):
    initializer = tf.keras.initializers.GlorotUniform(tf.sqrt(2.0))
    inputs = Input(shape = encoder_input_shape)

    # 進 LSTM 時 shape = (2,176,320)
    x = LSTM(512,return_sequences = True,kernel_initializer=initializer)(inputs)


    """
    3  個 5x1 Conv + BN + ReLU
    """

    x = Conv1D(512, kernel_size =5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size = 5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size = 5 , strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = tf.transpose(x,(0,2,1))

    #########################
    #   對 512 那維做 LSTM  #
    #########################
    x = LSTM(1024,return_sequences = True,kernel_initializer=initializer)(x)
    x = LSTM(1024,return_sequences = True,kernel_initializer=initializer)(x)

    """
    Linear
    """
    x = Dense(80)(x)

    return Model(inputs = inputs, outputs = x, name="decoder")   
    

Decoder 的第二個輸出

def Posnet(input_shape):
    initializer = tf.keras.initializers.GlorotUniform(tf.cast(5/3,tf.float32))
    liner = tf.keras.initializers.GlorotUniform(1)
    inp = Input(shape = input_shape)

    """
    這裡是第二個輸出

    要把 80 的那維變成 512

    """

    """
    4  個 5x1 Conv + BN + ReLU
    """

    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(inp)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = Activation("tanh")(x)

    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1)
    x = BatchNormalization()(x)
    x = Activation("tanh")(x)

    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))

    x = BatchNormalization()(x)
    x = Activation("tanh")(x)

    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))

    x = BatchNormalization()(x)
    x = Activation("tanh")(x)


    x = tf.transpose(x,(0,2,1))
    x = Conv1D(80, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))

    x = BatchNormalization()(x)

    return Model(inputs = inp, outputs = x, name="posnet")
    

把上面三個組合起來,這邊對應的是 model_vc 裡的 Generator

class Autovc(tf.keras.Model):
    def __init__(self,dim_neck = 32,dim_emb=256,len_crop=176,freq=22):
        super(Autovc, self).__init__()
        self.encoder = Encoder((dim_emb+80,len_crop),dim_neck,dim_emb,freq)
        self.decoder = Decoder((len_crop,320))
        self.postnet = Posnet((len_crop,80))

    def call(self, inputs):

        x = inputs[0]
        c_org = inputs[1]
        c_trg = inputs[-1]
        batch_size = tf.shape(x)[0]

        x = tf.transpose(x,(0,2,1))
        c_org = tf.expand_dims(c_org, axis=1)
        c_org = tf.transpose(tf.broadcast_to(c_org,(tf.shape(c_org)[0],LEN_CROP,tf.shape(c_org)[-1])),(0,2,1))
        # concat 80 那維
        x = tf.concat([x, c_org],axis=1)

        codes = self.encoder(x)
        if c_trg is None:
            return tf.concat(codes,axis=-1)

        tmp = []
        for code in codes:
            tc = tf.expand_dims(code,axis=1)
            tmp.append(tf.broadcast_to(tc,(batch_size,int(LEN_CROP/len(codes)),64))) 
        code_exp = tf.concat(tmp, axis=1)

        c_trg =  tf.expand_dims(c_trg, axis=1)                       
        c_trg = tf.broadcast_to(c_trg,(batch_size,tf.shape(x)[-1],DIM_EMB))

        # concat 64 那維
        encoder_outputs = tf.concat((code_exp, c_trg), axis=-1)
        mel_outputs  = self.decoder(encoder_outputs)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = tf.transpose(mel_outputs_postnet,(0,2,1))
        mel_outputs_postnet = mel_outputs +  mel_outputs_postnet

        return mel_outputs, mel_outputs_postnet, tf.concat(codes, axis=-1)

Train-Loop

  • 驗證的話至少要訓練 30,000 次才聽得出來,在 2080Ti 上要跑約 6 小時,要聽到不錯的結果要訓練約 1,000,000 次,loss 會收斂到 0.0001

非常神奇的是,Tensorflow 版的要 DIM_NECK 設 44 才會 work (32 的會生出不知道誰的聲音),目前原因不明,我在猜是計算的精度問題?

def train(step =30000 ,batch_size = 2):
    print(".....Strat.....")
    for j in range(step): 

        # 這裡的跟之前 pytorch 那邊一樣
        # 確保輸入跟 pytorch 版的一樣
        try:
            x_real, emb_org = next(data_iter)
        except:
            data_iter = iter(vcc_loader)
            x_real, emb_org = next(data_iter)  
            
        # 因為輸入資料是 torch tensor 記得要轉回 np
        x_real = x_real.detach().cpu().numpy().astype(np.float32)
        emb_org = emb_org.detach().cpu().numpy().astype(np.float32) 

        # train_step 見下方
        g_loss_id, g_loss_id_psnt, g_loss_cd = train_step(x_real, emb_org,emb_org)


        if (j+1)%10 == 0:
            print(f"Step:{j}")
            print(f"G_loss_id:{g_loss_id}")
            print(f"G_loss_id_psnet:{g_loss_id_psnt}")
            print(f"G_loss_cd:{g_loss_cd}")
        if (j+2)%10 == 0:
            clear_output(wait=True)
        # 看你想什麼時候存,使用的時候就 autovc.encoder.load_weight("encoder_weights") 載入就好
        if (j+1)%10000 == 0:
            autovc.encoder.save_weights(f"model/encoder_weights_step_{j+1}.h5")
            autovc.decoder.save_weights(f"model/decoder_weights_step_{j+1}.h5")
            autovc.postnet.save_weights(f"model/postnet_weights_step_{j+1}.h5")
            

Train-Step

autovc_optimizer = tf.keras.optimizers.Adam(0.0001)

@tf.function
def train_step(x_real,emb_org,emb_trg):
    # tf.GradientTape() 等價於 loss.backward()

    with tf.GradientTape() as autovc_tape:
        x_identic, x_identic_psnt, code_real = autovc([x_real, emb_org, emb_trg])
        # loss 請參考昨天那篇
        g_loss_id = mse_loss(x_real, x_identic)
        g_loss_id_psnt = mse_loss(x_real, x_identic_psnt)

        code_reconst = autovc([x_identic_psnt, emb_org, None])

        g_loss_cd = l1_loss(code_real, code_reconst)
        g_loss = g_loss_id + g_loss_id_psnt + g_loss_cd


    gradients_of_autovc = autovc_tape.gradient(g_loss,autovc.trainable_variables)
    autovc_optimizer.apply_gradients(zip(gradients_of_autovc,autovc.trainable_variables))

    return g_loss_id, g_loss_id_psnt, g_loss_cd

最後就開始愉快的訓練拉 ~~~

小結

到這邊我們已經把 AutoVC 做過兩遍了,TF 做出來的效果跟 Pytorch 的是一樣的,只是 dim_neck 這個參數比較令人疑惑,為什麼 pytorch 的可以在 32 上成功但 TF 的不行,但兩邊在 freq = 22, dim_neck = 44 的情況下轉出來的聲音效果我聽起來是差不多的。

10/1 號更新:
已經找到原因了,在上下採樣的時候出了一些問題,現在解決了,兩邊都可以得到一樣的結果。

10/2 號更新:
最新版的程式碼重構完了,你可以在這裡下載

接下的路

有了 model 之後就是要想辦法能讓它變得更好,像 LSTM 這邊是有機會用 Transformer 來去取代它的,或是改變一下訓練的方法之類的; (這次的鐵人賽看到有幾位邦友在專門寫 Transformer 的介紹),這邊我想說改來分享一些 Gan 的音樂生成經驗與音樂情緒反應的相關話題好了 XD,那聲音轉換的部分就先告一段落了,終於挺過一半了,大家繼續加油!

/images/emoticon/emoticon09.gif/images/emoticon/emoticon13.gif/images/emoticon/emoticon14.gif/images/emoticon/emoticon22.gif/images/emoticon/emoticon28.gif


上一篇
【Day14】 Pytorch 轉 Tensorflow
下一篇
【Day16】音樂生成,我是要生成什麼?
系列文
AI Voice Conversion30

尚未有邦友留言

立即登入留言