iT邦幫忙

2021 iThome 鐵人賽

DAY 20
0
AI & Data

AI Voice Conversion系列 第 20

【Day20】 WavenetGan, BidirectionalLSTMGAN, WaveGan 鋼琴音樂生成

  • 因為之後想花一點時間分享一下 Transformer 閱讀跟實作的經驗,所以這篇就沒寫 Transformer 的部分,但它生成的音樂還是會貼在下方跟大家分享

  • 書接昨日,我們就開始寫吧!

WavenetGan

Generator

Discriminator

def build_discriminator():
    model=Sequential([
        Conv1D(32,(3),strides=(2),input_shape=(SEQ_LEN,1,)),
        LeakyReLU(),
        Conv1D(64, (3), strides=(2), padding='same',use_bias=False),
        BatchNormalization(),
        LeakyReLU(),
        Conv1D(64, (3), strides=(2), padding='same',use_bias=False),
        BatchNormalization(),
        LeakyReLU(),
        Flatten(),
        Dense(1)
    ])
    return model

BidirectionalLSTMGAN

Generator

def build_generator():

    model = Sequential([
    Bidirectional(LSTM(128, return_sequences=True), input_shape=(gen_len, gen_len)),
    LeakyReLU(alpha=0.2),
    Bidirectional(LSTM(128, return_sequences=True)),
    LeakyReLU(alpha=0.2),
    Bidirectional(LSTM(128)),
    LeakyReLU(alpha=0.2),
    # specifying output to have 40 timesteps
    RepeatVector(seq_len),
    # specifying 1 feature as the output
    Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2)),
    LeakyReLU(alpha=0.2),
    Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2)),
    LeakyReLU(alpha=0.2),
    Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2)),
    LeakyReLU(alpha=0.2),
    Dropout(0.3),
    TimeDistributed(Dense(128)),
    LeakyReLU(alpha=0.2),
    Dropout(0.4),
    TimeDistributed(Dense(128)),
    LeakyReLU(alpha=0.2),
    Dropout(0.4),
    TimeDistributed(Dense(1)),
    # back to 0 ~ 1
    Activation("sigmoid"),
    ])
    noise = Input(shape=(gen_len,gen_len))
    img = model(noise)
    return Model(noise, img)

Discriminator

def build_discriminator():

        model = Sequential([

                Bidirectional(LSTM(128, return_sequences=True), input_shape=(seq_len, 1)),
                Activation("relu"),
                LeakyReLU(alpha=0.2),
                Bidirectional(LSTM(128)),
                Activation("relu"),
                LeakyReLU(alpha=0.2),
                Dropout(0.4),
                RepeatVector(1),
                TimeDistributed(Dense(128, activation = 'sigmoid')),
                LeakyReLU(alpha=0.2),
                Dropout(0.4),
                TimeDistributed(Dense(128, activation = 'relu')),
                LeakyReLU(alpha=0.2),
                Dropout(0.4),
                TimeDistributed(Dense(1, activation = 'linear'))

        ])

        img = Input(shape=(seq_len,1))
        validity = model(img)
        return Model(img, validity)

WaveGan

Generator

def WaveGANGenerator():
    model = tf.keras.Sequential([
        Dense(seq_len,  activation='relu',input_shape=(seq_len,)),
        Reshape((1,seq_len)),

        Conv1D(64, kernel_size=25, strides=4, padding="same"),
        BatchNormalization(momentum=0.8),
        ReLU(),

        Conv1D(128,kernel_size=25, strides=4,padding='same'),
        BatchNormalization(momentum=0.8),
        ReLU(),

        Conv1D(seq_len,kernel_size=25,strides=4, padding='same'),
        BatchNormalization(momentum=0.8),
        ReLU(),

        Conv1D(seq_len,kernel_size=25,strides=4, padding='same'),
        BatchNormalization(momentum=0.8),
        ReLU(),

        Flatten(),
        Dense(seq_len, activation='sigmoid')
    ])
    return model

Discriminator

def WaveGANDiscriminator():
    model = tf.keras.Sequential([

        Dense(seq_len,  activation='relu',input_shape=(seq_len,)),
        Reshape((1,seq_len)),

        Conv1D(64, kernel_size=25, strides=4, padding="same"),
        BatchNormalization(momentum=0.8),
        ReLU(),

        Conv1D(seq_len,kernel_size=25,strides=4, padding='same'),
        BatchNormalization(momentum=0.8),
        ReLU(),

        Conv1D(seq_len,kernel_size=25,strides=4, padding='same'),
        BatchNormalization(momentum=0.8),
        ReLU(),

        Flatten(),
        Dense(seq_len, activation='sigmoid')
        ])
    return model
    

對於不同的模型,訓練的方式都是一樣的

定義 loss

cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

Train Loop

import time
total_Gloss = []
total_Dloss = []
def train(dataset, epochs):
    for epoch in range(epochs):
        start = time.time()
        G_loss = 0
        D_loss = 0
        for i,image_batch in enumerate(dataset):
            # 見下方
            gen_loss,disc_loss = train_step(image_batch)
            print(f"Step:{i} | G_loss:{gen_loss} D_loss:{disc_loss}|")
            G_loss += gen_loss
            D_loss += disc_loss
        clear_output(wait=True)
        print (f'Time for epoch {epoch + 1} is {time.time()-start} sec\n')
        print(f'G_AVE_Loss:{G_loss/len(dataset)}')
        print(f'D_AVE_loss:{D_loss/len(dataset)}')
        total_Gloss.append(G_loss/len(dataset))
        total_Dloss.append(D_loss/len(dataset))

Train step

@tf.function
def train_step(music):
    LAMBDA = 10
    noise = tf.random.normal([BATCH_SIZE,seq_len])
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_music = generator(noise, training=True)
        real_output = discriminator(music, training=True)
        fake_output = discriminator(generated_music, training=True)
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(fake_output,real_output)       
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    return gen_loss,disc_loss      

Loss 圖

生成 MIDI

import random
from mido import MidiFile, MidiTrack, Message
noise =   np.random.normal(0,1,(1,seq_len))
predict = generator.predict(noise)
predict = predict*127

midler = MidiFile()
track = MidiTrack()
midler.tracks.append(track)

track.append(Message('program_change', program=2, time=0))
for x in range(seq_len):
    # 這裡就是前面說的,訓練的部分只有音符排列,節奏跟控制都沒有訓練到,所以都是隨機生成的
    on_interval = random.randint(0,127)
    off_interval = random.randint(0,127)
    change_interval = random.randint(0,127)
    change_value = random.randint(0,127)
    isControl = random.randint(0,1)
    track.append(Message('note_on',channel =1, note=int(predict[0][x]), velocity=64, time = on_interval)) 
    if isControl:
         track.append(Message('control_change',channel =1, control=64, value=change_value, time = change_interval)) 
    track.append(Message('note_off',channel =1 ,note=int(predict[0][x]), velocity=64, time = off_interval))
    midler.save('WaveGan.mid')
    

小結

雖然在訓練的時候沒有用到節奏跟控制來訓練,我們只有訓練音符的排列,但結果跟隨機亂生的聽起來就是不一樣,評斷音樂的標準定義還是相當模糊的(至少對像我這樣不懂音樂的麻瓜來說),還是要實際聽看看比較能夠體會,所以最後附上我生成的結果XD。

WavenetGan

BidirectionalLSTMGAN

WaveGan

TransformerGan

/images/emoticon/emoticon09.gif/images/emoticon/emoticon13.gif/images/emoticon/emoticon14.gif/images/emoticon/emoticon22.gif/images/emoticon/emoticon28.gif


上一篇
【Day19】 用 4 種不同的 GAN 模型生成音樂簡介
下一篇
【Day21】 Transformer 新手包 (一)
系列文
AI Voice Conversion30

尚未有邦友留言

立即登入留言