Introduction
多對多神經網路(sequence-to-sequence networks)的應用範圍很廣,常用於機器翻譯、文章自動摘要、語法分析樹(parse tree)等,輸入與輸出揭示動態長度的序列。
sequence-to-sequence
主要組成為:
1.編碼器(encoder)
2.解碼器(decoder)
兩者都是遞歸神經網路(recurrent neural network, RNN),編碼器將文字資料處理成統一格式資料,解碼器將資料轉換為序列。
:序列開始。
:序列結束。
注意力模型(Attention model)概念是指對於文字資料時,可以聚焦某一個部分,對各個字詞的關注權重並不相同。
這也用來解決上述 RNN 用來處理文字資料時會遇到的問題,當文字資料字詞數量過多時,會在處理過程中衰弱先前步驟中取得的狀態值。
注意力(attention)機制,解碼器(encoder)能夠關注每個字詞的權重處理,而不是只對越晚出現的字詞印象比較深。
Tasks
物件引用
# 相容性需求,若使用舊版pyton時,可使用新版python函式
from __future__ import print_function
import numpy as np
import os
import cntk as C
import cntk.tests.test_utils
# 測試並設定使用 CPU 或 GPU 作為目前測試環境
cntk.tests.test_utils.set_device_from_pytest_env()
# 重新設定 CNTK 的亂數種子
C.cntk_py.set_fixed_random_seed(1)
宣告函式:isTest 檢查是否為測試環境。
def isTest():
return ('TEST_DEVICE' in os.environ)
1.資料讀取(Data reading):
CMUDict 資料集是一個發音辭典資料集。
S0:輸入序列
S1:輸出序列
0 |S0 3:1 |# |S1 3:1 |#
0 |S0 4:1 |# A |S1 32:1 |# ~AH
0 |S0 5:1 |# B |S1 36:1 |# ~B
0 |S0 4:1 |# A |S1 31:1 |# ~AE
0 |S0 7:1 |# D |S1 38:1 |# ~D
0 |S0 12:1 |# I |S1 47:1 |# ~IY
0 |S0 1:1 |# |S1 1:1 |#
宣告函式:download 下載所需資料集。
import requests
def download(url, filename):
response = requests.get(url, stream=True)
with open(filename, "wb") as handle:
for data in response.iter_content():
handle.write(data)
MODEL_DIR = "."
DATA_DIR = os.path.join('..', 'Examples', 'SequenceToSequence', 'CMUDict', 'Data')
# 若路徑不存在則使用目前路徑
if not os.path.exists(DATA_DIR):
DATA_DIR = '.'
dataPath = {
'validation': 'tiny.ctf',
'training': 'cmudict-0.7b.train-dev-20-21.ctf',
'testing': 'cmudict-0.7b.test.ctf',
'vocab_file': 'cmudict-0.7b.mapping',
}
for k in sorted(dataPath.keys()):
path = os.path.join(DATA_DIR, dataPath[k])
if os.path.exists(path):
print("Reusing locally cached:", path)
else:
print("Starting download:", dataPath[k])
url = "https://github.com/Microsoft/CNTK/blob/release/2.3.1/Examples/SequenceToSequence/CMUDict/Data/%s?raw=true"%dataPath[k]
download(url, path)
print("Download completed")
dataPath[k] = path
宣告函式:get_vocab 讀取詞彙資料。
def get_vocab(path):
# 產生詞彙序列
vocab = [w.strip() for w in open(path).readlines()]
i2w = { i:w for i,w in enumerate(vocab) }
w2i = { w:i for i,w in enumerate(vocab) }
return (vocab, i2w, w2i)
# 產生詞彙序列的索引
vocab, i2w, w2i = get_vocab(dataPath['vocab_file'])
2.資料處理(Data preprocessing):
變數設定。
# 輸入詞彙維度
input_vocab_dim = 69
# 詞彙標籤維度
label_vocab_dim = 69
宣告函式:create_reader 讀取資料。
def create_reader(path, is_training):
return MinibatchSource(CTFDeserializer(path, StreamDefs(
features = StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True),
labels = StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True)
)), randomize = is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)
# 讀取訓練資料
train_reader = create_reader(dataPath['training'], True)
# 讀取驗證資料
valid_reader = create_reader(dataPath['validation'], True)
3.建立模型(Model creation):
設定超參數。
hidden_dim = 512
num_layers = 2
attention_dim = 128
use_attention = True
use_embedding = True
embedding_dim = 200
vocab = ([w.strip() for w in open(dataPath['vocab_file']).readlines()])
length_increase = 1.5
設定變數。
# 序列資料開始的標記
sentence_start =C.Constant(np.array([w=='<s>' for w in vocab], dtype=np.float32))
# 序列資料結束的標記
sentence_end_index = vocab.index('</s>')
設定變數:動態軸(Dynamic axes)具有未知大小,直到程式碼運行時,依指定的資料大小而變化。
# 輸入序列的動態軸
inputAxis = C.Axis('inputAxis')
# 輸出序列的動態軸
labelAxis = C.Axis('labelAxis')
InputSequence = C.layers.SequenceOver[inputAxis]
LabelSequence = C.layers.SequenceOver[labelAxis]
宣告函式:建立模型
def create_model():
# 輸入嵌入
embed = C.layers.Embedding(embedding_dim, name='embed') if use_embedding else identity
# 編碼器:建立多個 LSTM 層
with C.layers.default_options(enable_self_stabilization=True, go_backwards=not use_attention):
LastRecurrence = C.layers.Fold if not use_attention else C.layers.Recurrence
encode = C.layers.Sequential([
embed,
C.layers.Stabilizer(),
C.layers.For(range(num_layers-1), lambda:
C.layers.Recurrence(C.layers.LSTM(hidden_dim))),
LastRecurrence(C.layers.LSTM(hidden_dim), return_full_state=True),
(C.layers.Label('encoded_h'), C.layers.Label('encoded_c')),
])
# 解碼器
with C.layers.default_options(enable_self_stabilization=True):
#
stab_in = C.layers.Stabilizer()
rec_blocks = [C.layers.LSTM(hidden_dim) for i in range(num_layers)]
stab_out = C.layers.Stabilizer()
proj_out = C.layers.Dense(label_vocab_dim, name='out_proj')
# 注意力模型:將解碼器隱藏狀態和所有編碼器狀態映射到增強狀態
if use_attention:
attention_model = C.layers.AttentionModel(attention_dim,
name='attention_model')
# 宣告函式:神經層
@C.Function
def decode(history, input):
encoded_input = encode(input)
r = history
r = embed(r)
r = stab_in(r)
for i in range(num_layers):
# 隱藏層:LSTM
rec_block = rec_blocks[i]
if use_attention:
if i == 0:
@C.Function
def lstm_with_attention(dh, dc, x):
h_att = attention_model(encoded_input.outputs[0], dh)
x = C.splice(x, h_att)
return rec_block(dh, dc, x)
r = C.layers.Recurrence(lstm_with_attention)(r)
else:
r = C.layers.Recurrence(rec_block)(r)
else:
r = C.layers.RecurrenceFrom(rec_block)(*(encoded_input.outputs + (r,)))
r = stab_out(r)
r = proj_out(r)
r = C.layers.Label('out_proj_out')(r)
return r
return decode