pip3 install torch torchvision torchaudio
或在Conda環境可以使用以下程式碼:
conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
'''
The initialization checks if d_model is divisible by num_heads,
and then defines the transformation weights for query, key, value, and output.
'''
super(MultiHeadAttention, self).__init__()
# Ensure that the model dimension (d_model) is divisible by the number of heads
assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
# Initialize dimensions
self.d_model = d_model # Model's dimension
self.num_heads = num_heads # Number of attention heads
self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
# Linear layers for transforming inputs
self.W_q = nn.Linear(d_model, d_model) # Query transformation
self.W_k = nn.Linear(d_model, d_model) # Key transformation
self.W_v = nn.Linear(d_model, d_model) # Value transformation
self.W_o = nn.Linear(d_model, d_model) # Output transformation
def scaled_dot_product_attention(self, Q, K, V, mask=None):
# Calculate attention scores
# Calculating Attention Scores. Here, the attention scores are calculated by taking
# the dot product of queries (Q) and keys (K), and then scaling by the square root
# of the key dimension (d_k).
attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
# Apply mask if provided (useful for preventing attention to certain parts like padding)
# If a mask is provided, it is applied to the attention scores to mask out specific
# values.
if mask is not None:
attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
# Softmax is applied to obtain attention probabilities
# Calculating Attention Weights: The attention scores are passed through a softmax
# function
# to convert them into probabilities that sum to 1.
attn_probs = torch.softmax(attn_scores, dim=-1)
# Multiply by values to obtain the final output
# Calculating Output: The final output of the attention is calculated by multiplying the
# attention weights by the values (V).
output = torch.matmul(attn_probs, V)
return output
def split_heads(self, x):
'''
Reshape the input to have num_heads for multi-head attention.This method reshapes the
input x into the shape (batch_size, num_heads, seq_length, d_k). It enables the model
to process multiple attention heads concurrently, allowing for parallel computation.
'''
batch_size, seq_length, d_model = x.size()
return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
def combine_heads(self, x):
'''
Combine the multiple heads back to original shape. After applying attention to each
head separately, this method combines the results back into a single tensor of shape
(batch_size, seq_length, d_model). This prepares the result for further processing.
'''
batch_size, _, seq_length, d_k = x.size()
return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
def forward(self, Q, K, V, mask=None):
'''
The forward method is where the actual computation happens
'''
# Apply linear transformations and split heads
Q = self.split_heads(self.W_q(Q))
K = self.split_heads(self.W_k(K))
V = self.split_heads(self.W_v(V))
# Perform scaled dot-product attention
attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
# Combine heads and apply output transformation
output = self.W_o(self.combine_heads(attn_output))
return output
綜上所述,MultiHeadAttention 級別封裝了 Transformer 模型中常用的多重注意力機制。 它負責將輸入分成多個注意力頭,對每個頭施加注意力然後組合結果,因此模型可以捕捉不同尺度的輸入資料中各種關係,提高模型的表達能力。
class PositionWiseFeedForward(nn.Module):
'''
The class is a subclass of PyTorch's nn.Module, which means it will inherit all
functionalities required to work with neural network layers.
'''
def __init__(self, d_model, d_ff):
'''
d_model: Dimensionality of the model's input and output.
d_ff: Dimensionality of the inner layer in the feed-forward network.
self.fc1 and self.fc2: Two fully connected (linear) layers with input and output
dimensions as defined by d_model and d_ff.
self.relu: ReLU (Rectified Linear Unit) activation function, which introduces non-
linearity between the two linear layers.
'''
super(PositionWiseFeedForward, self).__init__()
self.fc1 = nn.Linear(d_model, d_ff)
self.fc2 = nn.Linear(d_ff, d_model)
self.relu = nn.ReLU()
def forward(self, x):
'''
x: The input to the feed-forward network.
self.fc1(x): The input is first passed through the first linear layer (fc1).
self.relu(...): The output of fc1 is then passed through a ReLU activation function.
ReLU replaces all negative values with zeros, introducing non-linearity into the model.
self.fc2(...): The activated output is then passed through the second linear layer
(fc2), producing the final output.
'''
return self.fc2(self.relu(self.fc1(x)))
PositionWiseFeedForward 級別定義了一個位置向前饋神經網絡,它由兩個線性層組成,中間有一個 ReLU 活化函數。 在Transformer模型的背景下,此前饋網路被單獨且相同地應用於每個位置。 它有助於轉換Transformer內注意力機制學到的特徵,作為注意力輸出的附加處理步驟。
class PositionalEncoding(nn.Module):
'''
The class is defined as a subclass of PyTorch's nn.Module, allowing it to be used
as a standard PyTorch layer.
'''
def __init__(self, d_model, max_seq_length):
'''
d_model: The dimension of the model's input.
max_seq_length: The maximum length of the sequence for which positional encodings are
pre-computed.
pe: A tensor filled with zeros, which will be populated with positional encodings.
position: A tensor containing the position indices for each position in the sequence.
div_term: A term used to scale the position indices in a specific way.
The sine function is applied to the even indices and the cosine function to the odd
indices of pe.
Finally, pe is registered as a buffer, which means it will be part of the module's state
but will not be considered a trainable parameter.
'''
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_seq_length, d_model)
position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) /
d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe.unsqueeze(0))
def forward(self, x):
'''
The forward method simply adds the positional encodings to the input x.
It uses the first x.size(1) elements of pe to ensure that the positional
encodings match the actual sequence length of x.
'''
return x + self.pe[:, :x.size(1)]
PositionalEncoding 級別新增了有關序列中標記位置的資訊。 由於 Transformer 模型缺乏對 token 順序的了解(由於其自注意力機制),因此位置編碼幫助模型考慮 token 在序列中的位置。 選擇使用的正弦函數是為了使模型能夠輕鬆學習專注於相對位置,因為它們為序列中的每個位置產生獨特且平滑的編碼。
class EncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout):
'''
d_model: The dimensionality of the input.
num_heads: The number of attention heads in the multi-head attention.
d_ff: The dimensionality of the inner layer in the position-wise feed-forward network.
dropout: The dropout rate used for regularization.
self.self_attn: Multi-head attention mechanism.
self.feed_forward: Position-wise feed-forward neural network.
self.norm1 and self.norm2: Layer normalization, applied to smooth the layer's input.
self.dropout: Dropout layer, used to prevent overfitting by randomly setting some
activations to zero during training.
'''
super(EncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads)
self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask):
'''
x: The input to the encoder layer.
mask: Optional mask to ignore certain parts of the input.
Self-Attention: The input x is passed through the multi-head self-attention mechanism.
Add & Normalize (after Attention): The attention output is added to the original input
(residual connection), followed by dropout and normalization using norm1.
Feed-Forward Network: The output from the previous step is passed through the position-
wise feed-forward network.
Add & Normalize (after Feed-Forward): Similar to step 2, the feed-forward output is
added to the input of this stage (residual connection), followed by dropout and
normalization using norm2.
Output: The processed tensor is returned as the output of the encoder layer.
'''
attn_output = self.self_attn(x, x, x, mask)
x = self.norm1(x + self.dropout(attn_output))
ff_output = self.feed_forward(x)
x = self.norm2(x + self.dropout(ff_output))
return x
EncoderLayer 級別定義了Transformer單層編碼器。 它封裝了多重自注意力機制,還有位置前饋神經網絡,並根據需要應用殘差連接、歸一化層和丟失層。 這些組件共同允許編碼器捕獲輸入資料中的複雜關係,並將其轉換為下游任務的有用表示。 通常,多個這樣的編碼器層被堆疊以形成變壓器模型的完整編碼器部分。
class DecoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout):
'''
d_model: The dimensionality of the input.
num_heads: The number of attention heads in the multi-head attention.
d_ff: The dimensionality of the inner layer in the feed-forward network.
dropout: The dropout rate for regularization.
self.self_attn: Multi-head self-attention mechanism for the target sequence.
self.cross_attn: Multi-head attention mechanism that attends to the encoder's output.
self.feed_forward: Position-wise feed-forward neural network.
self.norm1, self.norm2, self.norm3: Layer normalization components.
self.dropout: Dropout layer for regularization.
'''
super(DecoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads)
self.cross_attn = MultiHeadAttention(d_model, num_heads)
self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, enc_output, src_mask, tgt_mask):
'''
x: The input to the decoder layer.
enc_output: The output from the corresponding encoder (used in the cross-attention
step).
src_mask: Source mask to ignore certain parts of the encoder's output.
tgt_mask: Target mask to ignore certain parts of the decoder's input.
Self-Attention on Target Sequence: The input x is processed through a self-attention
mechanism.
Add & Normalize (after Self-Attention): The output from self-attention is added to the
original x, followed by dropout and normalization using norm1.
Cross-Attention with Encoder Output: The normalized output from the previous step is
processed through a cross-attention mechanism that attends to the encoder's output
enc_output.
Add & Normalize (after Cross-Attention): The output from cross-attention is added to the
input of this stage, followed by dropout and normalization using norm2.
Feed-Forward Network: The output from the previous step is passed through the feed-
forward network.
Add & Normalize (after Feed-Forward): The feed-forward output is added to the input of
this stage, followed by dropout and normalization using norm3.
Output: The processed tensor is returned as the output of the decoder layer.
'''
attn_output = self.self_attn(x, x, x, tgt_mask)
x = self.norm1(x + self.dropout(attn_output))
attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
x = self.norm2(x + self.dropout(attn_output))
ff_output = self.feed_forward(x)
x = self.norm3(x + self.dropout(ff_output))
return x
DecoderLayer 級別定義了Transformer單層解碼器。 它由多重自注意力機制、多頭交叉注意力機制(關注編碼器的輸出)、位置前饋神經網路以及相應的殘差連接、歸一化層和 dropout 組成層。 這種組合使解碼器能夠基於編碼器的表示來產生有意義的輸出,同時考慮目標序列和來源序列。 與編碼器一樣,通常堆疊多個解碼器層以形成變壓器模型的完整解碼器部分。
class Transformer(nn.Module):
def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff,
max_seq_length, dropout):
'''
src_vocab_size: Source vocabulary size.
tgt_vocab_size: Target vocabulary size.
d_model: The dimensionality of the model's embeddings.
num_heads: Number of attention heads in the multi-head attention mechanism.
num_layers: Number of layers for both the encoder and the decoder.
d_ff: Dimensionality of the inner layer in the feed-forward network.
max_seq_length: Maximum sequence length for positional encoding.
dropout: Dropout rate for regularization.
self.encoder_embedding: Embedding layer for the source sequence.
self.decoder_embedding: Embedding layer for the target sequence.
self.positional_encoding: Positional encoding component.
self.encoder_layers: A list of encoder layers.
self.decoder_layers: A list of decoder layers.
self.fc: Final fully connected (linear) layer mapping to target vocabulary size.
self.dropout: Dropout layer.
'''
super(Transformer, self).__init__()
self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for
_ in range(num_layers)])
self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for
_ in range(num_layers)])
self.fc = nn.Linear(d_model, tgt_vocab_size)
self.dropout = nn.Dropout(dropout)
def generate_mask(self, src, tgt):
'''
This method is used to create masks for the source and target sequences,
ensuring that padding tokens are ignored and that future tokens are not
visible during training for the target sequence.
'''
src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
seq_length = tgt.size(1)
nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
tgt_mask = tgt_mask & nopeak_mask
return src_mask, tgt_mask
def forward(self, src, tgt):
'''
This method defines the forward pass for the Transformer, taking source and target
sequences and producing the output predictions.
Input Embedding and Positional Encoding: The source and target sequences are first
embedded using their respective embedding layers and then added to their positional
encodings.
Encoder Layers: The source sequence is passed through the encoder layers, with the
final encoder output representing the processed source sequence.
Decoder Layers: The target sequence and the encoder's output are passed through the
decoder layers, resulting in the decoder's output.
Final Linear Layer: The decoder's output is mapped to the target vocabulary size
using a fully connected (linear) layer.
'''
src_mask, tgt_mask = self.generate_mask(src, tgt)
src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))
enc_output = src_embedded
for enc_layer in self.encoder_layers:
enc_output = enc_layer(enc_output, src_mask)
dec_output = tgt_embedded
for dec_layer in self.decoder_layers:
dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
output = self.fc(dec_output)
return output
Transformer 級別匯集了 Transformer 模型的各個元件,包括嵌入、位置編碼、編碼器層和解碼器層。 它為訓練和推理提供了方便的接口,封裝了多重注意力、前饋網路和層歸一化的複雜性。此實作遵循標準 Transformer 架構,使其適合序列到序列的任務,例如機器翻譯、文字摘要等。包含遮罩可確保模型遵循序列內的因果依賴性,忽略填充標記並防止資訊洩漏來自未來的token。這些連續的步驟使 Transformer 模型能夠有效地處理輸入序列並產生相應的輸出序列。