iT邦幫忙

0

Transformer訓練程式碼

  • 分享至 

  • xImage
  •  

訓練過程

from torch.utils.data import DataLoader
from transformers import AutoModelForMaskedLM
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from transformers import default_data_collator
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer

# set batch size to 32, a larger bacth size when using a more powerful gpu
batch_size = 32

# use bert model checkpoint tokenizer
model_checkpoint = "distilbert-base-uncased"
# word piece tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

#define tokenize function to tokenize the dataset
def tokenize_function(data):
    result = tokenizer(data["text"])
    return result

# batched is set to True to activate fast multithreading!
# load imdb dataset
imdb_data = load_dataset("imdb")
tokenize_dataset = imdb_data.map(tokenize_function, batched = True, 
                   remove_columns = ["text", "label"])

# Apply random masking once on the whole test data, then uses the default data collector 
# to handle the test dataset in batches
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm_probability = 0.15)
processed_dataset = tokenize_dataset.map(concat_chunk_dataset, batched = True)
downsampled_dataset = processed_dataset["train"].train_test_split(train_size=train_size, test_size=test_size, seed=42)

# load the train dataset for traing
train_dataloader = DataLoader(downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator,)

# load the test dataset for evaluation
eval_dataset = downsampled_dataset["test"].map(insert_random_mask,batched=True, remove_columns=downsampled_dataset["test"].column_names)
eval_dataset = eval_dataset.rename_columns({"masked_input_ids": "input_ids", "masked_attention_mask": "attention_mask","masked_labels": "labels"})
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=default_data_collator)

# initialize pretrained bert model
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

# set the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# initialize accelerator for training
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

# set the number of epochs which is set to 30
num_train_epochs = 30
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

# define the learning rate scheduler for training
lr_scheduler = get_scheduler("linear",optimizer=optimizer,num_warmup_steps=0,num_training_steps=num_training_steps)

將批次大小設為 32,使用 pytorch 內建資料載入器載入訓練和測試資料集。 我們載入預先訓練的 DistilBERT 模型並使用 Adam Optimizer。呼叫 Transformers 加速器庫進行訓練,它接收預訓練模型、最佳化器、訓練和評估資料集來為訓練做準備。設定訓練輪數,取得訓練資料載入器的長度並計算訓練步驟。 最後,我們設定接受優化器、預熱步驟和訓練步驟的學習率調度器函數。


圖片
  直播研討會
圖片
{{ item.channelVendor }} {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言