昨天我們完成了 Tokenizer 的訓練實作,有了之前幾天奠定的基礎後,今天終於要開始訓練一個真正的 LLM!
我們的目標是建立一個約 8,000 萬參數的迷你 LLaMA2,雖然和 GPT-4、LLaMA2-70B 比起來小很多,但已經能體驗到大模型的完整訓練流程。
我們使用兩個開源的中文資料集:
pip install datasets modelscope
# 預訓練數據
os.system("modelscope download --dataset ddzhu123/seq-monkey --local_dir data")
os.system("tar -xvf data/mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 -C data")
# SFT 數據
os.system("huggingface-cli download --repo-type dataset --resume-download BelleGroup/train_3.5M_CN --local-dir BelleGroup")
把長文本切成 512 tokens 左右的小片段,這樣更適合訓練。
def split_text(text, chunk_size=512):
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
input_file = 'mobvoi_seq_monkey_general_open_corpus.jsonl'
with open('seq_monkey_datawhale.jsonl', 'a', encoding='utf-8') as pretrain:
with open(input_file, 'r', encoding='utf-8') as f:
for line in f:
text = json.loads(line)['text']
for chunk in split_text(text):
pretrain.write(json.dumps({'text': chunk}, ensure_ascii=False) + '\n')
把輸出格式轉成標準的多回合對話 JSON,方便 SFT 階段使用。
def convert_message(data):
message = [{"role": "system", "content": "你是一個AI助理"}]
for item in data:
if item['from'] == 'human':
message.append({'role': 'user', 'content': item['value']})
elif item['from'] == 'assistant':
message.append({'role': 'assistant', 'content': item['value']})
return message
with open('BelleGroup_sft.jsonl', 'a', encoding='utf-8') as sft:
with open('BelleGroup/train_3.5M_CN.json', 'r', encoding='utf-8') as f:
for line in f:
item = json.loads(line)
message = convert_message(item['conversations'])
sft.write(json.dumps(message, ensure_ascii=False) + '\n')
class PretrainDataset(Dataset):
def __init__(self, data_path, tokenizer, max_length=512):
self.data = open(data_path, 'r', encoding='utf-8').readlines()
self.tokenizer = tokenizer
self.max_length = max_length
self.pad_id = tokenizer.eos_token_id
def __len__(self):
return len(self.data)
def __getitem__(self, index):
sample = json.loads(self.data[index])
text = f"{self.tokenizer.bos_token}{sample['text']}"
ids = self.tokenizer(text).data['input_ids'][:self.max_length]
# padding
pad_len = self.max_length - len(ids)
ids = ids + [self.pad_id] * pad_len
loss_mask = [1] * (len(ids) - pad_len) + [0] * pad_len
X = ids[:-1]
Y = ids[1:]
loss_mask = loss_mask[1:]
return (
torch.tensor(X),
torch.tensor(Y),
torch.tensor(loss_mask)
)
class SFTDataset(Dataset):
def __init__(self, data_path, tokenizer, max_length=512):
self.data = open(data_path, 'r', encoding='utf-8').readlines()
self.tokenizer = tokenizer
self.max_length = max_length
self.pad_id = tokenizer.eos_token_id
def __len__(self):
return len(self.data)
def generate_loss_mask(self, ids):
mask = [0] * len(ids)
# 假設 tokenizer 中 assistant 開頭序列為 [3, 1074, ...] (需根據實際tokenizer調整)
a_sequence = [3, 1074, 537, 500, 203] # <|im_start|>assistant\n
n = len(ids)
i = 0
while i <= n - len(a_sequence):
if ids[i:i+len(a_sequence)] == a_sequence:
j = ids.index(4, i+len(a_sequence)) # 4 = <|im_end|>
for pos in range(i+len(a_sequence), j+1):
mask[pos] = 1
i = j
else:
i += 1
return mask
def __getitem__(self, index):
sample = json.loads(self.data[index])
text = self.tokenizer.apply_chat_template(sample, tokenize=False)
ids = self.tokenizer(text).data['input_ids'][:self.max_length]
pad_len = self.max_length - len(ids)
ids = ids + [self.pad_id] * pad_len
loss_mask = self.generate_loss_mask(ids)
X = ids[:-1]
Y = ids[1:]
loss_mask = loss_mask[1:]
return (
torch.tensor(X),
torch.tensor(Y),
torch.tensor(loss_mask)
)
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from k_model import Transformer, ModelConfig
# 1. 初始化 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./tokenizer_k/")
# 2. 準備 Dataset & DataLoader
train_ds = PretrainDataset("seq_monkey_datawhale.jsonl", tokenizer, max_length=512)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True)
# 3. 初始化模型
lm_config = ModelConfig(
dim=1024,
n_layers=18,
vocab_size=len(tokenizer) # 跟 tokenizer 一致
)
model = Transformer(lm_config)
# 4. 設定訓練裝置
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
# 5. Optimizer
optimizer = optim.AdamW(model.parameters(), lr=2e-4)
def get_lr(it, all_iters, lr=2e-4, warmup_iters=1000):
if it < warmup_iters:
return lr * it / warmup_iters
if it > all_iters:
return lr / 10
ratio = (it - warmup_iters) / (all_iters - warmup_iters)
coeff = 0.5 * (1.0 + math.cos(math.pi * ratio))
return lr/10 + coeff * (lr - lr/10)
for step, (X, Y, loss_mask) in enumerate(train_loader):
X, Y, loss_mask = X.to(device), Y.to(device), loss_mask.to(device)
out = model(X, Y)
loss = (out.last_loss.view(-1) * loss_mask.view(-1)).sum() / loss_mask.sum()
loss.backward()
optimizer.step()
optimizer.zero_grad()
if step % 100 == 0:
print(f"step {step}, loss {loss.item():.4f}")
from transformers import AutoTokenizer
import torch
from k_model import Transformer, ModelConfig
class TextGenerator:
def __init__(self, checkpoint, tokenizer_model_path="./tokenizer_k/", device=None):
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_path)
# 初始化模型
lm_config = ModelConfig(
dim=1024,
n_layers=18,
vocab_size=len(self.tokenizer)
)
self.model = Transformer(lm_config).to(self.device)
self.model.load_state_dict(torch.load(checkpoint, map_location=self.device), strict=False)
self.model.eval()
def pretrain_sample(self, prompt, max_new_tokens=100, temperature=0.7):
ids = self.tokenizer(prompt).input_ids
x = torch.tensor([ids], dtype=torch.long, device=self.device)
with torch.no_grad():
y = self.model.generate(x, max_new_tokens=max_new_tokens, temperature=temperature)
return self.tokenizer.decode(y[0].tolist())
def sft_sample(self, prompt, max_new_tokens=100, temperature=0.7):
message = [
{"role": "system", "content": "你是一個AI助理"},
{"role": "user", "content": prompt}
]
text = self.tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
ids = self.tokenizer(text).input_ids
x = torch.tensor([ids], dtype=torch.long, device=self.device)
with torch.no_grad():
y = self.model.generate(x, max_new_tokens=max_new_tokens, temperature=temperature)
return self.tokenizer.decode(y[0].tolist())
generator = TextGenerator(checkpoint='./base_model_215M/pretrain_1024_18_6144.pth')
# 預訓練模型測試
print(generator.pretrain_sample("<|im_start|>大學是", num_samples=1, max_new_tokens=100))
# SFT 模型測試
print(generator.sft_sample("你好呀", num_samples=1, max_new_tokens=50))
大家要特別注意的是,原作者在訓練時,是用了 8 張 4090 去跑,需要 46 小時,原作者也有建議,若 VRAM 不足可以調小 batch size(原作者測試 batch=4 只需 7G VRAM),所以大家就再斟酌自己設備的能力去訓練喔!
參考連結:
https://datawhalechina.github.io/happy-llm/#/