import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import time
import math
import pickle
from contextlib import nullcontext
import numpy as np
import torch
from model import Transformer, ModelArgs
from torch.distributed import destroy_process_group, init_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
from chatglm_tokenizer.tokenization_chatglm import ChatGLMTokenizer
from dataset import PretrainDataset
import logging
from tqdm import tqdm
#To run with DDP on 4 gpus on 1 node, example:
# torchrun --standalone --nproc_per_node=4 pretrain.py OR python -m torch.distributed.launch --nproc_per_node=4 pretrain.py
def get_logger(filename, verbosity=1, name=None):
level_dict = {0: logging.DEBUG, 1: logging.INFO, 2: logging.WARNING}
formatter = logging.Formatter(
"[%(asctime)s][%(filename)s][%(levelname)s] %(message)s"
logger = logging.getLogger(name)
fh = logging.FileHandler(filename, "w")
sh = logging.StreamHandler()
return logger
# -----------------------------------------------------------------------------
def get_lr(it):
# 1) linear warmup for warmup_iters steps
if it < warmup_iters:
return learning_rate * it / warmup_iters
# 2) if it > lr_decay_iters, return min learning rate
if it > lr_decay_iters:
return min_lr
# 3) in between, use cosine decay down to min learning rate
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
return min_lr + coeff * (learning_rate - min_lr)
def train_epoch(epoch, start_step, ckpt_path):
last_step = start_step
start_time = time.time()
# 使用tqdm来创建一个进度条
train_loader_progress = tqdm(train_loader, dynamic_ncols=True, mininterval=log_interval)
for step, (X, Y) in enumerate(train_loader_progress):
if step <= start_step:
X = X.to(device)
Y = Y.to(device)
lr = get_lr(epoch*iter_per_epoch+step) if decay_lr else learning_rate
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# and using the GradScaler if data type is float16
#for micro_step in range(gradient_accumulation_steps):
if ddp:
# in DDP training we only need to sync gradients at the last micro step.
# the official way to do this is with model.no_sync() context manager, but
# I really dislike that this bloats the code and forces us to repeat code
# looking at the source of that context manager, it just toggles this variable
model.require_backward_grad_sync = 0 == gradient_accumulation_steps - 1
with ctx:
logits = model(X, Y)
loss = raw_model.last_loss
#loss = loss / gradient_accumulation_steps
# immediately async prefetch next batch while model is doing the forward pass on the GPU
# backward pass, with gradient scaling if training in fp16
# clip the gradient
if grad_clip != 0.0:
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
# step the optimizer and scaler if training in fp16
# flush the gradients as soon as we can, no need for this memory anymore
# if step % log_interval == 0:
# spend_time=time.time()-start_time
# logger.info(
# 'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
# epoch,
# max_epoch,
# step,
# iter_per_epoch,
# loss.item(),
# optimizer.param_groups[-1]['lr'],
# spend_time / (step+1) * iter_per_epoch // 60 - spend_time // 60))
if step % log_interval == 0:
f'Epoch:[{epoch}/{max_epoch}] Step:[{step}/{iter_per_epoch}] '
f'Loss: {loss.item():.3f} LR: {optimizer.param_groups[-1]["lr"]:.7f}'
last_step = step
save_ckpt(model, model_args, last_step, 1, epoch, ckpt_path)
def valid_epoch(epoch):
global best_val_loss
losses = []
for _, (X, Y) in enumerate(val_loader):
with ctx:
logits, loss = model(X, Y)
logger.info('valid loss = {:.4f}'.format(val_loss))
if val_loss < best_val_loss:
best_val_loss = val_loss
logger.info('best val_loss: {} best_epoch: {} '.format(best_val_loss,epoch))
return val_loss
def init_model(init_from, ckpt_path=None):
# model init
model_args = dict(
) # start with model_args from command line
if init_from == "scratch":
# init a new model from scratch
print("Initializing a new model from scratch")
gptconf = ModelArgs(**model_args)
model = Transformer(gptconf)
step = 0
best_val_loss = 1
elif init_from == "resume":
print(f"Resuming training from {out_dir}")
# resume training from a checkpoint.
checkpoint = torch.load(ckpt_path, map_location=device)
checkpoint_model_args = checkpoint["model_args"]
# force these config attributes to be equal otherwise we can't even resume training
# the rest of the attributes (e.g. dropout) can stay as desired from command line
for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
model_args[k] = checkpoint_model_args[k]
# create the model
gptconf = ModelArgs(**model_args)
model = Transformer(gptconf)
state_dict = checkpoint["model"]
# fix the keys of the state dictionary :(
# honestly no idea how checkpoints sometimes get this prefix, have to debug more
# unwanted_prefix = "_orig_mod."
# for k, v in list(state_dict.items()):
# if k.startswith(unwanted_prefix):
# state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
step = checkpoint["step"]
best_val_loss = checkpoint["best_val_loss"]
return model, model_args, step
def save_ckpt(model, model_args, step, best_val_loss, epoch, ckpt_path):
model (torch.nn.Module): 要保存的模型
step (int): 目前的迭代次數
best_val_loss (float): 最佳驗證損失
epoch (int): 目前的訓練epoch數
ckpt_path (str): checkpoint文件的路徑
checkpoint = {
'model': model.state_dict(),
'model_args': model_args,
'step': step,
'best_val_loss': best_val_loss,
'epoch': epoch
torch.save(checkpoint, ckpt_path)
print(f'Checkpoint saved to {ckpt_path}')
# I/O
if __name__=="__main__":
out_dir = 'testout'
max_epoch = 10
eval_interval = 1
log_interval = 100
eval_iters = 200
eval_only = False # if True, script exits right after the first eval
always_save_checkpoint = True # if True, always save a checkpoint after each eval
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
gradient_accumulation_steps = 1 # used to simulate larger batch sizes
batch_size = 32 # if gradient_accumulation_steps > 1, this is the micro-batch size
# model 根据需要更改
max_seq_len = 512
dim = 512
n_layers = 8
n_heads = 8
multiple_of = 32
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
bias = False # do we use bias inside LayerNorm and Linear layers?
# adamw optimizer
learning_rate = 3e-4 # max learning rate
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
decay_lr = True # whether to decay the learning rate
warmup_iters = 1000 # how many steps to warm up for
lr_decay_iters = 80000 # should be ~= max_iters per Chinchilla
min_lr = 1e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
# DDP settings
backend = 'nccl' # 'nccl', 'gloo', etc.
# system
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
dtype = 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
compile = False # use PyTorch 2.0 to compile the model to be faster
start_step = 0
start_epoch = 0
save_dir =os.path.join(out_dir , '20230815_baike_pretrain')
if not os.path.exists(save_dir): os.makedirs(save_dir)
logger = get_logger(os.path.join(save_dir,'log.log'))
ckpt_path = os.path.join(save_dir, "ckpt.pt")
if os.path.exists(ckpt_path):
init_from = "resume"
# -----------------------------------------------------------------------------
config_keys = [
for k, v in globals().items()
if not k.startswith("_") and isinstance(v, (int, float, bool, str))
# exec(open("configurator.py").read()) # overrides from command line or config file
# config = {k: globals()[k] for k in config_keys} # will be useful for logging
# -----------------------------------------------------------------------------
# various inits, derived attributes, I/O setup
ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run?
if ddp:
ddp_rank = int(os.environ["RANK"])
ddp_local_rank = int(os.environ["LOCAL_RANK"])
ddp_world_size = int(os.environ["WORLD_SIZE"])
device = f"cuda:{ddp_local_rank}"
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
seed_offset = ddp_rank # each process gets a different seed
# world_size number of processes will be training simultaneously, so we can scale
# down the desired gradient accumulation iterations per process proportionally
#assert gradient_accumulation_steps % ddp_world_size == 0
#gradient_accumulation_steps //= ddp_world_size
# if not ddp, we are running on a single gpu, and one process
master_process = True
seed_offset = 0
ddp_world_size = 1
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len
if master_process:
print(f"tokens per iteration will be: {tokens_per_iter:,}")
print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len")
if master_process:
os.makedirs(out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = "cuda" if "cuda" in device else "cpu" # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]
ctx = (
if device_type == "cpu"
else torch.cuda.amp.autocast()
best_val_loss = 1e9
#-----init dataloader------
# './data/medical_book.bin',
# './data/medical_encyclopedia.bin',
# './data/medical_qa.bin',
# './data/wiki.bin',
# './data/baidubaike_563w.bin',
train_ds = PretrainDataset(data_path_list, max_length=max_seq_len,memmap=True)
train_sampler = torch.utils.data.RandomSampler(train_ds)
train_loader = torch.utils.data.DataLoader(
# val_ds = PretrainDataset(data_path_list, max_length=256)
# val_loader = torch.utils.data.DataLoader(
# val_ds,
# batch_size=batch_size,
# pin_memory=False,
# drop_last=False,
# shuffle=False,
# num_workers=0,
# )
#init model
if init_from == "resume":
model, model_args, start_step = init_model(init_from, ckpt_path=ckpt_path) # 当resume时加载模型
model, model_args, start_step = init_model(init_from)
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=max_epoch, T_mult=1, eta_min=1e-6, last_epoch=-1)
# compile the model
if compile:
print("compiling the model... (takes a ~minute)")
unoptimized_model = model
model = torch.compile(model) # requires PyTorch 2.0
# wrap model into DDP container
if ddp:
# Ignore the `freqs_cis` buffer so that DDP does not broadcast it at
# construction time since NCCL does not support `ComplexFloat`
prefix = "_orig_mod." if compile else ""
model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"}
model = DDP(model, device_ids=[ddp_local_rank])
raw_model = model.module if ddp else model # unwrap DDP container if needed
# training loop
for epoch in range(start_epoch, max_epoch):
train_epoch(epoch, start_step, ckpt_path)
# val_loss=valid_epoch(epoch)
# if torch.distributed.get_rank() == 0: #一般用0,当然,可以选任意的rank保存。
# torch.save(raw_model.state_dict(),'{}/epoch_{}.pth'.format(save_dir,epoch))
# torch.save(raw_model.state_dict(),'{}/epoch_{}.pth'.format(save_dir,epoch))
start_step = 0
save_ckpt(model, model_args, start_step, 1, epoch+1, ckpt_path)
if ddp: