Day 18 - Baby LLama2 Chinese (12) - iT 邦幫忙::一起幫忙解決難題，拯救 IT 人的一天

2023 iThome 鐵人賽
DAY 18
AI & Data
用單張顯卡探索大型語言模型的奧秘系列第 18 篇

Day 18 - Baby LLama2 Chinese (12)

15th鐵人賽
jjchen1
團隊我在鐵人賽烙賽、也在外木山裸泳◑ω◐
2023-09-19 23:55:22
921 瀏覽
分享至
目前剛把pretrain.py改成可以從最近的中斷點繼續訓練，因為一個epoch實在太久，我的電腦沒有辦法長時間用來訓練這個網路；目前先試試看改完的code是否有用。
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import time
import math
import pickle
from contextlib import nullcontext
import numpy as np
import torch
from model import Transformer, ModelArgs
from torch.distributed import destroy_process_group, init_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
from chatglm_tokenizer.tokenization_chatglm import ChatGLMTokenizer

from dataset import PretrainDataset
import logging
from tqdm import tqdm

#To run with DDP on 4 gpus on 1 node, example:
# torchrun --standalone --nproc_per_node=4 pretrain.py OR python -m torch.distributed.launch --nproc_per_node=4 pretrain.py
        
def get_logger(filename, verbosity=1, name=None):
    level_dict = {0: logging.DEBUG, 1: logging.INFO, 2: logging.WARNING}
    formatter = logging.Formatter(
        "[%(asctime)s][%(filename)s][%(levelname)s] %(message)s"
    )
    logger = logging.getLogger(name)
    logger.setLevel(level_dict[verbosity])

    fh = logging.FileHandler(filename, "w")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    sh = logging.StreamHandler()
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    return logger
# -----------------------------------------------------------------------------
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

def train_epoch(epoch, start_step, ckpt_path):
    log_interval=100
    last_step = start_step
    try:
        start_time = time.time()
        
        # 使用tqdm来创建一个进度条
        train_loader_progress = tqdm(train_loader, dynamic_ncols=True, mininterval=log_interval)
        
        for step, (X, Y) in enumerate(train_loader_progress):
            if step <= start_step:
                continue
            X = X.to(device)
            Y = Y.to(device)
            lr = get_lr(epoch*iter_per_epoch+step) if decay_lr else learning_rate
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            # and using the GradScaler if data type is float16
            #for micro_step in range(gradient_accumulation_steps):
            if ddp:
                # in DDP training we only need to sync gradients at the last micro step.
                # the official way to do this is with model.no_sync() context manager, but
                # I really dislike that this bloats the code and forces us to repeat code
                # looking at the source of that context manager, it just toggles this variable
                model.require_backward_grad_sync = 0 == gradient_accumulation_steps - 1
            with ctx:
                logits = model(X, Y)
                loss = raw_model.last_loss
                #loss = loss / gradient_accumulation_steps
            # immediately async prefetch next batch while model is doing the forward pass on the GPU
            # backward pass, with gradient scaling if training in fp16
            scaler.scale(loss).backward()
            #
            # clip the gradient
            if grad_clip != 0.0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            # step the optimizer and scaler if training in fp16
            scaler.step(optimizer)
            scaler.update()
            # flush the gradients as soon as we can, no need for this memory anymore
            optimizer.zero_grad(set_to_none=True)
            #打印日志
            # if step % log_interval == 0:
                # spend_time=time.time()-start_time
                # logger.info(
                #         'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
                #             epoch,
                #             max_epoch, 
                #             step, 
                #             iter_per_epoch,
                #             loss.item(), 
                #             optimizer.param_groups[-1]['lr'],
                #             spend_time / (step+1) * iter_per_epoch // 60 - spend_time // 60))
            if step % log_interval == 0:
                train_loader_progress.set_description(
                    f'Epoch:[{epoch}/{max_epoch}] Step:[{step}/{iter_per_epoch}] '
                    f'Loss: {loss.item():.3f} LR: {optimizer.param_groups[-1]["lr"]:.7f}'
                )
            last_step = step
    except:
        save_ckpt(model, model_args, last_step, 1, epoch, ckpt_path)
        exit(0)

@torch.no_grad()
def valid_epoch(epoch):
    global best_val_loss
    losses = []
    model.eval()
    for _, (X, Y) in enumerate(val_loader):
        X=X.to(device)
        Y=Y.to(device)
        with ctx:
            logits, loss = model(X, Y)
        losses.append(loss.item())
    model.train()
    val_loss=np.mean(losses)
    #
    logger.info('valid loss = {:.4f}'.format(val_loss))
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        logger.info('best val_loss: {} best_epoch: {} '.format(best_val_loss,epoch))
        torch.save(raw_model.state_dict(),'{}/best.pth'.format(save_dir))
    #
    return val_loss

def init_model(init_from, ckpt_path=None):
    # model init
    model_args = dict(
        dim=dim,
        n_layers=n_layers,
        n_heads=n_heads,
        n_kv_heads=n_heads,
        vocab_size=64793,
        multiple_of=multiple_of,
        max_seq_len=max_seq_len,
        dropout=dropout,
    )  # start with model_args from command line
    if init_from == "scratch":
        # init a new model from scratch
        print("Initializing a new model from scratch")
        gptconf = ModelArgs(**model_args)
        model = Transformer(gptconf)
        step = 0
        best_val_loss = 1
    elif init_from == "resume":
        print(f"Resuming training from {out_dir}")
        # resume training from a checkpoint.
        checkpoint = torch.load(ckpt_path, map_location=device)
        checkpoint_model_args = checkpoint["model_args"]
        # force these config attributes to be equal otherwise we can't even resume training
        # the rest of the attributes (e.g. dropout) can stay as desired from command line
        for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
            model_args[k] = checkpoint_model_args[k]
        # create the model
        gptconf = ModelArgs(**model_args)
        model = Transformer(gptconf)
        state_dict = checkpoint["model"]
        # fix the keys of the state dictionary :(
        # honestly no idea how checkpoints sometimes get this prefix, have to debug more
        # unwanted_prefix = "_orig_mod."
        # for k, v in list(state_dict.items()):
        #     if k.startswith(unwanted_prefix):
        #         state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
        model.load_state_dict(state_dict)
        step = checkpoint["step"]
        best_val_loss = checkpoint["best_val_loss"]
    return model, model_args, step
def save_ckpt(model, model_args, step, best_val_loss, epoch, ckpt_path):
    """
    將模型、迭代次數、最佳驗證損失和訓練的epoch保存到checkpoint文件中。
    
    Parameters:
        model (torch.nn.Module): 要保存的模型
        step (int): 目前的迭代次數
        best_val_loss (float): 最佳驗證損失
        epoch (int): 目前的訓練epoch數
        ckpt_path (str): checkpoint文件的路徑
    """
    checkpoint = {
        'model': model.state_dict(),
        'model_args': model_args, 
        'step': step,
        'best_val_loss': best_val_loss,
        'epoch': epoch
    }
    
    torch.save(checkpoint, ckpt_path)
    print(f'Checkpoint saved to {ckpt_path}')

# I/O
if __name__=="__main__":
    out_dir = 'testout'
    max_epoch = 10
    eval_interval = 1
    log_interval = 100
    eval_iters = 200
    eval_only = False # if True, script exits right after the first eval
    always_save_checkpoint = True # if True, always save a checkpoint after each eval
    init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
    #
    gradient_accumulation_steps = 1 # used to simulate larger batch sizes
    batch_size = 32  # if gradient_accumulation_steps > 1, this is the micro-batch size
    # model 根据需要更改 
    max_seq_len = 512
    dim = 512
    n_layers = 8
    n_heads = 8
    multiple_of = 32
    dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
    bias = False # do we use bias inside LayerNorm and Linear layers?
    # adamw optimizer
    learning_rate = 3e-4 # max learning rate
    weight_decay = 1e-1
    beta1 = 0.9
    beta2 = 0.95
    grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
    # learning rate decay settings
    decay_lr = True # whether to decay the learning rate
    warmup_iters = 1000 # how many steps to warm up for
    lr_decay_iters = 80000 # should be ~= max_iters per Chinchilla
    min_lr = 1e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
    # DDP settings
    backend = 'nccl' # 'nccl', 'gloo', etc.
    # system
    device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
    dtype = 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
    compile = False # use PyTorch 2.0 to compile the model to be faster
    ###
    start_step = 0
    start_epoch = 0
    save_dir =os.path.join(out_dir , '20230815_baike_pretrain')
    if not os.path.exists(save_dir): os.makedirs(save_dir)
    logger = get_logger(os.path.join(save_dir,'log.log'))
    ckpt_path = os.path.join(save_dir, "ckpt.pt")
    if os.path.exists(ckpt_path):
        init_from = "resume"
    # -----------------------------------------------------------------------------
    config_keys = [
        k
        for k, v in globals().items()
        if not k.startswith("_") and isinstance(v, (int, float, bool, str))
    ]
    # exec(open("configurator.py").read())  # overrides from command line or config file
    # config = {k: globals()[k] for k in config_keys}  # will be useful for logging
    # -----------------------------------------------------------------------------

    # various inits, derived attributes, I/O setup
   # various inits, derived attributes, I/O setup
    ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
    
    if ddp:
        init_process_group(backend="nccl")
        ddp_rank = int(os.environ["RANK"])
        ddp_local_rank = int(os.environ["LOCAL_RANK"])
        ddp_world_size = int(os.environ["WORLD_SIZE"])
        device = f"cuda:{ddp_local_rank}"
        torch.cuda.set_device(device)
        master_process = ddp_rank == 0  # this process will do logging, checkpointing etc.
        seed_offset = ddp_rank  # each process gets a different seed
        # world_size number of processes will be training simultaneously, so we can scale
        # down the desired gradient accumulation iterations per process proportionally
        #assert gradient_accumulation_steps % ddp_world_size == 0
        #gradient_accumulation_steps //= ddp_world_size
    else:
        # if not ddp, we are running on a single gpu, and one process
        master_process = True
        seed_offset = 0
        ddp_world_size = 1
    tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len
    if master_process:
        print(f"tokens per iteration will be: {tokens_per_iter:,}")
        print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len")

    if master_process:
        os.makedirs(out_dir, exist_ok=True)
    torch.manual_seed(1337 + seed_offset)
    torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
    torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
    device_type = "cuda" if "cuda" in device else "cpu"  # for later use in torch.autocast
    # note: float16 data type will automatically use a GradScaler
    ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]
    ctx = (
        nullcontext()
        if device_type == "cpu"
        else torch.cuda.amp.autocast()
    )
    #
    best_val_loss = 1e9
    #
    #-----init dataloader------
    data_path_list=[
        './data/pretrain_data.bin',
        # './data/medical_book.bin',
        # './data/medical_encyclopedia.bin',
        # './data/medical_qa.bin',
        # './data/wiki.bin',
        # './data/baidubaike_563w.bin',
    ]
    train_ds = PretrainDataset(data_path_list, max_length=max_seq_len,memmap=True)
    train_sampler = torch.utils.data.RandomSampler(train_ds)
    train_loader = torch.utils.data.DataLoader(
        train_ds,
        batch_size=batch_size,
        pin_memory=False,
        drop_last=False,
        shuffle=False,        
        num_workers=4,
        sampler=train_sampler
    )
    # val_ds = PretrainDataset(data_path_list, max_length=256)
    # val_loader = torch.utils.data.DataLoader(
    #     val_ds,
    #     batch_size=batch_size,
    #     pin_memory=False,
    #     drop_last=False,
    #     shuffle=False,        
    #     num_workers=0,
    # )
    #init model
    if init_from == "resume":
        model, model_args, start_step = init_model(init_from, ckpt_path=ckpt_path) # 当resume时加载模型
    else:
        model, model_args, start_step = init_model(init_from)
    model.to(device)
    # initialize a GradScaler. If enabled=False scaler is a no-op
    scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
    # optimizer
    optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
    #
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=max_epoch, T_mult=1, eta_min=1e-6, last_epoch=-1)
    iter_per_epoch=len(train_loader)
    warmup_epoch=1
    
    # compile the model
    if compile:
        print("compiling the model... (takes a ~minute)")
        unoptimized_model = model
        model = torch.compile(model) # requires PyTorch 2.0
    # wrap model into DDP container
    if ddp:
        # Ignore the `freqs_cis` buffer so that DDP does not broadcast it at
        # construction time since NCCL does not support `ComplexFloat`
        prefix = "_orig_mod." if compile else ""
        model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"}
        model = DDP(model, device_ids=[ddp_local_rank])
        #
    raw_model = model.module if ddp else model # unwrap DDP container if needed
    # training loop
    for epoch in range(start_epoch, max_epoch):
        train_epoch(epoch, start_step, ckpt_path)
        # val_loss=valid_epoch(epoch)
        # if torch.distributed.get_rank() == 0:  #一般用0，当然，可以选任意的rank保存。
            # torch.save(raw_model.state_dict(),'{}/epoch_{}.pth'.format(save_dir,epoch))
        # torch.save(raw_model.state_dict(),'{}/epoch_{}.pth'.format(save_dir,epoch))
        start_step = 0
        save_ckpt(model, model_args, start_step, 1, epoch+1, ckpt_path)
    if ddp:
        destroy_process_group()