Day 11: 使用資料集微調出醫療問答機器人

2024 iThome 鐵人賽

DAY 11

生成式 AI

LLM與生成式AI筆記系列第 11 篇

16th鐵人賽

中年一般人

2024-08-12 13:07:55

475 瀏覽

分享至

完成內容來自:1.Medical_chat

導入環境

!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]
!pip install peft
!pip install pandas pyarrow
!pip install -U bitsandbytes
!pip install transformers datasets
!apt-get install wget

載入必要套件

import bitsandbytes
import gc
import os
import pandas as pd
import torch
from datasets import Dataset
from google.colab import drive, output
from huggingface_hub import login, HfApi
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig, AutoModelForSequenceClassification, AdamW,  get_linear_schedule_with_warmup

下載資料集

記得去這邊申請使用權

!wget https://huggingface.co/datasets/ChenWeiLi/Medtext_zhtw/raw/main/MedText_zhtw.json

登陸 Hugging Face

# 替換為你的Hugging Face API Token
my_read_token = "換成你自己的read token"
os.environ["HUGGINGFACE_TOKEN"] = my_read_token
# 登錄Hugging Face
login(token=os.environ["HUGGINGFACE_TOKEN"])

處理資料集

# 將 JSON 文件轉換為 CSV 文件。
df = pd.read_json('/content/MedText_zhtw.json' )
ds = Dataset.from_pandas(df)

初始化分詞器(記得到這邊申請使用權)

model_name = "taide/TAIDE-LX-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, token=my_write_token)

進行分詞以及詞語向量化

def process_func(example):
    MAX_LENGTH = 384  # Llama 分詞器會將一個中文字切割成多個 token，因此需要放開一些最大長度，確保資料的完整性
    input_ids, attention_mask, labels = [], [], []

    # 構建 instruction 和 input 字符串，並進行分詞
    instruction = tokenizer(f"user\n\n{example['instruction'] + example['input']}assistant\n\n", add_special_tokens=False)
    response = tokenizer(f"{example['output']}", add_special_tokens=False)

    # 合併 input 和 response 的 token ID 和注意力掩碼
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]

    # 如果超過最大長度，進行截斷
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    # 確保 attention_mask 和 labels 與 input_ids 一致
    attention_mask = attention_mask[:len(input_ids)]
    labels = labels[:len(input_ids)]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# 應用到整個數據集
tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)

# 檢查是否正確
print(tokenized_ds)
print(tokenizer.decode(tokenized_ds[0]['input_ids']))
print(tokenizer.decode([token for token in tokenized_ds[0]["labels"] if token != -100]))

8.創建模型

# 重新開始加載模型
model = AutoModelForCausalLM.from_pretrained("taide/TAIDE-LX-7B", device_map="auto",torch_dtype=torch.bfloat16)
# 從檢查點載入模型- 可參考的寫法
#model_checkpoint = '/content/drive/MyDrive/colab_results/checkpoint-38500'  # 修改為最新的檢查點路徑
#model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)


# 設定 pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# 設定 eos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
# 檢查是否正確
model

model.enable_input_require_grads() # 開啟梯度檢查點時，要執行方法

9.lora 方法

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 訓練模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alapa，具體作用參見 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
model = get_peft_model(model, config)

10.配置訓練參數

torch.utils.checkpoint.use_reentrant = False
# TODO: 之後改成使用optuna自動調整超參
# 設定訓練參數
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/colab_results",
    save_steps=50,  # 每50步保存一次檢查點
    logging_dir='/content/drive/MyDrive/colab_logs',
    per_device_train_batch_size=64,  # 增加批次大小
    gradient_accumulation_steps=4,  # 調整累積梯度步數，使得實際批次大小達到 260K tokens
    logging_steps=10,
    num_train_epochs=40,
    learning_rate=5e-5,
    save_strategy="steps",  # 每 steps 次保存一次
    save_total_limit=5,  # 保留最多5個檢查點
    gradient_checkpointing=True,
    weight_decay=0.01,
)

# 設定優化器
optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)

# 設定學習率調度器
num_training_steps = len(tokenized_ds) // (args.per_device_train_batch_size * args.gradient_accumulation_steps) * args.num_train_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),  # 預熱步數，這裡設定為總步數的10%
    num_training_steps=num_training_steps
)

# 創建 Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds,
    optimizers=(optimizer, lr_scheduler),  # 設定優化器和學習率調度器
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    tokenizer =tokenizer
)

訓練

trainer.train()

# 繼續訓練
#def forward_with_checkpoint(*args, **kwargs):
#    return torch.utils.checkpoint.checkpoint(trainer.training_step, *args, use_reentrant=False, **kwargs)

#trainer.training_step = forward_with_checkpoint
#trainer.train(resume_from_checkpoint=True)

推理

gc.collect()
torch.cuda.empty_cache()
prompt = "每天只睡三小時會有啥狀況？"
messages = [
    {"role": "system", "content": "你是一位專業的醫療人員，請用心且專業的以三到五句話回答問題。"},
    {"role": "user", "content": prompt}
]

# 將消息合併成一個文本輸入
def format_messages(messages):
    formatted_messages = ""
    for message in messages:
        if message['role'] == 'system':
            formatted_messages += f"[SYSTEM] {message['content']}\n"
        elif message['role'] == 'user':
            formatted_messages += f"[USER] {message['content']}\n"
    return formatted_messages

# 格式化消息
formatted_text = format_messages(messages)
print("Formatted text:", formatted_text)

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
print("Generated text with chat template:", text)


model_inputs = tokenizer([text], return_tensors="pt").to('cuda')

# 印出model_inputs進行檢查
print("Model inputs:", model_inputs)

# 產生文本
generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=90,
    #eos_token_id=tokenizer.encode('<|eot_id|>')[0],
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    attention_mask=model_inputs.attention_mask,
repetition_penalty=1.6,  # 增加 repetition_penalty
    top_k=50,  # 設定最高k個概率詞
    # stop_token=tokenizer.eos_token,  # 確保模型在遇到停止標記時中斷
    do_sample=True,
    top_p=0.15,  # 新增 top_p 參數
    temperature=0.15,  # 設定 temperature 參數
    #forced_eos_token_id=tokenizer.encode('</s>')[0]
)

# 打印generated_ids進行檢查
print("Generated IDs:", generated_ids)
# 去掉輸入部分，僅保留生成的文本
generated_ids = generated_ids[:, model_inputs.input_ids.shape[-1]:]


response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(response)


# 清理生成的文本
import re
response = re.sub(r'</s>.*', '', response, flags=re.DOTALL)
response = re.sub(r'</s>.*', '</s>', response)
response = re.sub(r'\[.*?\]', '', response)  # 去除方括號內的內容
response = re.sub(r'</?[^>]+>', '', response)  # 去除HTML標籤
response = re.sub(r'dress|dressing', '', response, flags=re.IGNORECASE)  # 去除 "dress" 和 "dressing"
response = re.sub(r'<<.*?>>', '', response)  # 去除 `<<SYS>>` 標籤
response = re.sub(r'\[.*?\]', '', response)  # 去除 `[/EMBB]]` 標籤
response = response.strip()  # 去除前後多餘的空格
# 從最後開始查找句號並去除句號之後的語句
def remove_after_last_period(text):
    last_period_index = max(text.rfind('。'), text.rfind('!'))
    if last_period_index != -1:
        return text[:last_period_index + 1]
    return text

# 清理生成的文本
response = remove_after_last_period(response).strip()

print(response)

生成結果如下:

Formatted text: [SYSTEM] 你是一位專業的醫療人員，請用心且專業的以三到五句話回答問題。
[USER] 每天只睡三小時會有啥狀況？

Generated text with chat template: <s>[INST] <<SYS>>
你是一位專業的醫療人員，請用心且專業的以三到五句話回答問題。
<</SYS>>

每天只睡三小時會有啥狀況？ [/INST]
Model inputs: {'input_ids': tensor([[    1,     1, 29961, 25580, 29962,  3532, 14816, 29903,  6778,    13,
         33013, 32052, 37319, 52781, 32701, 30214, 50772, 44775, 32350, 37319,
         30651, 30457, 30780, 30904, 34694, 35616, 35211, 30267,    13, 29966,
           829, 14816, 29903,  6778,    13,    13, 42265, 31557, 45710, 30457,
         37374, 40939, 35236, 44124, 30882,   518, 29914, 25580, 29962]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]], device='cuda:0')}
/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:91: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
  warnings.warn(
Generated IDs: tensor([[    1,     1, 29961, 25580, 29962,  3532, 14816, 29903,  6778,    13,
         33013, 32052, 37319, 52781, 32701, 30214, 50772, 44775, 32350, 37319,
         30651, 30457, 30780, 30904, 34694, 35616, 35211, 30267,    13, 29966,
           829, 14816, 29903,  6778,    13,    13, 42265, 31557, 45710, 30457,
         37374, 40939, 35236, 44124, 30882,   518, 29914, 25580, 29962, 29871,
         45711, 32336, 31411, 37362, 50691, 30503, 38796, 53952, 30330, 42602,
         30413, 53999, 32827, 33813, 33388, 38424, 30419, 33084, 55042, 36182,
         30409, 31184, 35560, 38367, 53628, 33389, 38792, 30267, 36557, 39567,
         37016, 31391, 37334, 44739, 52244, 52819, 35647, 45394, 34113, 54098,
         52270, 50614, 32373, 34744, 51949, 50584, 47240, 38513, 37054, 39228,
         37318, 32703, 52283, 42541, 50811, 30267, 43711, 33327, 42600, 45322,
         42263, 30210, 47976, 31074, 42400, 32262, 31608, 40973, 34725, 38454,
         41633, 30898, 44951, 30275, 38693, 32377, 30505, 45835, 40684, 37105,
         32854, 31325, 43692, 33647, 32887, 45220, 34747, 38908, 44950]],
       device='cuda:0')
睡眠不足會導致認知和情緒障礙、注意力不集中以及其他健康後果（例如體重增加）等嚴重影響長期健康的情況。如果持續存在或對生活造成重大困擾的話則需要進一步評估並可能轉診給心理學家或其他專業人士進行治療諮詢。然而值得注意的是每個人的耐力水平不同；有些人可以從極度疲勞中恢復並且在短時間內完成任務而無需充分休息的人可能會感到疲
睡眠不足會導致認知和情緒障礙、注意力不集中以及其他健康後果（例如體重增加）等嚴重影響長期健康的情況。如果持續存在或對生活造成重大困擾的話則需要進一步評估並可能轉診給心理學家或其他專業人士進行治療諮詢。

存處

model.save_pretrained("/hfmodel")
tokenizer.save_pretrained("/hftokenizer")

上傳

os.environ["HUGGINGFACE_UPLOAD_TOKEN"] = "改成你的 write token"
# 登錄Hugging Face
login(token=os.environ["HUGGINGFACE_UPLOAD_TOKEN"])

api = HfApi()
api.upload_folder(
    folder_path="/hfmodel",  # 本地保存模型的目錄
    path_in_repo="",  # 默認值會將文件上傳到根目錄
    repo_id="mark1098/TAIDE-LX-7B-Chat-Medical-Fintune",  # Hugging Face 上的模型名稱
    repo_type="model"  # 上傳的是模型
)

之後就是使用gradio demo 跟量化