1.壓縮微調過的模型,並讓它能在Ollama上運作。
先下載必要的組件
!pip install transformers gradio
!pip install peft
導入必要的套件
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import os
from huggingface_hub import login
login huggingface 的 token
# 替換為你的Hugging Face API Token
my_read_token = "請改成你的read token"
os.environ["HUGGINGFACE_TOKEN"] = my_read_token
# 登錄Hugging Face
login(token=os.environ["HUGGINGFACE_TOKEN"])
載入模型以及peft參數
model_name = "taide/TAIDE-LX-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# 加载 tokenizer 和 模型
peft_name = "mark1098/TAIDE-LX-7B-Chat-Medical-Fintune"
model = PeftModel.from_pretrained(model, peft_name)
model = model.to('cuda')
載入 Gradio
import gradio as gr
import re
# 定义一个生成文本的函数
def chat_with_model(input_text):
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
# outputs = model.generate(inputs.input_ids, max_new_tokens=150, pad_token_id=tokenizer.pad_token_id)
# prompt = "每天肚子痛是啥狀況?"
messages = [
{"role": "system", "content": "你是一位專業的醫療人員,請用心且專業的以三到五句話回答問題。"},
{"role": "user", "content": input_text}
]
# 將消息合併成一個文本輸入
def format_messages(messages):
formatted_messages = ""
for message in messages:
if message['role'] == 'system':
formatted_messages += f"[SYSTEM] {message['content']}\n"
elif message['role'] == 'user':
formatted_messages += f"[USER] {message['content']}\n"
return formatted_messages
# 格式化消息
formatted_text = format_messages(messages)
print("Formatted text:", formatted_text)
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
print("Generated text with chat template:", text)
model_inputs = tokenizer([text], return_tensors="pt").to('cuda')
# 印出model_inputs進行檢查
print("Model inputs:", model_inputs)
# 產生文本
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=90,
#eos_token_id=tokenizer.encode('<|eot_id|>')[0],
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
attention_mask=model_inputs.attention_mask,
repetition_penalty=1.6, # 增加 repetition_penalty
top_k=50, # 設定最高k個概率詞
# stop_token=tokenizer.eos_token, # 確保模型在遇到停止標記時中斷
do_sample=True,
top_p=0.15, # 新增 top_p 參數
temperature=0.15, # 設定 temperature 參數
#forced_eos_token_id=tokenizer.encode('')[0]
)
generated_ids = generated_ids[:, model_inputs.input_ids.shape[-1]:]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# 清理生成的文本
response = re.sub(r'.*', '', response, flags=re.DOTALL)
response = re.sub(r'.*', '', response)
response = re.sub(r'
', '', response) # 去除方括號內的內容
response = re.sub(r'?[^>]+>', '', response) # 去除HTML標籤
response = re.sub(r'dress|dressing', '', response, flags=re.IGNORECASE) # 去除 "dress" 和 "dressing"
response = re.sub(r'<<.*?>>', '', response) # 去除 `<>` 標籤
response = re.sub(r'
', '', response) # 去除 `[/EMBB]]` 標籤
response = response.strip() # 去除前後多餘的空格
# 從最後開始查找句號並去除句號之後的語句
def remove_after_last_period(text):
last_period_index = max(text.rfind('。'), text.rfind('!'))
if last_period_index != -1:
return text[:last_period_index + 1]
return text
# 清理生成的文本
response = remove_after_last_period(response).strip()
return response
# 创建 Gradio 界面
iface = gr.Interface(fn=chat_with_model, inputs="text", outputs="text", title="Medical Chatbot", description="使用TAIDE-LX-7B-Chat-Medical-Fintune模型的医疗聊天机器人")
# 启动接口
iface.launch()
存儲模型
model = model.merge_and_unload()
save_directory = "./full_model_with_peft"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
安裝llama.cpp
!git clone https://github.com/ggerganov/llama.cpp.git
%cd llama.cpp
!make
進行轉化
!python convert_hf_to_gguf.py /content/full_model_with_peft \
--outfile /content/TAIDE-LX-7B-Chat-Medical-Fintune.gguf \
--outtype f16
壓縮
!./llama-quantize /content/TAIDE-LX-7B-Chat-Medical-Fintune.gguf /content/TAIDE-LX-7B-Chat-Medical-Fintune-q5_k_m.gguf q5_k_m
上傳
from huggingface_hub import HfApi
import os
api = HfApi()
HF_ACCESS_TOKEN = "改成你的write token"
model_id = "mark1098/TAIDE-LX-7B-Chat-Medical-Fintune.gguf"
api.create_repo(
model_id,
exist_ok=True,
repo_type="model", # 上傳格式為模型
use_auth_token=HF_ACCESS_TOKEN,
)
# upload the model to the hub
# upload model name includes the TAIDE-LX-7B-Chat-Medical-Fintune.gguf in same folder
for file in os.listdir():
if file.endswith(".gguf"):
model_name = file.lower()
api.upload_file(
repo_id=model_id,
path_in_repo=model_name,
path_or_fileobj=f"{os.getcwd()}/{file}",
repo_type="model", # 上傳格式為模型
use_auth_token=HF_ACCESS_TOKEN)
2.使用Breeze微調模型
其實主要改的地方就是 模型跟token導入時改成MediaTek-Research/Breeze-7B-Instruct-v1_0
其他地方就沒修改。
3.使用llama factory 微調模型
這邊其實是從官網抓下來的範例程式碼只是增加了抓取繁中資料集以及加到 web GUI下拉列表中的程式化修改,具體的細看程式碼就可發現。
下圖為大概的範例