{
"instruction": "法国的首都是什么?",
"input": "",
"output": "法国的首都是巴黎。"
}
{
"instruction": "将给定的数字按升序排列。",
"input": "输入:2、4、0、8、3",
"output": "输出:0、2、3、4、8"
}
在sft階段每一筆訓練樣本都分成了兩個部份,prompt
與answer
:
prompt
= instruction
+ input
answer
= output
def sft_process(jsonpath_list, outcsvpath):
q_lst=[]
a_lst=[]
for jsonpath in jsonpath_list:
f = open(jsonpath,'r')
#s
while True:
line = f.readline()
if not line:
break
per=json.loads(line)
q=per['instruction']
i=per['input']
a=per['output']
q=q+i
if len(q)<10 or len(a)<5:
continue
if len(q)>256 or len(a)>256:
continue
q_lst.append(q)
a_lst.append(a)
df=pd.DataFrame(columns=['prompt','answer'])
df['prompt']=q_lst
df['answer']=a_lst
df.to_csv(outcsvpath,index=False)
print(df)