昨天,我們把建立決策樹條件設定完,那今天,我打算寫建立決策樹後半:
有了條件後,就開始建立整個決策樹-->利用遞迴放入
# 建立決策樹
def create_Tree(data):
#把之前條件函數加入進來
classList = [dt[0] for dt in data]
if no_data_check(data):
return [None]
if one_data_check(data):
return one_data_check(data)[0]
if one_feature_check(data):
return one_feature_check(data)[0]
#分割最佳點
best_feature_col,best_split_value=Best_Feature(data)
new_data,leftData,leftDataindex,rightData,rightDataindex,
best_feature_col=split_Data(data,best_feature_col,best_split_value)
#建立決策樹
myTree =
{best_feature_col: {'<' + str(best_split_value): {}, '>' + str(best_split_value): {}}}
myTree[best_feature_col]['<' + str(best_split_value)] =
create_Tree(leftData,best_feature_col)
myTree[best_feature_col]['>' + str(best_split_value)]=
create_Tree(rightData,best_feature_col)
return myTree
print(create_Tree(data))
這樣子就會得到結果:
{2: {'<0.4495': {1: {'<2.51': 2, '>2.51': {1: {'<9.0': 1, '>9.0': 2}}}}, '>0.4495': 1}}
但這樣結果會有問題的是因為列表在刪除元素後,會改變原本欄位
因為列表是沒有欄位名(我覺得這就是列表極限),所以我現在要把資料帶入pandas.DataFrame裡,並命名欄位名稱:
data=pd.DataFrame([[1,1.01,0.852,5,1.5],[2,2.01,0.31,8,8.1],[1,3.01,0.589,9,5.6],[1,3.01,0.01,8,2.3],[2,4.01,0.258,10,1.1]])
print(data)
執行後就會長這樣(這樣就有欄位名)
0 1 2 3 4
0 1 1.01 0.852 5 1.5
1 2 2.01 0.310 8 8.1
2 1 3.01 0.589 9 5.6
3 1 3.01 0.010 8 2.3
4 2 4.01 0.258 10 1.1
當然其他函數要進行改寫:(像是基尼函數,split_data..)等,以下是整個修改完的程式:
import random as rd
import numpy as np
import pandas as pd
#一個5維資料,共5筆
data=pd.DataFrame([[1,1.01,0.852,5,1.5],[2,2.01,0.31,8,8.1],[1,3.01,0.589,9,5.6],[1,3.01,0.01,8,2.3],[2,4.01,0.258,10,1.1]])
print(data)
#劃分方式
def split_Data_Set(data, index, value):
data1, data2 = [], []
for j in data[index]:
#是否超過指定value
if j<= value:
data1.append(j)
else:
data2.append(j)
return data1, data2
def Best_Feature(data):
#1為最大(效果最差)
best_Gini_cofe = 1
#位置最小為0,先設定-1
best_feature_col = -1
#因為數值有可能正或負,所以先設定None
best_split_value = None
# 第i個特徵
for l in data.columns:
# print("第",i,"個特徵")
if l==0:
continue
feat_list = [k for k in data[l]]
sortfeats = sorted(list(set(feat_list)))
# print("排序好特徵資料:",sortfeats)
split_list = []
if len(sortfeats)==1:
splitList=sortfeats
else:
for j in range(len(sortfeats) - 1):
split_list.append(np.round((sortfeats[j] + sortfeats[j + 1]) / 2,5))
# print("節點:",split_list)
#每個劃分點都測試
for split_value in split_list:
subdata1, subdata2 = split_Data_Set(data, l, split_value)
#使用前幾天的Gini_cofe函數
new_Gini = Gini_cofe(subdata1, subdata2)
#如果基尼係數較小代表比較好
if new_Gini < best_Gini_cofe:
best_Gini_cofe = new_Gini
best_feature_col = l
best_split_value = split_value
return best_feature_col, best_split_value
best_feature_col, best_split_value=Best_Feature(data)
print("最佳分割特徵為: 第",best_feature_col,"特徵")
print("最佳分割特徵數值為:",best_split_value)
import copy
def split_Data(data, best_feature_col, best_split_value):
new_data = copy.deepcopy(data)
#去除特徵點資料
new_data = new_data.drop(columns=best_feature_col,axis=1)
leftData, rightData = [], []
leftDataindex,rightDataindex = [], []
#去除特徵後分類資料
for j in data.index:
if data[best_feature_col][j] <= best_split_value:
leftData.append(new_data.iloc[j].tolist())
leftDataindex.append(j)
else:
rightData.append(new_data.iloc[j].tolist())
rightDataindex.append(j)
leftData=pd.DataFrame(leftData,columns=new_data.columns)
rightData=pd.DataFrame(rightData,columns=new_data.columns)
return new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col
new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col=split_Data(data,best_feature_col,best_split_value)
print("去除特徵後資料:",new_data)
print("去除特徵後左資料:",leftData)
print("左資料在原始資料序列:",leftDataindex)
print("去除特徵後右資料:",rightData)
print("右資料在原始資料序列:",rightDataindex)
print("被去除特徵點:第",best_feature_col,"位")
#確認是否為同一類別
def one_data_check(data):
count = 0
#拿第一筆值當範例
check = data[0][0]
for i in range(len(data)):
#check所有資料
if data[0][i]==check:
count+=1
if count==len(data):
return [check]
else:
return False
#確認是否為空
def no_data_check(data):
if len(data)==0:
return True
else:
return False
#類別都分類完
import random as rd
def one_feature_check(data):
count_use={}
if len(data[0])==1:
new_data=[]
for i in range(len(data)):
new_data.append(data[i][0])
for i in new_data:
if i in count_use.keys():
count_use[i]+=1
else:
count_use[i]=1
#確認最大值
max_check=max(count_use, key=count_use.get)
#確認類別是否一樣多
the_same=[]
for key,value in count_use.items():
if(value == max(count_use.values())):
the_same.append(key)
#如果有一樣就隨機取
return rd.sample(the_same,1)
else:
return False
#示範有可能是0或1
print(one_feature_check([[0],[0],[1],[1]]))
# 建立決策樹
def create_Tree(data):
#把之前條件函數加入進來
# classList = [dt[0] for dt in data]
if no_data_check(data):
return [None]
if one_data_check(data):
return one_data_check(data)[0]
if one_feature_check(data):
return one_feature_check(data)[0]
#分割最佳點
# print(data)
best_feature_col,best_split_value=Best_Feature(data)
new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col=split_Data(data,best_feature_col,best_split_value)
#建立決策樹
myTree = {best_feature_col: {'<' + str(best_split_value): {}, '>' + str(best_split_value): {}}}
myTree[best_feature_col]['<' + str(best_split_value)] = create_Tree(leftData)
myTree[best_feature_col]['>' + str(best_split_value)] = create_Tree(rightData)
return myTree
print(create_Tree(data))
結果:
{3: {'<8.5': {1: {'<1.51': 1.0, '>1.51': {2: {'<0.16': 1.0, '>0.16': 2.0}}}}, '>8.5': {1: {'<3.51': 1.0, '>3.51': 2.0}}}}
好,今天實作部分就到這,明天就把資料去做帶入動作
男孩沿著森林小徑,朝著歌聲方向前進,一路上除了歌聲之外,森林原本的聲音都不見了,沒有鳥叫的聲音,也沒有樹枝摩擦的聲音,但男孩忽略了這種異常,繼續朝著歌聲前進,走了一段時間後,他看見一棟小木屋,歌聲似乎是從裡面傳了出來,男孩想從窗外窺視裡面,但裡面被窗簾遮得死死的,於是男孩走到門前,敲了敲門,並說:有人在嗎?
--|我看著你,你卻看不到我|-- MS.CM