iT邦幫忙

2021 iThome 鐵人賽

DAY 7
0

昨天,我們把建立決策樹條件設定完,那今天,我打算寫建立決策樹後半:
有了條件後,就開始建立整個決策樹-->利用遞迴放入

# 建立決策樹
def create_Tree(data):
    #把之前條件函數加入進來
    classList = [dt[0] for dt in data]
    if no_data_check(data):
        return [None]
    if one_data_check(data):
        return one_data_check(data)[0]
    if one_feature_check(data):
        return one_feature_check(data)[0]
    #分割最佳點
    best_feature_col,best_split_value=Best_Feature(data)
    new_data,leftData,leftDataindex,rightData,rightDataindex, 
    best_feature_col=split_Data(data,best_feature_col,best_split_value)
    #建立決策樹
    myTree = 
    {best_feature_col: {'<' + str(best_split_value): {}, '>' + str(best_split_value): {}}}
    myTree[best_feature_col]['<' + str(best_split_value)] =  
        create_Tree(leftData,best_feature_col)
    myTree[best_feature_col]['>' + str(best_split_value)]= 
        create_Tree(rightData,best_feature_col)
    return myTree
print(create_Tree(data))

這樣子就會得到結果:

{2: {'<0.4495': {1: {'<2.51': 2, '>2.51': {1: {'<9.0': 1, '>9.0': 2}}}}, '>0.4495': 1}}

但這樣結果會有問題的是因為列表在刪除元素後,會改變原本欄位
因為列表是沒有欄位名(我覺得這就是列表極限),所以我現在要把資料帶入pandas.DataFrame裡,並命名欄位名稱:

data=pd.DataFrame([[1,1.01,0.852,5,1.5],[2,2.01,0.31,8,8.1],[1,3.01,0.589,9,5.6],[1,3.01,0.01,8,2.3],[2,4.01,0.258,10,1.1]])
print(data)

執行後就會長這樣(這樣就有欄位名)

0     1      2   3    4
0  1  1.01  0.852   5  1.5
1  2  2.01  0.310   8  8.1
2  1  3.01  0.589   9  5.6
3  1  3.01  0.010   8  2.3
4  2  4.01  0.258  10  1.1

當然其他函數要進行改寫:(像是基尼函數,split_data..)等,以下是整個修改完的程式:

import random as rd
import numpy as np
import pandas as pd
#一個5維資料,共5筆
data=pd.DataFrame([[1,1.01,0.852,5,1.5],[2,2.01,0.31,8,8.1],[1,3.01,0.589,9,5.6],[1,3.01,0.01,8,2.3],[2,4.01,0.258,10,1.1]])
print(data)
#劃分方式
def split_Data_Set(data, index, value):
    data1, data2 = [], []
    for j in data[index]:
        #是否超過指定value
        if j<= value:
            data1.append(j)
        else:
            data2.append(j)
    return data1, data2
def Best_Feature(data):
    #1為最大(效果最差)
    best_Gini_cofe = 1
    #位置最小為0,先設定-1
    best_feature_col = -1
    #因為數值有可能正或負,所以先設定None
    best_split_value = None
    # 第i個特徵

    for l in data.columns:
        # print("第",i,"個特徵")
        if l==0:
            continue
        feat_list = [k for k in data[l]]
        sortfeats = sorted(list(set(feat_list)))
        # print("排序好特徵資料:",sortfeats)
        split_list = []

        if len(sortfeats)==1:
            splitList=sortfeats
        else:
            for j in range(len(sortfeats) - 1):
                split_list.append(np.round((sortfeats[j] + sortfeats[j + 1]) / 2,5))
        # print("節點:",split_list)
        #每個劃分點都測試
        for split_value in split_list:   
            
            subdata1, subdata2 = split_Data_Set(data, l, split_value)
            #使用前幾天的Gini_cofe函數
            
            new_Gini = Gini_cofe(subdata1, subdata2)
            
            #如果基尼係數較小代表比較好
            if new_Gini < best_Gini_cofe:
                
                best_Gini_cofe = new_Gini
                best_feature_col = l
                best_split_value = split_value



    return best_feature_col, best_split_value
best_feature_col, best_split_value=Best_Feature(data)    
print("最佳分割特徵為: 第",best_feature_col,"特徵")
print("最佳分割特徵數值為:",best_split_value) 
import copy
def split_Data(data, best_feature_col, best_split_value):
    new_data = copy.deepcopy(data)
    #去除特徵點資料
    
    new_data = new_data.drop(columns=best_feature_col,axis=1)

    leftData, rightData = [], []
    leftDataindex,rightDataindex = [], []
    #去除特徵後分類資料

    for j in data.index:
        
        if data[best_feature_col][j] <= best_split_value:
            
            leftData.append(new_data.iloc[j].tolist())
            leftDataindex.append(j)
        else:
            
            rightData.append(new_data.iloc[j].tolist())
            rightDataindex.append(j)
    leftData=pd.DataFrame(leftData,columns=new_data.columns)
    rightData=pd.DataFrame(rightData,columns=new_data.columns)
    return new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col
new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col=split_Data(data,best_feature_col,best_split_value)
print("去除特徵後資料:",new_data)
print("去除特徵後左資料:",leftData)
print("左資料在原始資料序列:",leftDataindex)
print("去除特徵後右資料:",rightData)
print("右資料在原始資料序列:",rightDataindex)
print("被去除特徵點:第",best_feature_col,"位")

#確認是否為同一類別
def one_data_check(data):
    count = 0
    #拿第一筆值當範例

    check = data[0][0]
    
    for i in range(len(data)):
        #check所有資料
        if data[0][i]==check:
            count+=1
    if count==len(data):
        
        return [check]
    else:
        return False
#確認是否為空
def no_data_check(data):
    if len(data)==0:
        return True
    else:
        return False
#類別都分類完
import random as rd
def one_feature_check(data):
    count_use={}
    if len(data[0])==1:
        new_data=[]
        for i in range(len(data)):
            new_data.append(data[i][0])
        for i in new_data:
            if i in count_use.keys():
                count_use[i]+=1
            else:
                count_use[i]=1
        #確認最大值
        max_check=max(count_use, key=count_use.get)
        #確認類別是否一樣多
        the_same=[]
        for key,value in count_use.items():
            if(value == max(count_use.values())):
                the_same.append(key)
        #如果有一樣就隨機取
        return rd.sample(the_same,1)
    else:
        return False
#示範有可能是0或1 
print(one_feature_check([[0],[0],[1],[1]]))
# 建立決策樹
def create_Tree(data):
    #把之前條件函數加入進來
    # classList = [dt[0] for dt in data]
    if no_data_check(data):
        return [None]
    if one_data_check(data):
        return one_data_check(data)[0]
    if one_feature_check(data):
        return one_feature_check(data)[0]
    #分割最佳點
    # print(data)
    best_feature_col,best_split_value=Best_Feature(data)

    new_data,leftData,leftDataindex,rightData,rightDataindex, best_feature_col=split_Data(data,best_feature_col,best_split_value)
    #建立決策樹
    myTree = {best_feature_col: {'<' + str(best_split_value): {}, '>' + str(best_split_value): {}}}
    myTree[best_feature_col]['<' + str(best_split_value)] = create_Tree(leftData)
    myTree[best_feature_col]['>' + str(best_split_value)] = create_Tree(rightData)
    return myTree
print(create_Tree(data))

結果:

{3: {'<8.5': {1: {'<1.51': 1.0, '>1.51': {2: {'<0.16': 1.0, '>0.16': 2.0}}}}, '>8.5': {1: {'<3.51': 1.0, '>3.51': 2.0}}}}

好,今天實作部分就到這,明天就把資料去做帶入動作

男孩沿著森林小徑,朝著歌聲方向前進,一路上除了歌聲之外,森林原本的聲音都不見了,沒有鳥叫的聲音,也沒有樹枝摩擦的聲音,但男孩忽略了這種異常,繼續朝著歌聲前進,走了一段時間後,他看見一棟小木屋,歌聲似乎是從裡面傳了出來,男孩想從窗外窺視裡面,但裡面被窗簾遮得死死的,於是男孩走到門前,敲了敲門,並說:有人在嗎?
				--|我看著你,你卻看不到我|--     MS.CM

上一篇
DAY06隨機森林演算法(續3)
下一篇
DAY08隨機森林演算法(續6)
系列文
數據分析方法研究和理解演算法30
圖片
  直播研討會
圖片
{{ item.channelVendor }} {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言