[2020鐵人賽Day9]邂逅PHP Machine Learning-用KNearestNeighbors (KNN) 來定位 - 訓練樣本與測試樣本

第 11 屆 iThome 鐵人賽

DAY 9

AI & Data

學習PHP Machine Learning的冒險歷程系列第 9 篇

[2020鐵人賽Day9]邂逅PHP Machine Learning-用KNearestNeighbors (KNN) 來定位 - 訓練樣本與測試樣本

11th鐵人賽 php-ml

Old Siao

2019-09-25 23:57:41

1508 瀏覽

分享至

訓練樣本與測試樣本

首先我們將樣本分為80%的訓練樣本與20%的測試樣本，如下程式碼：

$samples_20percent = Array(); //宣告20% samples 為Array
$labels_20percent = Array();  //宣告20% labels  為Array
$samples_80percent = Array(); //宣告80% samples 為Array
$labels_80percent = Array();  //宣告80% labels  為Array

/**
 * 取得20%數量的亂數
 */
$randValue = Array(); //定義為陣列
$count = $count_20percent; //產生指定數量
for ($i=1; $i<=$count; $i++) {
    $randValueTemp = mt_rand(0,count($getSample)-1); //產生0~(總數量-1)的亂數
    if (in_array($randValueTemp, $randValue)) { //如果已產生過迴圈重跑
        $i--;
    }else{
        $randValue[] = $randValueTemp; //若無重復則將亂數塞入陣列
    }
}

asort($randValue);  //排序
foreach($randValue as $value){  
    //把陣列內的亂數讀出，就將要的20% samples跟labels寫入到指定變數內
    $samples_20percent[] = $total_sample[$value];
    $labels_20percent[] = $getTargets[$value];

    //刪除已取出資料的陣列元素
    unset($total_sample[$value]);
    unset($getTargets[$value]);
}

//20％擷取完畢資料，剩下的資料為80％的部分，array_values()方法函式會返回所指定陣列中所有的值並將其建立新索引(由0開始)
$samples_80percent = array_values($total_sample);
$labels_80percent = array_values($getTargets);

接下來建立一個新的分類器 KNearestNeighbors()，這個分類器可以調整兩個參數，一個是k的大小，另外個是距離的計算方式
因為我們有做標準化的動作，理論上不會被有特別某一個參數決定預測的結果
其中參數k我們設定為3，距離為預設

KNearestNeighbors($k=3)

接下來將訓練樣本與測試樣本放入分類器，完成訓練
使用predict，將測試的sample放入來預測測試樣本的成果
這樣我們就可以獲得預測的花之種類了

完整Code：

require_once __DIR__ . '/vendor/autoload.php';

use Phpml\Classification\KNearestNeighbors;
use Phpml\Dataset\CsvDataset;

//讀取Excel
$dataset = new CsvDataset('iris.csv',4);

//取得相關數值
$getSample = $dataset->getSamples();
$getTargets = $dataset->getTargets();

// max(最大化)
$sepalLength_max = 0;
$sepalWidth_max = 0;
$petalLength_max = 0;
$petalWidth_max = 0;

// min(最小化)
$sepalLength_min = 0;
$sepalWidth_min = 0;
$petalLength_min = 0;
$petalWidth_min = 0;

// array(標準化數值)
$sepalLength_array = [];
$sepalWidth_array = [];
$petalLength_array = [];
$petalWidth_array = [];

for($i=0; $i<count($getSample); $i++){
    if($i==0){
        // max(最大化參數賦予初始值)
        $sepalLength_max = $getSample[$i][0];
        $sepalWidth_max = $getSample[$i][1];
        $petalLength_max = $getSample[$i][2];
        $petalWidth_max = $getSample[$i][3];

        // min(最小化參數賦予初始值)
        $sepalLength_min = $getSample[$i][0];
        $sepalWidth_min = $getSample[$i][1];
        $petalLength_min = $getSample[$i][2];
        $petalWidth_min = $getSample[$i][3];
    }

    // max(比較最大化)
    if($getSample[$i][0] > $sepalLength_max){
        $sepalLength_max = $getSample[$i][0];
    }
    if($getSample[$i][1] > $sepalWidth_max){
        $sepalWidth_max = $getSample[$i][1];
    }
    if($getSample[$i][2] > $petalLength_max){
        $petalLength_max = $getSample[$i][2];
    }
    if($getSample[$i][3] > $petalWidth_max){
        $petalWidth_max = $getSample[$i][3];
    }

    // mix(比較最小化)
    if($getSample[$i][0] < $sepalLength_min){
        $sepalLength_min = $getSample[$i][0];
    }
    if($getSample[$i][1] < $sepalWidth_min){
        $sepalWidth_min = $getSample[$i][1];
    }
    if($getSample[$i][2] < $petalLength_min){
        $petalLength_min = $getSample[$i][2];
    }
    if($getSample[$i][3] < $petalWidth_min){
        $petalWidth_min = $getSample[$i][3];
    }
}

// x'= (x-min)/(max - min) 標準化數值（有效值取到小數第三位）
for($i=0; $i<count($getSample); $i++){
    $sepalLength_array[] = round(($getSample[$i][0]-$sepalLength_min)/($sepalLength_max-$sepalLength_min), 3);
    $sepalWidth_array[] = round(($getSample[$i][1]-$sepalWidth_min)/($sepalWidth_max-$sepalWidth_min), 3);
    $petalLength_array[] = round(($getSample[$i][2]-$petalLength_min)/($petalLength_max-$petalLength_min), 3);
    $petalWidth_array[] = round(($getSample[$i][3]-$petalWidth_min)/($petalWidth_max-$petalWidth_min), 3);
}

$count_total = count($getSample);
$count_20percent = round($count_total * 0.2);
$count_80percent = $count_total - $count_20percent;

$total_sample = Array();
for($i=0; $i<count($sepalLength_array); $i++){
    $tempArrayValue = array(
        $sepalLength_array[$i],
        $sepalWidth_array[$i],
        $petalLength_array[$i],
        $petalWidth_array[$i],
    );
    $total_sample[] = $tempArrayValue;
}

$samples_20percent = Array(); //宣告20% samples 為Array
$labels_20percent = Array();  //宣告20% labels  為Array
$samples_80percent = Array(); //宣告80% samples 為Array
$labels_80percent = Array();  //宣告80% labels  為Array

/**
 * 取得20%數量的亂數
 */
$randValue = Array(); //定義為陣列
$count = $count_20percent; //產生指定數量
for ($i=1; $i<=$count; $i++) {
    $randValueTemp = mt_rand(0,count($getSample)-1); //產生0~(總數量-1)的亂數
    if (in_array($randValueTemp, $randValue)) { //如果已產生過迴圈重跑
        $i--;
    }else{
        $randValue[] = $randValueTemp; //若無重復則將亂數塞入陣列
    }
}

asort($randValue);  //排序
foreach($randValue as $value){  
    //把陣列內的亂數讀出，就將要的20% samples跟labels寫入到指定變數內
    $samples_20percent[] = $total_sample[$value];
    $labels_20percent[] = $getTargets[$value];

    //刪除已取出資料的陣列元素
    unset($total_sample[$value]);
    unset($getTargets[$value]);
}

//20％擷取完畢資料，剩下的資料為80％的部分，array_values()方法函式會返回所指定陣列中所有的值並將其建立新索引(由0開始)
$samples_80percent = array_values($total_sample);
$labels_80percent = array_values($getTargets);

$classifier = new KNearestNeighbors($k=3);
$classifier->train($samples_80percent, $labels_80percent);

$resultDate = $classifier->predict($samples_20percent);

echo "<pre>";
var_dump($resultDate);
echo "</pre>";