iT邦幫忙

0

Python 演算法 學習日誌 Day 3

機器學習九大步驟:

https://ithelp.ithome.com.tw/upload/images/20210621/20138527bNvtSwZ6Fc.png

https://yourfreetemplates.com/free-machine-learning-diagram/

Homework 1. 酒類分類 (classification):

import pandas as pd
import numpy as np
from sklearn import datasets

ds = datasets.load_wine()

# 1. Dataset
X =pd.DataFrame(ds.data, columns=ds.feature_names)
y = ds.target

# 2. Data clean
print(X.isna().sum())

# 3. Date Feturing

# 4. Split
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.1)
print(X_train.shape, y_train.shape)
>> (160, 13) (160,)

第一種分類方式 (KNN):

# # 5. Define and train the KNN model
from sklearn.neighbors import KNeighborsClassifier as KNN

clf = KNN(n_neighbors=3)
clf.fit(X_train, y_train)
print(f'score = {clf.score(X_test, y_test):.2}')
>> score = 0.78

# 驗證答案
print(list(y_test))
print(list(clf.predict(X_test)))
>> [0, 0, 1, 1, 1, 0, 0, 2, 0, 1, 0, 2, 1, 2, 1, 0, 1, 0]
>> [0, 0, 1, 2, 2, 0, 0, 2, 0, 1, 0, 0, 1, 2, 2, 0, 1, 0]
# 錯 4 個

第二種分類方式 (LogisticRegression):

# 5. Define and train the LogisticRegression model
from sklearn.linear_model import LogisticRegression as lr
clf2 = lr(solver='liblinear')

# 訓練
clf2.fit(X_train, y_train)

# 打分數
print(clf2.score(X_test, y_test))
>> score = 1.0

# 驗證答案
print(list(y_test))
print(list(clf.predict(X_test)))
>> [2, 2, 1, 2, 0, 0, 0, 0, 1, 2, 0, 2, 1, 2, 0, 2, 1, 1]
>> [2, 2, 1, 2, 0, 0, 0, 0, 1, 2, 0, 2, 1, 2, 0, 2, 1, 1]
# 全對!

以上兩種模型比較後,選擇第二種模型。
此步驟是在做機器學習第八步:Evaluate Module

Homework 2. 糖尿病 (regression):

import pandas as pd
import numpy as np
from sklearn import datasets

ds = datasets.load_diabetes()

# 1. Dataset
# 注意此處 X 的數值經過 standardization,固有出現年齡負值的情況。
# (X-m)/sigma,平均為 0 / 標準差為 1
X = pd.DataFrame(ds.data, columns=ds.feature_names)
y = ds.target

# 2. Data clean
print(X.isna().sum())

# 3. Date Feturing

# 4. Split
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
>> (353, 10) (353,)

# 5. Define and train the LinearRegression model
from sklearn.linear_model import LinearRegression as lr
clf = lr()

# 訓練
clf.fit(X_train, y_train)

# 打分數
print(f'{clf.score(X_test, y_test):.2}')
>> score = 0.49

# 求得 MSE & Coefficients
from sklearn.metrics import mean_squared_error, r2_score
y_pred = clf.predict(X_test)

# Coefficients (一次項式係數)
# y = w1*x1 + w2*x2 + w3*x3 ... w10*x10 + b
print('Coefficients: ', clf.coef_)
>> Coefficients:
>> [-67.61407073  -234.0154753  531.59534257  333.23390321  -1043.20891453  666.24505827  248.17865333  267.79106365  820.36323647  75.61684284]

print('Intercept: ', clf.intercept_)
>> Intercept:  151.54632577037566

# MSE (均方誤差):1/n * sum(y_pred-y_test)
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
>> MSE: 3324.9499520969694

# Coefficient of determination (判定係數):越接近 1 越好
print(f'Coefficient of determination: {r2_score(y_test, y_pred)}')
>> Coefficient of determination: 0.49401537426842035

Homework 3. 小費 (regression):

'tips.csv' 資料如下:
https://ithelp.ithome.com.tw/upload/images/20210621/20138527HWDjRZ7hzZ.png

import pandas as pd
import numpy as np
from sklearn import datasets

df = pd.read_csv('tips.csv')
print(df.head())

# 1. Dataset
X = df.drop('tip', axis=1)   # 把'tip'丟棄
y = df['tip']                # y 為要分析的資料

# 2. Data clean
print(df.isna().sum())

# 3. Date Feturing
# 顯示 'sex' 欄位不同的項目
print(X['sex'].unique())
>> ['Female' 'Male']

此時發現演算結果評分過低,推斷為X['day']轉化為數字時編碼問題。
解決方式如下:

gb = df.groupby(['day'])['tip'].mean()
print(gb)
>> Fri    2.73
>> Sat    2.99
>> Sun    3.26
>> Thur   2.77

import seaborn as sns
import matplotlib.pyplot as plt
sns.barplot(gb.index, gb.values)
plt.show()

https://ithelp.ithome.com.tw/upload/images/20210622/20138527fC6VtRq2i6.png

# 把所有值換成數字才能分析
X['sex'].replace({'Female' : 0, 'Male' : 1}, inplace=True)
X['smoker'].replace({'Yes' : 0, 'No' : 1}, inplace=True)
X['day'].replace({'Thur' : 0, 'Fri' : 0, 'Sat' : 2, 'Sun' : 3}, inplace=True)
X['time'].replace({'Lunch' : 0, 'Dinner' : 1}, inplace=True)

# 4. Split
from sklearn.model_selection import train_test_split as tts

# test_size=0.2 : 測試用資料為 20%
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.2)
print(X_train.shape, y_train.shape)
>> (195, 6) (195,)

# 5. Define and train the LinearRegression model
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
clf.fit(X_train, y_train)

print(f'score = {clf.score(X_test, y_test):.2}')
>> score = 0.4

# 驗證答案
print(list(y_test))
b = [float(f'{i:.2}')! for i in clf.predict(X_test)]
print(b)

補充:存取 & 取用模型

import joblib as jb

# 存取
jb.dump(clf, 'wine.joblib')
# 取用
print(clf.predict(X_test))

尚未有邦友留言

立即登入留言