0

## 機器學習九大步驟：

https://yourfreetemplates.com/free-machine-learning-diagram/

### Homework 1. 酒類分類 (classification)：

``````import pandas as pd
import numpy as np
from sklearn import datasets

# 1. Dataset
X =pd.DataFrame(ds.data, columns=ds.feature_names)
y = ds.target

# 2. Data clean
print(X.isna().sum())

# 3. Date Feturing

# 4. Split
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.1)
print(X_train.shape, y_train.shape)
>> (160, 13) (160,)
``````

``````# # 5. Define and train the KNN model
from sklearn.neighbors import KNeighborsClassifier as KNN

clf = KNN(n_neighbors=3)
clf.fit(X_train, y_train)
print(f'score = {clf.score(X_test, y_test):.2}')
>> score = 0.78

# 驗證答案
print(list(y_test))
print(list(clf.predict(X_test)))
>> [0, 0, 1, 1, 1, 0, 0, 2, 0, 1, 0, 2, 1, 2, 1, 0, 1, 0]
>> [0, 0, 1, 2, 2, 0, 0, 2, 0, 1, 0, 0, 1, 2, 2, 0, 1, 0]
# 錯 4 個
``````

``````# 5. Define and train the LogisticRegression model
from sklearn.linear_model import LogisticRegression as lr
clf2 = lr(solver='liblinear')

# 訓練
clf2.fit(X_train, y_train)

# 打分數
print(clf2.score(X_test, y_test))
>> score = 1.0

# 驗證答案
print(list(y_test))
print(list(clf.predict(X_test)))
>> [2, 2, 1, 2, 0, 0, 0, 0, 1, 2, 0, 2, 1, 2, 0, 2, 1, 1]
>> [2, 2, 1, 2, 0, 0, 0, 0, 1, 2, 0, 2, 1, 2, 0, 2, 1, 1]
# 全對!
``````

### Homework 2. 糖尿病 (regression)：

``````import pandas as pd
import numpy as np
from sklearn import datasets

# 1. Dataset
# 注意此處 X 的數值經過 standardization，固有出現年齡負值的情況。
# (X-m)/sigma，平均為 0 / 標準差為 1
X = pd.DataFrame(ds.data, columns=ds.feature_names)
y = ds.target

# 2. Data clean
print(X.isna().sum())

# 3. Date Feturing

# 4. Split
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
>> (353, 10) (353,)

# 5. Define and train the LinearRegression model
from sklearn.linear_model import LinearRegression as lr
clf = lr()

# 訓練
clf.fit(X_train, y_train)

# 打分數
print(f'{clf.score(X_test, y_test):.2}')
>> score = 0.49

# 求得 MSE & Coefficients
from sklearn.metrics import mean_squared_error, r2_score
y_pred = clf.predict(X_test)

# Coefficients (一次項式係數)
# y = w1*x1 + w2*x2 + w3*x3 ... w10*x10 + b
print('Coefficients: ', clf.coef_)
>> Coefficients:
>> [-67.61407073  -234.0154753  531.59534257  333.23390321  -1043.20891453  666.24505827  248.17865333  267.79106365  820.36323647  75.61684284]

print('Intercept: ', clf.intercept_)
>> Intercept:  151.54632577037566

# MSE (均方誤差)：1/n * sum(y_pred-y_test)
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
>> MSE: 3324.9499520969694

# Coefficient of determination (判定係數)：越接近 1 越好
print(f'Coefficient of determination: {r2_score(y_test, y_pred)}')
>> Coefficient of determination: 0.49401537426842035
``````

### Homework 3. 小費 (regression)：

'tips.csv'　資料如下：

``````import pandas as pd
import numpy as np
from sklearn import datasets

# 1. Dataset
X = df.drop('tip', axis=1)   # 把'tip'丟棄
y = df['tip']                # y 為要分析的資料

# 2. Data clean
print(df.isna().sum())

# 3. Date Feturing
# 顯示 'sex' 欄位不同的項目
print(X['sex'].unique())
>> ['Female' 'Male']
``````

``````gb = df.groupby(['day'])['tip'].mean()
print(gb)
>> Fri    2.73
>> Sat    2.99
>> Sun    3.26
>> Thur   2.77

import seaborn as sns
import matplotlib.pyplot as plt
sns.barplot(gb.index, gb.values)
plt.show()
``````

``````# 把所有值換成數字才能分析
X['sex'].replace({'Female' : 0, 'Male' : 1}, inplace=True)
X['smoker'].replace({'Yes' : 0, 'No' : 1}, inplace=True)
X['day'].replace({'Thur' : 0, 'Fri' : 0, 'Sat' : 2, 'Sun' : 3}, inplace=True)
X['time'].replace({'Lunch' : 0, 'Dinner' : 1}, inplace=True)

# 4. Split
from sklearn.model_selection import train_test_split as tts

# test_size=0.2 : 測試用資料為 20%
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.2)
print(X_train.shape, y_train.shape)
>> (195, 6) (195,)

# 5. Define and train the LinearRegression model
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
clf.fit(X_train, y_train)

print(f'score = {clf.score(X_test, y_test):.2}')
>> score = 0.4

# 驗證答案
print(list(y_test))
b = [float(f'{i:.2}')! for i in clf.predict(X_test)]
print(b)
``````

### 補充：存取 & 取用模型

``````import joblib as jb

# 存取
jb.dump(clf, 'wine.joblib')
# 取用
print(clf.predict(X_test))
``````