DAY 22
0
Big Data

## [第 22 天] 機器學習（2）複迴歸與 Logistic 迴歸

Logistic regression, despite its name, is a linear model for classification rather than regression.
Generalized Linear Models - scikit-learn 0.18.1 documentation

## 建立複迴歸模型

### Python

``````import numpy as np
from sklearn.linear_model import LinearRegression

X = np.array([
[10, 80], [8, 0], [8, 200], [5, 200], [7, 300], [8, 230], [7, 40], [9, 0], [6, 330], [9, 180]
])
y = np.array([469, 366, 371, 208, 246, 297, 363, 436, 198, 364])

lm = LinearRegression()
lm.fit(X, y)

# 印出係數
print(lm.coef_)

# 印出截距
print(lm.intercept_ )
``````

### R 語言

``````store_area <- c(10, 8, 8, 5, 7, 8, 7, 9, 6, 9)
dist_to_station <- c(80, 0, 200, 200, 300, 230, 40, 0, 330, 180)
monthly_sales <- c(469, 366, 371, 208, 246, 297, 363, 436, 198, 364)
bakery_df <- data.frame(store_area, dist_to_station, monthly_sales)

lm_fit <- lm(monthly_sales ~ ., data = bakery_df)

# 印出係數
lm_fit\$coefficients[-1]

# 印出截距
lm_fit\$coefficients[1]
``````

## 利用複迴歸模型預測

### Python

``````import numpy as np
from sklearn.linear_model import LinearRegression

X = np.array([
[10, 80], [8, 0], [8, 200], [5, 200], [7, 300], [8, 230], [7, 40], [9, 0], [6, 330], [9, 180]
])
y = np.array([469, 366, 371, 208, 246, 297, 363, 436, 198, 364])

lm = LinearRegression()
lm.fit(X, y)

# 新蛋糕店資料
to_be_predicted = np.array([
[10, 110]
])
predicted_sales = lm.predict(to_be_predicted)

# 預測新蛋糕店的單月銷量
print(predicted_sales)
``````

### R 語言

``````store_area <- c(10, 8, 8, 5, 7, 8, 7, 9, 6, 9)
dist_to_station <- c(80, 0, 200, 200, 300, 230, 40, 0, 330, 180)
monthly_sales <- c(469, 366, 371, 208, 246, 297, 363, 436, 198, 364)
bakery_df <- data.frame(store_area, dist_to_station, monthly_sales)

lm_fit <- lm(monthly_sales ~ ., data = bakery_df)

# 新蛋糕店資料
to_be_predicted <- data.frame(store_area = 10, dist_to_station = 110)
predicted_sales <- predict(lm_fit, newdata = to_be_predicted)

# 預測新蛋糕店的單月銷量
predicted_sales
``````

## 複迴歸模型的績效

### Python

``````import numpy as np
from sklearn.linear_model import LinearRegression

X = np.array([
[10, 80], [8, 0], [8, 200], [5, 200], [7, 300], [8, 230], [7, 40], [9, 0], [6, 330], [9, 180]
])
y = np.array([469, 366, 371, 208, 246, 297, 363, 436, 198, 364])

lm = LinearRegression()
lm.fit(X, y)

# 模型績效
mse = np.mean((lm.predict(X) - y) ** 2)
r_squared = lm.score(X, y)
adj_r_squared = r_squared - (1 - r_squared) * (X.shape[1] / (X.shape[0] - X.shape[1] - 1))

# 印出模型績效
print(mse)
print(r_squared)
``````

### R 語言

``````store_area <- c(10, 8, 8, 5, 7, 8, 7, 9, 6, 9)
dist_to_station <- c(80, 0, 200, 200, 300, 230, 40, 0, 330, 180)
monthly_sales <- c(469, 366, 371, 208, 246, 297, 363, 436, 198, 364)
bakery_df <- data.frame(store_area, dist_to_station, monthly_sales)

lm_fit <- lm(monthly_sales ~ ., data = bakery_df)
predicted_sales <- predict(lm_fit, newdata = data.frame(store_area, dist_to_station))

# 模型績效
mse <- mean((monthly_sales - predicted_sales) ^ 2)

# 印出模型績效
mse
summary(lm_fit)\$r.squared
``````

## 複迴歸模型的係數檢定

### Python

``````import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression

X = np.array([
[10, 80], [8, 0], [8, 200], [5, 200], [7, 300], [8, 230], [7, 40], [9, 0], [6, 330], [9, 180]
])
y = np.array([469, 366, 371, 208, 246, 297, 363, 436, 198, 364])

lm = LinearRegression()
lm.fit(X, y)

# 印出 p-value
print(f_regression(X, y)[1])
``````

### R 語言

``````store_area <- c(10, 8, 8, 5, 7, 8, 7, 9, 6, 9)
dist_to_station <- c(80, 0, 200, 200, 300, 230, 40, 0, 330, 180)
monthly_sales <- c(469, 366, 371, 208, 246, 297, 363, 436, 198, 364)
bakery_df <- data.frame(store_area, dist_to_station, monthly_sales)

lm_fit <- lm(monthly_sales ~ ., data = bakery_df)

# 印出 p-value
summary(lm_fit)\$coefficients[-1, 4]
``````

## 建立 Logistic 迴歸模型

Kaggle 著名的鐵達尼克號資料，我們使用 SexPclassAge 來預測 Survived

### Python

``````import pandas as pd
import numpy as np
from sklearn import preprocessing, linear_model

# 將 Age 遺漏值以 median 填補
age_median = np.nanmedian(titanic_train["Age"])
new_Age = np.where(titanic_train["Age"].isnull(), age_median, titanic_train["Age"])
titanic_train["Age"] = new_Age
titanic_train

# 創造 dummy variables
label_encoder = preprocessing.LabelEncoder()
encoded_Sex = label_encoder.fit_transform(titanic_train["Sex"])

# 建立 train_X
train_X = pd.DataFrame([titanic_train["Pclass"],
encoded_Sex,
titanic_train["Age"]
]).T

# 建立模型
logistic_regr = linear_model.LogisticRegression()
logistic_regr.fit(train_X, titanic_train["Survived"])

# 印出係數
print(logistic_regr.coef_)

# 印出截距
print(logistic_regr.intercept_ )
``````

### R 語言

``````url = "https://storage.googleapis.com/2017_ithome_ironman/data/kaggle_titanic_train.csv"

# 將 Age 遺漏值以 median 填補
age_median <- median(titanic_train\$Age, na.rm = TRUE)
new_Age <- ifelse(is.na(titanic_train\$Age), age_median, titanic_train\$Age)
titanic_train\$Age <- new_Age

# 建立模型
logistic_regr <- glm(Survived ~ Age + Pclass + Sex, data = titanic_train, family = binomial(link = "logit"))

# 印出係數
logistic_regr\$coefficients[-1]

# 印出截距
logistic_regr\$coefficients[1]
``````

## Logistic 迴歸模型係數檢定

Logistic 迴歸模型我們也可以檢定變數的顯著性，以 P-value 是否小於 0.05（信心水準 95%）來判定。

### Python

``````import pandas as pd
import numpy as np
from sklearn import preprocessing, linear_model
from sklearn.feature_selection import f_regression

# 將 Age 遺漏值以 median 填補
age_median = np.nanmedian(titanic_train["Age"])
new_Age = np.where(titanic_train["Age"].isnull(), age_median, titanic_train["Age"])
titanic_train["Age"] = new_Age
titanic_train

# 創造 dummy variables
label_encoder = preprocessing.LabelEncoder()
encoded_Sex = label_encoder.fit_transform(titanic_train["Sex"])

# 建立 train_X
train_X = pd.DataFrame([titanic_train["Pclass"],
encoded_Sex,
titanic_train["Age"]
]).T

# 建立模型
logistic_regr = linear_model.LogisticRegression()
logistic_regr.fit(train_X, titanic_train["Survived"])

# 印出 p-value
print(f_regression(train_X, titanic_train["Survived"])[1])
``````

### R 語言

``````url = "https://storage.googleapis.com/2017_ithome_ironman/data/kaggle_titanic_train.csv"

# 將 Age 遺漏值以 median 填補
age_median <- median(titanic_train\$Age, na.rm = TRUE)
new_Age <- ifelse(is.na(titanic_train\$Age), age_median, titanic_train\$Age)
titanic_train\$Age <- new_Age

# 建立模型
logistic_regr <- glm(Survived ~ Age + Pclass + Sex, data = titanic_train, family = binomial(link = "logit"))

# 印出 p-value
summary(logistic_regr)\$coefficients[-1, 4]
``````

## Logistic 迴歸模型績效

### Python

``````import pandas as pd
import numpy as np
from sklearn import preprocessing, linear_model

# 將 Age 遺漏值以 median 填補
age_median = np.nanmedian(titanic_train["Age"])
new_Age = np.where(titanic_train["Age"].isnull(), age_median, titanic_train["Age"])
titanic_train["Age"] = new_Age
titanic_train

# 創造 dummy variables
label_encoder = preprocessing.LabelEncoder()
encoded_Sex = label_encoder.fit_transform(titanic_train["Sex"])

# 建立 train_X
train_X = pd.DataFrame([titanic_train["Pclass"],
encoded_Sex,
titanic_train["Age"]
]).T

# 建立模型
logistic_regr = linear_model.LogisticRegression()
logistic_regr.fit(train_X, titanic_train["Survived"])

# 計算準確率
survived_predictions = logistic_regr.predict(train_X)
accuracy = logistic_regr.score(train_X, titanic_train["Survived"])
print(accuracy)
``````

### R 語言

``````url = "https://storage.googleapis.com/2017_ithome_ironman/data/kaggle_titanic_train.csv"

# 將 Age 遺漏值以 median 填補
age_median <- median(titanic_train\$Age, na.rm = TRUE)
new_Age <- ifelse(is.na(titanic_train\$Age), age_median, titanic_train\$Age)
titanic_train\$Age <- new_Age

# 建立模型
logistic_regr <- glm(Survived ~ Age + Pclass + Sex, data = titanic_train, family = binomial(link = "logit"))

# 計算準確率
x_features <- titanic_train[, c("Age", "Pclass", "Sex")]
survived_predictions <- predict(logistic_regr, newdata = x_features, type = "response")
prediction_cutoff <- ifelse(survived_predictions > 0.5, 1, 0)
confusion_matrix <- table(titanic_train\$Survived, prediction_cutoff)
accuracy <- sum(diag(confusion_matrix))/sum(confusion_matrix)
accuracy
``````