讀取csv
df_train = pd.read_csv('./data/' + 'titanic_train.csv')
df_test = pd.read_csv('./data/' + 'titanic_test.csv')
取出目標欄位
Y_train = df_train['Survived']
df_train = df_train.drop(['Survived'] , axis=1) # 移除欄位
取出非特徵欄位
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId'] , axis=1) # 移除欄位
df_test = df_test.drop(['PassengerId'] , axis=1) # 移除欄位
合併df_train、df_test
df = pd.concat([df_train,df_test])
df.head()
填補缺失值
編碼
歸一化
LEncoder = LabelEncoder()
MMEncoder = MinMaxScaler()
for c in df.columns:
df[c] = df[c].fillna(-1) # 填補缺失值
if df[c].dtype == 'object':
print(c)
df[c] = LEncoder.fit_transform(list(df[c].values)) # 編碼
df[c] = MMEncoder.fit_transform(df[c].values.reshape(-1, 1)) # 歸一化
df.head()
取得X_train、y_train
train_num = Y_train.shape[0]
X_train = df[:train_num]
X_test = df[train_num:]
訓練模型
model = GradientBoostingClassifier()
model.fit(X_train, Y_train)
列出特徵重要性
importance = pd.Series(data=model.feature_importances_, index=X_train.columns)
importance = importance.sort_values(ascending=False)
print(importance)
預測X_test
pred = model.predict(X_test)
sub = pd.DataFrame({'PassengerId': ids, 'Survived': pred})
sub.head()
儲存預測數據到csv檔
sub.to_csv('titanic_baseline.csv', index=False)