print('START Fit')
print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))
print('elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)
print('Lasso')
lasso_model_full_data = lasso.fit(X, y)
print('Ridge')
ridge_model_full_data = ridge.fit(X, y)
print('Svr')
svr_model_full_data = svr.fit(X, y)
print('GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)
print('xgboost')
xgb_model_full_data = xgboost.fit(X, y)
print('lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)
再次疊合模型,此處的比例可能要視經驗而定,並沒有特定的比例,不同模型在不同的資料集當中可能會有差異。
def blend_models_predict(X):
return ((0.1 * elastic_model_full_data.predict(X)) + \
(0.05 * lasso_model_full_data.predict(X)) + \
(0.1 * ridge_model_full_data.predict(X)) + \
(0.1 * svr_model_full_data.predict(X)) + \
(0.1 * gbr_model_full_data.predict(X)) + \
(0.15 * xgb_model_full_data.predict(X)) + \
(0.1 * lgb_model_full_data.predict(X)) + \
(0.3 * stack_gen_model.predict(np.array(X))))
print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict(X)))
print('Predict submission')
submission = pd.read_csv("sample_submission.csv")
# 記得將取過log的預測值透過expm1還原
# expm1 相反是 log1p
submission.iloc[:,1] = (np.expm1(blend_models_predict(X_sub)))
由於資料在頭尾的部分會有不同的趨勢,例如特別大或者特別小,因此最後根據這點來將尾端的資料向上下延展,提高預測準確的可能性。
q1 = submission['SalePrice'].quantile(0.0042)
q2 = submission['SalePrice'].quantile(0.99)
# 藉由百分位數作為極端值的調整依據
# 將超過的數值調整
# 下方程式碼的意思是 對於所有SalePrice 大於q1(下界)的保留原數值
# 小於q1的則乘上0.77
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x > q1 else x*0.77)
# 大於q2(上界)的則乘上1.1
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x < q2 else x*1.1)
# 匯出檔案
submission.to_csv("submission.csv", index=False)