from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder# Preprocessing for numerical data 數字數據插值
numerical_transformer =SimpleImputer(strategy='constant')# Preprocessing for categorical data 文字特征處理,插值+編碼轉換
categorical_transformer =Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),('onehot',OneHotEncoder(handle_unknown='ignore'))])# Bundle preprocessing for numerical and categorical data
# 上面兩者合并起來,形成完整的數據處理流程
preprocessor =ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),('cat', categorical_transformer, categorical_cols)])
步驟2: 定義模型
from sklearn.ensemble import RandomForestRegressormodel =RandomForestRegressor(n_estimators=100, random_state=0)
# Bundle preprocessing and modeling code in a pipeline
# 將 前處理管道 + 模型管道,再次疊加形成新管道
my_pipeline =Pipeline(steps=[('preprocessor', preprocessor),('model', model)])# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)
# 用定義好的pipeline 對test進行預測,提交,代碼很簡潔,不易出錯
preds_test = my_pipeline.predict(X_test)# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)
You advanced 5,020 places on the leaderboard! Your submission scored 16459.13640, which is an improvement of your previous score of 16619.07644. Great job! 誤差有點提升,哈哈,加油!🚀
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputermy_pipeline =Pipeline(steps=[('preprocessor',SimpleImputer()),('model',RandomForestRegressor(n_estimators=50,random_state=0))])
from sklearn.model_selection import cross_val_score
# Multiply by -1 since sklearn calculates *negative* MAE
scores =-1*cross_val_score(my_pipeline, X, y,cv=5,scoring='neg_mean_absolute_error')print("MAE scores:\n", scores)print("Average MAE score (across experiments):")print(scores.mean())
# 樹的棵數不同情況下,交叉驗證的得分均值
def get_score(n_estimators):"""Return the average MAE over 3 CV folds of random forest model.Keyword argument:n_estimators -- the number of trees in the forest"""my_pipeline =Pipeline(steps=[('preprocessing',SimpleImputer()),('model',RandomForestRegressor(n_estimators=n_estimators,random_state=0))])scores =-1*cross_val_score(my_pipeline,X,y,cv=3,scoring='neg_mean_absolute_error')return scores.mean()results ={}for i in range(1,9):# 獲取樹的棵樹是50,100,。。。,400時,模型的效果results[50*i]=get_score(50*i)
# 可視化不同參數下的模型效果
import matplotlib.pyplot as plt
%matplotlib inlineplt.plot(list(results.keys()),list(results.values()))
plt.show()
n_estimators_best =min(results, key=results.get) #最合適的參數