House Prices - Advanced Regression Techniques
This notebook comes from my personal work on a Kaggle competition
Data exploration
In [94]:
import numpy as np
import pandas as pd
import sklearn
In [95]:
df_train = pd.read_csv("data/train.csv").drop(columns=["Id"])
df_test = pd.read_csv("data/test.csv").set_index("Id")
features_num = df_train.select_dtypes(include=np.number).columns
features_cat = df_train.columns.difference(features_num)
features_num = features_num.drop('SalePrice')
In [96]:
corr = df_train.corr().query("SalePrice > 0.5") # most relevant features
corr.loc[["SalePrice"], corr.index].sort_values(by="SalePrice", axis=1, ascending=False)
Out[96]:
In [69]:
df_train[corr.index].hist(bins=30, figsize=(20, 10));
In [97]:
import scipy
scipy.stats.shapiro(df_train["SalePrice"]) # p-value < 0.05
Out[97]:
In [98]:
print(df_train[features_num].skew().sort_values(ascending=False).to_frame().rename(columns=lambda x:"Skewness")[:5])
features_skewed = (df_train[features_num].skew() > 0.75).index
df_train[features_skewed] = np.log1p(df_train[features_skewed]) # log transform to reduce skewness
df_test[features_skewed] = np.log1p(df_test[features_skewed])
In [99]:
pd.DataFrame({"price": df_train["SalePrice"], "log(price + 1)": np.log1p(df_train["SalePrice"])}) \
.hist(figsize=(12, 4));
df_train["SalePrice"] = np.log1p(df_train["SalePrice"]) # reduce skewness
In [100]:
from sklearn import preprocessing
std = preprocessing.RobustScaler()
df_train[features_num] = std.fit_transform(df_train[features_num])
df_test[features_num] = std.transform(df_test[features_num])
In [101]:
print(df_train[features_num].isnull().values.sum())
df_train[features_num] = df_train[features_num].fillna(df_train[features_num].mean())
print(df_train[features_num].isnull().values.sum())
df_test[features_num] = df_test[features_num].fillna(df_test[features_num].mean())
In [102]:
train = pd.concat([df_train[features_num], pd.get_dummies(df_train[features_cat])], axis=1)
test = pd.concat([df_test[features_num], pd.get_dummies(df_test[features_cat])], axis=1)
In [103]:
col = train.columns.difference(test.columns); col # some categorical values does not appear in the test set
Out[103]:
In [104]:
test[train.columns.difference(test.columns)] = 0
train_Y = df_train["SalePrice"]
In [105]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
def scorer(estimator, X, y):
return mean_squared_error(estimator.predict(X), y)**.5
def error(estimator, n_splits=5):
return cross_val_score(estimator, train, train_Y,
scoring=scorer, cv=n_splits).mean()
def submit(estimator):
e = estimator.fit(train, df_train["SalePrice"])
pd.DataFrame({"Id": df_test.index, "SalePrice": np.expm1(e.predict(test) )}) \
.to_csv("submission.csv", index=False)
error = scorer(estimator, train, df_train["SalePrice"])
print(f"Error on train set: {error}")
In [106]:
print(f"CV Error: {error(sklearn.linear_model.LinearRegression())}")
submit(sklearn.linear_model.LinearRegression()) # overfitting a lot
In [107]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
alphas = [1, 3, 5, 7, 8, 9, 10, 12, 20, 50]
errors = [error(sklearn.linear_model.Ridge(alpha=a)).mean() for a in alphas]
fig = px.line(pd.DataFrame({"alpha": alphas, "error": errors}).set_index("alpha"))
fig.show()
We also could have used linear_model.RidgeCV to search for the best params of a ridge regression:
In [108]:
ridgeCV = sklearn.linear_model.RidgeCV(alphas=np.linspace(15, 25, 10), cv=5) \
.fit(train, train_Y)
print(f"Alpha: {ridgeCV.alpha_}")
error(ridgeCV)
Out[108]:
In [109]:
submit(sklearn.linear_model.Ridge(alpha=19.44))
In [110]:
lassoCV = sklearn.linear_model.LassoCV(cv=5) \
.fit(train, train_Y)
print(f"CV error: {error(lassoCV)}")
submit(sklearn.linear_model.LassoCV(cv=5))
Let's see the relative importance of each feature in the Lasso regression by looking at its coefficient:
In [54]:
coef = pd.DataFrame({"Coefficient": lassoCV.coef_, "Feature": train.columns}) \
.sort_values(by="Coefficient")
px.bar(pd.concat([coef.head(10), coef.tail(10)]), x="Coefficient", y="Feature", orientation='h').show()
In [111]:
from sklearn.ensemble import RandomForestRegressor
error(RandomForestRegressor(n_estimators=200)) # note: standardization is not useful on RF
Out[111]:
In [112]:
import xgboost
reg_xgb = xgboost.XGBRegressor()
reg_xgb.fit(train, df_train["SalePrice"], verbose=False)
error(reg_xgb) # overfitting
Out[112]:
In [113]:
dtrain = xgboost.DMatrix(train, label=train_Y)
params = {"max_depth":2, "eta":0.1}
model = xgboost.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100)
In [114]:
model.index.names=["Number of trees"]
model.sort_values(by="test-rmse-mean")
Out[114]:
In [115]:
submit(xgboost.XGBRegressor(n_estimators=492, max_depth=2, learning_rate=1)) # bad on test set though
In [116]:
from sklearn.ensemble import StackingRegressor
stacking_regressor = StackingRegressor(
estimators=[
("Lasso", sklearn.linear_model.LassoCV(cv=5)),
("xgboost", xgboost.XGBRegressor()),
]) # final estimator is RidgeCV by default
print(f"CV error: {error(stacking_regressor)}")
submit(stacking_regressor) # this is the best model I got
In [ ]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 3)]
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 3)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
rf_random = RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions = random_grid, n_iter = 10, cv = 3, n_jobs = -1)
# Fit the random search model
rf_random.fit(train, df_train["SalePrice"])
rf_random.best_params_
In [ ]:
error(rf_random)
In [ ]:
submit(sklearn.neighbors.KNeighborsRegressor())