House Prices - Advanced Regression Techniques

This notebook comes from my personal work on a Kaggle competition

Data exploration

import numpy as np
import pandas as pd
import sklearn

df_train = pd.read_csv("data/train.csv").drop(columns=["Id"])
df_test = pd.read_csv("data/test.csv").set_index("Id")
features_num = df_train.select_dtypes(include=np.number).columns
features_cat = df_train.columns.difference(features_num)
features_num = features_num.drop('SalePrice')

corr = df_train.corr().query("SalePrice > 0.5")  # most relevant features
corr.loc[["SalePrice"], corr.index].sort_values(by="SalePrice", axis=1, ascending=False)

df_train[corr.index].hist(bins=30, figsize=(20, 10));

Features preprocessing

Deal with skewed data

The SalePrice histogram looks right-skewed. Indeed, the Shapiro-Wilk rejects the null hypothesis that the data is normally distributed:

import scipy 

scipy.stats.shapiro(df_train["SalePrice"]) # p-value < 0.05

ShapiroResult(statistic=0.869671642780304, pvalue=3.206247534576162e-33)

print(df_train[features_num].skew().sort_values(ascending=False).to_frame().rename(columns=lambda x:"Skewness")[:5])
features_skewed = (df_train[features_num].skew() > 0.75).index
df_train[features_skewed] = np.log1p(df_train[features_skewed]) # log transform to reduce skewness 
df_test[features_skewed] = np.log1p(df_test[features_skewed])

               Skewness
MiscVal       24.476794
PoolArea      14.828374
LotArea       12.207688
3SsnPorch     10.304342
LowQualFinSF   9.011341

pd.DataFrame({"price": df_train["SalePrice"], "log(price + 1)": np.log1p(df_train["SalePrice"])}) \
  .hist(figsize=(12, 4));
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])  # reduce skewness

Normalize data

from sklearn import preprocessing

std = preprocessing.RobustScaler()
df_train[features_num] = std.fit_transform(df_train[features_num])
df_test[features_num] = std.transform(df_test[features_num])

Fill NA

print(df_train[features_num].isnull().values.sum())
df_train[features_num] = df_train[features_num].fillna(df_train[features_num].mean())
print(df_train[features_num].isnull().values.sum())
df_test[features_num] = df_test[features_num].fillna(df_test[features_num].mean())

348
0

Convert categorical variables

train = pd.concat([df_train[features_num], pd.get_dummies(df_train[features_cat])], axis=1)
test = pd.concat([df_test[features_num], pd.get_dummies(df_test[features_cat])], axis=1)

col = train.columns.difference(test.columns); col  # some categorical values does not appear in the test set

Index(['Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn',
       'Electrical_Mix', 'Exterior1st_ImStucc', 'Exterior1st_Stone',
       'Exterior2nd_Other', 'GarageQual_Ex', 'Heating_Floor', 'Heating_OthW',
       'HouseStyle_2.5Fin', 'MiscFeature_TenC', 'PoolQC_Fa',
       'RoofMatl_ClyTile', 'RoofMatl_Membran', 'RoofMatl_Metal',
       'RoofMatl_Roll', 'Utilities_NoSeWa'],
      dtype='object')

test[train.columns.difference(test.columns)] = 0
train_Y = df_train["SalePrice"]

Utility functions

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

def scorer(estimator, X, y):
    return mean_squared_error(estimator.predict(X), y)**.5

def error(estimator, n_splits=5):
    return cross_val_score(estimator, train, train_Y, 
                           scoring=scorer, cv=n_splits).mean()

def submit(estimator):
    e = estimator.fit(train, df_train["SalePrice"])
    pd.DataFrame({"Id": df_test.index, "SalePrice": np.expm1(e.predict(test) )}) \
      .to_csv("submission.csv", index=False)
    error = scorer(estimator, train, df_train["SalePrice"])
    print(f"Error on train set: {error}")

Linear regression

Simple linear regression

print(f"CV Error: {error(sklearn.linear_model.LinearRegression())}")
submit(sklearn.linear_model.LinearRegression())  # overfitting a lot

CV Error: 298646942.7842525
Error on train set: 0.09243823419181545

Ridge regression

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

alphas = [1, 3, 5, 7, 8, 9, 10, 12, 20, 50]
errors = [error(sklearn.linear_model.Ridge(alpha=a)).mean() for a in alphas]
fig = px.line(pd.DataFrame({"alpha": alphas, "error": errors}).set_index("alpha"))
fig.show()

We also could have used linear_model.RidgeCV to search for the best params of a ridge regression:

ridgeCV = sklearn.linear_model.RidgeCV(alphas=np.linspace(15, 25, 10), cv=5) \
                      .fit(train, train_Y)
print(f"Alpha: {ridgeCV.alpha_}")
error(ridgeCV)

Alpha: 19.444444444444443

0.1272122750809639

submit(sklearn.linear_model.Ridge(alpha=19.44))

Error on train set: 0.1090730854198304

Lasso

lassoCV = sklearn.linear_model.LassoCV(cv=5) \
                      .fit(train, train_Y)
print(f"CV error: {error(lassoCV)}")
submit(sklearn.linear_model.LassoCV(cv=5))

CV error: 0.12371922010918017
Error on train set: 0.10994273586695323

Let's see the relative importance of each feature in the Lasso regression by looking at its coefficient:

coef = pd.DataFrame({"Coefficient": lassoCV.coef_, "Feature": train.columns}) \
         .sort_values(by="Coefficient")
px.bar(pd.concat([coef.head(10), coef.tail(10)]), x="Coefficient", y="Feature", orientation='h').show()

Random forests

from sklearn.ensemble import RandomForestRegressor

error(RandomForestRegressor(n_estimators=200))  # note: standardization is not useful on RF

0.14254555961172075

Gradient boosted trees

import xgboost
reg_xgb = xgboost.XGBRegressor()
reg_xgb.fit(train, df_train["SalePrice"], verbose=False)
error(reg_xgb)  # overfitting

0.1390387488205515

dtrain = xgboost.DMatrix(train, label=train_Y)
params = {"max_depth":2, "eta":0.1}
model = xgboost.cv(params, dtrain,  num_boost_round=500, early_stopping_rounds=100)

model.index.names=["Number of trees"]
model.sort_values(by="test-rmse-mean")

submit(xgboost.XGBRegressor(n_estimators=492, max_depth=2, learning_rate=1)) # bad on test set though

Error on train set: 0.015154050659435873

Stacking regressors

Lasso and Ridge are good on train and are not overfitting much
Xgboost is excellent on train but is overfitting

Let's try to stack them:

from sklearn.ensemble import StackingRegressor

stacking_regressor = StackingRegressor(
    estimators=[
        ("Lasso", sklearn.linear_model.LassoCV(cv=5)),
        ("xgboost", xgboost.XGBRegressor()), 
    ])  # final estimator is RidgeCV by default
print(f"CV error: {error(stacking_regressor)}")
submit(stacking_regressor) # this is the best model I got

CV error: 0.12013848127162992
Error on train set: 0.07290394083360184

Randomized Search

Trying to optimize parameters of RF:

from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 3)]
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 3)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
rf_random = RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions = random_grid, n_iter = 10, cv = 3, n_jobs = -1)
# Fit the random search model
rf_random.fit(train, df_train["SalePrice"])
rf_random.best_params_

error(rf_random)

KNN

submit(sklearn.neighbors.KNeighborsRegressor())

	train-rmse-mean	train-rmse-std	test-rmse-mean	test-rmse-std
Number of trees
492	0.063138	0.002819	0.123247	0.011814
486	0.063448	0.002786	0.123251	0.011719
493	0.063082	0.002799	0.123252	0.011801
491	0.063185	0.002812	0.123263	0.011798
494	0.063053	0.002805	0.123269	0.011805
...	...	...	...	...
4	6.820173	0.002320	6.820488	0.007688
3	7.574888	0.002511	7.575220	0.007952
2	8.413392	0.002711	8.413386	0.007926
1	9.345149	0.002914	9.345143	0.007585
0	10.380516	0.003151	10.380511	0.007226

Notebooks