In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import ensemble, linear_model
In [2]:
train_df = pd.read_csv("data/train.csv").drop(columns=["Id"])
test_df = pd.read_csv("data/test.csv").set_index("Id")
features_num = train_df.select_dtypes(include=np.number).columns
features_cat = train_df.columns.difference(features_num)
features_num = features_num.drop('SalePrice')
train_df["SalePrice"].describe()
Out[2]:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64
In [3]:
corr = train_df.corr().query("SalePrice > 0.5")  # most relevant features
corr[corr.index].style.background_gradient(cmap='coolwarm')
Out[3]:
  OverallQual YearBuilt YearRemodAdd TotalBsmtSF 1stFlrSF GrLivArea FullBath TotRmsAbvGrd GarageCars GarageArea SalePrice
OverallQual 1.000000 0.572323 0.550684 0.537808 0.476224 0.593007 0.550600 0.427452 0.600671 0.562022 0.790982
YearBuilt 0.572323 1.000000 0.592855 0.391452 0.281986 0.199010 0.468271 0.095589 0.537850 0.478954 0.522897
YearRemodAdd 0.550684 0.592855 1.000000 0.291066 0.240379 0.287389 0.439046 0.191740 0.420622 0.371600 0.507101
TotalBsmtSF 0.537808 0.391452 0.291066 1.000000 0.819530 0.454868 0.323722 0.285573 0.434585 0.486665 0.613581
1stFlrSF 0.476224 0.281986 0.240379 0.819530 1.000000 0.566024 0.380637 0.409516 0.439317 0.489782 0.605852
GrLivArea 0.593007 0.199010 0.287389 0.454868 0.566024 1.000000 0.630012 0.825489 0.467247 0.468997 0.708624
FullBath 0.550600 0.468271 0.439046 0.323722 0.380637 0.630012 1.000000 0.554784 0.469672 0.405656 0.560664
TotRmsAbvGrd 0.427452 0.095589 0.191740 0.285573 0.409516 0.825489 0.554784 1.000000 0.362289 0.337822 0.533723
GarageCars 0.600671 0.537850 0.420622 0.434585 0.439317 0.467247 0.469672 0.362289 1.000000 0.882475 0.640409
GarageArea 0.562022 0.478954 0.371600 0.486665 0.489782 0.468997 0.405656 0.337822 0.882475 1.000000 0.623431
SalePrice 0.790982 0.522897 0.507101 0.613581 0.605852 0.708624 0.560664 0.533723 0.640409 0.623431 1.000000
In [4]:
corr[corr.index].loc["SalePrice"].sort_values()[:-1]
Out[4]:
YearRemodAdd    0.507101
YearBuilt       0.522897
TotRmsAbvGrd    0.533723
FullBath        0.560664
1stFlrSF        0.605852
TotalBsmtSF     0.613581
GarageArea      0.623431
GarageCars      0.640409
GrLivArea       0.708624
OverallQual     0.790982
Name: SalePrice, dtype: float64

Features preprocessing

Deal with skewed data

In [5]:
print(train_df[features_num].skew().sort_values(ascending=False).to_frame().rename(columns=lambda x:"Skewness"))
features_skewed = (train_df[features_num].skew() > 0.75).index
train_df[features_skewed] = np.log1p(train_df[features_skewed]) # log transform to reduce skewness 
test_df[features_skewed] = np.log1p(test_df[features_skewed])
                Skewness
MiscVal        24.476794
PoolArea       14.828374
LotArea        12.207688
3SsnPorch      10.304342
LowQualFinSF    9.011341
KitchenAbvGr    4.488397
BsmtFinSF2      4.255261
ScreenPorch     4.122214
BsmtHalfBath    4.103403
EnclosedPorch   3.089872
MasVnrArea      2.669084
OpenPorchSF     2.364342
LotFrontage     2.163569
BsmtFinSF1      1.685503
WoodDeckSF      1.541376
TotalBsmtSF     1.524255
MSSubClass      1.407657
1stFlrSF        1.376757
GrLivArea       1.366560
BsmtUnfSF       0.920268
2ndFlrSF        0.813030
OverallCond     0.693067
TotRmsAbvGrd    0.676341
HalfBath        0.675897
Fireplaces      0.649565
BsmtFullBath    0.596067
OverallQual     0.216944
MoSold          0.212053
BedroomAbvGr    0.211790
GarageArea      0.179981
YrSold          0.096269
FullBath        0.036562
GarageCars     -0.342549
YearRemodAdd   -0.503562
YearBuilt      -0.613461
GarageYrBlt    -0.649415
In [6]:
pd.DataFrame({"price": train_df["SalePrice"], "log(price + 1)": np.log1p(train_df["SalePrice"])}) \
  .hist(figsize=(12, 4));
train_df["SalePrice"] = np.log1p(train_df["SalePrice"])  # reduce skewness

Normalize data

In [7]:
from sklearn import preprocessing

std = preprocessing.RobustScaler()
train_df[features_num] = std.fit_transform(train_df[features_num])
test_df[features_num] = std.transform(test_df[features_num])

Fill NA

In [8]:
print(train_df[features_num].isnull().values.sum())
train_df[features_num] = train_df[features_num].fillna(train_df[features_num].mean())
print(train_df[features_num].isnull().values.sum())
test_df[features_num] = test_df[features_num].fillna(test_df[features_num].mean())
348
0

Convert categorical variables

In [9]:
train = pd.concat([train_df[features_num], pd.get_dummies(train_df[features_cat])], axis=1)
test = pd.concat([test_df[features_num], pd.get_dummies(test_df[features_cat])], axis=1)
In [10]:
col = train.columns.difference(test.columns); col  # some categorical values does not appear in the test set
Out[10]:
Index(['Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn',
       'Electrical_Mix', 'Exterior1st_ImStucc', 'Exterior1st_Stone',
       'Exterior2nd_Other', 'GarageQual_Ex', 'Heating_Floor', 'Heating_OthW',
       'HouseStyle_2.5Fin', 'MiscFeature_TenC', 'PoolQC_Fa',
       'RoofMatl_ClyTile', 'RoofMatl_Membran', 'RoofMatl_Metal',
       'RoofMatl_Roll', 'Utilities_NoSeWa'],
      dtype='object')
In [11]:
train = train.drop(columns=train.columns.difference(test.columns))  # we delete them
train_Y = train_df["SalePrice"]

Utility functions

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

def scorer(estimator, X, y):
    return mean_squared_error(estimator.predict(X), y)**.5

def error(estimator, n_splits=5):
    return cross_val_score(estimator, train, train_Y, 
                           scoring=scorer, cv=n_splits).mean()

def submit(estimator):
    e = estimator.fit(train, train_df["SalePrice"])
    pd.DataFrame({"Id": test_df.index, "SalePrice": np.expm1(e.predict(test) )}) \
      .to_csv("submission.csv", index=False)
    return scorer(estimator, train, train_df["SalePrice"])

Linear regression

Simple linear regression

In [13]:
linear_model.LinearRegression()
submit(linear_model.LinearRegression())  # overflow without regularization
Out[13]:
0.09743640174538538

Ridge regression

In [14]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

alphas = [1, 3, 5, 7, 8, 9, 10, 12, 20, 50]
errors = [error(linear_model.Ridge(alpha=a)).mean() for a in alphas]
fig = px.line(pd.DataFrame({"alpha": alphas, "error": errors}).set_index("alpha"))
fig.show()