# standard python libs
import os
import random
import time
import warnings
from typing import Dict, List, Tuple # this one helps me for typing.
warnings.filterwarnings("ignore")
# importing machine learning stuff
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
# SKLearn stuff
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import (
KFold,
RandomizedSearchCV,
GridSearchCV,
cross_validate,
learning_curve,
train_test_split,
)
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.svm import SVRsns.set_theme(style="whitegrid")
RANDOM_STATE = 67
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
print(f"[INFO] Random state fixed at {RANDOM_STATE}")
#
SALEPRICE_PLOT_COLOR = "steelblue"
LOGSALEPRICE_PLOT_COLOR = "darkorange"DATASET_PATH = "/kaggle/input/competitions/house-prices-advanced-regression-techniques/train.csv" df = pd.read_csv(DATASET_PATH) df.shape
(1460, 81)
display(df.head())
print(f"Shape: {df.shape}")
print(f"Columns: {len(df.columns)}\n")
print("Data types:")
display(df.dtypes.value_counts())
missing = df.isna().mean().sort_values(ascending=False)
missing_top = missing[missing > 0].head(20)
fig, axes = plt.subplots(1, 2, figsize=(16, 5))
sns.histplot(df['SalePrice'], kde=True, ax=axes[0], color='teal')
axes[0].set_title('SalePrice Distribution (Raw)')
if len(missing_top) > 0:
sns.barplot(x=missing_top.values, y=missing_top.index, ax=axes[1], palette='viridis', hue=missing_top.index, legend=False)
axes[1].set_title('Top Missingness Ratios')
axes[1].set_xlabel('Missing Ratio')
else:
axes[1].text(0.5, 0.5, 'No Missing Values', ha='center', va='center')
axes[1].set_axis_off()
plt.tight_layout()
plt.show()| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
object 43 int64 35 float64 3 Name: count, dtype: int64
if "Id" in df.columns:
df = df.drop(columns = ["Id"])
df["LogSalePrice"] = np.log1p(df["SalePrice"])
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.histplot(df["SalePrice"], kde=True, ax=axes[0], color=SALEPRICE_PLOT_COLOR)
axes[0].set_title("Raw SalePrice")
sns.histplot(df["LogSalePrice"], kde=True, ax=axes[1], color=LOGSALEPRICE_PLOT_COLOR)
axes[1].set_title("Log(1 + SalePrice)")
plt.tight_layout()
plt.show()def engineerFeatures(inputDf: pd.DataFrame) -> pd.DataFrame:
data = inputDf.copy()
data["TotalSF"] = 0
for col in ["TotalBsmtSF", "1stFlrSF", "2ndFlrSF"]:
if col not in data.columns:
data[col] = 0
data["TotalSF"] += data[col].fillna(0)
if {'YrSold', 'YearBuilt'}.issubset(data.columns):
data['HouseAgeAtSale'] = data['YrSold'].fillna(data['YrSold'].median()) - data['YearBuilt'].fillna(data['YearBuilt'].median())
if {'YrSold', 'YearRemodAdd'}.issubset(data.columns):
data['RemodAgeAtSale'] = data['YrSold'].fillna(data['YrSold'].median()) - data['YearRemodAdd'].fillna(data['YearRemodAdd'].median())
full_bath_cols = [c for c in ['FullBath', 'BsmtFullBath'] if c in data.columns]
half_bath_cols = [c for c in ['HalfBath', 'BsmtHalfBath'] if c in data.columns]
data['TotalBaths'] = data[full_bath_cols].fillna(0).sum(axis=1) + 0.5 * data[half_bath_cols].fillna(0).sum(axis=1)
if 'GarageArea' in data.columns:
data['HasGarage'] = (data['GarageArea'].fillna(0) > 0).astype(int)
if 'TotalBsmtSF' in data.columns:
data['HasBsmt'] = (data['TotalBsmtSF'].fillna(0) > 0).astype(int)
return data
x = df.drop(columns=['SalePrice', 'LogSalePrice'])
y = df['LogSalePrice']
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=RANDOM_STATE)
print(f"Train shape: {xTrain.shape}, Test shape: {xTest.shape}")VIF_THRESHOLD = 10.0
MAX_DROPS = 15
def computeVIF(numericDF: pd.DataFrame) -> pd.DataFrame:
work = numericDF.select_dtypes(include=[np.number]).copy()
work = work.replace([np.inf, -np.inf], np.nan)
# Median imputation per column; if a column is fully missing, fallback to 0.
for col in work.columns:
colMedian = work[col].median()
if pd.isna(colMedian):
colMedian = 0.0
work[col] = work[col].fillna(colMedian)
vals = []
cols = list(work.columns)
for col in cols:
yCol = work[col]
xCols = work.drop(columns=[col])
if xCols.shape[1] == 0:
vals.append((col, 1.0))
continue
model = LinearRegression()
model.fit(xCols, yCol)
r2 = model.score(xCols, yCol)
vif = 1.0 / max(1e-8, (1.0 - r2))
vals.append((col, float(vif)))
return pd.DataFrame(vals, columns=["feature", "vif"]).sort_values('vif', ascending=False) if vals else pd.DataFrame(vals, columns=["feature", "vif"])
XTrainFe = engineerFeatures(xTrain)
numericVIFCols = XTrainFe.select_dtypes(include=[np.number]).columns.tolist()
vifDF = XTrainFe[numericVIFCols].copy()
droppedMulticollinear: List[str] = []
for _ in range(MAX_DROPS):
currentVIF = computeVIF(vifDF)
if currentVIF.empty:
break
worst = currentVIF.iloc[0]
if worst['vif'] <= VIF_THRESHOLD:
break
dropFeature = worst["feature"]
droppedMulticollinear.append(dropFeature)
vifDF = vifDF.drop(columns=[dropFeature], errors="ignore")
finalVIF = computeVIF(vifDF)
finalVIFHead = finalVIF.head(15)
print("Dropped due to multicollinearity:")
print(droppedMulticollinear if droppedMulticollinear else "None")
display(finalVIFHead)
plt.figure(figsize=(10, 6))
sns.barplot(data=finalVIFHead, x="vif", y="feature", palette='mako', hue="feature", legend=False)
plt.xlabel("VIF (Variance Inflation Factor)")
plt.ylabel("Feature")
plt.title("Top Remaining VIF Values (Plot Mitigation)")
plt.tight_layout()
plt.show()| feature | vif | |
|---|---|---|
| 8 | 1stFlrSF | 6.300462 |
| 19 | GarageCars | 6.245212 |
| 20 | GarageArea | 5.906320 |
| 9 | 2ndFlrSF | 5.772305 |
| 30 | HouseAgeAtSale | 5.526510 |
| 6 | BsmtFinSF1 | 5.047759 |
| 16 | TotRmsAbvGrd | 5.036058 |
| 7 | BsmtUnfSF | 4.470887 |
| 18 | GarageYrBlt | 4.313016 |
| 3 | OverallQual | 3.456460 |
| 12 | FullBath | 3.039315 |
| 31 | RemodAgeAtSale | 2.435597 |
| 14 | BedroomAbvGr | 2.343514 |
| 32 | HasGarage | 2.284594 |
| 13 | HalfBath | 2.160178 |
def dropSelectedColumns(data: pd.DataFrame, targets: List[str]) -> pd.DataFrame:
return data.drop(targets, errors="ignore")
xTrainPrepared = dropSelectedColumns(engineerFeatures(xTrain), droppedMulticollinear)
numFeatures = xTrainPrepared.select_dtypes(include=[np.number]).columns.tolist()
catFeatures = xTrainPrepared.select_dtypes(exclude=[np.number]).columns.tolist()
numericTransformLinear = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median"))
])
numericTransformScaled = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
categoricalTransform = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessorLinear = ColumnTransformer(transformers=[
("num", numericTransformLinear, numFeatures),
("cat", categoricalTransform, catFeatures)
])
preprocessorScaled = ColumnTransformer(transformers=[
('num', numericTransformScaled, numFeatures),
('cat', categoricalTransform, catFeatures),
])
featureEngineeringStep = FunctionTransformer(engineerFeatures, validate=False)
dropMulticolStep = FunctionTransformer(lambda d: dropSelectedColumns(d, droppedMulticollinear), validate=False)
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
print(f'Numeric features: {len(numFeatures)}')
print(f'Categorical features: {len(catFeatures)}')Pipeline and are tuned with CV search.def rmse(yTrue, yPred):
return np.sqrt(mean_squared_error(yTrue, yPred))
pipeLR = Pipeline(steps=[
("feat", featureEngineeringStep),
("drop_multi", dropMulticolStep),
("prep", preprocessorLinear),
("model", LinearRegression())
])
pipeSVR = Pipeline(steps=[
("feat", featureEngineeringStep),
("drop_multi", dropMulticolStep),
("prep", preprocessorScaled),
("model", SVR())
])
pipeANN = Pipeline(steps=[
("feat", featureEngineeringStep),
("drop_multi", dropMulticolStep),
("prep", preprocessorLinear),
("model", MLPRegressor(
random_state = RANDOM_STATE,
max_iter = 2000,
early_stopping = True,
validation_fraction=0.1
))
])
gridLR = {
'model__fit_intercept': [True, False],
'model__positive': [False, True],
}
randSVR = {
'model__kernel': ['rbf'],
'model__C': np.logspace(-1, 2.5, 30),
'model__epsilon': np.linspace(0.01, 0.5, 30),
'model__gamma': ['scale', 'auto'],
}
randANN = {
'model__hidden_layer_sizes': [(64,), (128,), (64, 32), (128, 64)],
'model__activation': ['relu', 'tanh'],
'model__alpha': np.logspace(-6, -2, 20),
'model__learning_rate_init': np.logspace(-4, -2, 20),
}
searches = {
"Linear Regression": GridSearchCV(
estimator= pipeLR,
param_grid= gridLR,
scoring= "neg_root_mean_squared_error",
cv= cv,
n_jobs= 1,
return_train_score= True
),
"SVR": RandomizedSearchCV(
estimator= pipeSVR,
param_distributions= randSVR,
n_iter= 25,
random_state= RANDOM_STATE,
scoring= "neg_root_mean_squared_error",
cv= cv,
n_jobs= 1,
return_train_score= True
),
"ANN": RandomizedSearchCV(
estimator= pipeANN,
param_distributions= randANN,
n_iter= 25,
random_state= RANDOM_STATE,
scoring= "neg_root_mean_squared_error",
cv= cv,
n_jobs= 1,
return_train_score= True
)
}
bestModels = {}
searchSummaries = []
for modelName, search in searches.items():
print(f"\nTraining {modelName}...")
start= time.perf_counter()
search.fit(xTrain, yTrain)
elapsed= time.perf_counter() - start
bestModels[modelName] = search.best_estimator_
searchSummaries.append({
"model": modelName,
"bestCVRmse": -search.best_score_,
"bestTimeSec": elapsed,
"MeanFitTimeBest": search.cv_results_['mean_fit_time'][search.best_index_],
"bestParams": search.best_params_
})
summaryDF = pd.DataFrame(searchSummaries).sort_values("bestCVRmse")
display(summaryDF[["model", "bestCVRmse", "bestTimeSec", "MeanFitTimeBest"]])
summaryDF| model | bestCVRmse | bestTimeSec | MeanFitTimeBest | |
|---|---|---|---|---|
| 1 | SVR | 0.136272 | 20.969226 | 0.102099 |
| 0 | Linear Regression | 0.182306 | 2.835429 | 0.061095 |
| 2 | ANN | 0.319546 | 377.929576 | 2.057789 |
| model | bestCVRmse | bestTimeSec | MeanFitTimeBest | bestParams | |
|---|---|---|---|---|---|
| 1 | SVR | 0.136272 | 20.969226 | 0.102099 | {'model__kernel': 'rbf', 'model__gamma': 'auto... |
| 0 | Linear Regression | 0.182306 | 2.835429 | 0.061095 | {'model__fit_intercept': True, 'model__positiv... |
| 2 | ANN | 0.319546 | 377.929576 | 2.057789 | {'model__learning_rate_init': 0.00233572146909... |
evaluationRows = []
foldScores = {}
for model_name, model in bestModels.items():
fitStart = time.perf_counter()
model.fit(xTrain, yTrain)
fitTime = time.perf_counter() - fitStart
predStart = time.perf_counter()
predTest = model.predict(xTest)
predTime = time.perf_counter() - predStart
cvResult = cross_validate(
model,
xTrain,
yTrain,
cv=cv,
scoring="neg_root_mean_squared_error",
return_train_score=True,
n_jobs=1
)
trainRmse = -cvResult["train_score"]
valRmse = -cvResult["test_score"]
foldScores[model_name] = valRmse
evaluationRows.append({
'model': model_name,
'testRmse': rmse(yTest, predTest),
'testMae': mean_absolute_error(yTest, predTest),
'testR2': r2_score(yTest, predTest),
'cvTrainRmseMean': trainRmse.mean(),
'cvValRmseMean': valRmse.mean(),
'biasVarianceGap': valRmse.mean() - trainRmse.mean(),
'fitTimeSec': fitTime,
'predTimeSec': predTime,
})
evalDF = pd.DataFrame(evaluationRows).sort_values("testRmse")
display(evalDF)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.barplot(data=evalDF, x="model", y="testRmse", ax=axes[0], palette="crest", hue="testRmse", legend=True)
axes[0].set_title("Test RMSE (lower is better)")
axes[0].set_xlabel("Model")
axes[0].set_ylabel("Test RMSE")
axes[0].tick_params(axis="x", rotation=15)
sns.barplot(data=evalDF, x="model", y="biasVarianceGap", ax=axes[1], palette="flare", hue="biasVarianceGap", legend=True)
axes[1].axhline(0, linestyle="--", color="black", linewidth=1)
axes[1].set_title("Bias-Variance Gap: CV Val RMSE - CV Train RMSE")
axes[1].set_xlabel("Model")
axes[1].set_ylabel("Bias-Variance Gap")
axes[1].tick_params(axis="x", rotation=15)
plt.tight_layout()
plt.show()| model | testRmse | testMae | testR2 | cvTrainRmseMean | cvValRmseMean | biasVarianceGap | fitTimeSec | predTimeSec | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | SVR | 0.106811 | 0.081612 | 0.918196 | 0.104451 | 0.136272 | 0.031822 | 0.195910 | 0.025118 |
| 0 | Linear Regression | 0.133803 | 0.091687 | 0.871627 | 0.089362 | 0.182306 | 0.092944 | 0.063651 | 0.022782 |
| 2 | ANN | 0.268396 | 0.199568 | 0.483471 | 0.303049 | 0.319546 | 0.016497 | 1.612850 | 0.034098 |
modelNames = list(foldScores.keys())
stateRows = []
for i in range(len(modelNames)):
for j in range(i + 1, len(modelNames)):
a = modelNames[i]
b = modelNames[j]
tStat, pVal = stats.ttest_rel(foldScores[a], foldScores[b])
stateRows.append({
"modelA": a,
"modelB": b,
"meanRmseA": np.mean(foldScores[a]),
"meanRmseB": np.mean(foldScores[b]),
"tStat": tStat,
"pValue": pVal,
})
statsDF = pd.DataFrame(stateRows).sort_values("pValue")
display(statsDF)
print('Interpretation tip: pValue < 0.05 suggests statistically significant fold-level performance difference.')| modelA | modelB | meanRmseA | meanRmseB | tStat | pValue | |
|---|---|---|---|---|---|---|
| 2 | SVR | ANN | 0.136272 | 0.319546 | -15.571130 | 0.000099 |
| 1 | Linear Regression | ANN | 0.182306 | 0.319546 | -4.834344 | 0.008435 |
| 0 | Linear Regression | SVR | 0.182306 | 0.136272 | 2.314003 | 0.081672 |
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)
trainSizes = np.linspace(0.2, 1.0, 5)
for ax, (model_name, model) in zip(axes, bestModels.items()):
sizes, trainScores, valScores = learning_curve(
model,
xTrain,
yTrain,
cv=cv,
train_sizes=trainSizes,
scoring="neg_root_mean_squared_error",
n_jobs = 1
)
trainRmse = -trainScores.mean(axis=1)
valRmse = -valScores.mean(axis=1)
ax.plot(sizes, trainRmse, marker="o", label="Train RMSE")
ax.plot(sizes, valRmse, marker="s", label="CV RMSE")
ax.set_title(f"{model_name} learning curve")
ax.set_xlabel("Training Samples")
ax.grid(alpha=0.3)
axes[0].set_ylabel("RMSE")
axes[0].legend()
plt.tight_layout()
plt.show()def getFeatureNamesFromPreprocessor(preprocessor: ColumnTransformer) -> np.ndarray:
return preprocessor.get_feature_names_out()
impactFrames = []
# Linear coefs:
LRModel = bestModels["Linear Regression"]
LRPrep = LRModel.named_steps["prep"]
LREst = LRModel.named_steps["model"]
LRFeatureNames = getFeatureNamesFromPreprocessor(LRPrep)
coefDF = pd.DataFrame({
"feature": LRFeatureNames,
"impact": LREst.coef_
})
coefDF["absImpact"] = coefDF["impact"].abs()
coefTop = coefDF.sort_values("absImpact", ascending=False).head(20)
coefTop["model"] = "Linear Regression (coefficient)"
impactFrames.append(coefTop[["model", "feature", "impact", "absImpact"]])
# Permutation importance for non linear models
for modelName in ["SVR", "ANN"]:
mdl = bestModels[modelName]
pi = permutation_importance(
mdl,
xTest,
yTest,
n_repeats=10,
random_state=RANDOM_STATE,
scoring="neg_root_mean_squared_error",
n_jobs=1
)
piDF = pd.DataFrame({
"feature": xTest.columns,
"impact": pi.importances_mean
})
piDF["absImpact"] = piDF["impact"].abs()
piTop = piDF.sort_values("absImpact", ascending=False).head(20)
piTop["model"] = f"{modelName} (permutation)"
impactFrames.append(piTop[["model", "feature", "impact", "absImpact"]])
impactDF = pd.concat(impactFrames, ignore_index=True)
display(impactDF.head(50))
fig, axes = plt.subplots(1, 3, figsize=(18, 8))
for ax, mdl in zip(axes, impactDF["model"].unique()):
sub = impactDF[impactDF["model"] == mdl].sort_values("absImpact", ascending=True).tail(15)
sns.barplot(data=sub, x="absImpact", y="feature", ax=ax, palette="cubehelix", hue="feature", legend=False)
ax.set_xlabel("Absolute Impact")
ax.set_ylabel("Feature")
ax.set_title(mdl)
plt.tight_layout()
plt.show()| model | feature | impact | absImpact | |
|---|---|---|---|---|
| 0 | Linear Regression (coefficient) | cat__RoofMatl_ClyTile | -2.258783 | 2.258783 |
| 1 | Linear Regression (coefficient) | cat__RoofMatl_Membran | 0.812536 | 0.812536 |
| 2 | Linear Regression (coefficient) | cat__RoofMatl_Metal | 0.640716 | 0.640716 |
| 3 | Linear Regression (coefficient) | cat__Condition2_PosN | -0.617063 | 0.617063 |
| 4 | Linear Regression (coefficient) | cat__Utilities_AllPub | 0.589445 | 0.589445 |
| 5 | Linear Regression (coefficient) | cat__GarageQual_Ex | 0.499449 | 0.499449 |
| 6 | Linear Regression (coefficient) | cat__BsmtCond_Po | 0.496187 | 0.496187 |
| 7 | Linear Regression (coefficient) | cat__Condition2_PosA | 0.494142 | 0.494142 |
| 8 | Linear Regression (coefficient) | cat__CentralAir_Y | 0.486507 | 0.486507 |
| 9 | Linear Regression (coefficient) | cat__Street_Pave | 0.478744 | 0.478744 |
| 10 | Linear Regression (coefficient) | cat__Alley_Pave | 0.465652 | 0.465652 |
| 11 | Linear Regression (coefficient) | cat__Alley_Grvl | 0.436322 | 0.436322 |
| 12 | Linear Regression (coefficient) | cat__Street_Grvl | 0.423230 | 0.423230 |
| 13 | Linear Regression (coefficient) | cat__CentralAir_N | 0.415467 | 0.415467 |
| 14 | Linear Regression (coefficient) | cat__LandSlope_Mod | 0.395719 | 0.395719 |
| 15 | Linear Regression (coefficient) | cat__Condition2_Feedr | 0.393939 | 0.393939 |
| 16 | Linear Regression (coefficient) | cat__GarageCond_Po | 0.384189 | 0.384189 |
| 17 | Linear Regression (coefficient) | cat__RoofMatl_WdShngl | 0.382020 | 0.382020 |
| 18 | Linear Regression (coefficient) | cat__RoofMatl_Roll | 0.379270 | 0.379270 |
| 19 | Linear Regression (coefficient) | cat__MiscFeature_Othr | 0.362884 | 0.362884 |
| 20 | SVR (permutation) | 2ndFlrSF | 0.031193 | 0.031193 |
| 21 | SVR (permutation) | OverallQual | 0.029872 | 0.029872 |
| 22 | SVR (permutation) | OverallCond | 0.021233 | 0.021233 |
| 23 | SVR (permutation) | GrLivArea | 0.019280 | 0.019280 |
| 24 | SVR (permutation) | TotalBsmtSF | 0.015383 | 0.015383 |
| 25 | SVR (permutation) | YearBuilt | 0.013890 | 0.013890 |
| 26 | SVR (permutation) | LotArea | 0.013328 | 0.013328 |
| 27 | SVR (permutation) | 1stFlrSF | 0.012957 | 0.012957 |
| 28 | SVR (permutation) | BsmtFinSF1 | 0.006109 | 0.006109 |
| 29 | SVR (permutation) | GarageArea | 0.005968 | 0.005968 |
| 30 | SVR (permutation) | Fireplaces | 0.005531 | 0.005531 |
| 31 | SVR (permutation) | TotRmsAbvGrd | 0.003590 | 0.003590 |
| 32 | SVR (permutation) | Neighborhood | 0.003291 | 0.003291 |
| 33 | SVR (permutation) | MSZoning | 0.003248 | 0.003248 |
| 34 | SVR (permutation) | MiscVal | 0.003209 | 0.003209 |
| 35 | SVR (permutation) | YearRemodAdd | 0.002899 | 0.002899 |
| 36 | SVR (permutation) | FullBath | 0.002764 | 0.002764 |
| 37 | SVR (permutation) | GarageCars | 0.002282 | 0.002282 |
| 38 | SVR (permutation) | HalfBath | 0.002265 | 0.002265 |
| 39 | SVR (permutation) | PoolArea | 0.001972 | 0.001972 |
| 40 | ANN (permutation) | 2ndFlrSF | 0.038569 | 0.038569 |
| 41 | ANN (permutation) | LotArea | 0.034728 | 0.034728 |
| 42 | ANN (permutation) | TotalBsmtSF | 0.025261 | 0.025261 |
| 43 | ANN (permutation) | BsmtFinSF1 | 0.013872 | 0.013872 |
| 44 | ANN (permutation) | GrLivArea | 0.012307 | 0.012307 |
| 45 | ANN (permutation) | GarageArea | 0.005593 | 0.005593 |
| 46 | ANN (permutation) | MasVnrArea | 0.003570 | 0.003570 |
| 47 | ANN (permutation) | 1stFlrSF | 0.003274 | 0.003274 |
| 48 | ANN (permutation) | YearBuilt | 0.002667 | 0.002667 |
| 49 | ANN (permutation) | BsmtUnfSF | 0.001814 | 0.001814 |
compareDF = evalDF.merge(summaryDF[['model', 'bestTimeSec']], on='model', how='left')
display(compareDF.sort_values('testRmse'))
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.barplot(data=compareDF, x='model', y='testRmse', ax=axes[0], palette='Blues')
axes[0].set_title('Test RMSE')
axes[0].set_xlabel("Model")
axes[0].set_ylabel("Test RMSE")
axes[0].tick_params(axis='x', rotation=15)
sns.barplot(data=compareDF, x='model', y='testR2', ax=axes[1], palette='Greens')
axes[1].set_title('Test $R^2$')
axes[0].set_xlabel("Model")
axes[1].set_ylabel("Test R2")
axes[1].tick_params(axis='x', rotation=15)
sns.barplot(data=compareDF, x='model', y='bestTimeSec', ax=axes[2], palette='Reds')
axes[2].set_title('Hyperparameter Search Time (sec)')
axes[0].set_xlabel("Model")
axes[1].set_ylabel("Search Time")
axes[2].tick_params(axis='x', rotation=15)
plt.tight_layout()
plt.show()| model | testRmse | testMae | testR2 | cvTrainRmseMean | cvValRmseMean | biasVarianceGap | fitTimeSec | predTimeSec | bestTimeSec | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SVR | 0.106811 | 0.081612 | 0.918196 | 0.104451 | 0.136272 | 0.031822 | 0.195910 | 0.025118 | 20.969226 |
| 1 | Linear Regression | 0.133803 | 0.091687 | 0.871627 | 0.089362 | 0.182306 | 0.092944 | 0.063651 | 0.022782 | 2.835429 |
| 2 | ANN | 0.268396 | 0.199568 | 0.483471 | 0.303049 | 0.319546 | 0.016497 | 1.612850 | 0.034098 | 377.929576 |
RMSE = 0.1068, MAE = 0.0816, R^2 = 0.9182)RMSE = 0.2684, R^2 = 0.4835) and did not generalize as well as the other models.cvValRMSE - cvTrainRMSE = 0.0318), suggesting a good balance between fit and generalization.0.0929), indicating greater variance/overfitting risk relative to SVR.0.0165) but high absolute error, which is consistent with underfitting (high bias) in the current search space.p = 0.000099).p = 0.008435).alpha = 0.05 (p = 0.081672), though SVR is better in all main holdout metrics.2ndFlrSF, OverallQual, OverallCond, GrLivArea, TotalBsmtSF, LotArea, YearBuilt, and 1stFlrSF.bestTimeSec ~ 2.79s) and predict, making it a strong lightweight baseline.bestTimeSec ~ 20.10s), representing the best accuracy-cost compromise.bestTimeSec ~ 366.38s) while also giving the weakest performance in this setup.TEST_DATASET_PATH = "/kaggle/input/competitions/house-prices-advanced-regression-techniques/train.csv"
testDF = pd.read_csv(TEST_DATASET_PATH)
submissionIds = testDF["Id"].copy()
xProd = testDF.drop(columns=["Id"])
svrModel = bestModels["SVR"]
predLog = svrModel.predict(xProd)
predPrice = np.expm1(predLog)
submissionDF = pd.DataFrame({
"Id": submissionIds,
"SalePrice": predPrice
})
submissionDF.to_csv("/kaggle/working/submission_svr.csv", index=False)