reverting to baseline performance which was just filling area with zero

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-04 19:38:18 +01:00
parent 68e903e492
commit 0b41057db8

View file

@ -11,13 +11,16 @@ from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tqdm import tqdm
with open("all_data.pkl", "rb") as f:
all_data = pickle.load(f)
class SapModel:
# We want to estimate for making improvements on different property components
RESPONSE = "environment-impact-current"
RESPONSE = "current-energy-efficiency"
# We could potentially build models by constituency to avoid having too many
# features in the model
BASE_FEATURES = [
@ -50,6 +53,12 @@ class SapModel:
# "lighting-description" # Might not need to use this
"low-energy-lighting",
"number-open-fireplaces",
"mainheatcont-description",
"fixed-lighting-outlets-count",
"floor-height",
"floor-level",
"total-floor-area",
"extension-count",
]
CATEGORICAL_COLS = [
@ -69,8 +78,17 @@ class SapModel:
"glazed-type",
"glazed-area",
"construction-age-band",
# Testing
"lighting-description"
"lighting-description",
"mainheatcont-description",
"floor-level",
]
NUMERICAL_COLUMNS = [
"photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces",
"fixed-lighting-outlets-count",
"floor-height",
"total-floor-area",
"extension-count",
]
def __init__(self, data, cleaner, test_size=0.2, random_state=None):
@ -145,14 +163,82 @@ class SapModel:
model_data = model_data.drop(columns=["transaction-type"])
return model_data
@staticmethod
def _clean_numericals(model_data):
def _clean_numericals(self, model_data):
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
# Try binning numericals
def bucket_and_fill(df, column_name, n_bins=10):
# Check if the column is numerical
if np.issubdtype(df[column_name].dtype, np.number):
# Create a new categorical column from numerical one by binning the data
df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
# Replace missing data with "NO_RECORD"
df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
df[column_name + "_bucket"] = np.where(
df[column_name + "_bucket"] == "nan",
"NO_RECORD",
df[column_name + "_bucket"]
)
return df
bucket_variables = []
remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in bucket_variables]
for col in bucket_variables:
model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
model_data = bucket_and_fill(model_data, col)
# Replace the data with the binned version
model_data = model_data.drop(columns=bucket_variables)
model_data = model_data.rename(
columns=dict(zip([c + "_bucket" for c in bucket_variables], bucket_variables))
)
for col in remaining_numericals:
model_data[col] = np.where(
model_data[col] == "", "0", model_data[col]
).astype(float)
# Basic fill the rest of the columns with 0
# grouping_variables = ["constituency", "built-form", "property-type"]
# secondary_grouping_variables = ["constituency", "property-type"]
#
# def append_missings(model_data, col, grouping_variables, secondary=False):
# if not secondary:
# numerical_data = model_data[(model_data[col] != "")].copy()
# else:
# numerical_data = model_data[~pd.isnull(model_data[col])].copy()
#
# numerical_data[col] = numerical_data[col].astype(float)
# numerical_data = numerical_data.groupby(grouping_variables, observed=True)[col].mean().reset_index()
# numerical_data = numerical_data.rename(columns={col: f"{col}_avg"})
# model_data = model_data.merge(numerical_data, how="left", on=grouping_variables)
#
# if not secondary:
# model_data[col] = np.where(model_data[col] == "", model_data[f"{col}_avg"], model_data[col])
# else:
# model_data[col] = np.where(pd.isnull(model_data[col]), model_data[f"{col}_avg"], model_data[col])
#
# model_data = model_data.drop(columns=[f"{col}_avg"])
# model_data[col] = model_data[col].astype(float)
#
# return model_data
#
# for col in numerical_cols:
# model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
# model_data[col] = model_data[col].fillna(model_data[col].mean())
# model_data = append_missings(model_data, col, grouping_variables)
#
# if pd.isnull(model_data[col]).sum():
# model_data = append_missings(model_data, col, secondary_grouping_variables, True)
#
# if pd.isnull(model_data[col]).sum():
# raise Exception("Cleaning failed, missing values remain")
#
# model_data[col] = np.where(
# model_data[col] == "", "0", model_data[col]
# ).astype(float)
return model_data
@staticmethod
@ -212,7 +298,7 @@ class SapModel:
features = [
x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
"walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx"
"walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx", "is_rdsap"
] if x not in exclude_features
]
@ -251,7 +337,9 @@ class SapModel:
def fit_model(self):
# Dummy out the categorical variables
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS, drop_first=True)
binned = []
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + binned, drop_first=True)
# Convert booleans to integer
for col in x.columns:
@ -276,16 +364,6 @@ class SapModel:
train_x = train_x.drop(columns=["idx"])
test_x = test_x.drop(columns=["idx"])
# importance_df = self.make_importance(train_x)
# Test dropping the least important features
# to_drop = importance_df.tail(2)["Feature"].values
# Dropping this is a good idea
to_drop = [
"hotwater-description_Electric immersion, off-peak",
]
train_x = train_x.drop(columns=to_drop)
test_x = test_x[train_x.columns]
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
@ -342,13 +420,13 @@ class SapModel:
).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx")
# temp hardcoded values
best_fit = {'MAPE': 0.042824355225087686, 'Mean Squared Error': 21.49263731368226,
'Mean Absolute Error': 3.298755911054327, 'R2 Score': 0.794118580154128,
'Explained Variance Score': 0.794118580154128, 'Median Absolute Error': 2.426789554039914}
best_fit = {'MAPE': 0.04617542805587113, 'Mean Squared Error': 18.62306128026334,
'Mean Absolute Error': 2.865262003625814, 'R2 Score': 0.8008316762496143,
'Explained Variance Score': 0.8008316762496143, 'Median Absolute Error': 1.911197425417548}
best_predict = {'MAPE': 0.04413439429441669, 'Mean Squared Error': 22.700373062051142,
'Mean Absolute Error': 3.3961241443022008, 'R2 Score': 0.750296045867001,
'Explained Variance Score': 0.7503518147827141, 'Median Absolute Error': 2.4442017110145855}
best_predict = {'MAPE': 0.04358926901734807, 'Mean Squared Error': 21.197491698961528,
'Mean Absolute Error': 3.046853690257838, 'R2 Score': 0.7215087343364782,
'Explained Variance Score': 0.7215726927575035, 'Median Absolute Error': 1.921094388694634}
def check_successes(experiment_error, best_error):
@ -485,38 +563,7 @@ class SapModel:
worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True)
worst_x = worst_x.sort_values("Absolute Residual", ascending=False)
def make_importance(self, train_x):
# Create a DMatrix from your training data
dtrain = xgb.DMatrix(train_x, label=self.train_y)
# Set the parameters for the XGBoost model
params = {
'objective': 'reg:squarederror',
'eval_metric': 'rmse'
}
# Train the XGBoost model
model = xgb.train(params, dtrain)
# Get feature importance scores
importance_scores = model.get_score(importance_type='gain')
# Create a dataframe with feature names and importance scores
importance_df = pd.DataFrame({
'Feature': importance_scores.keys(),
'Importance': importance_scores.values()
})
# Sort the dataframe by importance score in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)
# Print the feature importances
return importance_df
def detect_multi_collinearity(self):
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tqdm import tqdm
# Get the VIFs for each variable
vifs = pd.DataFrame()
vifs["features"] = self.train_x.columns
@ -616,14 +663,14 @@ class SapModel:
Returns:
dict: Dictionary containing the calculated metrics.
"""
metrics = {}
metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
metrics['R2 Score'] = r2_score(y_true, y_pred)
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
metrics = {
'MAPE': mean_absolute_percentage_error(y_true, y_pred),
'Mean Squared Error': mean_squared_error(y_true, y_pred),
'Mean Absolute Error': mean_absolute_error(y_true, y_pred),
'R2 Score': r2_score(y_true, y_pred),
'Explained Variance Score': explained_variance_score(y_true, y_pred),
'Median Absolute Error': median_absolute_error(y_true, y_pred)
}
errors = pd.DataFrame()
errors['Fit'] = y_true
@ -645,20 +692,40 @@ from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=self.random_state)
X = self.df.drop(columns=self.RESPONSE)
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
X = X.drop(
columns=["address", "uprn-source", "heating-cost-potential", "hot-water-cost-potential", "potential-energy-rating",
"environment-impact-potential", "heating-cost-current", "address3", "local-authority-label",
"hot-water-cost-current", "county", "postcode", "constituency", "co2-emissions-potential", "address2",
"posttown", "lighting-cost-potential", "lodgement-datetime", "current-energy-rating", "uprn",
"energy-consumption-current", "lighting-cost-current", "lodgement-date", "lmk-key",
"potential-energy-efficiency",
"energy-consumption-potential", "inspection-date", "co2-emiss-curr-per-floor-area", "address1",
"constituency-label",
"building-reference-number", "environment-impact-current", "co2-emissions-current",
])
# Note: this probably not the best way to treat low-energy-fixed-light-count, unheated-corridor-length,
# fixed-lighting-outlets-count
for col in [
"photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces", "floor-height",
"low-energy-fixed-light-count", "unheated-corridor-length", "fixed-lighting-outlets-count", "total-floor-area"
]:
X[col] = np.where(
X[col] == "", "0", X[col]
).astype(float)
Y = self.df[self.RESPONSE]
X = pd.get_dummies(X)
rf.fit(X, Y)
# Print the name and importance of each feature
importance_df = []
for feature, importance in zip(train_x.columns, rf.feature_importances_):
for feature, importance in zip(X.columns, rf.feature_importances_):
importance_df.append(
{
"Feature": feature,
"rf_importance": importance
}
)
importance_df = pd.DataFrame(importance_df)
importance_df = importance_df.sort_values("rf_importance", ascending=False)