mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
reverting to baseline performance which was just filling area with zero
This commit is contained in:
parent
68e903e492
commit
0b41057db8
1 changed files with 132 additions and 65 deletions
|
|
@ -11,13 +11,16 @@ from sklearn.linear_model import Lasso
|
|||
from sklearn.preprocessing import StandardScaler
|
||||
import xgboost as xgb
|
||||
|
||||
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
||||
from tqdm import tqdm
|
||||
|
||||
with open("all_data.pkl", "rb") as f:
|
||||
all_data = pickle.load(f)
|
||||
|
||||
|
||||
class SapModel:
|
||||
# We want to estimate for making improvements on different property components
|
||||
RESPONSE = "environment-impact-current"
|
||||
RESPONSE = "current-energy-efficiency"
|
||||
# We could potentially build models by constituency to avoid having too many
|
||||
# features in the model
|
||||
BASE_FEATURES = [
|
||||
|
|
@ -50,6 +53,12 @@ class SapModel:
|
|||
# "lighting-description" # Might not need to use this
|
||||
"low-energy-lighting",
|
||||
"number-open-fireplaces",
|
||||
"mainheatcont-description",
|
||||
"fixed-lighting-outlets-count",
|
||||
"floor-height",
|
||||
"floor-level",
|
||||
"total-floor-area",
|
||||
"extension-count",
|
||||
]
|
||||
|
||||
CATEGORICAL_COLS = [
|
||||
|
|
@ -69,8 +78,17 @@ class SapModel:
|
|||
"glazed-type",
|
||||
"glazed-area",
|
||||
"construction-age-band",
|
||||
# Testing
|
||||
"lighting-description"
|
||||
"lighting-description",
|
||||
"mainheatcont-description",
|
||||
"floor-level",
|
||||
]
|
||||
|
||||
NUMERICAL_COLUMNS = [
|
||||
"photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces",
|
||||
"fixed-lighting-outlets-count",
|
||||
"floor-height",
|
||||
"total-floor-area",
|
||||
"extension-count",
|
||||
]
|
||||
|
||||
def __init__(self, data, cleaner, test_size=0.2, random_state=None):
|
||||
|
|
@ -145,14 +163,82 @@ class SapModel:
|
|||
model_data = model_data.drop(columns=["transaction-type"])
|
||||
return model_data
|
||||
|
||||
@staticmethod
|
||||
def _clean_numericals(model_data):
|
||||
def _clean_numericals(self, model_data):
|
||||
|
||||
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
|
||||
# Try binning numericals
|
||||
def bucket_and_fill(df, column_name, n_bins=10):
|
||||
# Check if the column is numerical
|
||||
if np.issubdtype(df[column_name].dtype, np.number):
|
||||
# Create a new categorical column from numerical one by binning the data
|
||||
df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
|
||||
# Replace missing data with "NO_RECORD"
|
||||
df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
|
||||
df[column_name + "_bucket"] = np.where(
|
||||
df[column_name + "_bucket"] == "nan",
|
||||
"NO_RECORD",
|
||||
df[column_name + "_bucket"]
|
||||
)
|
||||
return df
|
||||
|
||||
bucket_variables = []
|
||||
remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in bucket_variables]
|
||||
|
||||
for col in bucket_variables:
|
||||
model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
|
||||
model_data = bucket_and_fill(model_data, col)
|
||||
|
||||
# Replace the data with the binned version
|
||||
model_data = model_data.drop(columns=bucket_variables)
|
||||
model_data = model_data.rename(
|
||||
columns=dict(zip([c + "_bucket" for c in bucket_variables], bucket_variables))
|
||||
)
|
||||
|
||||
for col in remaining_numericals:
|
||||
model_data[col] = np.where(
|
||||
model_data[col] == "", "0", model_data[col]
|
||||
).astype(float)
|
||||
|
||||
# Basic fill the rest of the columns with 0
|
||||
|
||||
# grouping_variables = ["constituency", "built-form", "property-type"]
|
||||
# secondary_grouping_variables = ["constituency", "property-type"]
|
||||
#
|
||||
# def append_missings(model_data, col, grouping_variables, secondary=False):
|
||||
# if not secondary:
|
||||
# numerical_data = model_data[(model_data[col] != "")].copy()
|
||||
# else:
|
||||
# numerical_data = model_data[~pd.isnull(model_data[col])].copy()
|
||||
#
|
||||
# numerical_data[col] = numerical_data[col].astype(float)
|
||||
# numerical_data = numerical_data.groupby(grouping_variables, observed=True)[col].mean().reset_index()
|
||||
# numerical_data = numerical_data.rename(columns={col: f"{col}_avg"})
|
||||
# model_data = model_data.merge(numerical_data, how="left", on=grouping_variables)
|
||||
#
|
||||
# if not secondary:
|
||||
# model_data[col] = np.where(model_data[col] == "", model_data[f"{col}_avg"], model_data[col])
|
||||
# else:
|
||||
# model_data[col] = np.where(pd.isnull(model_data[col]), model_data[f"{col}_avg"], model_data[col])
|
||||
#
|
||||
# model_data = model_data.drop(columns=[f"{col}_avg"])
|
||||
# model_data[col] = model_data[col].astype(float)
|
||||
#
|
||||
# return model_data
|
||||
#
|
||||
# for col in numerical_cols:
|
||||
# model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
|
||||
# model_data[col] = model_data[col].fillna(model_data[col].mean())
|
||||
# model_data = append_missings(model_data, col, grouping_variables)
|
||||
#
|
||||
# if pd.isnull(model_data[col]).sum():
|
||||
# model_data = append_missings(model_data, col, secondary_grouping_variables, True)
|
||||
#
|
||||
# if pd.isnull(model_data[col]).sum():
|
||||
# raise Exception("Cleaning failed, missing values remain")
|
||||
#
|
||||
# model_data[col] = np.where(
|
||||
# model_data[col] == "", "0", model_data[col]
|
||||
# ).astype(float)
|
||||
|
||||
return model_data
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -212,7 +298,7 @@ class SapModel:
|
|||
|
||||
features = [
|
||||
x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
|
||||
"walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx"
|
||||
"walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx", "is_rdsap"
|
||||
] if x not in exclude_features
|
||||
]
|
||||
|
||||
|
|
@ -251,7 +337,9 @@ class SapModel:
|
|||
def fit_model(self):
|
||||
|
||||
# Dummy out the categorical variables
|
||||
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS, drop_first=True)
|
||||
binned = []
|
||||
|
||||
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + binned, drop_first=True)
|
||||
|
||||
# Convert booleans to integer
|
||||
for col in x.columns:
|
||||
|
|
@ -276,16 +364,6 @@ class SapModel:
|
|||
train_x = train_x.drop(columns=["idx"])
|
||||
test_x = test_x.drop(columns=["idx"])
|
||||
|
||||
# importance_df = self.make_importance(train_x)
|
||||
# Test dropping the least important features
|
||||
# to_drop = importance_df.tail(2)["Feature"].values
|
||||
# Dropping this is a good idea
|
||||
to_drop = [
|
||||
"hotwater-description_Electric immersion, off-peak",
|
||||
]
|
||||
train_x = train_x.drop(columns=to_drop)
|
||||
test_x = test_x[train_x.columns]
|
||||
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.inspection import permutation_importance
|
||||
|
||||
|
|
@ -342,13 +420,13 @@ class SapModel:
|
|||
).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx")
|
||||
|
||||
# temp hardcoded values
|
||||
best_fit = {'MAPE': 0.042824355225087686, 'Mean Squared Error': 21.49263731368226,
|
||||
'Mean Absolute Error': 3.298755911054327, 'R2 Score': 0.794118580154128,
|
||||
'Explained Variance Score': 0.794118580154128, 'Median Absolute Error': 2.426789554039914}
|
||||
best_fit = {'MAPE': 0.04617542805587113, 'Mean Squared Error': 18.62306128026334,
|
||||
'Mean Absolute Error': 2.865262003625814, 'R2 Score': 0.8008316762496143,
|
||||
'Explained Variance Score': 0.8008316762496143, 'Median Absolute Error': 1.911197425417548}
|
||||
|
||||
best_predict = {'MAPE': 0.04413439429441669, 'Mean Squared Error': 22.700373062051142,
|
||||
'Mean Absolute Error': 3.3961241443022008, 'R2 Score': 0.750296045867001,
|
||||
'Explained Variance Score': 0.7503518147827141, 'Median Absolute Error': 2.4442017110145855}
|
||||
best_predict = {'MAPE': 0.04358926901734807, 'Mean Squared Error': 21.197491698961528,
|
||||
'Mean Absolute Error': 3.046853690257838, 'R2 Score': 0.7215087343364782,
|
||||
'Explained Variance Score': 0.7215726927575035, 'Median Absolute Error': 1.921094388694634}
|
||||
|
||||
def check_successes(experiment_error, best_error):
|
||||
|
||||
|
|
@ -485,38 +563,7 @@ class SapModel:
|
|||
worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True)
|
||||
worst_x = worst_x.sort_values("Absolute Residual", ascending=False)
|
||||
|
||||
def make_importance(self, train_x):
|
||||
|
||||
# Create a DMatrix from your training data
|
||||
dtrain = xgb.DMatrix(train_x, label=self.train_y)
|
||||
|
||||
# Set the parameters for the XGBoost model
|
||||
params = {
|
||||
'objective': 'reg:squarederror',
|
||||
'eval_metric': 'rmse'
|
||||
}
|
||||
|
||||
# Train the XGBoost model
|
||||
model = xgb.train(params, dtrain)
|
||||
|
||||
# Get feature importance scores
|
||||
importance_scores = model.get_score(importance_type='gain')
|
||||
|
||||
# Create a dataframe with feature names and importance scores
|
||||
importance_df = pd.DataFrame({
|
||||
'Feature': importance_scores.keys(),
|
||||
'Importance': importance_scores.values()
|
||||
})
|
||||
|
||||
# Sort the dataframe by importance score in descending order
|
||||
importance_df = importance_df.sort_values(by='Importance', ascending=False)
|
||||
|
||||
# Print the feature importances
|
||||
return importance_df
|
||||
|
||||
def detect_multi_collinearity(self):
|
||||
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
||||
from tqdm import tqdm
|
||||
# Get the VIFs for each variable
|
||||
vifs = pd.DataFrame()
|
||||
vifs["features"] = self.train_x.columns
|
||||
|
|
@ -616,14 +663,14 @@ class SapModel:
|
|||
Returns:
|
||||
dict: Dictionary containing the calculated metrics.
|
||||
"""
|
||||
metrics = {}
|
||||
|
||||
metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
|
||||
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
|
||||
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
|
||||
metrics['R2 Score'] = r2_score(y_true, y_pred)
|
||||
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
|
||||
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
|
||||
metrics = {
|
||||
'MAPE': mean_absolute_percentage_error(y_true, y_pred),
|
||||
'Mean Squared Error': mean_squared_error(y_true, y_pred),
|
||||
'Mean Absolute Error': mean_absolute_error(y_true, y_pred),
|
||||
'R2 Score': r2_score(y_true, y_pred),
|
||||
'Explained Variance Score': explained_variance_score(y_true, y_pred),
|
||||
'Median Absolute Error': median_absolute_error(y_true, y_pred)
|
||||
}
|
||||
|
||||
errors = pd.DataFrame()
|
||||
errors['Fit'] = y_true
|
||||
|
|
@ -645,20 +692,40 @@ from sklearn.ensemble import RandomForestRegressor
|
|||
|
||||
rf = RandomForestRegressor(random_state=self.random_state)
|
||||
X = self.df.drop(columns=self.RESPONSE)
|
||||
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
|
||||
X = X.drop(
|
||||
columns=["address", "uprn-source", "heating-cost-potential", "hot-water-cost-potential", "potential-energy-rating",
|
||||
"environment-impact-potential", "heating-cost-current", "address3", "local-authority-label",
|
||||
"hot-water-cost-current", "county", "postcode", "constituency", "co2-emissions-potential", "address2",
|
||||
"posttown", "lighting-cost-potential", "lodgement-datetime", "current-energy-rating", "uprn",
|
||||
"energy-consumption-current", "lighting-cost-current", "lodgement-date", "lmk-key",
|
||||
"potential-energy-efficiency",
|
||||
"energy-consumption-potential", "inspection-date", "co2-emiss-curr-per-floor-area", "address1",
|
||||
"constituency-label",
|
||||
"building-reference-number", "environment-impact-current", "co2-emissions-current",
|
||||
])
|
||||
# Note: this probably not the best way to treat low-energy-fixed-light-count, unheated-corridor-length,
|
||||
# fixed-lighting-outlets-count
|
||||
for col in [
|
||||
"photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces", "floor-height",
|
||||
"low-energy-fixed-light-count", "unheated-corridor-length", "fixed-lighting-outlets-count", "total-floor-area"
|
||||
]:
|
||||
X[col] = np.where(
|
||||
X[col] == "", "0", X[col]
|
||||
).astype(float)
|
||||
|
||||
Y = self.df[self.RESPONSE]
|
||||
X = pd.get_dummies(X)
|
||||
rf.fit(X, Y)
|
||||
|
||||
# Print the name and importance of each feature
|
||||
importance_df = []
|
||||
for feature, importance in zip(train_x.columns, rf.feature_importances_):
|
||||
for feature, importance in zip(X.columns, rf.feature_importances_):
|
||||
importance_df.append(
|
||||
{
|
||||
"Feature": feature,
|
||||
"rf_importance": importance
|
||||
}
|
||||
)
|
||||
|
||||
importance_df = pd.DataFrame(importance_df)
|
||||
importance_df = importance_df.sort_values("rf_importance", ascending=False)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue