mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
266 lines
8.4 KiB
Python
266 lines
8.4 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import statsmodels.api as sm
|
|
import matplotlib.pyplot as plt
|
|
import pickle
|
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
|
median_absolute_error, mean_absolute_percentage_error
|
|
|
|
with open("all_data.pkl", "rb") as f:
|
|
all_data = pickle.load(f)
|
|
|
|
|
|
class SalModel:
|
|
# We want to estimate for making improvements on different property components
|
|
RESPONSE = "environment-impact-current"
|
|
# We could potentially build models by constituency to avoid having too many
|
|
# features in the model
|
|
BASE_FEATURES = [
|
|
"property-type",
|
|
"built-form",
|
|
"construction-age-band",
|
|
"number-habitable-rooms",
|
|
"constituency",
|
|
"number-heated-rooms",
|
|
"transaction-type"
|
|
]
|
|
|
|
COMPONENT_FEATURES = [
|
|
"walls-description",
|
|
"floor-description",
|
|
"lighting-description",
|
|
"roof-description",
|
|
"mainheat-description",
|
|
"hotwater-description",
|
|
"main-fuel",
|
|
"mechanical-ventilation",
|
|
"secondheat-description",
|
|
"energy-tariff",
|
|
"solar-water-heating-flag",
|
|
"photo-supply",
|
|
"windows-description",
|
|
"glazed-type",
|
|
"glazed-area",
|
|
"multi-glaze-proportion",
|
|
# "lighting-description" # Might not need to use this
|
|
"low-energy-lighting",
|
|
"number-open-fireplaces",
|
|
]
|
|
|
|
CATEGORICAL_COLS = [
|
|
"property-type",
|
|
"built-form",
|
|
"number-habitable-rooms",
|
|
"constituency",
|
|
"number-heated-rooms",
|
|
"lighting-description",
|
|
"mainheat-description",
|
|
"hotwater-description",
|
|
"main-fuel",
|
|
"mechanical-ventilation",
|
|
"secondheat-description",
|
|
"energy-tariff",
|
|
"solar-water-heating-flag",
|
|
"windows-description",
|
|
"glazed-type",
|
|
"glazed-area",
|
|
"mainheat-description",
|
|
|
|
]
|
|
|
|
def __init__(self, data, cleaner):
|
|
self.df = pd.DataFrame(data)
|
|
self.cleaner = cleaner
|
|
|
|
self.train_x = None
|
|
self.train_y = None
|
|
self.results = None
|
|
self.model_data = None
|
|
self.fit_error = None
|
|
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
|
|
|
|
def _append_extracted_u_values(self, model_data):
|
|
"""
|
|
We need to estimate the u-value impact for:
|
|
1) Walls
|
|
2) Roof
|
|
3) Floors
|
|
"""
|
|
|
|
wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[
|
|
["original_description", "thermal_transmittance"]].rename(
|
|
columns={"thermal_transmittance": "walls_u_value"}
|
|
)
|
|
|
|
floor_u_values = pd.DataFrame(self.cleaner.cleaned["floor-description"])[
|
|
["original_description", "thermal_transmittance"]].rename(
|
|
columns={"thermal_transmittance": "floor_u_value"}
|
|
)
|
|
|
|
roof_u_values = pd.DataFrame(self.cleaner.cleaned["roof-description"])[
|
|
["original_description", "thermal_transmittance"]].rename(
|
|
columns={"thermal_transmittance": "roof_u_value", }
|
|
)
|
|
|
|
model_data = model_data.merge(
|
|
wall_u_values,
|
|
how="left",
|
|
left_on="walls-description",
|
|
right_on="original_description"
|
|
).drop(
|
|
columns=["original_description"]
|
|
).merge(
|
|
floor_u_values,
|
|
how="left",
|
|
left_on="floor-description",
|
|
right_on="original_description"
|
|
).drop(
|
|
columns=["original_description"]
|
|
).merge(
|
|
roof_u_values,
|
|
how="left",
|
|
left_on="roof-description",
|
|
right_on="original_description"
|
|
)
|
|
|
|
return model_data
|
|
|
|
@staticmethod
|
|
def _convert_transaction_type(model_data):
|
|
model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling"
|
|
model_data = model_data.drop(columns=["transaction-type"])
|
|
return model_data
|
|
|
|
@staticmethod
|
|
def _clean_numericals(model_data):
|
|
|
|
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting"]:
|
|
model_data[col] = np.where(
|
|
model_data[col] == "", "0", model_data["photo-supply"]
|
|
).astype(float)
|
|
|
|
# We need to clean lighting
|
|
|
|
return model_data
|
|
|
|
def create_dataset(self):
|
|
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
|
|
model_data = model_data.reset_index(drop=True)
|
|
model_data["idx"] = model_data.index.copy()
|
|
|
|
# Append on u-values
|
|
model_data = self._append_extracted_u_values(model_data)
|
|
|
|
# Convert transaction_type
|
|
model_data = self._convert_transaction_type(model_data)
|
|
|
|
# Clean numerical columns
|
|
model_data = self._clean_numericals(model_data)
|
|
|
|
# Take just entries with U-values
|
|
# TODO: Rather than doing this, do we want to include the estimated u-values?
|
|
# Since this ends up with just 2k entries
|
|
model_data = model_data[
|
|
~pd.isnull(model_data["walls_u_value"]) &
|
|
~pd.isnull(model_data["floor_u_value"]) &
|
|
~pd.isnull(model_data["roof_u_value"])
|
|
]
|
|
|
|
exclude_features = ["walls-description", "floor-description", "roof-description", "transaction-type"]
|
|
|
|
features = [
|
|
x for x in self.BASE_FEATURES +
|
|
self.COMPONENT_FEATURES +
|
|
["walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE] if x not in exclude_features
|
|
]
|
|
|
|
model_data = model_data[features]
|
|
|
|
for col in self.CATEGORICAL_COLS:
|
|
model_data[col] = model_data[col].astype('category')
|
|
|
|
def make_training_test(self):
|
|
# Split into training and test
|
|
# Dummy data
|
|
pass
|
|
|
|
def fit_model(self):
|
|
# Add a constant to the independent value
|
|
x1 = sm.add_constant(self.X)
|
|
|
|
# make regression model
|
|
model = sm.OLS(self.Y, x1)
|
|
|
|
# fit model and print results
|
|
self.results = model.fit()
|
|
|
|
self.fit_error, self.worst["errors"] = self.calculate_regression_metrics(
|
|
y_true=self.Y, y_pred=self.results.fittedvalues
|
|
)
|
|
|
|
self.model_data['fit'] = self.results.fittedvalues
|
|
# The worst errors over index heavily for flats
|
|
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
|
|
|
|
@staticmethod
|
|
def plot_regression(df):
|
|
# Extract the "fit" and "actual" columns from the dataframe
|
|
fit = df['fit']
|
|
actual = df['actual']
|
|
|
|
# Create an array of x-values (assumed to be sequential integers)
|
|
x = np.arange(len(df))
|
|
|
|
# Plot the fit and actual data
|
|
plt.plot(x, fit, color='red', label='Fit')
|
|
plt.plot(x, actual, color='blue', label='Actual')
|
|
|
|
# Set labels and title
|
|
plt.xlabel('Index')
|
|
plt.ylabel('Value')
|
|
plt.title('Linear Regression - Fit vs Actual')
|
|
|
|
# Display legend
|
|
plt.legend()
|
|
|
|
# Show the plot
|
|
plt.show()
|
|
|
|
@staticmethod
|
|
def calculate_regression_metrics(y_true, y_pred, n=20):
|
|
"""
|
|
Calculate the 5 most important accuracy metrics for regression.
|
|
|
|
Args:
|
|
y_true (array-like): Array of true target values.
|
|
y_pred (array-like): Array of predicted target values.
|
|
|
|
Returns:
|
|
dict: Dictionary containing the calculated metrics.
|
|
"""
|
|
metrics = {}
|
|
|
|
metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
|
|
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
|
|
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
|
|
metrics['R2 Score'] = r2_score(y_true, y_pred)
|
|
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
|
|
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
|
|
metrics['Mean True Value'] = y_true.mean()
|
|
metrics['Mean Predicted Value'] = y_pred.mean()
|
|
|
|
errors = pd.DataFrame()
|
|
errors['Fit'] = y_true
|
|
errors['Actual'] = y_pred
|
|
errors['Residual'] = errors['Actual'] - errors['Fit']
|
|
errors['Absolute Residual'] = np.abs(errors['Residual'])
|
|
|
|
worst_errors = errors.nlargest(n, 'Absolute Residual')
|
|
|
|
return metrics, worst_errors
|
|
|
|
|
|
self = SalModel(
|
|
data=all_data["data"],
|
|
cleaner=all_data["cleaner"]
|
|
)
|