mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
creating rough framework for sap model
This commit is contained in:
parent
c08f74cefb
commit
f941a3c512
2 changed files with 170 additions and 134 deletions
170
model_data/analysis/SapModel.py
Normal file
170
model_data/analysis/SapModel.py
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import statsmodels.api as sm
|
||||
import matplotlib.pyplot as plt
|
||||
import pickle
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
||||
median_absolute_error, mean_absolute_percentage_error
|
||||
|
||||
with open("all_data.pkl", "wb") as f:
|
||||
all_data = pickle.load(f)
|
||||
|
||||
|
||||
class SalModel:
|
||||
# We want to estimate for making improvements on different property components
|
||||
RESPONSE = "environment-impact-current"
|
||||
# We could potentially build models by constituency to avoid having too many
|
||||
# features in the model
|
||||
BASE_FEATURES = [
|
||||
"property-type",
|
||||
"built-form",
|
||||
# "construction-age-band",
|
||||
"number-habitable-rooms",
|
||||
"constituency",
|
||||
"number-heated-rooms",
|
||||
]
|
||||
|
||||
COMPONENT_FEATURES = [
|
||||
"walls-description",
|
||||
"floor-description",
|
||||
"lighting-description",
|
||||
"windows-description",
|
||||
"roof-description",
|
||||
"mainheat-description",
|
||||
"main-fuel"
|
||||
]
|
||||
|
||||
CATEGORICAL_COLS = [
|
||||
"property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
|
||||
"lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
|
||||
|
||||
]
|
||||
|
||||
def __init__(self, data, cleaner):
|
||||
self.df = pd.DataFrame(data)
|
||||
self.cleaner = cleaner
|
||||
|
||||
self.train_x = None
|
||||
self.train_y = None
|
||||
self.results = None
|
||||
self.model_data = None
|
||||
self.fit_error = None
|
||||
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
|
||||
|
||||
def create_dataset(self):
|
||||
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
|
||||
model_data = model_data.reset_index(drop=True)
|
||||
model_data["idx"] = model_data.index.copy()
|
||||
|
||||
# Append on u-value estimates
|
||||
model_data = model_data.merge(
|
||||
pd.DataFrame(self.cleaner.cleaned["walls-description"])[
|
||||
["original_description", "thermal_transmittance"]].rename(
|
||||
columns={"thermal_transmittance": "walls_u_value", }
|
||||
),
|
||||
how="left",
|
||||
left_on="walls-description",
|
||||
right_on="original_description"
|
||||
) \
|
||||
.drop(columns=["original_description"]) \
|
||||
.merge(
|
||||
pd.DataFrame(self.cleaner.cleaned["floor-description"])[
|
||||
["original_description", "thermal_transmittance"]].rename(
|
||||
columns={"thermal_transmittance": "floor_u_value", }
|
||||
),
|
||||
how="left",
|
||||
left_on="floor-description",
|
||||
right_on="original_description"
|
||||
)
|
||||
# Take just entries with U-values
|
||||
model_data = model_data[
|
||||
~pd.isnull(model_data["walls_u_value"]) &
|
||||
~pd.isnull(model_data["floor_u_value"])
|
||||
]
|
||||
model_data = model_data[
|
||||
self.BASE_FEATURES + [c for c in self.COMPONENT_FEATURES if c not in [
|
||||
"walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", self.RESPONSE]
|
||||
]
|
||||
|
||||
for col in self.CATEGORICAL_COLS:
|
||||
model_data[col] = model_data[col].astype('category')
|
||||
|
||||
def make_training_test(self):
|
||||
# Split into training and test
|
||||
# Dummy data
|
||||
pass
|
||||
|
||||
def fit_model(self):
|
||||
# Add a constant to the independent value
|
||||
x1 = sm.add_constant(self.X)
|
||||
|
||||
# make regression model
|
||||
model = sm.OLS(self.Y, x1)
|
||||
|
||||
# fit model and print results
|
||||
self.results = model.fit()
|
||||
|
||||
self.fit_error, self.worst["errors"] = self.calculate_regression_metrics(
|
||||
y_true=self.Y, y_pred=self.results.fittedvalues
|
||||
)
|
||||
|
||||
self.model_data['fit'] = self.results.fittedvalues
|
||||
# The worst errors over index heavily for flats
|
||||
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
|
||||
|
||||
@staticmethod
|
||||
def plot_regression(df):
|
||||
# Extract the "fit" and "actual" columns from the dataframe
|
||||
fit = df['fit']
|
||||
actual = df['actual']
|
||||
|
||||
# Create an array of x-values (assumed to be sequential integers)
|
||||
x = np.arange(len(df))
|
||||
|
||||
# Plot the fit and actual data
|
||||
plt.plot(x, fit, color='red', label='Fit')
|
||||
plt.plot(x, actual, color='blue', label='Actual')
|
||||
|
||||
# Set labels and title
|
||||
plt.xlabel('Index')
|
||||
plt.ylabel('Value')
|
||||
plt.title('Linear Regression - Fit vs Actual')
|
||||
|
||||
# Display legend
|
||||
plt.legend()
|
||||
|
||||
# Show the plot
|
||||
plt.show()
|
||||
|
||||
@staticmethod
|
||||
def calculate_regression_metrics(y_true, y_pred, n=20):
|
||||
"""
|
||||
Calculate the 5 most important accuracy metrics for regression.
|
||||
|
||||
Args:
|
||||
y_true (array-like): Array of true target values.
|
||||
y_pred (array-like): Array of predicted target values.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing the calculated metrics.
|
||||
"""
|
||||
metrics = {}
|
||||
|
||||
metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
|
||||
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
|
||||
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
|
||||
metrics['R2 Score'] = r2_score(y_true, y_pred)
|
||||
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
|
||||
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
|
||||
metrics['Mean True Value'] = y_true.mean()
|
||||
metrics['Mean Predicted Value'] = y_pred.mean()
|
||||
|
||||
errors = pd.DataFrame()
|
||||
errors['Fit'] = y_true
|
||||
errors['Actual'] = y_pred
|
||||
errors['Residual'] = errors['Actual'] - errors['Fit']
|
||||
errors['Absolute Residual'] = np.abs(errors['Residual'])
|
||||
|
||||
worst_errors = errors.nlargest(n, 'Absolute Residual')
|
||||
|
||||
return metrics, worst_errors
|
||||
|
|
@ -239,71 +239,9 @@ def handler():
|
|||
pd.set_option('display.width', 1000)
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# We want to estimate for making improvements on different property components
|
||||
response = "environment-impact-current"
|
||||
# We could potentially build models by constituency to avoid having too many
|
||||
# features in the model
|
||||
base_features = [
|
||||
"property-type",
|
||||
"built-form",
|
||||
# "construction-age-band",
|
||||
"number-habitable-rooms",
|
||||
"constituency",
|
||||
"number-heated-rooms",
|
||||
]
|
||||
|
||||
component_features = [
|
||||
"walls-description",
|
||||
"floor-description",
|
||||
"lighting-description",
|
||||
"windows-description",
|
||||
"roof-description",
|
||||
"mainheat-description",
|
||||
"main-fuel"
|
||||
]
|
||||
|
||||
model_data = df[[response] + component_features + base_features]
|
||||
model_data = model_data.reset_index(drop=True)
|
||||
model_data["idx"] = model_data.index.copy()
|
||||
|
||||
# Append on u-value estimates
|
||||
model_data = model_data.merge(
|
||||
pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]].rename(
|
||||
columns={"thermal_transmittance": "walls_u_value", }
|
||||
),
|
||||
how="left",
|
||||
left_on="walls-description",
|
||||
right_on="original_description"
|
||||
) \
|
||||
.drop(columns=["original_description"]) \
|
||||
.merge(
|
||||
pd.DataFrame(cleaner.cleaned["floor-description"])[["original_description", "thermal_transmittance"]].rename(
|
||||
columns={"thermal_transmittance": "floor_u_value", }
|
||||
),
|
||||
how="left",
|
||||
left_on="floor-description",
|
||||
right_on="original_description"
|
||||
)
|
||||
# Take just entries with U-values
|
||||
model_data = model_data[
|
||||
~pd.isnull(model_data["walls_u_value"]) &
|
||||
~pd.isnull(model_data["floor_u_value"])
|
||||
]
|
||||
model_data = model_data[
|
||||
base_features + [c for c in component_features if c not in [
|
||||
"walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", response]
|
||||
]
|
||||
|
||||
# We need to split the data into a train and test set for model build
|
||||
categorical_cols = [
|
||||
"property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
|
||||
"lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
|
||||
|
||||
]
|
||||
|
||||
# If these categorical variables are not of type 'category', convert them
|
||||
for col in categorical_cols:
|
||||
model_data[col] = model_data[col].astype('category')
|
||||
|
||||
# Dummy out the categorical variables
|
||||
training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True)
|
||||
|
|
@ -316,88 +254,16 @@ def handler():
|
|||
if training_data[col].dtype == object:
|
||||
training_data[col] = training_data[col].astype(float)
|
||||
|
||||
import statsmodels.api as sm
|
||||
|
||||
# Assuming 'df' is your DataFrame
|
||||
X = training_data.drop(columns=response)
|
||||
Y = training_data[response]
|
||||
|
||||
# Add a constant to the independent value
|
||||
X1 = sm.add_constant(X)
|
||||
|
||||
# make regression model
|
||||
model = sm.OLS(Y, X1)
|
||||
|
||||
# fit model and print results
|
||||
results = model.fit()
|
||||
print(results.summary())
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
def plot_regression(df):
|
||||
# Extract the "fit" and "actual" columns from the dataframe
|
||||
fit = df['fit']
|
||||
actual = df['actual']
|
||||
|
||||
# Create an array of x-values (assumed to be sequential integers)
|
||||
x = np.arange(len(df))
|
||||
|
||||
# Plot the fit and actual data
|
||||
plt.plot(x, fit, color='red', label='Fit')
|
||||
plt.plot(x, actual, color='blue', label='Actual')
|
||||
|
||||
# Set labels and title
|
||||
plt.xlabel('Index')
|
||||
plt.ylabel('Value')
|
||||
plt.title('Linear Regression - Fit vs Actual')
|
||||
|
||||
# Display legend
|
||||
plt.legend()
|
||||
|
||||
# Show the plot
|
||||
plt.show()
|
||||
|
||||
import numpy as np
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
||||
median_absolute_error, mean_absolute_percentage_error
|
||||
|
||||
def calculate_regression_metrics(y_true, y_pred, n=20):
|
||||
"""
|
||||
Calculate the 5 most important accuracy metrics for regression.
|
||||
|
||||
Args:
|
||||
y_true (array-like): Array of true target values.
|
||||
y_pred (array-like): Array of predicted target values.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing the calculated metrics.
|
||||
"""
|
||||
metrics = {}
|
||||
|
||||
metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
|
||||
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
|
||||
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
|
||||
metrics['R2 Score'] = r2_score(y_true, y_pred)
|
||||
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
|
||||
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
|
||||
metrics['Mean True Value'] = y_true.mean()
|
||||
metrics['Mean Predicted Value'] = y_pred.mean()
|
||||
|
||||
errors = pd.DataFrame()
|
||||
errors['Fit'] = y_true
|
||||
errors['Actual'] = y_pred
|
||||
errors['Residual'] = errors['Actual'] - errors['Fit']
|
||||
errors['Absolute Residual'] = np.abs(errors['Residual'])
|
||||
|
||||
worst_errors = errors.nlargest(n, 'Absolute Residual')
|
||||
|
||||
return metrics, worst_errors
|
||||
|
||||
fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)
|
||||
|
||||
model_data['fit'] = results.fittedvalues
|
||||
# The worst errors over index heavily for flats
|
||||
worst_x = model_data[model_data.index.isin(worst_errors.index)]
|
||||
|
||||
# Notes
|
||||
# TODO: We might want to look at adding in the u-value estimates for the properties that do not have them
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue