mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
first working version of sap model
This commit is contained in:
parent
d586441769
commit
ff84635cb8
2 changed files with 74 additions and 80 deletions
|
|
@ -3,6 +3,8 @@ import pandas as pd
|
||||||
import statsmodels.api as sm
|
import statsmodels.api as sm
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pickle
|
import pickle
|
||||||
|
from typing import Any, Dict, Tuple
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
||||||
median_absolute_error, mean_absolute_percentage_error
|
median_absolute_error, mean_absolute_percentage_error
|
||||||
|
|
||||||
|
|
@ -10,7 +12,7 @@ with open("all_data.pkl", "rb") as f:
|
||||||
all_data = pickle.load(f)
|
all_data = pickle.load(f)
|
||||||
|
|
||||||
|
|
||||||
class SalModel:
|
class SapModel:
|
||||||
# We want to estimate for making improvements on different property components
|
# We want to estimate for making improvements on different property components
|
||||||
RESPONSE = "environment-impact-current"
|
RESPONSE = "environment-impact-current"
|
||||||
# We could potentially build models by constituency to avoid having too many
|
# We could potentially build models by constituency to avoid having too many
|
||||||
|
|
@ -64,81 +66,71 @@ class SalModel:
|
||||||
"windows-description",
|
"windows-description",
|
||||||
"glazed-type",
|
"glazed-type",
|
||||||
"glazed-area",
|
"glazed-area",
|
||||||
"mainheat-description",
|
"construction-age-band",
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, data, cleaner):
|
def __init__(self, data, cleaner, test_size=0.2, random_state=None):
|
||||||
self.df = pd.DataFrame(data)
|
self.df = pd.DataFrame(data)
|
||||||
self.cleaner = cleaner
|
self.cleaner = cleaner
|
||||||
|
self.random_state = random_state if random_state is not None else 42
|
||||||
|
self.test_size = 0.2 if test_size is None else test_size
|
||||||
|
|
||||||
self.model_data = None
|
self.model_data = None
|
||||||
self.train_x = None
|
self.train_x = None
|
||||||
self.train_y = None
|
self.train_y = None
|
||||||
|
self.test_x = None
|
||||||
|
self.test_y = None
|
||||||
self.results = None
|
self.results = None
|
||||||
self.model_data = None
|
self.model_data = None
|
||||||
self.fit_error = None
|
self.fit_error = None
|
||||||
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
|
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
|
||||||
|
self.fit_df = None
|
||||||
|
|
||||||
def _append_cleaned_data(self, model_data):
|
def run(self, plot=False):
|
||||||
"""
|
"""
|
||||||
We need to estimate the u-value impact for:
|
A pipeline method to run all necessary methods in correct order.
|
||||||
1) Walls
|
|
||||||
2) Roof
|
|
||||||
3) Floors
|
|
||||||
We append this data on
|
|
||||||
|
|
||||||
Additionally, we append on the extracted proportion of low energy lighting, which
|
|
||||||
is moreliably extracted that using the low-energy-lighting column
|
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
|
self.create_dataset()
|
||||||
|
self.fit_model()
|
||||||
|
if plot:
|
||||||
|
self.plot_regression(self.fit_df)
|
||||||
|
except Exception as e:
|
||||||
|
print("An error occurred during execution.")
|
||||||
|
print(str(e))
|
||||||
|
|
||||||
wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[
|
def _merge_with_u_values(
|
||||||
["original_description", "thermal_transmittance"]].rename(
|
self, model_data: pd.DataFrame, description: str, thermal_transmittance: str
|
||||||
columns={"thermal_transmittance": "walls_u_value"}
|
) -> pd.DataFrame:
|
||||||
|
|
||||||
|
u_values = pd.DataFrame(self.cleaner.cleaned[f"{description}-description"])[
|
||||||
|
["original_description", thermal_transmittance]].rename(
|
||||||
|
columns={thermal_transmittance: f"{description}_u_value"}
|
||||||
)
|
)
|
||||||
|
|
||||||
floor_u_values = pd.DataFrame(self.cleaner.cleaned["floor-description"])[
|
model_data = model_data.merge(
|
||||||
["original_description", "thermal_transmittance"]].rename(
|
u_values,
|
||||||
columns={"thermal_transmittance": "floor_u_value"}
|
how="left",
|
||||||
)
|
left_on=f"{description}-description",
|
||||||
|
right_on="original_description"
|
||||||
|
).drop(columns=["original_description"])
|
||||||
|
|
||||||
roof_u_values = pd.DataFrame(self.cleaner.cleaned["roof-description"])[
|
return model_data
|
||||||
["original_description", "thermal_transmittance"]].rename(
|
|
||||||
columns={"thermal_transmittance": "roof_u_value", }
|
|
||||||
)
|
|
||||||
|
|
||||||
|
def _append_cleaned_data(self, model_data: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
for description in ["walls", "floor", "roof"]:
|
||||||
|
model_data = self._merge_with_u_values(model_data, description, "thermal_transmittance")
|
||||||
|
|
||||||
|
# lighting_proportions added separately as it doesn't use the _merge_with_u_values method
|
||||||
lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
|
lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
|
||||||
["original_description", "low_energy_proportion"]]
|
["original_description", "low_energy_proportion"]]
|
||||||
|
|
||||||
model_data = model_data.merge(
|
model_data = model_data.merge(
|
||||||
wall_u_values,
|
|
||||||
how="left",
|
|
||||||
left_on="walls-description",
|
|
||||||
right_on="original_description"
|
|
||||||
).drop(
|
|
||||||
columns=["original_description"]
|
|
||||||
).merge(
|
|
||||||
floor_u_values,
|
|
||||||
how="left",
|
|
||||||
left_on="floor-description",
|
|
||||||
right_on="original_description"
|
|
||||||
).drop(
|
|
||||||
columns=["original_description"]
|
|
||||||
).merge(
|
|
||||||
roof_u_values,
|
|
||||||
how="left",
|
|
||||||
left_on="roof-description",
|
|
||||||
right_on="original_description"
|
|
||||||
).drop(
|
|
||||||
columns=["original_description"]
|
|
||||||
).merge(
|
|
||||||
lighting_proportions,
|
lighting_proportions,
|
||||||
how="left",
|
how="left",
|
||||||
left_on="lighting-description",
|
left_on="lighting-description",
|
||||||
right_on="original_description"
|
right_on="original_description"
|
||||||
).drop(
|
).drop(columns=["original_description"])
|
||||||
columns=["original_description"]
|
|
||||||
)
|
|
||||||
|
|
||||||
return model_data
|
return model_data
|
||||||
|
|
||||||
|
|
@ -195,33 +187,59 @@ class SalModel:
|
||||||
model_data[col] = model_data[col].astype('category')
|
model_data[col] = model_data[col].astype('category')
|
||||||
|
|
||||||
# Convert response
|
# Convert response
|
||||||
self.model_data[self.RESPONSE] = self.model_data[self.RESPONSE].astype(float)
|
model_data[self.RESPONSE] = model_data[self.RESPONSE].astype(float)
|
||||||
|
|
||||||
self.model_data = model_data
|
self.model_data = model_data
|
||||||
|
|
||||||
def make_training_test(self):
|
def make_training_test(self, x):
|
||||||
# Split into training and test
|
# Split into training and test
|
||||||
# Dummy data
|
self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(
|
||||||
pass
|
x.drop(self.RESPONSE, axis=1),
|
||||||
|
x[self.RESPONSE],
|
||||||
|
test_size=self.test_size,
|
||||||
|
random_state=self.random_state
|
||||||
|
)
|
||||||
|
|
||||||
def fit_model(self):
|
def fit_model(self):
|
||||||
|
|
||||||
|
# Dummy out the categorical variables
|
||||||
|
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS, drop_first=True)
|
||||||
|
|
||||||
|
# Convert booleans to integer
|
||||||
|
for col in x.columns:
|
||||||
|
if x[col].dtype == bool:
|
||||||
|
x[col] = x[col].astype(int)
|
||||||
|
|
||||||
|
if x[col].dtype == object:
|
||||||
|
x[col] = x[col].astype(float)
|
||||||
|
|
||||||
|
# Create the training and test sets for each run
|
||||||
|
self.make_training_test(x)
|
||||||
|
|
||||||
# Add a constant to the independent value
|
# Add a constant to the independent value
|
||||||
x1 = sm.add_constant(self.X)
|
train_x = sm.add_constant(self.train_x)
|
||||||
|
|
||||||
# make regression model
|
# make regression model
|
||||||
model = sm.OLS(self.Y, x1)
|
model = sm.OLS(self.train_y, train_x)
|
||||||
|
|
||||||
# fit model and print results
|
# fit model and print results
|
||||||
self.results = model.fit()
|
self.results = model.fit()
|
||||||
|
|
||||||
self.fit_error, self.worst["errors"] = self.calculate_regression_metrics(
|
self.fit_error, self.worst["errors"] = self.calculate_regression_metrics(
|
||||||
y_true=self.Y, y_pred=self.results.fittedvalues
|
y_true=self.train_y, y_pred=self.results.fittedvalues
|
||||||
)
|
)
|
||||||
|
|
||||||
self.model_data['fit'] = self.results.fittedvalues
|
self.model_data['fit'] = self.results.fittedvalues
|
||||||
# The worst errors over index heavily for flats
|
# The worst errors over index heavily for flats
|
||||||
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
|
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
|
||||||
|
|
||||||
|
self.fit_df = pd.DataFrame(
|
||||||
|
{
|
||||||
|
"fit": self.results.fittedvalues,
|
||||||
|
"actual": self.train_y
|
||||||
|
}
|
||||||
|
).sort_values("actual", ascending=True)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def plot_regression(df):
|
def plot_regression(df):
|
||||||
# Extract the "fit" and "actual" columns from the dataframe
|
# Extract the "fit" and "actual" columns from the dataframe
|
||||||
|
|
@ -280,7 +298,7 @@ class SalModel:
|
||||||
return metrics, worst_errors
|
return metrics, worst_errors
|
||||||
|
|
||||||
|
|
||||||
self = SalModel(
|
self = SapModel(
|
||||||
data=all_data["data"],
|
data=all_data["data"],
|
||||||
cleaner=all_data["cleaner"]
|
cleaner=all_data["cleaner"]
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -243,21 +243,6 @@ def handler():
|
||||||
|
|
||||||
# If these categorical variables are not of type 'category', convert them
|
# If these categorical variables are not of type 'category', convert them
|
||||||
|
|
||||||
# Dummy out the categorical variables
|
|
||||||
training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True)
|
|
||||||
|
|
||||||
# Convert booleans to integer
|
|
||||||
for col in training_data.columns:
|
|
||||||
if training_data[col].dtype == bool:
|
|
||||||
training_data[col] = training_data[col].astype(int)
|
|
||||||
|
|
||||||
if training_data[col].dtype == object:
|
|
||||||
training_data[col] = training_data[col].astype(float)
|
|
||||||
|
|
||||||
# Assuming 'df' is your DataFrame
|
|
||||||
X = training_data.drop(columns=response)
|
|
||||||
Y = training_data[response]
|
|
||||||
|
|
||||||
print(results.summary())
|
print(results.summary())
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
@ -281,15 +266,6 @@ def handler():
|
||||||
grouped_error = pd.DataFrame(grouped_error)
|
grouped_error = pd.DataFrame(grouped_error)
|
||||||
grouped_error = grouped_error.sort_values("R2 Score", ascending=True)
|
grouped_error = grouped_error.sort_values("R2 Score", ascending=True)
|
||||||
|
|
||||||
fit_df = pd.DataFrame(
|
|
||||||
{
|
|
||||||
"fit": results.fittedvalues,
|
|
||||||
"actual": Y
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Sort on magnitude of actual
|
|
||||||
fit_df = fit_df.sort_values("actual", ascending=True)
|
|
||||||
plot_regression(fit_df)
|
plot_regression(fit_df)
|
||||||
|
|
||||||
model_data[["thermal_transmittance", response]].corr()
|
model_data[["thermal_transmittance", response]].corr()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue