first working version of sap model

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-04 10:00:15 +01:00
parent d586441769
commit ff84635cb8
2 changed files with 74 additions and 80 deletions

View file

@ -3,6 +3,8 @@ import pandas as pd
import statsmodels.api as sm import statsmodels.api as sm
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pickle import pickle
from typing import Any, Dict, Tuple
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
median_absolute_error, mean_absolute_percentage_error median_absolute_error, mean_absolute_percentage_error
@ -10,7 +12,7 @@ with open("all_data.pkl", "rb") as f:
all_data = pickle.load(f) all_data = pickle.load(f)
class SalModel: class SapModel:
# We want to estimate for making improvements on different property components # We want to estimate for making improvements on different property components
RESPONSE = "environment-impact-current" RESPONSE = "environment-impact-current"
# We could potentially build models by constituency to avoid having too many # We could potentially build models by constituency to avoid having too many
@ -64,81 +66,71 @@ class SalModel:
"windows-description", "windows-description",
"glazed-type", "glazed-type",
"glazed-area", "glazed-area",
"mainheat-description", "construction-age-band",
] ]
def __init__(self, data, cleaner): def __init__(self, data, cleaner, test_size=0.2, random_state=None):
self.df = pd.DataFrame(data) self.df = pd.DataFrame(data)
self.cleaner = cleaner self.cleaner = cleaner
self.random_state = random_state if random_state is not None else 42
self.test_size = 0.2 if test_size is None else test_size
self.model_data = None self.model_data = None
self.train_x = None self.train_x = None
self.train_y = None self.train_y = None
self.test_x = None
self.test_y = None
self.results = None self.results = None
self.model_data = None self.model_data = None
self.fit_error = None self.fit_error = None
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()} self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
self.fit_df = None
def _append_cleaned_data(self, model_data): def run(self, plot=False):
""" """
We need to estimate the u-value impact for: A pipeline method to run all necessary methods in correct order.
1) Walls
2) Roof
3) Floors
We append this data on
Additionally, we append on the extracted proportion of low energy lighting, which
is moreliably extracted that using the low-energy-lighting column
""" """
try:
self.create_dataset()
self.fit_model()
if plot:
self.plot_regression(self.fit_df)
except Exception as e:
print("An error occurred during execution.")
print(str(e))
wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[ def _merge_with_u_values(
["original_description", "thermal_transmittance"]].rename( self, model_data: pd.DataFrame, description: str, thermal_transmittance: str
columns={"thermal_transmittance": "walls_u_value"} ) -> pd.DataFrame:
u_values = pd.DataFrame(self.cleaner.cleaned[f"{description}-description"])[
["original_description", thermal_transmittance]].rename(
columns={thermal_transmittance: f"{description}_u_value"}
) )
floor_u_values = pd.DataFrame(self.cleaner.cleaned["floor-description"])[ model_data = model_data.merge(
["original_description", "thermal_transmittance"]].rename( u_values,
columns={"thermal_transmittance": "floor_u_value"} how="left",
) left_on=f"{description}-description",
right_on="original_description"
).drop(columns=["original_description"])
roof_u_values = pd.DataFrame(self.cleaner.cleaned["roof-description"])[ return model_data
["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "roof_u_value", }
)
def _append_cleaned_data(self, model_data: pd.DataFrame) -> pd.DataFrame:
for description in ["walls", "floor", "roof"]:
model_data = self._merge_with_u_values(model_data, description, "thermal_transmittance")
# lighting_proportions added separately as it doesn't use the _merge_with_u_values method
lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[ lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
["original_description", "low_energy_proportion"]] ["original_description", "low_energy_proportion"]]
model_data = model_data.merge( model_data = model_data.merge(
wall_u_values,
how="left",
left_on="walls-description",
right_on="original_description"
).drop(
columns=["original_description"]
).merge(
floor_u_values,
how="left",
left_on="floor-description",
right_on="original_description"
).drop(
columns=["original_description"]
).merge(
roof_u_values,
how="left",
left_on="roof-description",
right_on="original_description"
).drop(
columns=["original_description"]
).merge(
lighting_proportions, lighting_proportions,
how="left", how="left",
left_on="lighting-description", left_on="lighting-description",
right_on="original_description" right_on="original_description"
).drop( ).drop(columns=["original_description"])
columns=["original_description"]
)
return model_data return model_data
@ -195,33 +187,59 @@ class SalModel:
model_data[col] = model_data[col].astype('category') model_data[col] = model_data[col].astype('category')
# Convert response # Convert response
self.model_data[self.RESPONSE] = self.model_data[self.RESPONSE].astype(float) model_data[self.RESPONSE] = model_data[self.RESPONSE].astype(float)
self.model_data = model_data self.model_data = model_data
def make_training_test(self): def make_training_test(self, x):
# Split into training and test # Split into training and test
# Dummy data self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(
pass x.drop(self.RESPONSE, axis=1),
x[self.RESPONSE],
test_size=self.test_size,
random_state=self.random_state
)
def fit_model(self): def fit_model(self):
# Dummy out the categorical variables
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS, drop_first=True)
# Convert booleans to integer
for col in x.columns:
if x[col].dtype == bool:
x[col] = x[col].astype(int)
if x[col].dtype == object:
x[col] = x[col].astype(float)
# Create the training and test sets for each run
self.make_training_test(x)
# Add a constant to the independent value # Add a constant to the independent value
x1 = sm.add_constant(self.X) train_x = sm.add_constant(self.train_x)
# make regression model # make regression model
model = sm.OLS(self.Y, x1) model = sm.OLS(self.train_y, train_x)
# fit model and print results # fit model and print results
self.results = model.fit() self.results = model.fit()
self.fit_error, self.worst["errors"] = self.calculate_regression_metrics( self.fit_error, self.worst["errors"] = self.calculate_regression_metrics(
y_true=self.Y, y_pred=self.results.fittedvalues y_true=self.train_y, y_pred=self.results.fittedvalues
) )
self.model_data['fit'] = self.results.fittedvalues self.model_data['fit'] = self.results.fittedvalues
# The worst errors over index heavily for flats # The worst errors over index heavily for flats
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)] self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
self.fit_df = pd.DataFrame(
{
"fit": self.results.fittedvalues,
"actual": self.train_y
}
).sort_values("actual", ascending=True)
@staticmethod @staticmethod
def plot_regression(df): def plot_regression(df):
# Extract the "fit" and "actual" columns from the dataframe # Extract the "fit" and "actual" columns from the dataframe
@ -280,7 +298,7 @@ class SalModel:
return metrics, worst_errors return metrics, worst_errors
self = SalModel( self = SapModel(
data=all_data["data"], data=all_data["data"],
cleaner=all_data["cleaner"] cleaner=all_data["cleaner"]
) )

View file

@ -243,21 +243,6 @@ def handler():
# If these categorical variables are not of type 'category', convert them # If these categorical variables are not of type 'category', convert them
# Dummy out the categorical variables
training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True)
# Convert booleans to integer
for col in training_data.columns:
if training_data[col].dtype == bool:
training_data[col] = training_data[col].astype(int)
if training_data[col].dtype == object:
training_data[col] = training_data[col].astype(float)
# Assuming 'df' is your DataFrame
X = training_data.drop(columns=response)
Y = training_data[response]
print(results.summary()) print(results.summary())
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@ -281,15 +266,6 @@ def handler():
grouped_error = pd.DataFrame(grouped_error) grouped_error = pd.DataFrame(grouped_error)
grouped_error = grouped_error.sort_values("R2 Score", ascending=True) grouped_error = grouped_error.sort_values("R2 Score", ascending=True)
fit_df = pd.DataFrame(
{
"fit": results.fittedvalues,
"actual": Y
}
)
# Sort on magnitude of actual
fit_df = fit_df.sort_values("actual", ascending=True)
plot_regression(fit_df) plot_regression(fit_df)
model_data[["thermal_transmittance", response]].corr() model_data[["thermal_transmittance", response]].corr()