diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index d18429dc..8b0013c3 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -3,6 +3,8 @@ import pandas as pd import statsmodels.api as sm import matplotlib.pyplot as plt import pickle +from typing import Any, Dict, Tuple +from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ median_absolute_error, mean_absolute_percentage_error @@ -10,7 +12,7 @@ with open("all_data.pkl", "rb") as f: all_data = pickle.load(f) -class SalModel: +class SapModel: # We want to estimate for making improvements on different property components RESPONSE = "environment-impact-current" # We could potentially build models by constituency to avoid having too many @@ -64,81 +66,71 @@ class SalModel: "windows-description", "glazed-type", "glazed-area", - "mainheat-description", - + "construction-age-band", ] - def __init__(self, data, cleaner): + def __init__(self, data, cleaner, test_size=0.2, random_state=None): self.df = pd.DataFrame(data) self.cleaner = cleaner + self.random_state = random_state if random_state is not None else 42 + self.test_size = 0.2 if test_size is None else test_size self.model_data = None self.train_x = None self.train_y = None + self.test_x = None + self.test_y = None self.results = None self.model_data = None self.fit_error = None self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()} + self.fit_df = None - def _append_cleaned_data(self, model_data): + def run(self, plot=False): """ - We need to estimate the u-value impact for: - 1) Walls - 2) Roof - 3) Floors - We append this data on - - Additionally, we append on the extracted proportion of low energy lighting, which - is moreliably extracted that using the low-energy-lighting column + A pipeline method to run all necessary methods in correct order. """ + try: + self.create_dataset() + self.fit_model() + if plot: + self.plot_regression(self.fit_df) + except Exception as e: + print("An error occurred during execution.") + print(str(e)) - wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[ - ["original_description", "thermal_transmittance"]].rename( - columns={"thermal_transmittance": "walls_u_value"} + def _merge_with_u_values( + self, model_data: pd.DataFrame, description: str, thermal_transmittance: str + ) -> pd.DataFrame: + + u_values = pd.DataFrame(self.cleaner.cleaned[f"{description}-description"])[ + ["original_description", thermal_transmittance]].rename( + columns={thermal_transmittance: f"{description}_u_value"} ) - floor_u_values = pd.DataFrame(self.cleaner.cleaned["floor-description"])[ - ["original_description", "thermal_transmittance"]].rename( - columns={"thermal_transmittance": "floor_u_value"} - ) + model_data = model_data.merge( + u_values, + how="left", + left_on=f"{description}-description", + right_on="original_description" + ).drop(columns=["original_description"]) - roof_u_values = pd.DataFrame(self.cleaner.cleaned["roof-description"])[ - ["original_description", "thermal_transmittance"]].rename( - columns={"thermal_transmittance": "roof_u_value", } - ) + return model_data + def _append_cleaned_data(self, model_data: pd.DataFrame) -> pd.DataFrame: + for description in ["walls", "floor", "roof"]: + model_data = self._merge_with_u_values(model_data, description, "thermal_transmittance") + + # lighting_proportions added separately as it doesn't use the _merge_with_u_values method lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[ ["original_description", "low_energy_proportion"]] model_data = model_data.merge( - wall_u_values, - how="left", - left_on="walls-description", - right_on="original_description" - ).drop( - columns=["original_description"] - ).merge( - floor_u_values, - how="left", - left_on="floor-description", - right_on="original_description" - ).drop( - columns=["original_description"] - ).merge( - roof_u_values, - how="left", - left_on="roof-description", - right_on="original_description" - ).drop( - columns=["original_description"] - ).merge( lighting_proportions, how="left", left_on="lighting-description", right_on="original_description" - ).drop( - columns=["original_description"] - ) + ).drop(columns=["original_description"]) return model_data @@ -195,33 +187,59 @@ class SalModel: model_data[col] = model_data[col].astype('category') # Convert response - self.model_data[self.RESPONSE] = self.model_data[self.RESPONSE].astype(float) + model_data[self.RESPONSE] = model_data[self.RESPONSE].astype(float) self.model_data = model_data - def make_training_test(self): + def make_training_test(self, x): # Split into training and test - # Dummy data - pass + self.train_x, self.test_x, self.train_y, self.test_y = train_test_split( + x.drop(self.RESPONSE, axis=1), + x[self.RESPONSE], + test_size=self.test_size, + random_state=self.random_state + ) def fit_model(self): + + # Dummy out the categorical variables + x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS, drop_first=True) + + # Convert booleans to integer + for col in x.columns: + if x[col].dtype == bool: + x[col] = x[col].astype(int) + + if x[col].dtype == object: + x[col] = x[col].astype(float) + + # Create the training and test sets for each run + self.make_training_test(x) + # Add a constant to the independent value - x1 = sm.add_constant(self.X) + train_x = sm.add_constant(self.train_x) # make regression model - model = sm.OLS(self.Y, x1) + model = sm.OLS(self.train_y, train_x) # fit model and print results self.results = model.fit() self.fit_error, self.worst["errors"] = self.calculate_regression_metrics( - y_true=self.Y, y_pred=self.results.fittedvalues + y_true=self.train_y, y_pred=self.results.fittedvalues ) self.model_data['fit'] = self.results.fittedvalues # The worst errors over index heavily for flats self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)] + self.fit_df = pd.DataFrame( + { + "fit": self.results.fittedvalues, + "actual": self.train_y + } + ).sort_values("actual", ascending=True) + @staticmethod def plot_regression(df): # Extract the "fit" and "actual" columns from the dataframe @@ -280,7 +298,7 @@ class SalModel: return metrics, worst_errors -self = SalModel( +self = SapModel( data=all_data["data"], cleaner=all_data["cleaner"] ) diff --git a/model_data/app.py b/model_data/app.py index 2fcf48a9..586337db 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -243,21 +243,6 @@ def handler(): # If these categorical variables are not of type 'category', convert them - # Dummy out the categorical variables - training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True) - - # Convert booleans to integer - for col in training_data.columns: - if training_data[col].dtype == bool: - training_data[col] = training_data[col].astype(int) - - if training_data[col].dtype == object: - training_data[col] = training_data[col].astype(float) - - # Assuming 'df' is your DataFrame - X = training_data.drop(columns=response) - Y = training_data[response] - print(results.summary()) import matplotlib.pyplot as plt @@ -281,15 +266,6 @@ def handler(): grouped_error = pd.DataFrame(grouped_error) grouped_error = grouped_error.sort_values("R2 Score", ascending=True) - fit_df = pd.DataFrame( - { - "fit": results.fittedvalues, - "actual": Y - } - ) - - # Sort on magnitude of actual - fit_df = fit_df.sort_values("actual", ascending=True) plot_regression(fit_df) model_data[["thermal_transmittance", response]].corr()