first working version of sap model

2026-06-30 13:10:47 +00:00 · 2023-07-04 10:00:15 +01:00 · 2023-07-04 10:00:15 +01:00 · ff84635cb8
commit ff84635cb8
parent d586441769
2 changed files with 74 additions and 80 deletions
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@ -3,6 +3,8 @@ import pandas as pd
 import statsmodels.api as sm
 import matplotlib.pyplot as plt
 import pickle
 from typing import Any, Dict, Tuple
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
    median_absolute_error, mean_absolute_percentage_error
@ -10,7 +12,7 @@ with open("all_data.pkl", "rb") as f:
    all_data = pickle.load(f)
-class SalModel:
+class SapModel:
    # We want to estimate for making improvements on different property components
    RESPONSE = "environment-impact-current"
    # We could potentially  build models by constituency to avoid having too many
@ -64,81 +66,71 @@ class SalModel:
        "windows-description",
        "glazed-type",
        "glazed-area",
-        "mainheat-description",
+        "construction-age-band",
    ]
-    def __init__(self, data, cleaner):
+    def __init__(self, data, cleaner, test_size=0.2, random_state=None):
        self.df = pd.DataFrame(data)
        self.cleaner = cleaner
        self.random_state = random_state if random_state is not None else 42
        self.test_size = 0.2 if test_size is None else test_size
        self.model_data = None
        self.train_x = None
        self.train_y = None
        self.test_x = None
        self.test_y = None
        self.results = None
        self.model_data = None
        self.fit_error = None
        self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
        self.fit_df = None
-    def _append_cleaned_data(self, model_data):
+    def run(self, plot=False):
        """
-        We need to estimate the u-value impact for:
+        A pipeline method to run all necessary methods in correct order.
        1) Walls
        2) Roof
        3) Floors
        We append this data on
        Additionally, we append on the extracted proportion of low energy lighting, which
        is moreliably extracted that using the low-energy-lighting column
        """
        try:
            self.create_dataset()
            self.fit_model()
            if plot:
                self.plot_regression(self.fit_df)
        except Exception as e:
            print("An error occurred during execution.")
            print(str(e))
-        wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[
+    def _merge_with_u_values(
-            ["original_description", "thermal_transmittance"]].rename(
+        self, model_data: pd.DataFrame, description: str, thermal_transmittance: str
-            columns={"thermal_transmittance": "walls_u_value"}
+    ) -> pd.DataFrame:
        u_values = pd.DataFrame(self.cleaner.cleaned[f"{description}-description"])[
            ["original_description", thermal_transmittance]].rename(
            columns={thermal_transmittance: f"{description}_u_value"}
        )
-        floor_u_values = pd.DataFrame(self.cleaner.cleaned["floor-description"])[
+        model_data = model_data.merge(
-            ["original_description", "thermal_transmittance"]].rename(
+            u_values,
-            columns={"thermal_transmittance": "floor_u_value"}
+            how="left",
-        )
+            left_on=f"{description}-description",
            right_on="original_description"
        ).drop(columns=["original_description"])
-        roof_u_values = pd.DataFrame(self.cleaner.cleaned["roof-description"])[
+        return model_data
            ["original_description", "thermal_transmittance"]].rename(
            columns={"thermal_transmittance": "roof_u_value", }
        )
    def _append_cleaned_data(self, model_data: pd.DataFrame) -> pd.DataFrame:
        for description in ["walls", "floor", "roof"]:
            model_data = self._merge_with_u_values(model_data, description, "thermal_transmittance")
        # lighting_proportions added separately as it doesn't use the _merge_with_u_values method
        lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
            ["original_description", "low_energy_proportion"]]
        model_data = model_data.merge(
            wall_u_values,
            how="left",
            left_on="walls-description",
            right_on="original_description"
        ).drop(
            columns=["original_description"]
        ).merge(
            floor_u_values,
            how="left",
            left_on="floor-description",
            right_on="original_description"
        ).drop(
            columns=["original_description"]
        ).merge(
            roof_u_values,
            how="left",
            left_on="roof-description",
            right_on="original_description"
        ).drop(
            columns=["original_description"]
        ).merge(
            lighting_proportions,
            how="left",
            left_on="lighting-description",
            right_on="original_description"
-        ).drop(
+        ).drop(columns=["original_description"])
            columns=["original_description"]
        )
        return model_data
@ -195,33 +187,59 @@ class SalModel:
            model_data[col] = model_data[col].astype('category')
        # Convert response
-        self.model_data[self.RESPONSE] = self.model_data[self.RESPONSE].astype(float)
+        model_data[self.RESPONSE] = model_data[self.RESPONSE].astype(float)
        self.model_data = model_data
-    def make_training_test(self):
+    def make_training_test(self, x):
        # Split into training and test
-        # Dummy data
+        self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(
-        pass
+            x.drop(self.RESPONSE, axis=1),
            x[self.RESPONSE],
            test_size=self.test_size,
            random_state=self.random_state
        )
    def fit_model(self):
        # Dummy out the categorical variables
        x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS, drop_first=True)
        # Convert booleans to integer
        for col in x.columns:
            if x[col].dtype == bool:
                x[col] = x[col].astype(int)
            if x[col].dtype == object:
                x[col] = x[col].astype(float)
        # Create the training and test sets for each run
        self.make_training_test(x)
        # Add a constant to the independent value
-        x1 = sm.add_constant(self.X)
+        train_x = sm.add_constant(self.train_x)
        # make regression model
-        model = sm.OLS(self.Y, x1)
+        model = sm.OLS(self.train_y, train_x)
        # fit model and print results
        self.results = model.fit()
        self.fit_error, self.worst["errors"] = self.calculate_regression_metrics(
-            y_true=self.Y, y_pred=self.results.fittedvalues
+            y_true=self.train_y, y_pred=self.results.fittedvalues
        )
        self.model_data['fit'] = self.results.fittedvalues
        # The worst errors over index heavily for flats
        self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
        self.fit_df = pd.DataFrame(
            {
                "fit": self.results.fittedvalues,
                "actual": self.train_y
            }
        ).sort_values("actual", ascending=True)
    @staticmethod
    def plot_regression(df):
        # Extract the "fit" and "actual" columns from the dataframe
@ -280,7 +298,7 @@ class SalModel:
        return metrics, worst_errors
-self = SalModel(
+self = SapModel(
    data=all_data["data"],
    cleaner=all_data["cleaner"]
 )
--- a/model_data/app.py
+++ b/model_data/app.py
@ -243,21 +243,6 @@ def handler():
    # If these categorical variables are not of type 'category', convert them
    # Dummy out the categorical variables
    training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True)
    # Convert booleans to integer
    for col in training_data.columns:
        if training_data[col].dtype == bool:
            training_data[col] = training_data[col].astype(int)
        if training_data[col].dtype == object:
            training_data[col] = training_data[col].astype(float)
    # Assuming 'df' is your DataFrame
    X = training_data.drop(columns=response)
    Y = training_data[response]
    print(results.summary())
    import matplotlib.pyplot as plt
@ -281,15 +266,6 @@ def handler():
    grouped_error = pd.DataFrame(grouped_error)
    grouped_error = grouped_error.sort_values("R2 Score", ascending=True)
    fit_df = pd.DataFrame(
        {
            "fit": results.fittedvalues,
            "actual": Y
        }
    )
    # Sort on magnitude of actual
    fit_df = fit_df.sort_values("actual", ascending=True)
    plot_regression(fit_df)
    model_data[["thermal_transmittance", response]].corr()