creating rough framework for sap model

2026-08-02 21:08:24 +00:00 · 2023-07-03 13:01:41 +01:00 · 2023-07-03 13:01:41 +01:00 · f941a3c512
commit f941a3c512
parent c08f74cefb
2 changed files with 170 additions and 134 deletions
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@ -0,0 +1,170 @@
+import numpy as np
+import pandas as pd
+import statsmodels.api as sm
+import matplotlib.pyplot as plt
+import pickle
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
+    median_absolute_error, mean_absolute_percentage_error
+
+with open("all_data.pkl", "wb") as f:
+    all_data = pickle.load(f)
+
+
+class SalModel:
+    # We want to estimate for making improvements on different property components
+    RESPONSE = "environment-impact-current"
+    # We could potentially  build models by constituency to avoid having too many
+    # features in the model
+    BASE_FEATURES = [
+        "property-type",
+        "built-form",
+        # "construction-age-band",
+        "number-habitable-rooms",
+        "constituency",
+        "number-heated-rooms",
+    ]
+
+    COMPONENT_FEATURES = [
+        "walls-description",
+        "floor-description",
+        "lighting-description",
+        "windows-description",
+        "roof-description",
+        "mainheat-description",
+        "main-fuel"
+    ]
+
+    CATEGORICAL_COLS = [
+        "property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
+        "lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
+
+    ]
+
+    def __init__(self, data, cleaner):
+        self.df = pd.DataFrame(data)
+        self.cleaner = cleaner
+
+        self.train_x = None
+        self.train_y = None
+        self.results = None
+        self.model_data = None
+        self.fit_error = None
+        self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
+
+    def create_dataset(self):
+        model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
+        model_data = model_data.reset_index(drop=True)
+        model_data["idx"] = model_data.index.copy()
+
+        # Append on u-value estimates
+        model_data = model_data.merge(
+            pd.DataFrame(self.cleaner.cleaned["walls-description"])[
+                ["original_description", "thermal_transmittance"]].rename(
+                columns={"thermal_transmittance": "walls_u_value", }
+            ),
+            how="left",
+            left_on="walls-description",
+            right_on="original_description"
+        ) \
+            .drop(columns=["original_description"]) \
+            .merge(
+            pd.DataFrame(self.cleaner.cleaned["floor-description"])[
+                ["original_description", "thermal_transmittance"]].rename(
+                columns={"thermal_transmittance": "floor_u_value", }
+            ),
+            how="left",
+            left_on="floor-description",
+            right_on="original_description"
+        )
+        # Take just entries with U-values
+        model_data = model_data[
+            ~pd.isnull(model_data["walls_u_value"]) &
+            ~pd.isnull(model_data["floor_u_value"])
+            ]
+        model_data = model_data[
+            self.BASE_FEATURES + [c for c in self.COMPONENT_FEATURES if c not in [
+                "walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", self.RESPONSE]
+            ]
+
+        for col in self.CATEGORICAL_COLS:
+            model_data[col] = model_data[col].astype('category')
+
+    def make_training_test(self):
+        # Split into training and test
+        # Dummy data
+        pass
+
+    def fit_model(self):
+        # Add a constant to the independent value
+        x1 = sm.add_constant(self.X)
+
+        # make regression model
+        model = sm.OLS(self.Y, x1)
+
+        # fit model and print results
+        self.results = model.fit()
+
+        self.fit_error, self.worst["errors"] = self.calculate_regression_metrics(
+            y_true=self.Y, y_pred=self.results.fittedvalues
+        )
+
+        self.model_data['fit'] = self.results.fittedvalues
+        # The worst errors over index heavily for flats
+        self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
+
+    @staticmethod
+    def plot_regression(df):
+        # Extract the "fit" and "actual" columns from the dataframe
+        fit = df['fit']
+        actual = df['actual']
+
+        # Create an array of x-values (assumed to be sequential integers)
+        x = np.arange(len(df))
+
+        # Plot the fit and actual data
+        plt.plot(x, fit, color='red', label='Fit')
+        plt.plot(x, actual, color='blue', label='Actual')
+
+        # Set labels and title
+        plt.xlabel('Index')
+        plt.ylabel('Value')
+        plt.title('Linear Regression - Fit vs Actual')
+
+        # Display legend
+        plt.legend()
+
+        # Show the plot
+        plt.show()
+
+    @staticmethod
+    def calculate_regression_metrics(y_true, y_pred, n=20):
+        """
+        Calculate the 5 most important accuracy metrics for regression.
+
+        Args:
+            y_true (array-like): Array of true target values.
+            y_pred (array-like): Array of predicted target values.
+
+        Returns:
+            dict: Dictionary containing the calculated metrics.
+        """
+        metrics = {}
+
+        metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
+        metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
+        metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
+        metrics['R2 Score'] = r2_score(y_true, y_pred)
+        metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
+        metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
+        metrics['Mean True Value'] = y_true.mean()
+        metrics['Mean Predicted Value'] = y_pred.mean()
+
+        errors = pd.DataFrame()
+        errors['Fit'] = y_true
+        errors['Actual'] = y_pred
+        errors['Residual'] = errors['Actual'] - errors['Fit']
+        errors['Absolute Residual'] = np.abs(errors['Residual'])
+
+        worst_errors = errors.nlargest(n, 'Absolute Residual')
+
+        return metrics, worst_errors
--- a/model_data/app.py
+++ b/model_data/app.py
@ -239,71 +239,9 @@ def handler():
    pd.set_option('display.width', 1000)
    df = pd.DataFrame(data)

-    # We want to estimate for making improvements on different property components
-    response = "environment-impact-current"
-    # We could potentially  build models by constituency to avoid having too many
-    # features in the model
-    base_features = [
-        "property-type",
-        "built-form",
-        # "construction-age-band",
-        "number-habitable-rooms",
-        "constituency",
-        "number-heated-rooms",
-    ]
-
-    component_features = [
-        "walls-description",
-        "floor-description",
-        "lighting-description",
-        "windows-description",
-        "roof-description",
-        "mainheat-description",
-        "main-fuel"
-    ]
-
-    model_data = df[[response] + component_features + base_features]
-    model_data = model_data.reset_index(drop=True)
-    model_data["idx"] = model_data.index.copy()
-
-    # Append on u-value estimates
-    model_data = model_data.merge(
-        pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]].rename(
-            columns={"thermal_transmittance": "walls_u_value", }
-        ),
-        how="left",
-        left_on="walls-description",
-        right_on="original_description"
-    ) \
-        .drop(columns=["original_description"]) \
-        .merge(
-        pd.DataFrame(cleaner.cleaned["floor-description"])[["original_description", "thermal_transmittance"]].rename(
-            columns={"thermal_transmittance": "floor_u_value", }
-        ),
-        how="left",
-        left_on="floor-description",
-        right_on="original_description"
-    )
-    # Take just entries with U-values
-    model_data = model_data[
-        ~pd.isnull(model_data["walls_u_value"]) &
-        ~pd.isnull(model_data["floor_u_value"])
-        ]
-    model_data = model_data[
-        base_features + [c for c in component_features if c not in [
-            "walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", response]
-        ]
-
    # We need to split the data into a train and test set for model build
-    categorical_cols = [
-        "property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
-        "lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
-
-    ]

    # If these categorical variables are not of type 'category', convert them
-    for col in categorical_cols:
-        model_data[col] = model_data[col].astype('category')

    # Dummy out the categorical variables
    training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True)
@ -316,88 +254,16 @@ def handler():
        if training_data[col].dtype == object:
            training_data[col] = training_data[col].astype(float)

-    import statsmodels.api as sm
-
    # Assuming 'df' is your DataFrame
    X = training_data.drop(columns=response)
    Y = training_data[response]

-    # Add a constant to the independent value
-    X1 = sm.add_constant(X)
-
-    # make regression model
-    model = sm.OLS(Y, X1)
-
-    # fit model and print results
-    results = model.fit()
    print(results.summary())

    import matplotlib.pyplot as plt
    import numpy as np
-    def plot_regression(df):
-        # Extract the "fit" and "actual" columns from the dataframe
-        fit = df['fit']
-        actual = df['actual']
-
-        # Create an array of x-values (assumed to be sequential integers)
-        x = np.arange(len(df))
-
-        # Plot the fit and actual data
-        plt.plot(x, fit, color='red', label='Fit')
-        plt.plot(x, actual, color='blue', label='Actual')
-
-        # Set labels and title
-        plt.xlabel('Index')
-        plt.ylabel('Value')
-        plt.title('Linear Regression - Fit vs Actual')
-
-        # Display legend
-        plt.legend()
-
-        # Show the plot
-        plt.show()

    import numpy as np
-    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
-        median_absolute_error, mean_absolute_percentage_error
-
-    def calculate_regression_metrics(y_true, y_pred, n=20):
-        """
-        Calculate the 5 most important accuracy metrics for regression.
-
-        Args:
-            y_true (array-like): Array of true target values.
-            y_pred (array-like): Array of predicted target values.
-
-        Returns:
-            dict: Dictionary containing the calculated metrics.
-        """
-        metrics = {}
-
-        metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
-        metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
-        metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
-        metrics['R2 Score'] = r2_score(y_true, y_pred)
-        metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
-        metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
-        metrics['Mean True Value'] = y_true.mean()
-        metrics['Mean Predicted Value'] = y_pred.mean()
-
-        errors = pd.DataFrame()
-        errors['Fit'] = y_true
-        errors['Actual'] = y_pred
-        errors['Residual'] = errors['Actual'] - errors['Fit']
-        errors['Absolute Residual'] = np.abs(errors['Residual'])
-
-        worst_errors = errors.nlargest(n, 'Absolute Residual')
-
-        return metrics, worst_errors
-
-    fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)
-
-    model_data['fit'] = results.fittedvalues
-    # The worst errors over index heavily for flats
-    worst_x = model_data[model_data.index.isin(worst_errors.index)]

    # Notes
    # TODO: We might want to look at adding in the u-value estimates for the properties that do not have them