poc sap model wip, probably need full panel of data

2026-07-27 23:35:01 +00:00 · 2023-06-30 18:34:41 +01:00 · 2023-06-30 18:34:41 +01:00 · 8c55df82fa
commit 8c55df82fa
parent cbfb9a5a93
2 changed files with 99 additions and 3 deletions
--- a/model_data/app.py
+++ b/model_data/app.py
@ -232,6 +232,8 @@ def handler():

    # We want to estimate for making improvements on different property components
    response = "environment-impact-current"
+    # We could potentially  build models by constituency to avoid having too many
+    # features in the model
    base_features = [
        "property-type",
        "built-form",
@ -258,14 +260,33 @@ def handler():
    )
    # Take just entries with U-values
    model_data = model_data[~pd.isnull(model_data["thermal_transmittance"])]
+    model_data = model_data[base_features + ["thermal_transmittance", response]]

    # We need to split the data into a train and test set for model build
+    categorical_cols = [
+        "property-type", "built-form", "number-habitable-rooms", "constituency",
+    ]
+
+    # If these categorical variables are not of type 'category', convert them
+    for col in categorical_cols:
+        model_data[col] = model_data[col].astype('category')
+
+    # Dummy out the categorical variables
+    training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True)
+
+    # Convert booleans to integer
+    for col in training_data.columns:
+        if training_data[col].dtype == bool:
+            training_data[col] = training_data[col].astype(int)
+
+        if training_data[col].dtype == object:
+            training_data[col] = training_data[col].astype(float)

    import statsmodels.api as sm

    # Assuming 'df' is your DataFrame
-    X = model_data[base_features + ["thermal_transmittance"]]
-    Y = model_data[response]
+    X = training_data.drop(columns=response)
+    Y = training_data[response]

    # Add a constant to the independent value
    X1 = sm.add_constant(X)
@ -277,6 +298,80 @@ def handler():
    results = model.fit()
    print(results.summary())

+    import matplotlib.pyplot as plt
+    import numpy as np
+    def plot_regression(df):
+        # Extract the "fit" and "actual" columns from the dataframe
+        fit = df['fit']
+        actual = df['actual']
+
+        # Create an array of x-values (assumed to be sequential integers)
+        x = np.arange(len(df))
+
+        # Plot the fit and actual data
+        plt.plot(x, fit, color='red', label='Fit')
+        plt.plot(x, actual, color='blue', label='Actual')
+
+        # Set labels and title
+        plt.xlabel('Index')
+        plt.ylabel('Value')
+        plt.title('Linear Regression - Fit vs Actual')
+
+        # Display legend
+        plt.legend()
+
+        # Show the plot
+        plt.show()
+
+    import numpy as np
+    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
+        median_absolute_error
+
+    def calculate_regression_metrics(y_true, y_pred, n=20):
+        """
+        Calculate the 5 most important accuracy metrics for regression.
+
+        Args:
+            y_true (array-like): Array of true target values.
+            y_pred (array-like): Array of predicted target values.
+
+        Returns:
+            dict: Dictionary containing the calculated metrics.
+        """
+        metrics = {}
+
+        metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
+        metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
+        metrics['R2 Score'] = r2_score(y_true, y_pred)
+        metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
+        metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
+
+        errors = pd.DataFrame()
+        errors['Fit'] = y_true
+        errors['Actual'] = y_pred
+        errors['Residual'] = errors['Actual'] - errors['Fit']
+        errors['Absolute Residual'] = np.abs(errors['Residual'])
+
+        worst_errors = errors.nlargest(n, 'Absolute Residual')
+
+        return metrics, worst_errors
+
+    fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)
+
+    worst_x = model_data[model_data.index.isin(worst_errors.index)]
+    # The worst errors over index heavily for flats
+
+    fit_df = pd.DataFrame(
+        {
+            "fit": results.fittedvalues,
+            "actual": Y
+        }
+    )
+
+    # Sort on magnitude of actual
+    fit_df = fit_df.sort_values("actual", ascending=True)
+    plot_regression(fit_df)
+
    model_data[["thermal_transmittance", response]].corr()

    summary = model_data.groupby(["property-type", "built-form"], observed=True)[
--- a/model_data/requirements.txt
+++ b/model_data/requirements.txt
@ -15,4 +15,5 @@ pint
 geopandas
 mip
 seaborn
-statsmodels
+statsmodels
+scikit-learn