reverting to baseline performance which was just filling area with zero

2026-07-27 23:35:01 +00:00 · 2023-07-04 19:38:18 +01:00 · 2023-07-04 19:38:18 +01:00 · 0b41057db8
commit 0b41057db8
parent 68e903e492
1 changed files with 132 additions and 65 deletions
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@ -11,13 +11,16 @@ from sklearn.linear_model import Lasso
 from sklearn.preprocessing import StandardScaler
 import xgboost as xgb

+from statsmodels.stats.outliers_influence import variance_inflation_factor
+from tqdm import tqdm
+
 with open("all_data.pkl", "rb") as f:
    all_data = pickle.load(f)


 class SapModel:
    # We want to estimate for making improvements on different property components
-    RESPONSE = "environment-impact-current"
+    RESPONSE = "current-energy-efficiency"
    # We could potentially  build models by constituency to avoid having too many
    # features in the model
    BASE_FEATURES = [
@ -50,6 +53,12 @@ class SapModel:
        # "lighting-description"  # Might not need to use this
        "low-energy-lighting",
        "number-open-fireplaces",
+        "mainheatcont-description",
+        "fixed-lighting-outlets-count",
+        "floor-height",
+        "floor-level",
+        "total-floor-area",
+        "extension-count",
    ]

    CATEGORICAL_COLS = [
@ -69,8 +78,17 @@ class SapModel:
        "glazed-type",
        "glazed-area",
        "construction-age-band",
-        # Testing
-        "lighting-description"
+        "lighting-description",
+        "mainheatcont-description",
+        "floor-level",
+    ]
+
+    NUMERICAL_COLUMNS = [
+        "photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces",
+        "fixed-lighting-outlets-count",
+        "floor-height",
+        "total-floor-area",
+        "extension-count",
    ]

    def __init__(self, data, cleaner, test_size=0.2, random_state=None):
@ -145,14 +163,82 @@ class SapModel:
        model_data = model_data.drop(columns=["transaction-type"])
        return model_data

-    @staticmethod
-    def _clean_numericals(model_data):
+    def _clean_numericals(self, model_data):

-        for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
+        # Try binning numericals
+        def bucket_and_fill(df, column_name, n_bins=10):
+            # Check if the column is numerical
+            if np.issubdtype(df[column_name].dtype, np.number):
+                # Create a new categorical column from numerical one by binning the data
+                df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
+                # Replace missing data with "NO_RECORD"
+                df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
+                df[column_name + "_bucket"] = np.where(
+                    df[column_name + "_bucket"] == "nan",
+                    "NO_RECORD",
+                    df[column_name + "_bucket"]
+                )
+            return df
+
+        bucket_variables = []
+        remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in bucket_variables]
+
+        for col in bucket_variables:
+            model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
+            model_data = bucket_and_fill(model_data, col)
+
+        # Replace the data with the binned version
+        model_data = model_data.drop(columns=bucket_variables)
+        model_data = model_data.rename(
+            columns=dict(zip([c + "_bucket" for c in bucket_variables], bucket_variables))
+        )
+
+        for col in remaining_numericals:
            model_data[col] = np.where(
                model_data[col] == "", "0", model_data[col]
            ).astype(float)

+        # Basic fill the rest of the columns with 0
+
+        # grouping_variables = ["constituency", "built-form", "property-type"]
+        # secondary_grouping_variables = ["constituency", "property-type"]
+        #
+        # def append_missings(model_data, col, grouping_variables, secondary=False):
+        #     if not secondary:
+        #         numerical_data = model_data[(model_data[col] != "")].copy()
+        #     else:
+        #         numerical_data = model_data[~pd.isnull(model_data[col])].copy()
+        #
+        #     numerical_data[col] = numerical_data[col].astype(float)
+        #     numerical_data = numerical_data.groupby(grouping_variables, observed=True)[col].mean().reset_index()
+        #     numerical_data = numerical_data.rename(columns={col: f"{col}_avg"})
+        #     model_data = model_data.merge(numerical_data, how="left", on=grouping_variables)
+        #
+        #     if not secondary:
+        #         model_data[col] = np.where(model_data[col] == "", model_data[f"{col}_avg"], model_data[col])
+        #     else:
+        #         model_data[col] = np.where(pd.isnull(model_data[col]), model_data[f"{col}_avg"], model_data[col])
+        #
+        #     model_data = model_data.drop(columns=[f"{col}_avg"])
+        #     model_data[col] = model_data[col].astype(float)
+        #
+        #     return model_data
+        #
+        # for col in numerical_cols:
+        #     model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
+        #     model_data[col] = model_data[col].fillna(model_data[col].mean())
+        #     model_data = append_missings(model_data, col, grouping_variables)
+        #
+        #     if pd.isnull(model_data[col]).sum():
+        #         model_data = append_missings(model_data, col, secondary_grouping_variables, True)
+        #
+        #     if pd.isnull(model_data[col]).sum():
+        #         raise Exception("Cleaning failed, missing values remain")
+        #
+        #     model_data[col] = np.where(
+        #         model_data[col] == "", "0", model_data[col]
+        #     ).astype(float)
+
        return model_data

    @staticmethod
@ -212,7 +298,7 @@ class SapModel:

        features = [
            x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
-                "walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx"
+                "walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx", "is_rdsap"
            ] if x not in exclude_features
        ]

@ -251,7 +337,9 @@ class SapModel:
    def fit_model(self):

        # Dummy out the categorical variables
-        x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS, drop_first=True)
+        binned = []
+
+        x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + binned, drop_first=True)

        # Convert booleans to integer
        for col in x.columns:
@ -276,16 +364,6 @@ class SapModel:
        train_x = train_x.drop(columns=["idx"])
        test_x = test_x.drop(columns=["idx"])

-        # importance_df = self.make_importance(train_x)
-        # Test dropping the least important features
-        # to_drop = importance_df.tail(2)["Feature"].values
-        # Dropping this is a good idea
-        to_drop = [
-            "hotwater-description_Electric immersion, off-peak",
-        ]
-        train_x = train_x.drop(columns=to_drop)
-        test_x = test_x[train_x.columns]
-
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.inspection import permutation_importance

@ -342,13 +420,13 @@ class SapModel:
        ).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx")

        # temp hardcoded values
-        best_fit = {'MAPE': 0.042824355225087686, 'Mean Squared Error': 21.49263731368226,
-                    'Mean Absolute Error': 3.298755911054327, 'R2 Score': 0.794118580154128,
-                    'Explained Variance Score': 0.794118580154128, 'Median Absolute Error': 2.426789554039914}
+        best_fit = {'MAPE': 0.04617542805587113, 'Mean Squared Error': 18.62306128026334,
+                    'Mean Absolute Error': 2.865262003625814, 'R2 Score': 0.8008316762496143,
+                    'Explained Variance Score': 0.8008316762496143, 'Median Absolute Error': 1.911197425417548}

-        best_predict = {'MAPE': 0.04413439429441669, 'Mean Squared Error': 22.700373062051142,
-                        'Mean Absolute Error': 3.3961241443022008, 'R2 Score': 0.750296045867001,
-                        'Explained Variance Score': 0.7503518147827141, 'Median Absolute Error': 2.4442017110145855}
+        best_predict = {'MAPE': 0.04358926901734807, 'Mean Squared Error': 21.197491698961528,
+                        'Mean Absolute Error': 3.046853690257838, 'R2 Score': 0.7215087343364782,
+                        'Explained Variance Score': 0.7215726927575035, 'Median Absolute Error': 1.921094388694634}

        def check_successes(experiment_error, best_error):

@ -485,38 +563,7 @@ class SapModel:
        worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True)
        worst_x = worst_x.sort_values("Absolute Residual", ascending=False)

-    def make_importance(self, train_x):
-
-        # Create a DMatrix from your training data
-        dtrain = xgb.DMatrix(train_x, label=self.train_y)
-
-        # Set the parameters for the XGBoost model
-        params = {
-            'objective': 'reg:squarederror',
-            'eval_metric': 'rmse'
-        }
-
-        # Train the XGBoost model
-        model = xgb.train(params, dtrain)
-
-        # Get feature importance scores
-        importance_scores = model.get_score(importance_type='gain')
-
-        # Create a dataframe with feature names and importance scores
-        importance_df = pd.DataFrame({
-            'Feature': importance_scores.keys(),
-            'Importance': importance_scores.values()
-        })
-
-        # Sort the dataframe by importance score in descending order
-        importance_df = importance_df.sort_values(by='Importance', ascending=False)
-
-        # Print the feature importances
-        return importance_df
-
    def detect_multi_collinearity(self):
-        from statsmodels.stats.outliers_influence import variance_inflation_factor
-        from tqdm import tqdm
        # Get the VIFs for each variable
        vifs = pd.DataFrame()
        vifs["features"] = self.train_x.columns
@ -616,14 +663,14 @@ class SapModel:
        Returns:
            dict: Dictionary containing the calculated metrics.
        """
-        metrics = {}
-
-        metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
-        metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
-        metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
-        metrics['R2 Score'] = r2_score(y_true, y_pred)
-        metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
-        metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
+        metrics = {
+            'MAPE': mean_absolute_percentage_error(y_true, y_pred),
+            'Mean Squared Error': mean_squared_error(y_true, y_pred),
+            'Mean Absolute Error': mean_absolute_error(y_true, y_pred),
+            'R2 Score': r2_score(y_true, y_pred),
+            'Explained Variance Score': explained_variance_score(y_true, y_pred),
+            'Median Absolute Error': median_absolute_error(y_true, y_pred)
+        }

        errors = pd.DataFrame()
        errors['Fit'] = y_true
@ -645,20 +692,40 @@ from sklearn.ensemble import RandomForestRegressor

 rf = RandomForestRegressor(random_state=self.random_state)
 X = self.df.drop(columns=self.RESPONSE)
-for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
+X = X.drop(
+    columns=["address", "uprn-source", "heating-cost-potential", "hot-water-cost-potential", "potential-energy-rating",
+             "environment-impact-potential", "heating-cost-current", "address3", "local-authority-label",
+             "hot-water-cost-current", "county", "postcode", "constituency", "co2-emissions-potential", "address2",
+             "posttown", "lighting-cost-potential", "lodgement-datetime", "current-energy-rating", "uprn",
+             "energy-consumption-current", "lighting-cost-current", "lodgement-date", "lmk-key",
+             "potential-energy-efficiency",
+             "energy-consumption-potential", "inspection-date", "co2-emiss-curr-per-floor-area", "address1",
+             "constituency-label",
+             "building-reference-number", "environment-impact-current", "co2-emissions-current",
+             ])
+# Note: this probably not the best way to treat low-energy-fixed-light-count, unheated-corridor-length,
+# fixed-lighting-outlets-count
+for col in [
+    "photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces", "floor-height",
+    "low-energy-fixed-light-count", "unheated-corridor-length", "fixed-lighting-outlets-count", "total-floor-area"
+]:
    X[col] = np.where(
        X[col] == "", "0", X[col]
    ).astype(float)

 Y = self.df[self.RESPONSE]
+X = pd.get_dummies(X)
 rf.fit(X, Y)

 # Print the name and importance of each feature
 importance_df = []
-for feature, importance in zip(train_x.columns, rf.feature_importances_):
+for feature, importance in zip(X.columns, rf.feature_importances_):
    importance_df.append(
        {
            "Feature": feature,
            "rf_importance": importance
        }
    )
+
+importance_df = pd.DataFrame(importance_df)
+importance_df = importance_df.sort_values("rf_importance", ascending=False)