Added in coercing boolean-like columns to actual booleans

2026-07-27 23:35:01 +00:00 · 2023-09-23 15:35:58 +01:00 · 2023-09-23 15:35:58 +01:00 · 20d12c157a
commit 20d12c157a
parent dd90e280ca
1 changed files with 39 additions and 8 deletions
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@ -509,7 +509,28 @@ class DataProcessor:
        return self.data[FIXED_FEATURES]

    @staticmethod
-    def difference_data(df):
+    def coerce_boolean_columns(df: pd.DataFrame, cols_to_ignore: List | None = None):
+        """
+        Coerce columns with string 'True'/'False' values to boolean columns.
+
+        :param df: Input DataFrame.
+        :param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids
+        :return: DataFrame with coerced columns.
+        """
+        object_columns = df.select_dtypes(include=['object']).columns
+        if cols_to_ignore:
+            object_columns = [c for c in object_columns if c not in cols_to_ignore]
+
+        for column in object_columns:
+            unique_values = df[column].dropna().unique()
+            # If the unique values in the column are 'True' and 'False', convert the column to boolean
+            if set(unique_values) == {'True', 'False'} or set(unique_values) == {True, False}:
+                df[column] = df[column].astype(bool)
+
+        return df
+
+    @classmethod
+    def difference_data(cls, df: pd.DataFrame):

        """
        Given a dataframe and starting and ending columns, this function will convert the features to
@ -521,12 +542,14 @@ class DataProcessor:
        for uvalue_col in uvalue_columns:
            df[uvalue_col] = pd.to_numeric(df[uvalue_col])

-        columns = {
-            x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [
-                "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
-                "CARBON_STARTING", "UPRN", "CONSTITUENCY",
-            ]
-        }
+        key_columns = [
+            "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
+            "CARBON_STARTING", "UPRN", "CONSTITUENCY",
+        ]
+
+        ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns
+
+        columns = {x for x in df.columns if x not in ignore_cols}

        non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
        non_numerical_columns = [col for col in non_numerical_columns if col in columns]
@ -549,10 +572,15 @@ class DataProcessor:
                    no_diff_columns.append(col)

        if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
-            raise Exception("Something went wrong, potentially missed a differencing colunn")
+            raise Exception("Something went wrong, potentially missed a differencing column")

        datatypes = df.dtypes

+        # Note: We also difference columns like floor area and floor height. We should experiement with this.
+        # Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by
+        # the starting value, therefore to explain any differences in the new floor area, it may be enough to
+        # just consider the difference however we can play around with this.
+
        # Do the differencing
        cols_to_append = {}
        for starting_col in diff_columns:
@ -616,4 +644,7 @@ class DataProcessor:
        cols_to_append = pd.DataFrame(cols_to_append)
        df = pd.concat([df, cols_to_append], axis=1)

+        # Perform a final coercing of string True/False columns to boolean
+        df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns)
+
        return df