From 20d12c157a8d6066588a77347da10f538319ac22 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 23 Sep 2023 15:35:58 +0100
Subject: [PATCH] Added in coercing boolean-like columns to actual booleans

---
 .../simulation_system/core/DataProcessor.py   | 47 +++++++++++++++----
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py
index bf9b473f..1252f6c6 100644
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@@ -509,7 +509,28 @@ class DataProcessor:
         return self.data[FIXED_FEATURES]
 
     @staticmethod
-    def difference_data(df):
+    def coerce_boolean_columns(df: pd.DataFrame, cols_to_ignore: List | None = None):
+        """
+        Coerce columns with string 'True'/'False' values to boolean columns.
+
+        :param df: Input DataFrame.
+        :param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids
+        :return: DataFrame with coerced columns.
+        """
+        object_columns = df.select_dtypes(include=['object']).columns
+        if cols_to_ignore:
+            object_columns = [c for c in object_columns if c not in cols_to_ignore]
+
+        for column in object_columns:
+            unique_values = df[column].dropna().unique()
+            # If the unique values in the column are 'True' and 'False', convert the column to boolean
+            if set(unique_values) == {'True', 'False'} or set(unique_values) == {True, False}:
+                df[column] = df[column].astype(bool)
+
+        return df
+
+    @classmethod
+    def difference_data(cls, df: pd.DataFrame):
 
         """
         Given a dataframe and starting and ending columns, this function will convert the features to
@@ -521,12 +542,14 @@ class DataProcessor:
         for uvalue_col in uvalue_columns:
             df[uvalue_col] = pd.to_numeric(df[uvalue_col])
 
-        columns = {
-            x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [
-                "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
-                "CARBON_STARTING", "UPRN", "CONSTITUENCY",
-            ]
-        }
+        key_columns = [
+            "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
+            "CARBON_STARTING", "UPRN", "CONSTITUENCY",
+        ]
+
+        ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns
+
+        columns = {x for x in df.columns if x not in ignore_cols}
 
         non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
         non_numerical_columns = [col for col in non_numerical_columns if col in columns]
@@ -549,10 +572,15 @@ class DataProcessor:
                     no_diff_columns.append(col)
 
         if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
-            raise Exception("Something went wrong, potentially missed a differencing colunn")
+            raise Exception("Something went wrong, potentially missed a differencing column")
 
         datatypes = df.dtypes
 
+        # Note: We also difference columns like floor area and floor height. We should experiement with this.
+        # Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by
+        # the starting value, therefore to explain any differences in the new floor area, it may be enough to
+        # just consider the difference however we can play around with this.
+
         # Do the differencing
         cols_to_append = {}
         for starting_col in diff_columns:
@@ -616,4 +644,7 @@ class DataProcessor:
         cols_to_append = pd.DataFrame(cols_to_append)
         df = pd.concat([df, cols_to_append], axis=1)
 
+        # Perform a final coercing of string True/False columns to boolean
+        df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns)
+
         return df