From 20d12c157a8d6066588a77347da10f538319ac22 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 23 Sep 2023 15:35:58 +0100 Subject: [PATCH] Added in coercing boolean-like columns to actual booleans --- .../simulation_system/core/DataProcessor.py | 47 +++++++++++++++---- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py index bf9b473f..1252f6c6 100644 --- a/model_data/simulation_system/core/DataProcessor.py +++ b/model_data/simulation_system/core/DataProcessor.py @@ -509,7 +509,28 @@ class DataProcessor: return self.data[FIXED_FEATURES] @staticmethod - def difference_data(df): + def coerce_boolean_columns(df: pd.DataFrame, cols_to_ignore: List | None = None): + """ + Coerce columns with string 'True'/'False' values to boolean columns. + + :param df: Input DataFrame. + :param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids + :return: DataFrame with coerced columns. + """ + object_columns = df.select_dtypes(include=['object']).columns + if cols_to_ignore: + object_columns = [c for c in object_columns if c not in cols_to_ignore] + + for column in object_columns: + unique_values = df[column].dropna().unique() + # If the unique values in the column are 'True' and 'False', convert the column to boolean + if set(unique_values) == {'True', 'False'} or set(unique_values) == {True, False}: + df[column] = df[column].astype(bool) + + return df + + @classmethod + def difference_data(cls, df: pd.DataFrame): """ Given a dataframe and starting and ending columns, this function will convert the features to @@ -521,12 +542,14 @@ class DataProcessor: for uvalue_col in uvalue_columns: df[uvalue_col] = pd.to_numeric(df[uvalue_col]) - columns = { - x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [ - "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING", - "CARBON_STARTING", "UPRN", "CONSTITUENCY", - ] - } + key_columns = [ + "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING", + "CARBON_STARTING", "UPRN", "CONSTITUENCY", + ] + + ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns + + columns = {x for x in df.columns if x not in ignore_cols} non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist() non_numerical_columns = [col for col in non_numerical_columns if col in columns] @@ -549,10 +572,15 @@ class DataProcessor: no_diff_columns.append(col) if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns): - raise Exception("Something went wrong, potentially missed a differencing colunn") + raise Exception("Something went wrong, potentially missed a differencing column") datatypes = df.dtypes + # Note: We also difference columns like floor area and floor height. We should experiement with this. + # Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by + # the starting value, therefore to explain any differences in the new floor area, it may be enough to + # just consider the difference however we can play around with this. + # Do the differencing cols_to_append = {} for starting_col in diff_columns: @@ -616,4 +644,7 @@ class DataProcessor: cols_to_append = pd.DataFrame(cols_to_append) df = pd.concat([df, cols_to_append], axis=1) + # Perform a final coercing of string True/False columns to boolean + df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns) + return df