Added in coercing boolean-like columns to actual booleans

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-23 15:35:58 +01:00
parent dd90e280ca
commit 20d12c157a

View file

@ -509,7 +509,28 @@ class DataProcessor:
return self.data[FIXED_FEATURES]
@staticmethod
def difference_data(df):
def coerce_boolean_columns(df: pd.DataFrame, cols_to_ignore: List | None = None):
"""
Coerce columns with string 'True'/'False' values to boolean columns.
:param df: Input DataFrame.
:param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids
:return: DataFrame with coerced columns.
"""
object_columns = df.select_dtypes(include=['object']).columns
if cols_to_ignore:
object_columns = [c for c in object_columns if c not in cols_to_ignore]
for column in object_columns:
unique_values = df[column].dropna().unique()
# If the unique values in the column are 'True' and 'False', convert the column to boolean
if set(unique_values) == {'True', 'False'} or set(unique_values) == {True, False}:
df[column] = df[column].astype(bool)
return df
@classmethod
def difference_data(cls, df: pd.DataFrame):
"""
Given a dataframe and starting and ending columns, this function will convert the features to
@ -521,12 +542,14 @@ class DataProcessor:
for uvalue_col in uvalue_columns:
df[uvalue_col] = pd.to_numeric(df[uvalue_col])
columns = {
x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [
"RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
"CARBON_STARTING", "UPRN", "CONSTITUENCY",
]
}
key_columns = [
"RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
"CARBON_STARTING", "UPRN", "CONSTITUENCY",
]
ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns
columns = {x for x in df.columns if x not in ignore_cols}
non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
non_numerical_columns = [col for col in non_numerical_columns if col in columns]
@ -549,10 +572,15 @@ class DataProcessor:
no_diff_columns.append(col)
if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
raise Exception("Something went wrong, potentially missed a differencing colunn")
raise Exception("Something went wrong, potentially missed a differencing column")
datatypes = df.dtypes
# Note: We also difference columns like floor area and floor height. We should experiement with this.
# Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by
# the starting value, therefore to explain any differences in the new floor area, it may be enough to
# just consider the difference however we can play around with this.
# Do the differencing
cols_to_append = {}
for starting_col in diff_columns:
@ -616,4 +644,7 @@ class DataProcessor:
cols_to_append = pd.DataFrame(cols_to_append)
df = pd.concat([df, cols_to_append], axis=1)
# Perform a final coercing of string True/False columns to boolean
df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns)
return df