mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added in coercing boolean-like columns to actual booleans
This commit is contained in:
parent
dd90e280ca
commit
20d12c157a
1 changed files with 39 additions and 8 deletions
|
|
@ -509,7 +509,28 @@ class DataProcessor:
|
|||
return self.data[FIXED_FEATURES]
|
||||
|
||||
@staticmethod
|
||||
def difference_data(df):
|
||||
def coerce_boolean_columns(df: pd.DataFrame, cols_to_ignore: List | None = None):
|
||||
"""
|
||||
Coerce columns with string 'True'/'False' values to boolean columns.
|
||||
|
||||
:param df: Input DataFrame.
|
||||
:param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids
|
||||
:return: DataFrame with coerced columns.
|
||||
"""
|
||||
object_columns = df.select_dtypes(include=['object']).columns
|
||||
if cols_to_ignore:
|
||||
object_columns = [c for c in object_columns if c not in cols_to_ignore]
|
||||
|
||||
for column in object_columns:
|
||||
unique_values = df[column].dropna().unique()
|
||||
# If the unique values in the column are 'True' and 'False', convert the column to boolean
|
||||
if set(unique_values) == {'True', 'False'} or set(unique_values) == {True, False}:
|
||||
df[column] = df[column].astype(bool)
|
||||
|
||||
return df
|
||||
|
||||
@classmethod
|
||||
def difference_data(cls, df: pd.DataFrame):
|
||||
|
||||
"""
|
||||
Given a dataframe and starting and ending columns, this function will convert the features to
|
||||
|
|
@ -521,12 +542,14 @@ class DataProcessor:
|
|||
for uvalue_col in uvalue_columns:
|
||||
df[uvalue_col] = pd.to_numeric(df[uvalue_col])
|
||||
|
||||
columns = {
|
||||
x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [
|
||||
"RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
|
||||
"CARBON_STARTING", "UPRN", "CONSTITUENCY",
|
||||
]
|
||||
}
|
||||
key_columns = [
|
||||
"RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
|
||||
"CARBON_STARTING", "UPRN", "CONSTITUENCY",
|
||||
]
|
||||
|
||||
ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns
|
||||
|
||||
columns = {x for x in df.columns if x not in ignore_cols}
|
||||
|
||||
non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
|
||||
non_numerical_columns = [col for col in non_numerical_columns if col in columns]
|
||||
|
|
@ -549,10 +572,15 @@ class DataProcessor:
|
|||
no_diff_columns.append(col)
|
||||
|
||||
if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
|
||||
raise Exception("Something went wrong, potentially missed a differencing colunn")
|
||||
raise Exception("Something went wrong, potentially missed a differencing column")
|
||||
|
||||
datatypes = df.dtypes
|
||||
|
||||
# Note: We also difference columns like floor area and floor height. We should experiement with this.
|
||||
# Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by
|
||||
# the starting value, therefore to explain any differences in the new floor area, it may be enough to
|
||||
# just consider the difference however we can play around with this.
|
||||
|
||||
# Do the differencing
|
||||
cols_to_append = {}
|
||||
for starting_col in diff_columns:
|
||||
|
|
@ -616,4 +644,7 @@ class DataProcessor:
|
|||
cols_to_append = pd.DataFrame(cols_to_append)
|
||||
df = pd.concat([df, cols_to_append], axis=1)
|
||||
|
||||
# Perform a final coercing of string True/False columns to boolean
|
||||
df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns)
|
||||
|
||||
return df
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue