from pathlib import Path import numpy as np import pandas as pd from BaseUtility import Definitions from etl.epc.settings import ( DATA_PROCESSOR_SETTINGS, EARLIEST_EPC_DATE, # IGNORED_TRANSACTION_TYPES, IGNORED_FLOOR_LEVELS, IGNORED_PROPERTY_TYPES, IGNORED_TENURES, FULLY_GLAZED_DESCRIPTIONS, AVERAGE_FIXED_FEATURES, BUILT_FORM_REMAP, COLUMNS_TO_MERGE_ON, FIXED_FEATURES, COLUMNTYPES, RDSAP_RESPONSE, MAX_SAP_SCORE, fill_na_map, STARTING_SUFFIX_COMPONENT_COLS, NO_SUFFIX_COMPONENT_COLS, ENDING_SUFFIX_COMPONENT_COLS, POTENTIAL_COLUMNS, EFFICIENCY_FEATURES, ) from recommendations.rdsap_tables import FLOOR_LEVEL_MAP from typing import List # TODO: change the setting columns to lower STARTING_SUFFIX_COMPONENT_COLS = [x.lower() for x in STARTING_SUFFIX_COMPONENT_COLS] NO_SUFFIX_COMPONENT_COLS = [x.lower() for x in NO_SUFFIX_COMPONENT_COLS] ENDING_SUFFIX_COMPONENT_COLS = [x.lower() for x in ENDING_SUFFIX_COMPONENT_COLS] POTENTIAL_COLUMNS = [x.lower() for x in POTENTIAL_COLUMNS] # These lookups are used to clean the construction age band construction_age_bounds_map = { "England and Wales: before 1900": {"l": 0, "u": 1899}, "England and Wales: 1930-1949": {"l": 1930, "u": 1949}, "England and Wales: 1900-1929": {"l": 1900, "u": 1929}, "England and Wales: 1950-1966": {"l": 1950, "u": 1966}, "England and Wales: 1967-1975": {"l": 1967, "u": 1975}, "England and Wales: 1976-1982": {"l": 1976, "u": 1982}, "England and Wales: 1983-1990": {"l": 1983, "u": 1990}, "England and Wales: 1991-1995": {"l": 1991, "u": 1995}, "England and Wales: 1996-2002": {"l": 1996, "u": 2002}, "England and Wales: 2003-2006": {"l": 2003, "u": 2006}, "England and Wales: 2007-2011": {"l": 2007, "u": 2011}, "England and Wales: 2012 onwards": {"l": 2012, "u": 3000}, } construction_age_remap = { "England and Wales: 2007 onwards": "England and Wales: 2007-2011" } expanded_map = { i: [ label for label, bounds in construction_age_bounds_map.items() if (i <= bounds["u"]) and (i >= bounds["l"]) ][0] for i in range(0, 3001) } def is_int(x): try: int(x) return True except: return False class EPCDataProcessor: """ Handle data loading and data preprocessing """ def __init__( self, data: pd.DataFrame | None = None, cleaning_averages: pd.DataFrame | None = None, run_mode: str = "training", violation_mode: bool = False, ) -> None: """ :param filepath: If specified, is the physical location of the data :param is_newdata: Indicates if we are processing new, testing data. In this instance, there are some operations we do not want to perform, such as confine_data() """ is_data_a_dataframe = isinstance(data, pd.DataFrame) self.data: pd.DataFrame = data if is_data_a_dataframe else pd.DataFrame() is_cleaning_averages_a_dataframe = isinstance(cleaning_averages, pd.DataFrame) self.cleaning_averages: pd.DataFrame = ( cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame() ) # FOR NOW IF VIOLATION MODE IS ON, WE USE RUN MODE AS NEWDATA self.violation_mode = violation_mode if run_mode not in ["training", "newdata"]: raise ValueError("Run mode must be either training or newdata") self.run_mode = run_mode if not violation_mode else "newdata" def prepare_data(self, filepath: Path | str | None = None) -> None: """ Given the run mode, we apply the relevant pipeline steps Ignore step is used to highlight which steps are not needed in newdata """ ignore_step = True if self.run_mode == "newdata" else False if filepath is not None: self.load_data( filepath=filepath, low_memory=DATA_PROCESSOR_SETTINGS["low_memory"] ) if len(self.data) == 0: raise Exception("No data to process - check filepath/ data being passed in") self.confine_data(ignore_step=ignore_step) self.remap_anomalies() self.remap_floor_level(ignore_step=ignore_step) self.remap_build_form() self.cast_data_column_values_to_lower() self.standardise_construction_age_band(ignore_step=ignore_step) self.clean_missing_rooms(ignore_step=ignore_step) self.recast_df_columns( column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"] ) self.clean_multi_glaze_proportion(ignore_step=ignore_step) self.clean_photo_supply() self.retain_multiple_epc_properties( epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"], ignore_step=ignore_step, ) self.fill_na_fields() self.sort_data_by_uprn_lodgement_date(ignore_step=ignore_step) # Final re-casting after data transformed and prepared self.recast_df_columns(column_mappings=COLUMNTYPES, auto_subset_columns=True) self.recast_all_data(column_mappings=COLUMNTYPES, auto_subset_columns=True) self.na_remapping(auto_subset_columns=True) self.fill_invalid_constituency_fields(ignore_step=ignore_step) self.make_cleaning_averages(ignore_step=ignore_step) self.add_local_authority_to_cleaning_average(ignore_step=ignore_step) # TODO: check if this has impact on training dataset # cleaned_data = self.apply_averages_cleaning( # data_to_clean=self.data, # cleaning_data=self.cleaning_averages, # cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], # colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], # ) # When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper cleaning_averages = self.cleaning_averages.copy() if self.run_mode == "newdata": cleaning_averages.columns = cleaning_averages.columns.str.upper() cleaned_data = self.apply_averages_cleaning( data_to_clean=self.data, cleaning_data=cleaning_averages, cols_to_merge_on=COLUMNS_TO_MERGE_ON, ) self.data = self.data if cleaned_data is None else cleaned_data self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step) self.cast_data_columns_to_lower() def cast_data_columns_to_lower(self): """ Convert all columns names to lower """ self.data.columns = self.data.columns.str.lower() def cast_cleaning_averages_columns_to_lower(self, ignore_step: bool = False): """ Convert all column names to lower No need in newdata mode """ if ignore_step: return self.cleaning_averages.columns = self.cleaning_averages.columns.str.lower() def add_local_authority_to_cleaning_average(self, ignore_step: bool = False): """ Add the Local authority column to the cleaning averages No need in newdata mode """ if ignore_step: return self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[ 0 ] def fill_invalid_constituency_fields(self, ignore_step: bool = False): """ For some weird cases, where data has missing constituency, we add a dummy value """ if self.violation_mode: # TODO: to fill in return if ignore_step: return self.data = self.data.fillna( {"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]} ) def sort_data_by_uprn_lodgement_date(self, ignore_step: bool = False): """ Order data by uprn and lodgement data No Violation mode needed """ if ignore_step: return self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) def cast_data_column_values_to_lower(self): """ For given columns, cast values to lower No Violation mode or newdata modes required """ convert_to_lower = ["TRANSACTION_TYPE"] for col in convert_to_lower: self.data[col] = self.data[col].str.lower() def remap_build_form(self): """ Remap build form to standard values No Violation mode or newdata modes required """ self.data["BUILT_FORM"] = self.data["BUILT_FORM"].replace(BUILT_FORM_REMAP) def remap_anomalies(self): """ Remap anomalies to None No Violation mode or newdata modes required """ # Map all anomaly values to None data_anomaly_map = dict( zip( Definitions.DATA_ANOMALY_MATCHES, [None] * len(Definitions.DATA_ANOMALY_MATCHES), ) ) # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values data = self.data.replace(data_anomaly_map) data = data.replace(np.nan, None) self.data = data def remap_floor_level(self, ignore_step: bool = False): """ Remap floor level to standard values """ if self.violation_mode: # TODO: We need to handle this case return if ignore_step: return self.data["FLOOR_LEVEL"] = self.data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP) def load_data(self, filepath, low_memory=False) -> None: if not filepath: raise ValueError("No filepath specified") self.data = pd.read_csv(filepath, low_memory=low_memory) def insert_data(self, data: pd.DataFrame) -> None: self.data = data @staticmethod def clean_construction_age_band(x): # Firstly, we check if it's an error value if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]: return x # Next, we check if it's a value in our map if construction_age_bounds_map.get(x): return x # We check if it's a standard remap value remap_value = construction_age_remap.get(x, None) if remap_value: return remap_value # We check if it's a number if is_int(x): x_int = int(x) return expanded_map[x_int] raise NotImplementedError("Not handled the case for value %s" % x) def standardise_construction_age_band(self, ignore_step: bool = False): """ This function will tidy up some of the non-standard values that are populated in the construction age band, which is useful for cleaning """ if self.violation_mode: # TODO: to fill in return if ignore_step: return self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply( lambda x: self.clean_construction_age_band(x) ) self.data = self.data[~pd.isnull(self.data["CONSTRUCTION_AGE_BAND"])] def clean_missing_rooms(self, ignore_step: bool = False): """ For the number of heated rooms and number of habitable rooms, we clean these values up front, based on property archetype and age TODO: We could use a model based impution approach for possibly more accurate cleaning """ if self.violation_mode: # TODO: to fill in return if ignore_step: return # TODO: DO we want to move this out of this function? (i.e. alter the data before we do any cleaning) self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply( lambda x: x.split(" ")[0] ) def apply_clean(data, matching_columns): cleaning_data = ( data[~pd.isnull(data[col])] .groupby(matching_columns)[col] .median() .reset_index() ) data = data.merge( cleaning_data, how="left", on=matching_columns, suffixes=("", "_CLEANING"), ) data[col] = np.where( pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col] ) data = data.drop(columns=f"{col}_CLEANING") return data for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]: to_index = 3 matching_columns = [ "PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "POSTAL_AREA", ] has_missings = pd.isnull(self.data[col]).sum() while has_missings: self.data = apply_clean( data=self.data, matching_columns=matching_columns[0: to_index + 1] ) has_missings = pd.isnull(self.data[col]).sum() if not has_missings or to_index == 0: # Check if we've gotten to index 0 and still have missings - something has gone wrong or # we have a very unique property type if has_missings: raise NotImplementedError( "Handle this edge case, we still have missings for column %s" % col ) break to_index -= 1 # def pre_process(self, filepath: Path | None = None) -> tuple[pd.DataFrame, pd.DataFrame]: # """ # Load data and begin initial cleaning # """ # if self.data is None: # self.load_data(filepath=filepath, low_memory=DATA_PROCESSOR_SETTINGS["low_memory"]) # if not self.is_newdata: # self.confine_data() # self.remap_columns() # # We have some non-standard construction age bands which we'll clean for matching # if not self.is_newdata: # self.standardise_construction_age_band() # self.clean_missing_rooms() # self.recast_df_columns( # column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"] # ) # if not self.is_newdata: # self.clean_multi_glaze_proportion() # self.clean_photo_supply() # if not self.is_newdata: # self.retain_multiple_epc_properties( # epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"] # ) # if DATA_PROCESSOR_SETTINGS["epc_minimum_count"] >= 1: # # If we have multiple EPC records, we can try and do filling # self.fill_na_fields() # if not self.is_newdata: # self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) # # Final re-casting after data transformed and prepared # coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else # COLUMNTYPES # for k, v in coltypes.items(): # self.data[k] = self.data[k].astype(v) # self.data = self.data.astype(coltypes) # self.na_remapping() # self.cleaning_averages = None # if not self.is_newdata: # # We have some odd cases with missing constituency so we fill # self.data = self.data.fillna({"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]}) # self.cleaning_averages = self.make_cleaning_averages() # # We apply averages cleaning to the data # self.data = self.apply_averages_cleaning( # data_to_clean=self.data, # cleaning_data=self.cleaning_averages, # cols_to_merge_on=COLUMNS_TO_MERGE_ON # ) # self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0] # self.cleaning_averages.columns = self.cleaning_averages.columns.str.lower() # self.data.columns = self.data.columns.str.lower() # return self.data, self.cleaning_averages def na_remapping(self, auto_subset_columns: bool = False): fill_na_map_apply = ( {k: v for k, v in fill_na_map.items() if k in self.data.columns} if auto_subset_columns else fill_na_map ) for column, fill_value in fill_na_map_apply.items(): self.data[column] = self.data[column].fillna(fill_value) def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON): """ If we have a minimum of 2 epcs, we can do back fill and forward fill on certain data fields """ # Each uprn can fille backward from recent and forward fill from oldest # The groupby changes the order and we use the index to make the original data filled_data = ( self.data.groupby("UPRN", group_keys=True)[columns_to_fill] .apply(lambda group: group.bfill().ffill().infer_objects(copy=False)) .reset_index() .set_index("level_1") .sort_index() ) self.data[columns_to_fill] = filled_data[columns_to_fill] # For floor area, we also replace "" values with None self.data[["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]] = self.data[ ["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"] ].replace("", None) def make_cleaning_averages(self, ignore_step: bool = False) -> pd.DataFrame: """ Create a dataset to hold averages based on property type, built form, construction age, and rooms. Not require in newdata mode """ if ignore_step: return pd.DataFrame() # Define a custom function to calculate the median, excluding missing values def median_without_missing(group): return group[AVERAGE_FIXED_FEATURES].median(skipna=True) cleaning_averages = ( self.data.groupby( [ "PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", ], observed=True, dropna=False, ) .apply(median_without_missing) .reset_index() ) general_averages = ( self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True) .apply(median_without_missing) .reset_index() ) property_averages = ( self.data.groupby(["PROPERTY_TYPE"], observed=True) .apply(median_without_missing) .reset_index() ) built_form_averages = ( self.data.groupby(["BUILT_FORM"], observed=True) .apply(median_without_missing) .reset_index() ) # We can clean up any NA's in the cleaning averages with the general averages here cleaning_averages_filled = pd.merge( cleaning_averages, general_averages, on=["PROPERTY_TYPE", "BUILT_FORM"], suffixes=["", "_AVERAGE"], ) cleaning_averages_filled = pd.merge( cleaning_averages_filled, property_averages, on=["PROPERTY_TYPE"], suffixes=["", "_PROPERTY_AVERAGE"], ) cleaning_averages_filled = pd.merge( cleaning_averages_filled, built_form_averages, on=["BUILT_FORM"], suffixes=["", "_BUILT_FORM_AVERAGE"], ) for variable in AVERAGE_FIXED_FEATURES: # Replace any missing NAN values with averages for the same Property type and built form cleaning_averages_filled[variable] = cleaning_averages_filled[ variable ].fillna(cleaning_averages_filled[f"{variable}_AVERAGE"]) cleaning_averages_filled = cleaning_averages_filled.drop( columns=f"{variable}_AVERAGE" ) # If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope # and built form # We can use just the property type average and replace cleaning_averages_filled[variable] = cleaning_averages_filled[ variable ].fillna(cleaning_averages_filled[f"{variable}_PROPERTY_AVERAGE"]) cleaning_averages_filled = cleaning_averages_filled.drop( columns=f"{variable}_PROPERTY_AVERAGE" ) # If there are still NA values, use BUILT FORM averages cleaning_averages_filled["variable"] = cleaning_averages_filled[ variable ].fillna(cleaning_averages_filled[f"{variable}_BUILT_FORM_AVERAGE"]) cleaning_averages_filled = cleaning_averages_filled.drop( columns=f"{variable}_BUILT_FORM_AVERAGE" ) # If there still is na values, use average across all epc in consituecy cleaning_averages_filled[variable] = cleaning_averages_filled[ variable ].fillna(cleaning_averages_filled[variable].mean()) # If the consituency is all NA values, then take UK AVERAGE VALUES # cleaning_averages_filled["TOTAL_FLOOR_AREA"] = cleaning_averages_filled[ # "TOTAL_FLOOR_AREA" # ].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE) # cleaning_averages_filled["FLOOR_HEIGHT"] = cleaning_averages_filled[ # "FLOOR_HEIGHT" # ].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE) self.cleaning_averages = cleaning_averages_filled def retain_multiple_epc_properties( self, epc_minimum_count: int = 1, ignore_step: bool = False ) -> None: """ Reduce the data futher by keeping only datasets with multiple epcs """ if self.violation_mode: # TODO: to fill in return if ignore_step: return counts = self.data.groupby("UPRN").size().reset_index() counts.columns = ["UPRN", "count"] # take UPRNS with multiple EPCs counts = counts[counts["count"] > epc_minimum_count] self.data = pd.merge(self.data, counts, on="UPRN") def recast_df_columns( self, column_mappings: dict, auto_subset_columns: bool = False ) -> None: """ Recast columns from the dataframe to ensure the behaviour we want """ if auto_subset_columns: column_mappings = { k: v for k, v in column_mappings.items() if k in self.data.columns } for key, values in column_mappings.items(): if key not in self.data.columns: raise ValueError("Column mapping incorrectly specified") if isinstance(values, list): for value in values: self.data[key] = self.data[key].astype(value) else: self.data[key] = self.data[key].astype(values) def recast_all_data( self, column_mappings: dict, auto_subset_columns: bool = False ) -> None: """ Using a dictionary to recast all columns at once """ if auto_subset_columns: column_mappings = { k: v for k, v in column_mappings.items() if k in self.data.columns } self.data = self.data.astype(column_mappings) def confine_data(self, ignore_step: bool = False): """ Include all step to reduce down the data based on assumptions """ if self.violation_mode: violation_uprn_missing = pd.isnull(self.data["UPRN"]) violation_old_lodgment_date = ( self.data["LODGEMENT_DATE"] < EARLIEST_EPC_DATE ) # violation_invalid_transaction_type = self.data["TRANSACTION_TYPE"] == IGNORED_TRANSACTION_TYPES violation_ignored_floor_level = self.data["FLOOR_LEVEL"].isin( IGNORED_FLOOR_LEVELS ) violation_rdsap_score_above_max = self.data[RDSAP_RESPONSE] > MAX_SAP_SCORE violation_missing_windows_description = pd.isnull( self.data["WINDOWS_DESCRIPTION"] ) violation_missing_hotwater_description = pd.isnull( self.data["HOTWATER_DESCRIPTION"] ) violation_missing_roof_description = pd.isnull( self.data["ROOF_DESCRIPTION"] ) violation_invalid_property_type = ( self.data["PROPERTY_TYPE"] == IGNORED_PROPERTY_TYPES ) violation_invalid_tenure = self.data["TENURE"].isin(IGNORED_TENURES) violation_df = pd.concat( [ violation_uprn_missing, violation_old_lodgment_date, violation_invalid_transaction_type, violation_ignored_floor_level, violation_rdsap_score_above_max, violation_missing_windows_description, violation_missing_hotwater_description, violation_missing_roof_description, violation_invalid_property_type, violation_invalid_tenure, ], axis=1, keys=[ "violation_uprn_missing", "violation_old_lodgment_date", "violation_invalid_transaction_type", "violation_ignored_floor_level", "violation_rdsap_score_above_max", "violation_missing_windows_description", "violation_missing_hotwater_description", "violation_missing_roof_description", "violation_invalid_property_type", "violation_invalid_tenure", ], ) self.data = pd.concat([self.data, violation_df], axis=1) if ignore_step: return # Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one # Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged # before the introduction of SAP09 # Filter 3: We remove EPCS that were conducted for a new build, since these are performed with # full SAP, which produces different results to the RdSAP methodology # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous # Filter 5: Remove any EPCs with a SAP score above 100 # Filter 6: We found a small number of cases that have missing window description so we drop these # Filter 7: We found a small number of cases that have missing hotwater description so we drop these self.data = self.data[~pd.isnull(self.data["UPRN"])] self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] # self.data = self.data[self.data["TRANSACTION_TYPE"] != IGNORED_TRANSACTION_TYPES] self.data = self.data[~self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS)] self.data = self.data[self.data[RDSAP_RESPONSE] <= MAX_SAP_SCORE] # We observed 7 final records with missing windows and 2 records with missing hot water so we shall remove them self.data = self.data[~pd.isnull(self.data["WINDOWS_DESCRIPTION"])] self.data = self.data[~pd.isnull(self.data["HOTWATER_DESCRIPTION"])] self.data = self.data[~pd.isnull(self.data["ROOF_DESCRIPTION"])] # Because park homes are surveyed unusually (for example, we don't have u-values to # look up for their different components, they need to be collected in survey and aren't reflected in # EPCs) we'll ignore them from the model self.data = self.data[self.data["PROPERTY_TYPE"] != IGNORED_PROPERTY_TYPES] # We remove EPCs where the tenure is unknown, but is usually an indicator of a new build self.data = self.data[~self.data["TENURE"].isin(IGNORED_TENURES)] # We remap zero values to None self.data.loc[self.data["FLOOR_HEIGHT"] == 0, "FLOOR_HEIGHT"] = None def clean_multi_glaze_proportion(self, ignore_step: bool = False) -> None: """ If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100 """ if self.violation_mode: # TODO: return if ignore_step: return no_multi_glaze_proportion_index = pd.isnull( self.data["MULTI_GLAZE_PROPORTION"] ) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) self.data.loc[no_multi_glaze_proportion_index, "MULTI_GLAZE_PROPORTION"] = 100 def clean_photo_supply(self) -> None: """ We fill photo supply with zeros where it's missing """ self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].astype("Int64").fillna(0) @staticmethod def apply_averages_cleaning( data_to_clean, cleaning_data, cols_to_merge_on, colnames=None, ignore_step: bool = False, ): """ Clean the input DataFrame using averages from a cleaning DataFrame. :param data_to_clean: DataFrame to be cleaned. :param cleaning_data: DataFrame containing data for cleaning. :param cols_to_merge_on: Columns on which merging is based. We pass cols_to_merge_on to this function as this differs depending on where the function is being used. :param colnames: If specified can be used to state exactly which columns to clean :return: Cleaned DataFrame. """ if ignore_step: return None # The desired colnames to clean - which may not be present if colnames is None: colnames = [ "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "FIXED_LIGHTING_OUTLETS_COUNT", ] cols_to_clean = [c for c in colnames if c in data_to_clean.columns] # Enforce data types for col in ["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]: data_to_clean[col] = data_to_clean[col].astype(float) # Identify columns with non-NaN values columns_to_merge_on = data_to_clean[cols_to_merge_on].dropna().columns.tolist() # Calculate averages cleaning_averages_to_merge = cleaning_data.groupby(columns_to_merge_on).agg( dict( zip( cols_to_clean, [ "mean", ] * len(cols_to_clean), ) ) ) # Merge with the original data data_to_clean = pd.merge( data_to_clean, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=("", "_AVERAGE"), how="left", ) global_averages = cleaning_data[cols_to_clean].mean() # Fill NaN values with averages for col in cols_to_clean: data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"]) data_to_clean = data_to_clean.drop(columns=[f"{col}_AVERAGE"]) # If we still have missings data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[col].mean()) # Final step if we still have missings - use global mean data_to_clean[col] = data_to_clean[col].fillna(global_averages[col]) return data_to_clean def get_component_features(self, suffix: str) -> pd.DataFrame: """ This function will return the property components such as the walls, roof, heating etc as well as lodgement date. These are features that we expect might change from one EPC to the next :param suffix: Should be one of "_STARTING" or "_ENDING" :return: Pandas dataframe containing the subset of columns defined in COMPONENT_FEATURES """ if suffix not in ["_starting", "_ending"]: raise Exception("Suffix should be one of _starting or _ending") if suffix == "_STARTING": starting_cols = ( self.data[STARTING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES] .copy() .add_suffix(suffix) ) fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS + POTENTIAL_COLUMNS].copy() return pd.concat([starting_cols, fixed_cols], axis=1) return ( self.data[ENDING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES] .copy() .add_suffix(suffix) ) def get_fixed_features(self) -> pd.DataFrame: """ Returns the fixed features that we don't believe should vary from one EPC to the next :return: Pandas dataframe containing the columns defined in FIXED_FEATURES """ return self.data[FIXED_FEATURES] @staticmethod def coerce_boolean_columns(df: pd.DataFrame, cols_to_ignore: List | None = None): """ Coerce columns with string 'True'/'False' values to boolean columns. :param df: Input DataFrame. :param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids :return: DataFrame with coerced columns. """ object_columns = df.select_dtypes(include=["object"]).columns if cols_to_ignore: object_columns = [c for c in object_columns if c not in cols_to_ignore] for column in object_columns: unique_values = df[column].dropna().unique() # If the unique values in the column are 'True' and 'False', convert the column to boolean if set(unique_values) == {"True", "False"} or set(unique_values) == { True, False, }: df[column] = df[column].astype(bool) return df @staticmethod def calculate_days_to(lodgement_date): if isinstance(lodgement_date, str): return ( pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) ).days return ( pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) ).dt.days @staticmethod def clean_missings_after_description_process(df, ignore_cols=None): missings = pd.isnull(df).sum() missings = missings[missings > 0] if ignore_cols: missings = missings[~missings.index.isin(ignore_cols)] for col in missings.index: unique_values = df[col].unique() # TODO: confirm this behaviour if True in unique_values or False in unique_values: df[col] = df[col].fillna(False) if "none" in unique_values: df[col] = df[col].fillna("none") else: df[col] = df[col].fillna("Unknown") return df @staticmethod def clean_efficiency_variables(df): """ These is scope to clean this by the model per corresponding description. E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and fill in the missing values with this. When looking at this initially, there are a large volume of records with missing energy efficiency values and therefore a simpler approach was taken just to test including these variables :param df: :return: """ missings = pd.isnull(df).sum() missings = missings[missings >= 1] if len(missings) == 0: return df # Make sure they are all efficiency columns if any(~missings.index.str.contains("ENERGY_EFF")): raise ValueError("Non efficiency columns are missing") for m in missings.index: df[m] = df[m].fillna("NO_RATING") return df