Model/etl/epc/DataProcessor.py
2024-10-21 18:34:18 +01:00

986 lines
36 KiB
Python

from pathlib import Path
import numpy as np
import pandas as pd
from BaseUtility import Definitions
from etl.epc.settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
# IGNORED_TRANSACTION_TYPES,
IGNORED_FLOOR_LEVELS,
IGNORED_PROPERTY_TYPES,
IGNORED_TENURES,
FULLY_GLAZED_DESCRIPTIONS,
AVERAGE_FIXED_FEATURES,
BUILT_FORM_REMAP,
COLUMNS_TO_MERGE_ON,
FIXED_FEATURES,
COLUMNTYPES,
RDSAP_RESPONSE,
MAX_SAP_SCORE,
fill_na_map,
STARTING_SUFFIX_COMPONENT_COLS,
NO_SUFFIX_COMPONENT_COLS,
ENDING_SUFFIX_COMPONENT_COLS,
POTENTIAL_COLUMNS,
EFFICIENCY_FEATURES,
)
from recommendations.rdsap_tables import FLOOR_LEVEL_MAP
from typing import List
# TODO: change the setting columns to lower
STARTING_SUFFIX_COMPONENT_COLS = [x.lower() for x in STARTING_SUFFIX_COMPONENT_COLS]
NO_SUFFIX_COMPONENT_COLS = [x.lower() for x in NO_SUFFIX_COMPONENT_COLS]
ENDING_SUFFIX_COMPONENT_COLS = [x.lower() for x in ENDING_SUFFIX_COMPONENT_COLS]
POTENTIAL_COLUMNS = [x.lower() for x in POTENTIAL_COLUMNS]
# These lookups are used to clean the construction age band
construction_age_bounds_map = {
"England and Wales: before 1900": {"l": 0, "u": 1899},
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
}
construction_age_remap = {
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
}
expanded_map = {
i: [
label
for label, bounds in construction_age_bounds_map.items()
if (i <= bounds["u"]) and (i >= bounds["l"])
][0]
for i in range(0, 3001)
}
def is_int(x):
try:
int(x)
return True
except:
return False
class EPCDataProcessor:
"""
Handle data loading and data preprocessing
"""
def __init__(
self,
data: pd.DataFrame | None = None,
cleaning_averages: pd.DataFrame | None = None,
run_mode: str = "training",
violation_mode: bool = False,
) -> None:
"""
:param filepath: If specified, is the physical location of the data
:param is_newdata: Indicates if we are processing new, testing data.
In this instance, there are some operations we do not
want to perform, such as confine_data()
"""
is_data_a_dataframe = isinstance(data, pd.DataFrame)
self.data: pd.DataFrame = data if is_data_a_dataframe else pd.DataFrame()
is_cleaning_averages_a_dataframe = isinstance(cleaning_averages, pd.DataFrame)
self.cleaning_averages: pd.DataFrame = (
cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame()
)
# FOR NOW IF VIOLATION MODE IS ON, WE USE RUN MODE AS NEWDATA
self.violation_mode = violation_mode
if run_mode not in ["training", "newdata"]:
raise ValueError("Run mode must be either training or newdata")
self.run_mode = run_mode if not violation_mode else "newdata"
def prepare_data(self, filepath: Path | str | None = None) -> None:
"""
Given the run mode, we apply the relevant pipeline steps
Ignore step is used to highlight which steps are not needed in newdata
"""
ignore_step = True if self.run_mode == "newdata" else False
if filepath is not None:
self.load_data(
filepath=filepath, low_memory=DATA_PROCESSOR_SETTINGS["low_memory"]
)
if len(self.data) == 0:
raise Exception("No data to process - check filepath/ data being passed in")
self.confine_data(ignore_step=ignore_step)
self.remap_anomalies()
self.remap_floor_level(ignore_step=ignore_step)
self.remap_build_form()
self.cast_data_column_values_to_lower()
self.standardise_construction_age_band(ignore_step=ignore_step)
self.clean_missing_rooms(ignore_step=ignore_step)
self.recast_df_columns(
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
)
self.clean_multi_glaze_proportion(ignore_step=ignore_step)
self.clean_photo_supply()
self.retain_multiple_epc_properties(
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"],
ignore_step=ignore_step,
)
self.fill_na_fields()
self.sort_data_by_uprn_lodgement_date(ignore_step=ignore_step)
# Final re-casting after data transformed and prepared
self.recast_df_columns(column_mappings=COLUMNTYPES, auto_subset_columns=True)
self.recast_all_data(column_mappings=COLUMNTYPES, auto_subset_columns=True)
self.na_remapping(auto_subset_columns=True)
self.fill_invalid_constituency_fields(ignore_step=ignore_step)
self.make_cleaning_averages(ignore_step=ignore_step)
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
# TODO: check if this has impact on training dataset
# cleaned_data = self.apply_averages_cleaning(
# data_to_clean=self.data,
# cleaning_data=self.cleaning_averages,
# cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
# colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
# )
# When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper
cleaning_averages = self.cleaning_averages.copy()
if self.run_mode == "newdata":
cleaning_averages.columns = cleaning_averages.columns.str.upper()
cleaned_data = self.apply_averages_cleaning(
data_to_clean=self.data,
cleaning_data=cleaning_averages,
cols_to_merge_on=COLUMNS_TO_MERGE_ON,
)
self.data = self.data if cleaned_data is None else cleaned_data
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
self.cast_data_columns_to_lower()
def cast_data_columns_to_lower(self):
"""
Convert all columns names to lower
"""
self.data.columns = self.data.columns.str.lower()
def cast_cleaning_averages_columns_to_lower(self, ignore_step: bool = False):
"""
Convert all column names to lower
No need in newdata mode
"""
if ignore_step:
return
self.cleaning_averages.columns = self.cleaning_averages.columns.str.lower()
def add_local_authority_to_cleaning_average(self, ignore_step: bool = False):
"""
Add the Local authority column to the cleaning averages
No need in newdata mode
"""
if ignore_step:
return
self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[
0
]
def fill_invalid_constituency_fields(self, ignore_step: bool = False):
"""
For some weird cases, where data has missing constituency, we add a dummy value
"""
if self.violation_mode:
# TODO: to fill in
return
if ignore_step:
return
self.data = self.data.fillna(
{"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]}
)
def sort_data_by_uprn_lodgement_date(self, ignore_step: bool = False):
"""
Order data by uprn and lodgement data
No Violation mode needed
"""
if ignore_step:
return
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
def cast_data_column_values_to_lower(self):
"""
For given columns, cast values to lower
No Violation mode or newdata modes required
"""
convert_to_lower = ["TRANSACTION_TYPE"]
for col in convert_to_lower:
self.data[col] = self.data[col].str.lower()
def remap_build_form(self):
"""
Remap build form to standard values
No Violation mode or newdata modes required
"""
self.data["BUILT_FORM"] = self.data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
def remap_anomalies(self):
"""
Remap anomalies to None
No Violation mode or newdata modes required
"""
# Map all anomaly values to None
data_anomaly_map = dict(
zip(
Definitions.DATA_ANOMALY_MATCHES,
[None] * len(Definitions.DATA_ANOMALY_MATCHES),
)
)
# Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
data = self.data.replace(data_anomaly_map)
data = data.replace(np.nan, None)
self.data = data
def remap_floor_level(self, ignore_step: bool = False):
"""
Remap floor level to standard values
"""
if self.violation_mode:
# TODO: We need to handle this case
return
if ignore_step:
return
self.data["FLOOR_LEVEL"] = self.data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
def load_data(self, filepath, low_memory=False) -> None:
if not filepath:
raise ValueError("No filepath specified")
self.data = pd.read_csv(filepath, low_memory=low_memory)
def insert_data(self, data: pd.DataFrame) -> None:
self.data = data
@staticmethod
def clean_construction_age_band(x):
# Firstly, we check if it's an error value
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
return x
# Next, we check if it's a value in our map
if construction_age_bounds_map.get(x):
return x
# We check if it's a standard remap value
remap_value = construction_age_remap.get(x, None)
if remap_value:
return remap_value
# We check if it's a number
if is_int(x):
x_int = int(x)
return expanded_map[x_int]
raise NotImplementedError("Not handled the case for value %s" % x)
def standardise_construction_age_band(self, ignore_step: bool = False):
"""
This function will tidy up some of the non-standard values that are populated in the construction age
band, which is useful for cleaning
"""
if self.violation_mode:
# TODO: to fill in
return
if ignore_step:
return
self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
lambda x: self.clean_construction_age_band(x)
)
self.data = self.data[~pd.isnull(self.data["CONSTRUCTION_AGE_BAND"])]
def clean_missing_rooms(self, ignore_step: bool = False):
"""
For the number of heated rooms and number of habitable rooms, we clean these values up front,
based on property archetype and age
TODO: We could use a model based impution approach for possibly more accurate cleaning
"""
if self.violation_mode:
# TODO: to fill in
return
if ignore_step:
return
# TODO: DO we want to move this out of this function? (i.e. alter the data before we do any cleaning)
self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(
lambda x: x.split(" ")[0]
)
def apply_clean(data, matching_columns):
cleaning_data = (
data[~pd.isnull(data[col])]
.groupby(matching_columns)[col]
.median()
.reset_index()
)
data = data.merge(
cleaning_data,
how="left",
on=matching_columns,
suffixes=("", "_CLEANING"),
)
data[col] = np.where(
pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col]
)
data = data.drop(columns=f"{col}_CLEANING")
return data
for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]:
to_index = 3
matching_columns = [
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTRUCTION_AGE_BAND",
"POSTAL_AREA",
]
has_missings = pd.isnull(self.data[col]).sum()
while has_missings:
self.data = apply_clean(
data=self.data, matching_columns=matching_columns[0: to_index + 1]
)
has_missings = pd.isnull(self.data[col]).sum()
if not has_missings or to_index == 0:
# Check if we've gotten to index 0 and still have missings - something has gone wrong or
# we have a very unique property type
if has_missings:
raise NotImplementedError(
"Handle this edge case, we still have missings for column %s"
% col
)
break
to_index -= 1
# def pre_process(self, filepath: Path | None = None) -> tuple[pd.DataFrame, pd.DataFrame]:
# """
# Load data and begin initial cleaning
# """
# if self.data is None:
# self.load_data(filepath=filepath, low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
# if not self.is_newdata:
# self.confine_data()
# self.remap_columns()
# # We have some non-standard construction age bands which we'll clean for matching
# if not self.is_newdata:
# self.standardise_construction_age_band()
# self.clean_missing_rooms()
# self.recast_df_columns(
# column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
# )
# if not self.is_newdata:
# self.clean_multi_glaze_proportion()
# self.clean_photo_supply()
# if not self.is_newdata:
# self.retain_multiple_epc_properties(
# epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
# )
# if DATA_PROCESSOR_SETTINGS["epc_minimum_count"] >= 1:
# # If we have multiple EPC records, we can try and do filling
# self.fill_na_fields()
# if not self.is_newdata:
# self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
# # Final re-casting after data transformed and prepared
# coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else
# COLUMNTYPES
# for k, v in coltypes.items():
# self.data[k] = self.data[k].astype(v)
# self.data = self.data.astype(coltypes)
# self.na_remapping()
# self.cleaning_averages = None
# if not self.is_newdata:
# # We have some odd cases with missing constituency so we fill
# self.data = self.data.fillna({"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]})
# self.cleaning_averages = self.make_cleaning_averages()
# # We apply averages cleaning to the data
# self.data = self.apply_averages_cleaning(
# data_to_clean=self.data,
# cleaning_data=self.cleaning_averages,
# cols_to_merge_on=COLUMNS_TO_MERGE_ON
# )
# self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0]
# self.cleaning_averages.columns = self.cleaning_averages.columns.str.lower()
# self.data.columns = self.data.columns.str.lower()
# return self.data, self.cleaning_averages
def na_remapping(self, auto_subset_columns: bool = False):
fill_na_map_apply = (
{k: v for k, v in fill_na_map.items() if k in self.data.columns}
if auto_subset_columns
else fill_na_map
)
for column, fill_value in fill_na_map_apply.items():
self.data[column] = self.data[column].fillna(fill_value)
def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
"""
If we have a minimum of 2 epcs, we can do back fill and forward fill on certain data fields
"""
# Each uprn can fille backward from recent and forward fill from oldest
# The groupby changes the order and we use the index to make the original data
filled_data = (
self.data.groupby("UPRN", group_keys=True)[columns_to_fill]
.apply(lambda group: group.bfill().ffill().infer_objects(copy=False))
.reset_index()
.set_index("level_1")
.sort_index()
)
self.data[columns_to_fill] = filled_data[columns_to_fill]
# For floor area, we also replace "" values with None
self.data[["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]] = self.data[
["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]
].replace("", None)
def make_cleaning_averages(self, ignore_step: bool = False) -> pd.DataFrame:
"""
Create a dataset to hold averages based on property type, built form, construction age, and rooms.
Not require in newdata mode
"""
if ignore_step:
return pd.DataFrame()
# Define a custom function to calculate the median, excluding missing values
def median_without_missing(group):
return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
cleaning_averages = (
self.data.groupby(
[
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTRUCTION_AGE_BAND",
"NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS",
],
observed=True,
dropna=False,
)
.apply(median_without_missing)
.reset_index()
)
general_averages = (
self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True)
.apply(median_without_missing)
.reset_index()
)
property_averages = (
self.data.groupby(["PROPERTY_TYPE"], observed=True)
.apply(median_without_missing)
.reset_index()
)
built_form_averages = (
self.data.groupby(["BUILT_FORM"], observed=True)
.apply(median_without_missing)
.reset_index()
)
# We can clean up any NA's in the cleaning averages with the general averages here
cleaning_averages_filled = pd.merge(
cleaning_averages,
general_averages,
on=["PROPERTY_TYPE", "BUILT_FORM"],
suffixes=["", "_AVERAGE"],
)
cleaning_averages_filled = pd.merge(
cleaning_averages_filled,
property_averages,
on=["PROPERTY_TYPE"],
suffixes=["", "_PROPERTY_AVERAGE"],
)
cleaning_averages_filled = pd.merge(
cleaning_averages_filled,
built_form_averages,
on=["BUILT_FORM"],
suffixes=["", "_BUILT_FORM_AVERAGE"],
)
for variable in AVERAGE_FIXED_FEATURES:
# Replace any missing NAN values with averages for the same Property type and built form
cleaning_averages_filled[variable] = cleaning_averages_filled[
variable
].fillna(cleaning_averages_filled[f"{variable}_AVERAGE"])
cleaning_averages_filled = cleaning_averages_filled.drop(
columns=f"{variable}_AVERAGE"
)
# If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope
# and built form
# We can use just the property type average and replace
cleaning_averages_filled[variable] = cleaning_averages_filled[
variable
].fillna(cleaning_averages_filled[f"{variable}_PROPERTY_AVERAGE"])
cleaning_averages_filled = cleaning_averages_filled.drop(
columns=f"{variable}_PROPERTY_AVERAGE"
)
# If there are still NA values, use BUILT FORM averages
cleaning_averages_filled["variable"] = cleaning_averages_filled[
variable
].fillna(cleaning_averages_filled[f"{variable}_BUILT_FORM_AVERAGE"])
cleaning_averages_filled = cleaning_averages_filled.drop(
columns=f"{variable}_BUILT_FORM_AVERAGE"
)
# If there still is na values, use average across all epc in consituecy
cleaning_averages_filled[variable] = cleaning_averages_filled[
variable
].fillna(cleaning_averages_filled[variable].mean())
# If the consituency is all NA values, then take UK AVERAGE VALUES
# cleaning_averages_filled["TOTAL_FLOOR_AREA"] = cleaning_averages_filled[
# "TOTAL_FLOOR_AREA"
# ].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE)
# cleaning_averages_filled["FLOOR_HEIGHT"] = cleaning_averages_filled[
# "FLOOR_HEIGHT"
# ].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE)
self.cleaning_averages = cleaning_averages_filled
def retain_multiple_epc_properties(
self, epc_minimum_count: int = 1, ignore_step: bool = False
) -> None:
"""
Reduce the data futher by keeping only datasets with multiple epcs
"""
if self.violation_mode:
# TODO: to fill in
return
if ignore_step:
return
counts = self.data.groupby("UPRN").size().reset_index()
counts.columns = ["UPRN", "count"]
# take UPRNS with multiple EPCs
counts = counts[counts["count"] > epc_minimum_count]
self.data = pd.merge(self.data, counts, on="UPRN")
def recast_df_columns(
self, column_mappings: dict, auto_subset_columns: bool = False
) -> None:
"""
Recast columns from the dataframe to ensure the behaviour we want
"""
if auto_subset_columns:
column_mappings = {
k: v for k, v in column_mappings.items() if k in self.data.columns
}
for key, values in column_mappings.items():
if key not in self.data.columns:
raise ValueError("Column mapping incorrectly specified")
if isinstance(values, list):
for value in values:
self.data[key] = self.data[key].astype(value)
else:
self.data[key] = self.data[key].astype(values)
def recast_all_data(
self, column_mappings: dict, auto_subset_columns: bool = False
) -> None:
"""
Using a dictionary to recast all columns at once
"""
if auto_subset_columns:
column_mappings = {
k: v for k, v in column_mappings.items() if k in self.data.columns
}
self.data = self.data.astype(column_mappings)
def confine_data(self, ignore_step: bool = False):
"""
Include all step to reduce down the data based on assumptions
"""
if self.violation_mode:
violation_uprn_missing = pd.isnull(self.data["UPRN"])
violation_old_lodgment_date = (
self.data["LODGEMENT_DATE"] < EARLIEST_EPC_DATE
)
# violation_invalid_transaction_type = self.data["TRANSACTION_TYPE"] == IGNORED_TRANSACTION_TYPES
violation_ignored_floor_level = self.data["FLOOR_LEVEL"].isin(
IGNORED_FLOOR_LEVELS
)
violation_rdsap_score_above_max = self.data[RDSAP_RESPONSE] > MAX_SAP_SCORE
violation_missing_windows_description = pd.isnull(
self.data["WINDOWS_DESCRIPTION"]
)
violation_missing_hotwater_description = pd.isnull(
self.data["HOTWATER_DESCRIPTION"]
)
violation_missing_roof_description = pd.isnull(
self.data["ROOF_DESCRIPTION"]
)
violation_invalid_property_type = (
self.data["PROPERTY_TYPE"] == IGNORED_PROPERTY_TYPES
)
violation_invalid_tenure = self.data["TENURE"].isin(IGNORED_TENURES)
violation_df = pd.concat(
[
violation_uprn_missing,
violation_old_lodgment_date,
violation_invalid_transaction_type,
violation_ignored_floor_level,
violation_rdsap_score_above_max,
violation_missing_windows_description,
violation_missing_hotwater_description,
violation_missing_roof_description,
violation_invalid_property_type,
violation_invalid_tenure,
],
axis=1,
keys=[
"violation_uprn_missing",
"violation_old_lodgment_date",
"violation_invalid_transaction_type",
"violation_ignored_floor_level",
"violation_rdsap_score_above_max",
"violation_missing_windows_description",
"violation_missing_hotwater_description",
"violation_missing_roof_description",
"violation_invalid_property_type",
"violation_invalid_tenure",
],
)
self.data = pd.concat([self.data, violation_df], axis=1)
if ignore_step:
return
# Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
# Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
# before the introduction of SAP09
# Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
# full SAP, which produces different results to the RdSAP methodology
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
# Filter 5: Remove any EPCs with a SAP score above 100
# Filter 6: We found a small number of cases that have missing window description so we drop these
# Filter 7: We found a small number of cases that have missing hotwater description so we drop these
self.data = self.data[~pd.isnull(self.data["UPRN"])]
self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
# self.data = self.data[self.data["TRANSACTION_TYPE"] != IGNORED_TRANSACTION_TYPES]
self.data = self.data[~self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS)]
self.data = self.data[self.data[RDSAP_RESPONSE] <= MAX_SAP_SCORE]
# We observed 7 final records with missing windows and 2 records with missing hot water so we shall remove them
self.data = self.data[~pd.isnull(self.data["WINDOWS_DESCRIPTION"])]
self.data = self.data[~pd.isnull(self.data["HOTWATER_DESCRIPTION"])]
self.data = self.data[~pd.isnull(self.data["ROOF_DESCRIPTION"])]
# Because park homes are surveyed unusually (for example, we don't have u-values to
# look up for their different components, they need to be collected in survey and aren't reflected in
# EPCs) we'll ignore them from the model
self.data = self.data[self.data["PROPERTY_TYPE"] != IGNORED_PROPERTY_TYPES]
# We remove EPCs where the tenure is unknown, but is usually an indicator of a new build
self.data = self.data[~self.data["TENURE"].isin(IGNORED_TENURES)]
# We remap zero values to None
self.data.loc[self.data["FLOOR_HEIGHT"] == 0, "FLOOR_HEIGHT"] = None
def clean_multi_glaze_proportion(self, ignore_step: bool = False) -> None:
"""
If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
"""
if self.violation_mode:
# TODO:
return
if ignore_step:
return
no_multi_glaze_proportion_index = pd.isnull(
self.data["MULTI_GLAZE_PROPORTION"]
) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
self.data.loc[no_multi_glaze_proportion_index, "MULTI_GLAZE_PROPORTION"] = 100
def clean_photo_supply(self) -> None:
"""
We fill photo supply with zeros where it's missing
"""
self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].astype("Int64").fillna(0)
@staticmethod
def apply_averages_cleaning(
data_to_clean,
cleaning_data,
cols_to_merge_on,
colnames=None,
ignore_step: bool = False,
):
"""
Clean the input DataFrame using averages from a cleaning DataFrame.
:param data_to_clean: DataFrame to be cleaned.
:param cleaning_data: DataFrame containing data for cleaning.
:param cols_to_merge_on: Columns on which merging is based. We pass cols_to_merge_on to this function as this
differs depending on where the function is being used.
:param colnames: If specified can be used to state exactly which columns to clean
:return: Cleaned DataFrame.
"""
if ignore_step:
return None
# The desired colnames to clean - which may not be present
if colnames is None:
colnames = [
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
"FIXED_LIGHTING_OUTLETS_COUNT",
]
cols_to_clean = [c for c in colnames if c in data_to_clean.columns]
# Enforce data types
for col in ["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]:
data_to_clean[col] = data_to_clean[col].astype(float)
# Identify columns with non-NaN values
columns_to_merge_on = data_to_clean[cols_to_merge_on].dropna().columns.tolist()
# Calculate averages
cleaning_averages_to_merge = cleaning_data.groupby(columns_to_merge_on).agg(
dict(
zip(
cols_to_clean,
[
"mean",
]
* len(cols_to_clean),
)
)
)
# Merge with the original data
data_to_clean = pd.merge(
data_to_clean,
cleaning_averages_to_merge,
on=columns_to_merge_on,
suffixes=("", "_AVERAGE"),
how="left",
)
global_averages = cleaning_data[cols_to_clean].mean()
# Fill NaN values with averages
for col in cols_to_clean:
data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"])
data_to_clean = data_to_clean.drop(columns=[f"{col}_AVERAGE"])
# If we still have missings
data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[col].mean())
# Final step if we still have missings - use global mean
data_to_clean[col] = data_to_clean[col].fillna(global_averages[col])
return data_to_clean
def get_component_features(self, suffix: str) -> pd.DataFrame:
"""
This function will return the property components such as the walls, roof, heating etc
as well as lodgement date. These are features that we expect might change from one EPC to the
next
:param suffix: Should be one of "_STARTING" or "_ENDING"
:return: Pandas dataframe containing the subset of columns defined in COMPONENT_FEATURES
"""
if suffix not in ["_starting", "_ending"]:
raise Exception("Suffix should be one of _starting or _ending")
if suffix == "_STARTING":
starting_cols = (
self.data[STARTING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES]
.copy()
.add_suffix(suffix)
)
fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS + POTENTIAL_COLUMNS].copy()
return pd.concat([starting_cols, fixed_cols], axis=1)
return (
self.data[ENDING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES]
.copy()
.add_suffix(suffix)
)
def get_fixed_features(self) -> pd.DataFrame:
"""
Returns the fixed features that we don't believe should vary from one EPC to the next
:return: Pandas dataframe containing the columns defined in FIXED_FEATURES
"""
return self.data[FIXED_FEATURES]
@staticmethod
def coerce_boolean_columns(df: pd.DataFrame, cols_to_ignore: List | None = None):
"""
Coerce columns with string 'True'/'False' values to boolean columns.
:param df: Input DataFrame.
:param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids
:return: DataFrame with coerced columns.
"""
object_columns = df.select_dtypes(include=["object"]).columns
if cols_to_ignore:
object_columns = [c for c in object_columns if c not in cols_to_ignore]
for column in object_columns:
unique_values = df[column].dropna().unique()
# If the unique values in the column are 'True' and 'False', convert the column to boolean
if set(unique_values) == {"True", "False"} or set(unique_values) == {
True,
False,
}:
df[column] = df[column].astype(bool)
return df
@staticmethod
def calculate_days_to(lodgement_date):
if isinstance(lodgement_date, str):
return (
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
).days
return (
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
).dt.days
@staticmethod
def clean_missings_after_description_process(df, ignore_cols=None):
missings = pd.isnull(df).sum()
missings = missings[missings > 0]
if ignore_cols:
missings = missings[~missings.index.isin(ignore_cols)]
for col in missings.index:
unique_values = df[col].unique()
# TODO: confirm this behaviour
if True in unique_values or False in unique_values:
df[col] = df[col].fillna(False)
if "none" in unique_values:
df[col] = df[col].fillna("none")
else:
df[col] = df[col].fillna("Unknown")
return df
@staticmethod
def clean_efficiency_variables(df):
"""
These is scope to clean this by the model per corresponding description.
E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
fill in the missing values with this.
When looking at this initially, there are a large volume of records with missing energy efficiency
values and therefore a simpler approach was taken just to test including these variables
:param df:
:return:
"""
missings = pd.isnull(df).sum()
missings = missings[missings >= 1]
if len(missings) == 0:
return df
# Make sure they are all efficiency columns
if any(~missings.index.str.contains("ENERGY_EFF")):
raise ValueError("Non efficiency columns are missing")
for m in missings.index:
df[m] = df[m].fillna("NO_RATING")
return df