mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
605 lines
22 KiB
Python
605 lines
22 KiB
Python
from pathlib import Path
|
|
import numpy as np
|
|
import pandas as pd
|
|
from BaseUtility import Definitions
|
|
from etl.epc.settings import (
|
|
DATA_PROCESSOR_SETTINGS,
|
|
EARLIEST_EPC_DATE,
|
|
FULLY_GLAZED_DESCRIPTIONS,
|
|
AVERAGE_FIXED_FEATURES,
|
|
BUILT_FORM_REMAP,
|
|
COLUMNS_TO_MERGE_ON,
|
|
FIXED_FEATURES,
|
|
COLUMNTYPES,
|
|
RDSAP_RESPONSE,
|
|
MAX_SAP_SCORE,
|
|
fill_na_map,
|
|
STARTING_SUFFIX_COMPONENT_COLS,
|
|
NO_SUFFIX_COMPONENT_COLS,
|
|
ENDING_SUFFIX_COMPONENT_COLS,
|
|
POTENTIAL_COLUMNS,
|
|
EFFICIENCY_FEATURES,
|
|
)
|
|
from recommendations.rdsap_tables import FLOOR_LEVEL_MAP
|
|
|
|
from typing import List
|
|
|
|
# These lookups are used to clean the construction age band
|
|
bounds_map = {
|
|
"England and Wales: before 1900": {"l": 0, "u": 1899},
|
|
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
|
|
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
|
|
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
|
|
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
|
|
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
|
|
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
|
|
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
|
|
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
|
|
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
|
|
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
|
|
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
|
|
}
|
|
|
|
remap = {
|
|
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
|
|
}
|
|
|
|
expanded_map = {
|
|
i: [
|
|
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
|
|
][0] for i in range(0, 3001)
|
|
}
|
|
|
|
|
|
def is_int(x):
|
|
try:
|
|
int(x)
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
|
|
class DataProcessor:
|
|
"""
|
|
Handle data loading and data preprocessing
|
|
"""
|
|
|
|
def __init__(self, filepath: Path | None, newdata: bool = False) -> None:
|
|
"""
|
|
:param filepath: If specified, is the physical location of the data
|
|
:param newdata: Indicates if we are processing new, testing data.
|
|
In this instance, there are some operations we do not
|
|
want to perform, such as confine_data()
|
|
"""
|
|
self.filepath = filepath
|
|
self.data = None
|
|
self.newdata = newdata
|
|
|
|
def load_data(self, low_memory=False) -> None:
|
|
if not self.filepath:
|
|
raise ValueError("No filepath specified")
|
|
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
|
|
|
|
def insert_data(self, data: pd.DataFrame) -> None:
|
|
self.data = data
|
|
|
|
@staticmethod
|
|
def clean_construction_age_band(x):
|
|
# Firstly, we check if it's an error value
|
|
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
|
|
return x
|
|
|
|
# Next, we check if it's a value in our map
|
|
if bounds_map.get(x):
|
|
return x
|
|
|
|
# We check if it's a standard remap value
|
|
remap_value = remap.get(x, None)
|
|
if remap_value:
|
|
return remap_value
|
|
|
|
# We check if it's a number
|
|
if is_int(x):
|
|
x_int = int(x)
|
|
return expanded_map[x_int]
|
|
|
|
raise NotImplementedError("Not handled the case for value %s" % x)
|
|
|
|
def standardise_construction_age_band(self):
|
|
"""
|
|
This function will tidy up some of the non-standard values that are populated in the construction age
|
|
band, which is useful for cleaning
|
|
"""
|
|
|
|
self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
|
|
lambda x: self.clean_construction_age_band(x)
|
|
)
|
|
|
|
self.data = self.data[
|
|
~pd.isnull(self.data["CONSTRUCTION_AGE_BAND"])
|
|
]
|
|
|
|
def clean_missing_rooms(self):
|
|
"""
|
|
For the number of heated rooms and number of habitable rooms, we clean these values up front,
|
|
based on property archetype and age
|
|
|
|
TODO: We could use a model based impution approach for possibly more accurate cleaning
|
|
"""
|
|
|
|
self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0])
|
|
|
|
def apply_clean(data, matching_columns):
|
|
|
|
cleaning_data = data[~pd.isnull(data[col])].groupby(
|
|
matching_columns
|
|
)[col].median().reset_index()
|
|
|
|
data = data.merge(
|
|
cleaning_data, how="left", on=matching_columns, suffixes=("", "_CLEANING")
|
|
)
|
|
|
|
data[col] = np.where(pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col])
|
|
data = data.drop(columns=f"{col}_CLEANING")
|
|
return data
|
|
|
|
for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]:
|
|
|
|
to_index = 3
|
|
matching_columns = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "POSTAL_AREA"]
|
|
has_missings = pd.isnull(self.data[col]).sum()
|
|
while has_missings:
|
|
self.data = apply_clean(
|
|
data=self.data,
|
|
matching_columns=matching_columns[0:to_index + 1]
|
|
)
|
|
has_missings = pd.isnull(self.data[col]).sum()
|
|
|
|
if not has_missings or to_index == 0:
|
|
# Check if we've gotten to index 0 and still have missings - something has gone wrong or
|
|
# we have a very unique property type
|
|
if has_missings:
|
|
raise NotImplementedError("Handle this edge case, we still have missings for column %s" % col)
|
|
|
|
break
|
|
to_index -= 1
|
|
|
|
def pre_process(self) -> pd.DataFrame:
|
|
"""
|
|
Load data and begin initial cleaning
|
|
"""
|
|
if self.data is None:
|
|
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
|
|
|
|
if not self.newdata:
|
|
self.confine_data()
|
|
|
|
self.remap_columns()
|
|
|
|
# We have some non-standard construction age bands which we'll clean for matching
|
|
if not self.newdata:
|
|
self.standardise_construction_age_band()
|
|
|
|
self.clean_missing_rooms()
|
|
|
|
self.recast_df_columns(
|
|
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
|
|
)
|
|
|
|
if not self.newdata:
|
|
self.clean_multi_glaze_proportion()
|
|
|
|
self.clean_photo_supply()
|
|
|
|
if not self.newdata:
|
|
self.retain_multiple_epc_properties(
|
|
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
|
|
)
|
|
|
|
if DATA_PROCESSOR_SETTINGS["epc_minimum_count"] >= 1:
|
|
# If we have multiple EPC records, we can try and do filling
|
|
self.fill_na_fields()
|
|
|
|
if not self.newdata:
|
|
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
|
|
|
# Final re-casting after data transformed and prepared
|
|
coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES
|
|
for k, v in coltypes.items():
|
|
self.data[k] = self.data[k].astype(v)
|
|
self.data = self.data.astype(coltypes)
|
|
|
|
self.na_remapping()
|
|
|
|
return self.data
|
|
|
|
def na_remapping(self):
|
|
|
|
fill_na_map_apply = {
|
|
k: v for k, v in fill_na_map.items() if k in self.data.columns
|
|
} if self.newdata else fill_na_map
|
|
|
|
for column, fill_value in fill_na_map_apply.items():
|
|
self.data[column] = self.data[column].fillna(fill_value)
|
|
|
|
def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
|
|
"""
|
|
If we have a minimum of 2 epcs, we can do back fill and forward fill on certain data fields
|
|
"""
|
|
# Each uprn can fille backward from recent and forward fill from oldest
|
|
# The groupby changes the order and we use the index to make the original data
|
|
|
|
filled_data = (
|
|
self.data.groupby("UPRN", group_keys=True)[columns_to_fill]
|
|
.apply(lambda group: group.fillna(method="bfill").fillna(method="ffill"))
|
|
.reset_index()
|
|
.set_index("level_1")
|
|
.sort_index()
|
|
)
|
|
|
|
self.data[columns_to_fill] = filled_data[columns_to_fill]
|
|
|
|
# For floor area, we also replace "" values with None
|
|
self.data[["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]] = self.data[
|
|
["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]
|
|
].replace("", None)
|
|
|
|
def remap_columns(self):
|
|
"""
|
|
Remap all columns, for any non values
|
|
"""
|
|
|
|
# Map all anomaly values to None
|
|
data_anomaly_map = dict(
|
|
zip(
|
|
Definitions.DATA_ANOMALY_MATCHES,
|
|
[None] * len(Definitions.DATA_ANOMALY_MATCHES),
|
|
)
|
|
)
|
|
|
|
# Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
|
|
data = self.data.replace(data_anomaly_map)
|
|
data = data.replace(np.NAN, None)
|
|
|
|
# Remap certain columns
|
|
if not self.newdata:
|
|
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
|
|
data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
|
|
|
|
convert_to_lower = ["TRANSACTION_TYPE"]
|
|
for col in convert_to_lower:
|
|
data[col] = data[col].str.lower()
|
|
|
|
self.data = data
|
|
|
|
def make_cleaning_averages(self) -> pd.DataFrame:
|
|
# Define a custom function to calculate the median, excluding missing values
|
|
def median_without_missing(group):
|
|
return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
|
|
|
|
cleaning_averages = (
|
|
self.data.groupby(
|
|
[
|
|
"PROPERTY_TYPE",
|
|
"BUILT_FORM",
|
|
"CONSTRUCTION_AGE_BAND",
|
|
"NUMBER_HABITABLE_ROOMS",
|
|
"NUMBER_HEATED_ROOMS",
|
|
],
|
|
observed=True,
|
|
dropna=False,
|
|
)
|
|
.apply(median_without_missing)
|
|
.reset_index()
|
|
)
|
|
|
|
general_averages = (
|
|
self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True)
|
|
.apply(median_without_missing)
|
|
.reset_index()
|
|
)
|
|
|
|
property_averages = (
|
|
self.data.groupby(["PROPERTY_TYPE"], observed=True)
|
|
.apply(median_without_missing)
|
|
.reset_index()
|
|
)
|
|
|
|
built_form_averages = (
|
|
self.data.groupby(["BUILT_FORM"], observed=True)
|
|
.apply(median_without_missing)
|
|
.reset_index()
|
|
)
|
|
|
|
# We can clean up any NA's in the cleaning averages with the general averages here
|
|
cleaning_averages_filled = pd.merge(
|
|
cleaning_averages,
|
|
general_averages,
|
|
on=["PROPERTY_TYPE", "BUILT_FORM"],
|
|
suffixes=["", "_AVERAGE"],
|
|
)
|
|
cleaning_averages_filled = pd.merge(
|
|
cleaning_averages_filled,
|
|
property_averages,
|
|
on=["PROPERTY_TYPE"],
|
|
suffixes=["", "_PROPERTY_AVERAGE"],
|
|
)
|
|
cleaning_averages_filled = pd.merge(
|
|
cleaning_averages_filled,
|
|
built_form_averages,
|
|
on=["BUILT_FORM"],
|
|
suffixes=["", "_BUILT_FORM_AVERAGE"],
|
|
)
|
|
|
|
for variable in AVERAGE_FIXED_FEATURES:
|
|
# Replace any missing NAN values with averages for the same Property type and built form
|
|
cleaning_averages_filled[variable] = cleaning_averages_filled[variable].fillna(
|
|
cleaning_averages_filled[f"{variable}_AVERAGE"]
|
|
)
|
|
|
|
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_AVERAGE")
|
|
|
|
# If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope
|
|
# and built form
|
|
# We can use just the property type average and replace
|
|
|
|
cleaning_averages_filled[variable] = cleaning_averages_filled[variable].fillna(
|
|
cleaning_averages_filled[f"{variable}_PROPERTY_AVERAGE"]
|
|
)
|
|
|
|
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_PROPERTY_AVERAGE")
|
|
|
|
# If there are still NA values, use BUILT FORM averages
|
|
cleaning_averages_filled["variable"] = cleaning_averages_filled[variable].fillna(
|
|
cleaning_averages_filled[f"{variable}_BUILT_FORM_AVERAGE"]
|
|
)
|
|
|
|
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
|
|
|
|
# If there still is na values, use average across all epc in consituecy
|
|
cleaning_averages_filled[variable] = cleaning_averages_filled[
|
|
variable
|
|
].fillna(cleaning_averages_filled[variable].mean())
|
|
|
|
# If the consituency is all NA values, then take UK AVERAGE VALUES
|
|
# cleaning_averages_filled["TOTAL_FLOOR_AREA"] = cleaning_averages_filled[
|
|
# "TOTAL_FLOOR_AREA"
|
|
# ].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE)
|
|
# cleaning_averages_filled["FLOOR_HEIGHT"] = cleaning_averages_filled[
|
|
# "FLOOR_HEIGHT"
|
|
# ].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE)
|
|
|
|
return cleaning_averages_filled
|
|
|
|
def retain_multiple_epc_properties(self, epc_minimum_count: int = 1) -> None:
|
|
"""
|
|
Reduce the data futher by keeping only datasets with multiple epcs
|
|
"""
|
|
|
|
counts = self.data.groupby("UPRN").size().reset_index()
|
|
counts.columns = ["UPRN", "count"]
|
|
|
|
# take UPRNS with multiple EPCs
|
|
counts = counts[counts["count"] > epc_minimum_count]
|
|
self.data = pd.merge(self.data, counts, on="UPRN")
|
|
|
|
def recast_df_columns(self, column_mappings: dict) -> None:
|
|
"""
|
|
Recast columns from the dataframe to ensure the behaviour we want
|
|
"""
|
|
|
|
for key, values in column_mappings.items():
|
|
if key not in self.data.columns:
|
|
raise ValueError("Column mapping incorrectly specified")
|
|
for value in values:
|
|
self.data[key] = self.data[key].astype(value)
|
|
|
|
def confine_data(self) -> None:
|
|
"""
|
|
Include all step to reduce down the data based on assumptions
|
|
"""
|
|
|
|
# Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
|
|
|
|
# Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
|
|
# before the introduction of SAP09
|
|
|
|
# Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
|
|
# full SAP, which produces different results to the RdSAP methodology
|
|
|
|
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
|
|
|
|
# Filter 5: Remove any EPCs with a SAP score above 100
|
|
|
|
# Filter 6: We found a small number of cases that have missing window description so we drop these
|
|
|
|
# Filter 7: We found a small number of cases that have missing hotwater description so we drop these
|
|
|
|
self.data = self.data[~pd.isnull(self.data["UPRN"])]
|
|
self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
|
self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
|
|
self.data = self.data[
|
|
~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])
|
|
]
|
|
self.data = self.data[self.data[RDSAP_RESPONSE] <= MAX_SAP_SCORE]
|
|
|
|
# We observed 7 final records with missing windows and 2 records with missing hot water so we shall remove them
|
|
self.data = self.data[~pd.isnull(self.data["WINDOWS_DESCRIPTION"])]
|
|
self.data = self.data[~pd.isnull(self.data["HOTWATER_DESCRIPTION"])]
|
|
self.data = self.data[~pd.isnull(self.data["ROOF_DESCRIPTION"])]
|
|
|
|
# Because park homes are surveyed unusually (for example, we don't have u-values to
|
|
# look up for their different components, they need to be collected in survey and aren't reflected in
|
|
# EPCs) we'll ignore them from the model
|
|
self.data = self.data[self.data["PROPERTY_TYPE"] != "Park home"]
|
|
|
|
def clean_multi_glaze_proportion(self) -> None:
|
|
"""
|
|
If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
|
|
"""
|
|
|
|
no_multi_glaze_proportion_index = pd.isnull(
|
|
self.data["MULTI_GLAZE_PROPORTION"]
|
|
) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
|
|
self.data.loc[no_multi_glaze_proportion_index, "MULTI_GLAZE_PROPORTION"] = 100
|
|
|
|
def clean_photo_supply(self) -> None:
|
|
"""
|
|
We fill photo supply with zeros where it's missing
|
|
"""
|
|
|
|
self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].fillna(0)
|
|
|
|
@staticmethod
|
|
def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on):
|
|
"""
|
|
Clean the input DataFrame using averages from a cleaning DataFrame.
|
|
|
|
:param data_to_clean: DataFrame to be cleaned.
|
|
:param cleaning_data: DataFrame containing data for cleaning.
|
|
:param cols_to_merge_on: Columns on which merging is based. We pass cols_to_merge_on to this function as this
|
|
differs depending on where the function is being used.
|
|
:return: Cleaned DataFrame.
|
|
"""
|
|
|
|
cols_to_clean = [
|
|
c for c in ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "FIXED_LIGHTING_OUTLETS_COUNT"] if
|
|
c in data_to_clean.columns
|
|
]
|
|
|
|
# Enforce data types
|
|
for col in ["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]:
|
|
data_to_clean[col] = data_to_clean[col].astype(float)
|
|
|
|
# Identify columns with non-NaN values
|
|
columns_to_merge_on = data_to_clean[cols_to_merge_on].dropna().columns.tolist()
|
|
|
|
# Calculate averages
|
|
cleaning_averages_to_merge = cleaning_data.groupby(columns_to_merge_on).agg(
|
|
dict(zip(cols_to_clean, ["mean", ] * len(cols_to_clean)))
|
|
)
|
|
|
|
# Merge with the original data
|
|
data_to_clean = pd.merge(
|
|
data_to_clean,
|
|
cleaning_averages_to_merge,
|
|
on=columns_to_merge_on,
|
|
suffixes=("", "_AVERAGE"),
|
|
how='left'
|
|
)
|
|
|
|
# Fill NaN values with averages
|
|
for col in cols_to_clean:
|
|
data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"], inplace=True)
|
|
data_to_clean.drop(columns=[f"{col}_AVERAGE"], inplace=True)
|
|
|
|
return data_to_clean
|
|
|
|
def get_component_features(self, suffix: str) -> pd.DataFrame:
|
|
"""
|
|
This function will return the property components such as the walls, roof, heating etc
|
|
as well as lodgement date. These are features that we expect might change from one EPC to the
|
|
next
|
|
:param suffix: Should be one of "_STARTING" or "_ENDING"
|
|
:return: Pandas dataframe containing the subset of columns defined in COMPONENT_FEATURES
|
|
"""
|
|
|
|
if suffix not in ["_STARTING", "_ENDING"]:
|
|
raise Exception("Suffix should be one of _STARTING or _ENDING")
|
|
|
|
if suffix == "_STARTING":
|
|
starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES].copy().add_suffix(suffix)
|
|
fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS + POTENTIAL_COLUMNS].copy()
|
|
|
|
return pd.concat([starting_cols, fixed_cols], axis=1)
|
|
|
|
return self.data[
|
|
ENDING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES
|
|
].copy().add_suffix(suffix)
|
|
|
|
def get_fixed_features(self) -> pd.DataFrame:
|
|
"""
|
|
Returns the fixed features that we don't believe should vary from one EPC to the next
|
|
:return: Pandas dataframe containing the columns defined in FIXED_FEATURES
|
|
"""
|
|
return self.data[FIXED_FEATURES]
|
|
|
|
@staticmethod
|
|
def coerce_boolean_columns(df: pd.DataFrame, cols_to_ignore: List | None = None):
|
|
"""
|
|
Coerce columns with string 'True'/'False' values to boolean columns.
|
|
|
|
:param df: Input DataFrame.
|
|
:param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids
|
|
:return: DataFrame with coerced columns.
|
|
"""
|
|
object_columns = df.select_dtypes(include=['object']).columns
|
|
if cols_to_ignore:
|
|
object_columns = [c for c in object_columns if c not in cols_to_ignore]
|
|
|
|
for column in object_columns:
|
|
unique_values = df[column].dropna().unique()
|
|
# If the unique values in the column are 'True' and 'False', convert the column to boolean
|
|
if set(unique_values) == {'True', 'False'} or set(unique_values) == {True, False}:
|
|
df[column] = df[column].astype(bool)
|
|
|
|
return df
|
|
|
|
@staticmethod
|
|
def calculate_days_to(lodgement_date):
|
|
|
|
if isinstance(lodgement_date, str):
|
|
return (
|
|
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
|
|
).days
|
|
|
|
return (
|
|
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
|
|
).dt.days
|
|
|
|
@staticmethod
|
|
def clean_missings_after_description_process(df, ignore_cols=None):
|
|
missings = pd.isnull(df).sum()
|
|
missings = missings[missings > 0]
|
|
|
|
if ignore_cols:
|
|
missings = missings[~missings.index.isin(ignore_cols)]
|
|
|
|
for col in missings.index:
|
|
unique_values = df[col].unique()
|
|
if True in unique_values or False in unique_values:
|
|
df[col] = df[col].fillna(False)
|
|
if "none" in unique_values:
|
|
df[col] = df[col].fillna("none")
|
|
else:
|
|
df[col] = df[col].fillna("Unknown")
|
|
|
|
return df
|
|
|
|
@staticmethod
|
|
def clean_efficiency_variables(df):
|
|
|
|
"""
|
|
These is scope to clean this by the model per corresponding description.
|
|
E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
|
|
fill in the missing values with this.
|
|
When looking at this initially, there are a large volume of records with missing energy efficiency
|
|
values and therefore a simpler approach was taken just to test including these variables
|
|
:param df:
|
|
:return:
|
|
"""
|
|
|
|
missings = pd.isnull(df).sum()
|
|
missings = missings[missings >= 1]
|
|
|
|
if len(missings) == 0:
|
|
return df
|
|
|
|
# Make sure they are all efficiency columns
|
|
if any(~missings.index.str.contains("ENERGY_EFF")):
|
|
raise ValueError("Non efficiency columns are missing")
|
|
|
|
for m in missings.index:
|
|
df[m] = df[m].fillna("NO_RATING")
|
|
|
|
return df
|