formatting

This commit is contained in:
Michael Duong 2024-02-22 16:19:40 +00:00
parent fc237f9dfe
commit 955e72f0bb

View file

@ -1,19 +1,133 @@
import numpy as np
import pandas as pd
from typing import List
from etl.epc.Record import EPCDifferenceRecord
from ValidationConfiguration import DatasetValidationConfiguration
from etl.epc.ValidationConfiguration import DatasetValidationConfiguration
from etl.epc.settings import EARLIEST_EPC_DATE
from recommendations.rdsap_tables import england_wales_age_band_lookup
from recommendations.recommendation_utils import (
get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter,
get_wall_type
estimate_number_of_floors,
get_wall_u_value,
get_roof_u_value,
get_floor_u_value,
estimate_perimeter,
get_wall_type,
)
# TODO: Can probably produce this in the property change app and store in S3
BOOLEAN_VARIABLES = [
"is_cavity_wall",
"is_filled_cavity",
"is_solid_brick",
"is_system_built",
"is_timber_frame",
"is_granite_or_whinstone",
"is_as_built",
"is_cob",
"is_sandstone_or_limestone",
"is_park_home",
"external_insulation",
"internal_insulation",
"is_park_home_ending",
"external_insulation_ending",
"internal_insulation_ending",
"is_to_unheated_space",
"is_to_external_air",
"is_suspended",
"is_solid",
"another_property_below",
"is_pitched",
"is_roof_room",
"is_loft",
"is_flat",
"is_thatched",
"is_at_rafters",
"has_dwelling_above",
"has_radiators",
"has_fan_coil_units",
"has_pipes_in_screed_above_insulation",
"has_pipes_in_insulated_timber_floor",
"has_pipes_in_concrete_slab",
"has_boiler",
"has_air_source_heat_pump",
"has_room_heaters",
"has_electric_storage_heaters",
"has_warm_air",
"has_electric_underfloor_heating",
"has_electric_ceiling_heating",
"has_community_scheme",
"has_ground_source_heat_pump",
"has_no_system_present",
"has_portable_electric_heaters",
"has_water_source_heat_pump",
"has_electric_heat_pump",
"has_micro-cogeneration",
"has_solar_assisted_heat_pump",
"has_exhaust_source_heat_pump",
"has_community_heat_pump",
"has_electric",
"has_mains_gas",
"has_wood_logs",
"has_coal",
"has_oil",
"has_wood_pellets",
"has_anthracite",
"has_dual_fuel_mineral_and_wood",
"has_smokeless_fuel",
"has_lpg",
"has_b30k",
"has_electricaire",
"has_assumed_for_most_rooms",
"has_underfloor_heating",
"has_radiators_ending",
"has_fan_coil_units_ending",
"has_pipes_in_screed_above_insulation_ending",
"has_pipes_in_insulated_timber_floor_ending",
"has_pipes_in_concrete_slab_ending",
"has_boiler_ending",
"has_air_source_heat_pump_ending",
"has_room_heaters_ending",
"has_electric_storage_heaters_ending",
"has_warm_air_ending",
"has_electric_underfloor_heating_ending",
"has_electric_ceiling_heating_ending",
"has_community_scheme_ending",
"has_ground_source_heat_pump_ending",
"has_no_system_present_ending",
"has_portable_electric_heaters_ending",
"has_water_source_heat_pump_ending",
"has_electric_heat_pump_ending",
"has_micro-cogeneration_ending",
"has_solar_assisted_heat_pump_ending",
"has_exhaust_source_heat_pump_ending",
"has_community_heat_pump_ending",
"has_electric_ending",
"has_mains_gas_ending",
"has_wood_logs_ending",
"has_coal_ending",
"has_oil_ending",
"has_wood_pellets_ending",
"has_anthracite_ending",
"has_dual_fuel_mineral_and_wood_ending",
"has_smokeless_fuel_ending",
"has_lpg_ending",
"has_b30k_ending",
"has_electricaire_ending",
"has_assumed_for_most_rooms_ending",
"has_underfloor_heating_ending",
"multiple_room_thermostats",
"multiple_room_thermostats_ending",
"is_community",
"no_individual_heating_or_community_network",
"is_community_ending",
"no_individual_heating_or_community_network_ending",
]
class BaseDataset:
"""
# Base class for all datasets
Base class for all datasets
"""
def __init__(self) -> None:
@ -33,18 +147,20 @@ class BaseDataset:
# raise ValueError(f"Pipeline type {pipeline_type} not found")
# return self.pipeline_steps[pipeline_type]
class TrainingDataset(BaseDataset):
"""
A collection of EPCDifferenceRecords can be combined into a TrainingDataset.
"""
def __init__(self, datasets: List[EPCDifferenceRecord], cleaned_lookup: dict) -> None:
def __init__(
self, datasets: List[EPCDifferenceRecord], cleaned_lookup: dict
) -> None:
# self.pipeline_steps = self.pipeline_factory("training")
self.datasets = datasets
self.df = pd.DataFrame([dataset.difference_record for dataset in datasets])
self._feature_generation()
self._drop_features()
self._clean_efficiency_variables()
@ -59,14 +175,51 @@ class TrainingDataset(BaseDataset):
self._null_validation(information="Clean Missing Values")
self._remove_abnormal_change_in_floor_area()
self._ensure_numeric()
self._organise_starting_ending_columns()
def _organise_starting_ending_columns(self):
"""
Organise the starting and ending columns so that they are next to each other
"""
no_suffix_cols = [
col
for col in self.df.columns
if "_ending" not in col and "_starting" not in col
]
starting_cols = [col for col in self.df.columns if "_starting" in col]
ending_cols = [col for col in self.df.columns if "_ending" in col]
common_cols = [
col.rsplit("_", 1)[0]
for col in starting_cols
if col.replace("_starting", "_ending") in ending_cols
]
only_ending_cols = [
col
for col in ending_cols
if col.replace("_ending", "_starting") not in starting_cols
]
common_cols = [[col + "_starting", col + "_ending"] for col in common_cols]
self.df = self.df.loc[
:,
no_suffix_cols
+ only_ending_cols
+ [col for cols in common_cols for col in cols],
]
def _remove_abnormal_change_in_floor_area(self):
"""
Remove properties where the change in floor area is greater than 100%
"""
self.df["tfa_diff_abs"] = abs(self.df["total_floor_area_ending"] - self.df["total_floor_area_starting"])
self.df["tfa_diff_prop"] = self.df["tfa_diff_abs"] / self.df["total_floor_area_starting"]
self.df["tfa_diff_abs"] = abs(
self.df["total_floor_area_ending"] - self.df["total_floor_area_starting"]
)
self.df["tfa_diff_prop"] = (
self.df["tfa_diff_abs"] / self.df["total_floor_area_starting"]
)
self.df = self.df[self.df["tfa_diff_prop"] < 0.5]
self.df = self.df.drop(columns=["tfa_diff_abs", "tfa_diff_prop"])
@ -75,7 +228,9 @@ class TrainingDataset(BaseDataset):
Ensure that all columns are numeric
"""
# TODO: move into EPCRecord record
uvalue_columns = [col for col in self.df.columns if "thermal_transmittance" in col]
uvalue_columns = [
col for col in self.df.columns if "thermal_transmittance" in col
]
for uvalue_col in uvalue_columns:
self.df[uvalue_col] = pd.to_numeric(self.df[uvalue_col])
@ -85,12 +240,16 @@ class TrainingDataset(BaseDataset):
Using the apply method, use the get_roof_u_value method to generate the u-value
"""
col_name = "roof_insulation_thickness" if not is_end else "roof_insulation_thickness_ending"
col_name = (
"roof_insulation_thickness"
if not is_end
else "roof_insulation_thickness_ending"
)
if row["has_dwelling_above"]:
if row["roof_thermal_transmittance"] != 0:
raise ValueError("Should have 0 u-value for roof")
if row["roof_thermal_transmittance_ending"] != 0:
raise ValueError("Should have 0 u-value for roof")
@ -103,16 +262,24 @@ class TrainingDataset(BaseDataset):
is_flat=row["is_flat"],
is_pitched=row["is_pitched"],
is_at_rafters=row["is_at_rafters"],
age_band=england_wales_age_band_lookup[row["construction_age_band"]]
)
age_band=england_wales_age_band_lookup[row["construction_age_band"]],
)
@staticmethod
def _lambda_function_to_generate_wall_uvalue(row, is_end=False):
"""
Using the apply method, use the get_wall_u_value method to generate the u-value
"""
description_col_name = "walls_clean_description" if not is_end else "walls_clean_description_ending"
thermal_transistance_col_name = "walls_thermal_transmittance" if not is_end else "walls_thermal_transmittance_ending"
description_col_name = (
"walls_clean_description"
if not is_end
else "walls_clean_description_ending"
)
thermal_transistance_col_name = (
"walls_thermal_transmittance"
if not is_end
else "walls_thermal_transmittance_ending"
)
if pd.isnull(row[thermal_transistance_col_name]):
output = get_wall_u_value(
@ -125,14 +292,18 @@ class TrainingDataset(BaseDataset):
output = row[thermal_transistance_col_name]
return output
@staticmethod
def _lambda_function_to_generate_floor_uvalue(row, is_end=False):
"""
Using the apply method, use the get_floor_u_value method to generate the u-value
"""
floor_thermal_col_name = "floor_thermal_transmittance" if not is_end else "floor_thermal_transmittance_ending"
floor_thermal_col_name = (
"floor_thermal_transmittance"
if not is_end
else "floor_thermal_transmittance_ending"
)
if row["another_property_below"]:
if row["floor_thermal_transmittance"] != 0:
@ -145,20 +316,31 @@ class TrainingDataset(BaseDataset):
uvalue = row[floor_thermal_col_name]
if pd.isnull(uvalue):
insulation_col_name = "floor_insulation_thickness" if not is_end else "floor_insulation_thickness_ending"
floor_area_col_name = "estimated_perimeter_starting" if not is_end else "estimated_perimeter_ending"
perimeter_col_name = "total_floor_area_starting" if not is_end else "total_floor_area_ending"
insulation_col_name = (
"floor_insulation_thickness"
if not is_end
else "floor_insulation_thickness_ending"
)
perimeter_col_name = (
"estimated_perimeter_starting"
if not is_end
else "estimated_perimeter_ending"
)
floor_area_col_name = (
"ground_floor_area_starting"
if not is_end
else "ground_floor_area_ending"
)
uvalue = get_floor_u_value(
floor_type=row["floor_type"],
perimeter=row[floor_area_col_name],
area=row[perimeter_col_name],
insulation_thickness=row[insulation_col_name],
wall_type=row["wall_type"],
age_band=england_wales_age_band_lookup[row["construction_age_band"]]
)
floor_type=row["floor_type"],
perimeter=row[perimeter_col_name],
area=row[floor_area_col_name],
insulation_thickness=row[insulation_col_name],
wall_type=row["wall_type"],
age_band=england_wales_age_band_lookup[row["construction_age_band"]],
)
return uvalue
def _generate_u_values_from_features(self):
@ -171,88 +353,136 @@ class TrainingDataset(BaseDataset):
# ~~~~~~~~~~~~~~~~~~
walls_starting_uvalue = self.df.apply(
lambda row: self._lambda_function_to_generate_wall_uvalue(row),
axis=1
lambda row: self._lambda_function_to_generate_wall_uvalue(row), axis=1
)
walls_ending_uvalue = self.df.apply(
lambda row: self._lambda_function_to_generate_wall_uvalue(row, is_end=True),
axis=1
axis=1,
)
walls_starting_uvalue = self.df['walls_thermal_transmittance'].fillna(walls_starting_uvalue)
walls_starting_equals_ending_flag = self.df['walls_clean_description'] == self.df["walls_clean_description_ending"]
walls_ending_uvalue[walls_starting_equals_ending_flag] = walls_starting_uvalue[walls_starting_equals_ending_flag]
walls_starting_uvalue = self.df["walls_thermal_transmittance"].fillna(
walls_starting_uvalue
)
walls_starting_equals_ending_flag = (
self.df["walls_clean_description"]
== self.df["walls_clean_description_ending"]
)
walls_ending_uvalue[walls_starting_equals_ending_flag] = walls_starting_uvalue[
walls_starting_equals_ending_flag
]
# ~~~~~~~~~~~~~~~~~~
# Roof
# ~~~~~~~~~~~~~~~~~~
roof_starting_uvalue = self.df.apply(
lambda row: self._lambda_function_to_generate_roof_uvalue(row),
axis=1
lambda row: self._lambda_function_to_generate_roof_uvalue(row), axis=1
)
roof_ending_uvalue = self.df.apply(
lambda row: self._lambda_function_to_generate_roof_uvalue(row, is_end=True),
axis=1
axis=1,
)
roof_starting_uvalue = self.df['roof_thermal_transmittance'].fillna(roof_starting_uvalue)
roof_ending_uvalue = self.df['roof_thermal_transmittance_ending'].fillna(roof_ending_uvalue)
roof_starting_uvalue = self.df["roof_thermal_transmittance"].fillna(
roof_starting_uvalue
)
roof_ending_uvalue = self.df["roof_thermal_transmittance_ending"].fillna(
roof_ending_uvalue
)
# ~~~~~~~~~~~~~~~~~~
# Floor
# ~~~~~~~~~~~~~~~~~~
self.df['estimated_perimeter_starting'] = self.df.apply(
lambda row: estimate_perimeter(row["total_floor_area_starting"], row["number_habitable_rooms"]),
axis=1
self.df["estimated_number_of_floors"] = self.df.apply(
lambda row: estimate_number_of_floors(row["property_type"]), axis=1
)
self.df['estimated_perimeter_ending'] = self.df.apply(
lambda row: estimate_perimeter(row["total_floor_area_ending"], row["number_habitable_rooms"]),
axis=1
self.df["ground_floor_area_starting"] = (
self.df["total_floor_area_starting"] / self.df["estimated_number_of_floors"]
)
self.df["ground_floor_area_ending"] = (
self.df["total_floor_area_ending"] / self.df["estimated_number_of_floors"]
)
self.df["estimated_perimeter_starting"] = self.df.apply(
lambda row: estimate_perimeter(
row["ground_floor_area_starting"],
row["number_habitable_rooms_starting"]
/ row["estimated_number_of_floors"],
),
axis=1,
)
self.df["estimated_perimeter_ending"] = self.df.apply(
lambda row: estimate_perimeter(
row["ground_floor_area_starting"],
row["number_habitable_rooms_ending"]
/ row["estimated_number_of_floors"],
),
axis=1,
)
self.df["floor_type"] = self.df["is_suspended"].replace(
{True: "suspended", False: "solid"}
)
self.df["floor_type"] = self.df["is_suspended"].replace({True: "suspended", False: "solid"})
self.df["wall_type"] = self.df.apply(
lambda row: get_wall_type(
is_cavity_wall=row["is_cavity_wall"],
is_solid_brick=row["is_solid_brick"],
is_timber_frame=row["is_timber_frame"],
is_granite_or_whinstone=row["is_granite_or_whinstone"],
is_cob=row["is_cob"],
is_cavity_wall=row["is_cavity_wall"],
is_solid_brick=row["is_solid_brick"],
is_timber_frame=row["is_timber_frame"],
is_granite_or_whinstone=row["is_granite_or_whinstone"],
is_cob=row["is_cob"],
is_sandstone_or_limestone=row["is_sandstone_or_limestone"],
is_system_built=row["is_system_built"],
is_park_home=row["is_park_home"]
),
axis=1
is_park_home=row["is_park_home"],
),
axis=1,
)
floor_starting_uvalue = self.df.apply(
lambda row: self._lambda_function_to_generate_floor_uvalue(row),
axis=1
lambda row: self._lambda_function_to_generate_floor_uvalue(row), axis=1
)
floor_ending_uvalue = self.df.apply(
lambda row: self._lambda_function_to_generate_floor_uvalue(row, is_end=True),
axis=1
lambda row: self._lambda_function_to_generate_floor_uvalue(
row, is_end=True
),
axis=1,
)
floor_starting_uvalue = self.df['floor_thermal_transmittance'].fillna(floor_starting_uvalue)
floor_ending_uvalue = self.df['floor_thermal_transmittance_ending'].fillna(floor_ending_uvalue)
floor_starting_uvalue = self.df["floor_thermal_transmittance"].fillna(
floor_starting_uvalue
)
floor_ending_uvalue = self.df["floor_thermal_transmittance_ending"].fillna(
floor_ending_uvalue
)
for component in ["walls", "roof", "floor"]:
self.df[f"{component}_thermal_transmittance"] = self.df[f"{component}_thermal_transmittance"].fillna(eval(f"{component}_starting_uvalue"))
self.df[f"{component}_thermal_transmittance_ending"] = self.df[f"{component}_thermal_transmittance_ending"].fillna(eval(f"{component}_ending_uvalue"))
self.df[f"{component}_thermal_transmittance"] = self.df[
f"{component}_thermal_transmittance"
].fillna(eval(f"{component}_starting_uvalue"))
self.df[f"{component}_thermal_transmittance_ending"] = self.df[
f"{component}_thermal_transmittance_ending"
].fillna(eval(f"{component}_ending_uvalue"))
self.df = self.df.drop(columns=["floor_type", "wall_type", "walls_clean_description", "walls_clean_description_ending"])
self.df = self.df.drop(
columns=[
"floor_type",
"wall_type",
"walls_clean_description",
"walls_clean_description_ending",
"estimated_number_of_floors",
"ground_floor_area_starting",
"ground_floor_area_ending",
]
)
def _adjust_assumed_values_in_wall_descriptions(self):
"""
Strip out assumed values for all wall descriptions
"""
for col in ["walls_clean_description", "walls_clean_description_ending"]:
self.df[col] = self.df[col].str.replace("(assumed)", "").str.rstrip()
self.df[col] = (
self.df[col].str.replace("(assumed)", "", regex=False).str.rstrip()
)
def _drop_inconsistent_properties(self, expanded_df: pd.DataFrame, component: str):
"""
@ -261,34 +491,57 @@ class TrainingDataset(BaseDataset):
if component == "walls":
expanded_df = expanded_df[
(expanded_df["is_cavity_wall"] == expanded_df["is_cavity_wall_ending"]) &
(expanded_df["is_solid_brick"] == expanded_df["is_solid_brick_ending"]) &
(expanded_df["is_timber_frame"] == expanded_df["is_timber_frame_ending"]) &
(expanded_df["is_granite_or_whinstone"] == expanded_df["is_granite_or_whinstone_ending"]) &
(expanded_df["is_cob"] == expanded_df["is_cob_ending"]) &
(expanded_df["is_sandstone_or_limestone"] == expanded_df["is_sandstone_or_limestone_ending"])
]
(expanded_df["is_cavity_wall"] == expanded_df["is_cavity_wall_ending"])
& (
expanded_df["is_solid_brick"]
== expanded_df["is_solid_brick_ending"]
)
& (
expanded_df["is_timber_frame"]
== expanded_df["is_timber_frame_ending"]
)
& (
expanded_df["is_granite_or_whinstone"]
== expanded_df["is_granite_or_whinstone_ending"]
)
& (expanded_df["is_cob"] == expanded_df["is_cob_ending"])
& (
expanded_df["is_sandstone_or_limestone"]
== expanded_df["is_sandstone_or_limestone_ending"]
)
]
elif component == "floor":
expanded_df = expanded_df[
(expanded_df["is_suspended"] == expanded_df["is_suspended_ending"]) &
(expanded_df["is_solid"] == expanded_df["is_solid_ending"]) &
(expanded_df["another_property_below"] == expanded_df["another_property_below_ending"]) &
(expanded_df["is_to_unheated_space"] == expanded_df["is_to_unheated_space_ending"]) &
(expanded_df["is_to_external_air"] == expanded_df["is_to_external_air_ending"])
]
(expanded_df["is_suspended"] == expanded_df["is_suspended_ending"])
& (expanded_df["is_solid"] == expanded_df["is_solid_ending"])
& (
expanded_df["another_property_below"]
== expanded_df["another_property_below_ending"]
)
& (
expanded_df["is_to_unheated_space"]
== expanded_df["is_to_unheated_space_ending"]
)
& (
expanded_df["is_to_external_air"]
== expanded_df["is_to_external_air_ending"]
)
]
elif component == "roof":
expanded_df = expanded_df[
(expanded_df["is_pitched"] == expanded_df["is_pitched_ending"]) &
(expanded_df["is_roof_room"] == expanded_df["is_roof_room_ending"]) &
(expanded_df["is_loft"] == expanded_df["is_loft_ending"]) &
(expanded_df["is_flat"] == expanded_df["is_flat_ending"]) &
(expanded_df["is_thatched"] == expanded_df["is_thatched_ending"]) &
(expanded_df["is_at_rafters"] == expanded_df["is_at_rafters_ending"]) &
(expanded_df["has_dwelling_above"] == expanded_df["has_dwelling_above_ending"])
]
(expanded_df["is_pitched"] == expanded_df["is_pitched_ending"])
& (expanded_df["is_roof_room"] == expanded_df["is_roof_room_ending"])
& (expanded_df["is_loft"] == expanded_df["is_loft_ending"])
& (expanded_df["is_flat"] == expanded_df["is_flat_ending"])
& (expanded_df["is_thatched"] == expanded_df["is_thatched_ending"])
& (expanded_df["is_at_rafters"] == expanded_df["is_at_rafters_ending"])
& (
expanded_df["has_dwelling_above"]
== expanded_df["has_dwelling_above_ending"]
)
]
return expanded_df
def _expand_description_to_features(self, cleaned_lookup: dict):
"""
@ -300,65 +553,111 @@ class TrainingDataset(BaseDataset):
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
# is low
# We also replace descriptions with their cleaned variants
"""
"""
cols_to_drop = {
"walls": [
# We need to cleaned descriptions for pulling out u-values
'original_description', 'thermal_transmittance_unit',
'original_description_ending',
'thermal_transmittance_unit_ending',
'is_cavity_wall_ending', 'is_filled_cavity_ending',
'is_solid_brick_ending', 'is_system_built_ending',
'is_timber_frame_ending', 'is_granite_or_whinstone_ending',
'is_as_built_ending', 'is_cob_ending', 'is_assumed_ending',
'is_sandstone_or_limestone_ending',
"original_description",
"thermal_transmittance_unit",
"original_description_ending",
"thermal_transmittance_unit_ending",
"is_cavity_wall_ending",
"is_solid_brick_ending",
"is_system_built_ending",
"is_timber_frame_ending",
"is_granite_or_whinstone_ending",
"is_as_built_ending",
"is_cob_ending",
"is_assumed_ending",
"is_sandstone_or_limestone_ending",
# Re remove the is_assumed columns
"is_assumed", "is_assumed_ending"
"is_assumed",
"is_assumed_ending",
],
"floor": [
"original_description", "clean_description", "thermal_transmittance_unit",
"no_data", "no_data_ending", "original_description_ending",
"clean_description_ending", "thermal_transmittance_unit_ending",
"is_suspended_ending", "is_solid_ending", "another_property_below_ending",
"is_to_unheated_space_ending", "is_to_external_air_ending", "is_assumed",
"is_assumed_ending"
"original_description",
"clean_description",
"thermal_transmittance_unit",
"no_data",
"no_data_ending",
"original_description_ending",
"clean_description_ending",
"thermal_transmittance_unit_ending",
"is_suspended_ending",
"is_solid_ending",
"another_property_below_ending",
"is_to_unheated_space_ending",
"is_to_external_air_ending",
"is_assumed",
"is_assumed_ending",
],
"roof": [
"original_description", "clean_description", "thermal_transmittance_unit",
"is_assumed", "is_valid", "original_description_ending", "clean_description_ending",
"thermal_transmittance_unit_ending", "is_pitched_ending", "is_roof_room_ending",
"is_loft_ending", "is_flat_ending", "is_thatched_ending", "is_at_rafters_ending",
"has_dwelling_above_ending", "is_assumed_ending", "is_valid_ending"
"original_description",
"clean_description",
"thermal_transmittance_unit",
"is_assumed",
"is_valid",
"original_description_ending",
"clean_description_ending",
"thermal_transmittance_unit_ending",
"is_pitched_ending",
"is_roof_room_ending",
"is_loft_ending",
"is_flat_ending",
"is_thatched_ending",
"has_dwelling_above_ending",
"is_assumed_ending",
"is_valid_ending",
],
"hotwater": [
"original_description", "clean_description", "assumed", "original_description_ending",
"clean_description_ending", "assumed_ending"
"original_description",
"clean_description",
"assumed",
"original_description_ending",
"clean_description_ending",
"assumed_ending",
],
"mainheat": [
"original_description", "clean_description", "original_description_ending",
"has_assumed", "original_description_ending", "clean_description_ending",
"original_description",
"clean_description",
"original_description_ending",
"has_assumed",
"original_description_ending",
"clean_description_ending",
"has_assumed_ending",
],
"mainheatcont": [
"original_description", "clean_description", "original_description_ending", "clean_description_ending"
"original_description",
"clean_description",
"original_description_ending",
"clean_description_ending",
],
"windows": [
"original_description", "clean_description", "original_description_ending", "clean_description_ending",
"original_description",
"clean_description",
"original_description_ending",
"clean_description_ending",
# We don't need many of the glazing coverage features because we have the multi_glaze_proportion feature
"has_glazing", "glazing_coverage", "no_data", "has_glazing_ending", "glazing_coverage_ending",
"no_data_ending"
"has_glazing",
"glazing_coverage",
"no_data",
"has_glazing_ending",
"glazing_coverage_ending",
"no_data_ending",
],
"main-fuel": [
"original_description", "clean_description", "original_description_ending", "clean_description_ending"
"original_description",
"clean_description",
"original_description_ending",
"clean_description_ending",
],
}
components_to_expand = cols_to_drop.keys()
for component in components_to_expand:
# TODO: change cleaned dataframe to have underscores instead of dashes
# TODO: change cleaned dataframe to have underscores instead of dashes
if component == "main-fuel":
cleaned_key = "main-fuel"
left_on_starting = "main_fuel_starting"
@ -368,10 +667,13 @@ class TrainingDataset(BaseDataset):
cleaned_key = f"{component}-description"
left_on_starting = f"{component}_description_starting"
left_on_ending = f"{component}_description_ending"
original_cols = [f"{component}_description_starting", f"{component}_description_ending"]
original_cols = [
f"{component}_description_starting",
f"{component}_description_ending",
]
cleaned_lookup_df_for_key = pd.DataFrame(cleaned_lookup[cleaned_key])
expanded_df = self.df.merge(
cleaned_lookup_df_for_key,
how="left",
@ -382,14 +684,16 @@ class TrainingDataset(BaseDataset):
how="left",
left_on=left_on_ending,
right_on="original_description",
suffixes=("", "_ending")
suffixes=("", "_ending"),
)
# Drop inconsistent properties
# Drop properties where key material types have changed
expanded_df = self._drop_inconsistent_properties(expanded_df, component)
# Drop original cols and cols to drop
expanded_df = expanded_df.drop(columns=cols_to_drop[component] + original_cols)
expanded_df = expanded_df.drop(
columns=cols_to_drop[component] + original_cols
)
# Rename columns to component specific names, if they have not been dropped
expanded_df = expanded_df.rename(
@ -405,11 +709,12 @@ class TrainingDataset(BaseDataset):
}
)
self.df = expanded_df
# We don't need any lighting specific cleaning, we just drop the original description as we use
# LOW_ENERGY_LIGHTING_STARTING, LOW_ENERGY_LIGHTING_ENDING
self.df = self.df.drop(columns=["lighting_description_starting", "lighting_description_ending"])
self.df = self.df.drop(
columns=["lighting_description_starting", "lighting_description_ending"]
)
def _clean_missing_values(self, ignore_cols=None):
missings = pd.isnull(self.df).sum()
@ -420,14 +725,17 @@ class TrainingDataset(BaseDataset):
for col in missings.index:
unique_values = self.df[col].unique()
if True in unique_values or False in unique_values:
if (
(True in unique_values)
or (False in unique_values)
or (col in BOOLEAN_VARIABLES)
):
self.df[col] = self.df[col].fillna(False)
if "none" in unique_values:
self.df[col] = self.df[col].fillna("none")
else:
self.df[col] = self.df[col].fillna("Unknown")
def _null_validation(self, information: str):
print(f"Null validation after {information}")
if pd.isnull(self.df).sum().sum():
@ -437,18 +745,22 @@ class TrainingDataset(BaseDataset):
"""
Drop features that are not needed for modelling
"""
self.df = self.df.drop(columns=["lodgement_date_starting", "lodgement_date_ending"])
self.df = self.df.drop(
columns=["lodgement_date_starting", "lodgement_date_ending"]
)
def _feature_generation(self):
"""
Generate features for modelling
"""
self.df["days_to_starting"] = self._calculate_days_to(self.df["lodgement_date_starting"])
self.df["day_to_ending"] = self._calculate_days_to(self.df["lodgement_date_ending"])
self.df["days_to_starting"] = self._calculate_days_to(
self.df["lodgement_date_starting"]
)
self.df["days_to_ending"] = self._calculate_days_to(
self.df["lodgement_date_ending"]
)
def _clean_efficiency_variables(self):
"""
These is scope to clean this by the model per corresponding description.
E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
@ -463,19 +775,17 @@ class TrainingDataset(BaseDataset):
missings = missings[missings >= 1]
if len(missings) == 0:
return
return
# Make sure they are all efficiency columns
# Make sure they are all efficiency columns
if any(~missings.index.str.contains("energy_eff")):
raise ValueError("Non efficiency columns are missing")
for m in missings.index:
self.df[m] = self.df[m].fillna("NO_RATING")
@staticmethod
def _calculate_days_to(lodgement_date):
if isinstance(lodgement_date, str):
return (
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
@ -489,7 +799,7 @@ class TrainingDataset(BaseDataset):
# if not isinstance(other, TrainingDataset):
# raise TypeError("Addition can only be performed with another instance of TrainingDataset")
# return TrainingDataset(self.datasets + other.datasets)
# def __radd__(self, other):
# """
# Required for sum() to work