From f715538c53fc40b9b3936dace6d2839d83a3ca49 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 22 Feb 2024 16:11:26 +0000 Subject: [PATCH 1/5] add record mode for testing --- etl/epc/Dataset.py | 999 ++++++++++++++++++++++---------------------- etl/epc/Pipeline.py | 62 ++- 2 files changed, 558 insertions(+), 503 deletions(-) diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 7f989633..5a7e3083 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -1,133 +1,19 @@ -import numpy as np import pandas as pd from typing import List from etl.epc.Record import EPCDifferenceRecord -from etl.epc.ValidationConfiguration import DatasetValidationConfiguration +from ValidationConfiguration import DatasetValidationConfiguration from etl.epc.settings import EARLIEST_EPC_DATE from recommendations.rdsap_tables import england_wales_age_band_lookup from recommendations.recommendation_utils import ( - estimate_number_of_floors, - get_wall_u_value, - get_roof_u_value, - get_floor_u_value, - estimate_perimeter, - get_wall_type, + get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter, + get_wall_type ) -# TODO: Can probably produce this in the property change app and store in S3 -BOOLEAN_VARIABLES = [ - "is_cavity_wall", - "is_filled_cavity", - "is_solid_brick", - "is_system_built", - "is_timber_frame", - "is_granite_or_whinstone", - "is_as_built", - "is_cob", - "is_sandstone_or_limestone", - "is_park_home", - "external_insulation", - "internal_insulation", - "is_park_home_ending", - "external_insulation_ending", - "internal_insulation_ending", - "is_to_unheated_space", - "is_to_external_air", - "is_suspended", - "is_solid", - "another_property_below", - "is_pitched", - "is_roof_room", - "is_loft", - "is_flat", - "is_thatched", - "is_at_rafters", - "has_dwelling_above", - "has_radiators", - "has_fan_coil_units", - "has_pipes_in_screed_above_insulation", - "has_pipes_in_insulated_timber_floor", - "has_pipes_in_concrete_slab", - "has_boiler", - "has_air_source_heat_pump", - "has_room_heaters", - "has_electric_storage_heaters", - "has_warm_air", - "has_electric_underfloor_heating", - "has_electric_ceiling_heating", - "has_community_scheme", - "has_ground_source_heat_pump", - "has_no_system_present", - "has_portable_electric_heaters", - "has_water_source_heat_pump", - "has_electric_heat_pump", - "has_micro-cogeneration", - "has_solar_assisted_heat_pump", - "has_exhaust_source_heat_pump", - "has_community_heat_pump", - "has_electric", - "has_mains_gas", - "has_wood_logs", - "has_coal", - "has_oil", - "has_wood_pellets", - "has_anthracite", - "has_dual_fuel_mineral_and_wood", - "has_smokeless_fuel", - "has_lpg", - "has_b30k", - "has_electricaire", - "has_assumed_for_most_rooms", - "has_underfloor_heating", - "has_radiators_ending", - "has_fan_coil_units_ending", - "has_pipes_in_screed_above_insulation_ending", - "has_pipes_in_insulated_timber_floor_ending", - "has_pipes_in_concrete_slab_ending", - "has_boiler_ending", - "has_air_source_heat_pump_ending", - "has_room_heaters_ending", - "has_electric_storage_heaters_ending", - "has_warm_air_ending", - "has_electric_underfloor_heating_ending", - "has_electric_ceiling_heating_ending", - "has_community_scheme_ending", - "has_ground_source_heat_pump_ending", - "has_no_system_present_ending", - "has_portable_electric_heaters_ending", - "has_water_source_heat_pump_ending", - "has_electric_heat_pump_ending", - "has_micro-cogeneration_ending", - "has_solar_assisted_heat_pump_ending", - "has_exhaust_source_heat_pump_ending", - "has_community_heat_pump_ending", - "has_electric_ending", - "has_mains_gas_ending", - "has_wood_logs_ending", - "has_coal_ending", - "has_oil_ending", - "has_wood_pellets_ending", - "has_anthracite_ending", - "has_dual_fuel_mineral_and_wood_ending", - "has_smokeless_fuel_ending", - "has_lpg_ending", - "has_b30k_ending", - "has_electricaire_ending", - "has_assumed_for_most_rooms_ending", - "has_underfloor_heating_ending", - "multiple_room_thermostats", - "multiple_room_thermostats_ending", - "is_community", - "no_individual_heating_or_community_network", - "is_community_ending", - "no_individual_heating_or_community_network_ending", -] - class BaseDataset: """ - Base class for all datasets + # Base class for all datasets """ def __init__(self) -> None: @@ -147,20 +33,18 @@ class BaseDataset: # raise ValueError(f"Pipeline type {pipeline_type} not found") # return self.pipeline_steps[pipeline_type] - - + class TrainingDataset(BaseDataset): """ A collection of EPCDifferenceRecords can be combined into a TrainingDataset. """ - def __init__( - self, datasets: List[EPCDifferenceRecord], cleaned_lookup: dict - ) -> None: + def __init__(self, datasets: List[EPCDifferenceRecord], cleaned_lookup: dict) -> None: + # self.pipeline_steps = self.pipeline_factory("training") self.datasets = datasets self.df = pd.DataFrame([dataset.difference_record for dataset in datasets]) - + self._feature_generation() self._drop_features() self._clean_efficiency_variables() @@ -175,51 +59,14 @@ class TrainingDataset(BaseDataset): self._null_validation(information="Clean Missing Values") self._remove_abnormal_change_in_floor_area() self._ensure_numeric() - self._organise_starting_ending_columns() - - def _organise_starting_ending_columns(self): - """ - Organise the starting and ending columns so that they are next to each other - """ - no_suffix_cols = [ - col - for col in self.df.columns - if "_ending" not in col and "_starting" not in col - ] - starting_cols = [col for col in self.df.columns if "_starting" in col] - ending_cols = [col for col in self.df.columns if "_ending" in col] - - common_cols = [ - col.rsplit("_", 1)[0] - for col in starting_cols - if col.replace("_starting", "_ending") in ending_cols - ] - only_ending_cols = [ - col - for col in ending_cols - if col.replace("_ending", "_starting") not in starting_cols - ] - - common_cols = [[col + "_starting", col + "_ending"] for col in common_cols] - - self.df = self.df.loc[ - :, - no_suffix_cols - + only_ending_cols - + [col for cols in common_cols for col in cols], - ] def _remove_abnormal_change_in_floor_area(self): """ Remove properties where the change in floor area is greater than 100% """ - self.df["tfa_diff_abs"] = abs( - self.df["total_floor_area_ending"] - self.df["total_floor_area_starting"] - ) - self.df["tfa_diff_prop"] = ( - self.df["tfa_diff_abs"] / self.df["total_floor_area_starting"] - ) + self.df["tfa_diff_abs"] = abs(self.df["total_floor_area_ending"] - self.df["total_floor_area_starting"]) + self.df["tfa_diff_prop"] = self.df["tfa_diff_abs"] / self.df["total_floor_area_starting"] self.df = self.df[self.df["tfa_diff_prop"] < 0.5] self.df = self.df.drop(columns=["tfa_diff_abs", "tfa_diff_prop"]) @@ -228,9 +75,7 @@ class TrainingDataset(BaseDataset): Ensure that all columns are numeric """ # TODO: move into EPCRecord record - uvalue_columns = [ - col for col in self.df.columns if "thermal_transmittance" in col - ] + uvalue_columns = [col for col in self.df.columns if "thermal_transmittance" in col] for uvalue_col in uvalue_columns: self.df[uvalue_col] = pd.to_numeric(self.df[uvalue_col]) @@ -240,16 +85,12 @@ class TrainingDataset(BaseDataset): Using the apply method, use the get_roof_u_value method to generate the u-value """ - col_name = ( - "roof_insulation_thickness" - if not is_end - else "roof_insulation_thickness_ending" - ) + col_name = "roof_insulation_thickness" if not is_end else "roof_insulation_thickness_ending" if row["has_dwelling_above"]: if row["roof_thermal_transmittance"] != 0: raise ValueError("Should have 0 u-value for roof") - + if row["roof_thermal_transmittance_ending"] != 0: raise ValueError("Should have 0 u-value for roof") @@ -262,24 +103,16 @@ class TrainingDataset(BaseDataset): is_flat=row["is_flat"], is_pitched=row["is_pitched"], is_at_rafters=row["is_at_rafters"], - age_band=england_wales_age_band_lookup[row["construction_age_band"]], - ) - + age_band=england_wales_age_band_lookup[row["construction_age_band"]] + ) + @staticmethod def _lambda_function_to_generate_wall_uvalue(row, is_end=False): """ Using the apply method, use the get_wall_u_value method to generate the u-value """ - description_col_name = ( - "walls_clean_description" - if not is_end - else "walls_clean_description_ending" - ) - thermal_transistance_col_name = ( - "walls_thermal_transmittance" - if not is_end - else "walls_thermal_transmittance_ending" - ) + description_col_name = "walls_clean_description" if not is_end else "walls_clean_description_ending" + thermal_transistance_col_name = "walls_thermal_transmittance" if not is_end else "walls_thermal_transmittance_ending" if pd.isnull(row[thermal_transistance_col_name]): output = get_wall_u_value( @@ -292,18 +125,14 @@ class TrainingDataset(BaseDataset): output = row[thermal_transistance_col_name] return output - + @staticmethod def _lambda_function_to_generate_floor_uvalue(row, is_end=False): """ Using the apply method, use the get_floor_u_value method to generate the u-value """ - floor_thermal_col_name = ( - "floor_thermal_transmittance" - if not is_end - else "floor_thermal_transmittance_ending" - ) + floor_thermal_col_name = "floor_thermal_transmittance" if not is_end else "floor_thermal_transmittance_ending" if row["another_property_below"]: if row["floor_thermal_transmittance"] != 0: @@ -316,31 +145,20 @@ class TrainingDataset(BaseDataset): uvalue = row[floor_thermal_col_name] if pd.isnull(uvalue): - insulation_col_name = ( - "floor_insulation_thickness" - if not is_end - else "floor_insulation_thickness_ending" - ) - perimeter_col_name = ( - "estimated_perimeter_starting" - if not is_end - else "estimated_perimeter_ending" - ) - floor_area_col_name = ( - "ground_floor_area_starting" - if not is_end - else "ground_floor_area_ending" - ) + + insulation_col_name = "floor_insulation_thickness" if not is_end else "floor_insulation_thickness_ending" + floor_area_col_name = "estimated_perimeter_starting" if not is_end else "estimated_perimeter_ending" + perimeter_col_name = "total_floor_area_starting" if not is_end else "total_floor_area_ending" uvalue = get_floor_u_value( - floor_type=row["floor_type"], - perimeter=row[perimeter_col_name], - area=row[floor_area_col_name], - insulation_thickness=row[insulation_col_name], - wall_type=row["wall_type"], - age_band=england_wales_age_band_lookup[row["construction_age_band"]], - ) - + floor_type=row["floor_type"], + perimeter=row[floor_area_col_name], + area=row[perimeter_col_name], + insulation_thickness=row[insulation_col_name], + wall_type=row["wall_type"], + age_band=england_wales_age_band_lookup[row["construction_age_band"]] + ) + return uvalue def _generate_u_values_from_features(self): @@ -353,136 +171,88 @@ class TrainingDataset(BaseDataset): # ~~~~~~~~~~~~~~~~~~ walls_starting_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_wall_uvalue(row), axis=1 + lambda row: self._lambda_function_to_generate_wall_uvalue(row), + axis=1 ) walls_ending_uvalue = self.df.apply( lambda row: self._lambda_function_to_generate_wall_uvalue(row, is_end=True), - axis=1, + axis=1 ) - walls_starting_uvalue = self.df["walls_thermal_transmittance"].fillna( - walls_starting_uvalue - ) - walls_starting_equals_ending_flag = ( - self.df["walls_clean_description"] - == self.df["walls_clean_description_ending"] - ) - walls_ending_uvalue[walls_starting_equals_ending_flag] = walls_starting_uvalue[ - walls_starting_equals_ending_flag - ] - + walls_starting_uvalue = self.df['walls_thermal_transmittance'].fillna(walls_starting_uvalue) + walls_starting_equals_ending_flag = self.df['walls_clean_description'] == self.df["walls_clean_description_ending"] + walls_ending_uvalue[walls_starting_equals_ending_flag] = walls_starting_uvalue[walls_starting_equals_ending_flag] + # ~~~~~~~~~~~~~~~~~~ # Roof # ~~~~~~~~~~~~~~~~~~ - + roof_starting_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_roof_uvalue(row), axis=1 + lambda row: self._lambda_function_to_generate_roof_uvalue(row), + axis=1 ) roof_ending_uvalue = self.df.apply( lambda row: self._lambda_function_to_generate_roof_uvalue(row, is_end=True), - axis=1, + axis=1 ) - roof_starting_uvalue = self.df["roof_thermal_transmittance"].fillna( - roof_starting_uvalue - ) - roof_ending_uvalue = self.df["roof_thermal_transmittance_ending"].fillna( - roof_ending_uvalue - ) + roof_starting_uvalue = self.df['roof_thermal_transmittance'].fillna(roof_starting_uvalue) + roof_ending_uvalue = self.df['roof_thermal_transmittance_ending'].fillna(roof_ending_uvalue) + # ~~~~~~~~~~~~~~~~~~ # Floor # ~~~~~~~~~~~~~~~~~~ - - self.df["estimated_number_of_floors"] = self.df.apply( - lambda row: estimate_number_of_floors(row["property_type"]), axis=1 + + self.df['estimated_perimeter_starting'] = self.df.apply( + lambda row: estimate_perimeter(row["total_floor_area_starting"], row["number_habitable_rooms"]), + axis=1 ) - - self.df["ground_floor_area_starting"] = ( - self.df["total_floor_area_starting"] / self.df["estimated_number_of_floors"] - ) - self.df["ground_floor_area_ending"] = ( - self.df["total_floor_area_ending"] / self.df["estimated_number_of_floors"] - ) - - self.df["estimated_perimeter_starting"] = self.df.apply( - lambda row: estimate_perimeter( - row["ground_floor_area_starting"], - row["number_habitable_rooms_starting"] - / row["estimated_number_of_floors"], - ), - axis=1, - ) - self.df["estimated_perimeter_ending"] = self.df.apply( - lambda row: estimate_perimeter( - row["ground_floor_area_starting"], - row["number_habitable_rooms_ending"] - / row["estimated_number_of_floors"], - ), - axis=1, - ) - self.df["floor_type"] = self.df["is_suspended"].replace( - {True: "suspended", False: "solid"} + self.df['estimated_perimeter_ending'] = self.df.apply( + lambda row: estimate_perimeter(row["total_floor_area_ending"], row["number_habitable_rooms"]), + axis=1 ) + self.df["floor_type"] = self.df["is_suspended"].replace({True: "suspended", False: "solid"}) self.df["wall_type"] = self.df.apply( lambda row: get_wall_type( - is_cavity_wall=row["is_cavity_wall"], - is_solid_brick=row["is_solid_brick"], - is_timber_frame=row["is_timber_frame"], - is_granite_or_whinstone=row["is_granite_or_whinstone"], - is_cob=row["is_cob"], + is_cavity_wall=row["is_cavity_wall"], + is_solid_brick=row["is_solid_brick"], + is_timber_frame=row["is_timber_frame"], + is_granite_or_whinstone=row["is_granite_or_whinstone"], + is_cob=row["is_cob"], is_sandstone_or_limestone=row["is_sandstone_or_limestone"], is_system_built=row["is_system_built"], - is_park_home=row["is_park_home"], - ), - axis=1, + is_park_home=row["is_park_home"] + ), + axis=1 ) - + floor_starting_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_floor_uvalue(row), axis=1 + lambda row: self._lambda_function_to_generate_floor_uvalue(row), + axis=1 ) floor_ending_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_floor_uvalue( - row, is_end=True - ), - axis=1, + lambda row: self._lambda_function_to_generate_floor_uvalue(row, is_end=True), + axis=1 ) - floor_starting_uvalue = self.df["floor_thermal_transmittance"].fillna( - floor_starting_uvalue - ) - floor_ending_uvalue = self.df["floor_thermal_transmittance_ending"].fillna( - floor_ending_uvalue - ) + floor_starting_uvalue = self.df['floor_thermal_transmittance'].fillna(floor_starting_uvalue) + floor_ending_uvalue = self.df['floor_thermal_transmittance_ending'].fillna(floor_ending_uvalue) for component in ["walls", "roof", "floor"]: - self.df[f"{component}_thermal_transmittance"] = self.df[ - f"{component}_thermal_transmittance" - ].fillna(eval(f"{component}_starting_uvalue")) - self.df[f"{component}_thermal_transmittance_ending"] = self.df[ - f"{component}_thermal_transmittance_ending" - ].fillna(eval(f"{component}_ending_uvalue")) + self.df[f"{component}_thermal_transmittance"] = self.df[f"{component}_thermal_transmittance"].fillna(eval(f"{component}_starting_uvalue")) + self.df[f"{component}_thermal_transmittance_ending"] = self.df[f"{component}_thermal_transmittance_ending"].fillna(eval(f"{component}_ending_uvalue")) - self.df = self.df.drop( - columns=[ - "floor_type", - "wall_type", - "walls_clean_description", - "walls_clean_description_ending", - "estimated_number_of_floors", - "ground_floor_area_starting", - "ground_floor_area_ending", - ] - ) + self.df = self.df.drop(columns=["floor_type", "wall_type", "walls_clean_description", "walls_clean_description_ending"]) + def _adjust_assumed_values_in_wall_descriptions(self): """ Strip out assumed values for all wall descriptions """ for col in ["walls_clean_description", "walls_clean_description_ending"]: - self.df[col] = ( - self.df[col].str.replace("(assumed)", "", regex=False).str.rstrip() - ) + self.df[col] = self.df[col].str.replace("(assumed)", "").str.rstrip() + def _drop_inconsistent_properties(self, expanded_df: pd.DataFrame, component: str): """ @@ -491,57 +261,34 @@ class TrainingDataset(BaseDataset): if component == "walls": expanded_df = expanded_df[ - (expanded_df["is_cavity_wall"] == expanded_df["is_cavity_wall_ending"]) - & ( - expanded_df["is_solid_brick"] - == expanded_df["is_solid_brick_ending"] - ) - & ( - expanded_df["is_timber_frame"] - == expanded_df["is_timber_frame_ending"] - ) - & ( - expanded_df["is_granite_or_whinstone"] - == expanded_df["is_granite_or_whinstone_ending"] - ) - & (expanded_df["is_cob"] == expanded_df["is_cob_ending"]) - & ( - expanded_df["is_sandstone_or_limestone"] - == expanded_df["is_sandstone_or_limestone_ending"] - ) - ] + (expanded_df["is_cavity_wall"] == expanded_df["is_cavity_wall_ending"]) & + (expanded_df["is_solid_brick"] == expanded_df["is_solid_brick_ending"]) & + (expanded_df["is_timber_frame"] == expanded_df["is_timber_frame_ending"]) & + (expanded_df["is_granite_or_whinstone"] == expanded_df["is_granite_or_whinstone_ending"]) & + (expanded_df["is_cob"] == expanded_df["is_cob_ending"]) & + (expanded_df["is_sandstone_or_limestone"] == expanded_df["is_sandstone_or_limestone_ending"]) + ] elif component == "floor": expanded_df = expanded_df[ - (expanded_df["is_suspended"] == expanded_df["is_suspended_ending"]) - & (expanded_df["is_solid"] == expanded_df["is_solid_ending"]) - & ( - expanded_df["another_property_below"] - == expanded_df["another_property_below_ending"] - ) - & ( - expanded_df["is_to_unheated_space"] - == expanded_df["is_to_unheated_space_ending"] - ) - & ( - expanded_df["is_to_external_air"] - == expanded_df["is_to_external_air_ending"] - ) - ] + (expanded_df["is_suspended"] == expanded_df["is_suspended_ending"]) & + (expanded_df["is_solid"] == expanded_df["is_solid_ending"]) & + (expanded_df["another_property_below"] == expanded_df["another_property_below_ending"]) & + (expanded_df["is_to_unheated_space"] == expanded_df["is_to_unheated_space_ending"]) & + (expanded_df["is_to_external_air"] == expanded_df["is_to_external_air_ending"]) + ] elif component == "roof": expanded_df = expanded_df[ - (expanded_df["is_pitched"] == expanded_df["is_pitched_ending"]) - & (expanded_df["is_roof_room"] == expanded_df["is_roof_room_ending"]) - & (expanded_df["is_loft"] == expanded_df["is_loft_ending"]) - & (expanded_df["is_flat"] == expanded_df["is_flat_ending"]) - & (expanded_df["is_thatched"] == expanded_df["is_thatched_ending"]) - & (expanded_df["is_at_rafters"] == expanded_df["is_at_rafters_ending"]) - & ( - expanded_df["has_dwelling_above"] - == expanded_df["has_dwelling_above_ending"] - ) - ] - + (expanded_df["is_pitched"] == expanded_df["is_pitched_ending"]) & + (expanded_df["is_roof_room"] == expanded_df["is_roof_room_ending"]) & + (expanded_df["is_loft"] == expanded_df["is_loft_ending"]) & + (expanded_df["is_flat"] == expanded_df["is_flat_ending"]) & + (expanded_df["is_thatched"] == expanded_df["is_thatched_ending"]) & + (expanded_df["is_at_rafters"] == expanded_df["is_at_rafters_ending"]) & + (expanded_df["has_dwelling_above"] == expanded_df["has_dwelling_above_ending"]) + ] + return expanded_df + def _expand_description_to_features(self, cleaned_lookup: dict): """ @@ -553,111 +300,65 @@ class TrainingDataset(BaseDataset): # remove this record, as it indicates that the quality of the EPC conducted in the first instance # is low # We also replace descriptions with their cleaned variants - """ + """ cols_to_drop = { "walls": [ # We need to cleaned descriptions for pulling out u-values - "original_description", - "thermal_transmittance_unit", - "original_description_ending", - "thermal_transmittance_unit_ending", - "is_cavity_wall_ending", - "is_solid_brick_ending", - "is_system_built_ending", - "is_timber_frame_ending", - "is_granite_or_whinstone_ending", - "is_as_built_ending", - "is_cob_ending", - "is_assumed_ending", - "is_sandstone_or_limestone_ending", + 'original_description', 'thermal_transmittance_unit', + 'original_description_ending', + 'thermal_transmittance_unit_ending', + 'is_cavity_wall_ending', 'is_filled_cavity_ending', + 'is_solid_brick_ending', 'is_system_built_ending', + 'is_timber_frame_ending', 'is_granite_or_whinstone_ending', + 'is_as_built_ending', 'is_cob_ending', 'is_assumed_ending', + 'is_sandstone_or_limestone_ending', # Re remove the is_assumed columns - "is_assumed", - "is_assumed_ending", + "is_assumed", "is_assumed_ending" ], "floor": [ - "original_description", - "clean_description", - "thermal_transmittance_unit", - "no_data", - "no_data_ending", - "original_description_ending", - "clean_description_ending", - "thermal_transmittance_unit_ending", - "is_suspended_ending", - "is_solid_ending", - "another_property_below_ending", - "is_to_unheated_space_ending", - "is_to_external_air_ending", - "is_assumed", - "is_assumed_ending", + "original_description", "clean_description", "thermal_transmittance_unit", + "no_data", "no_data_ending", "original_description_ending", + "clean_description_ending", "thermal_transmittance_unit_ending", + "is_suspended_ending", "is_solid_ending", "another_property_below_ending", + "is_to_unheated_space_ending", "is_to_external_air_ending", "is_assumed", + "is_assumed_ending" ], "roof": [ - "original_description", - "clean_description", - "thermal_transmittance_unit", - "is_assumed", - "is_valid", - "original_description_ending", - "clean_description_ending", - "thermal_transmittance_unit_ending", - "is_pitched_ending", - "is_roof_room_ending", - "is_loft_ending", - "is_flat_ending", - "is_thatched_ending", - "has_dwelling_above_ending", - "is_assumed_ending", - "is_valid_ending", + "original_description", "clean_description", "thermal_transmittance_unit", + "is_assumed", "is_valid", "original_description_ending", "clean_description_ending", + "thermal_transmittance_unit_ending", "is_pitched_ending", "is_roof_room_ending", + "is_loft_ending", "is_flat_ending", "is_thatched_ending", "is_at_rafters_ending", + "has_dwelling_above_ending", "is_assumed_ending", "is_valid_ending" ], "hotwater": [ - "original_description", - "clean_description", - "assumed", - "original_description_ending", - "clean_description_ending", - "assumed_ending", + "original_description", "clean_description", "assumed", "original_description_ending", + "clean_description_ending", "assumed_ending" ], "mainheat": [ - "original_description", - "clean_description", - "original_description_ending", - "has_assumed", - "original_description_ending", - "clean_description_ending", + "original_description", "clean_description", "original_description_ending", + "has_assumed", "original_description_ending", "clean_description_ending", "has_assumed_ending", ], "mainheatcont": [ - "original_description", - "clean_description", - "original_description_ending", - "clean_description_ending", + "original_description", "clean_description", "original_description_ending", "clean_description_ending" ], "windows": [ - "original_description", - "clean_description", - "original_description_ending", - "clean_description_ending", + "original_description", "clean_description", "original_description_ending", "clean_description_ending", # We don't need many of the glazing coverage features because we have the multi_glaze_proportion feature - "has_glazing", - "glazing_coverage", - "no_data", - "has_glazing_ending", - "glazing_coverage_ending", - "no_data_ending", + "has_glazing", "glazing_coverage", "no_data", "has_glazing_ending", "glazing_coverage_ending", + "no_data_ending" ], "main-fuel": [ - "original_description", - "clean_description", - "original_description_ending", - "clean_description_ending", + "original_description", "clean_description", "original_description_ending", "clean_description_ending" ], } components_to_expand = cols_to_drop.keys() - + for component in components_to_expand: - # TODO: change cleaned dataframe to have underscores instead of dashes + + # TODO: change cleaned dataframe to have underscores instead of dashes if component == "main-fuel": cleaned_key = "main-fuel" left_on_starting = "main_fuel_starting" @@ -667,13 +368,10 @@ class TrainingDataset(BaseDataset): cleaned_key = f"{component}-description" left_on_starting = f"{component}_description_starting" left_on_ending = f"{component}_description_ending" - original_cols = [ - f"{component}_description_starting", - f"{component}_description_ending", - ] + original_cols = [f"{component}_description_starting", f"{component}_description_ending"] cleaned_lookup_df_for_key = pd.DataFrame(cleaned_lookup[cleaned_key]) - + expanded_df = self.df.merge( cleaned_lookup_df_for_key, how="left", @@ -684,16 +382,14 @@ class TrainingDataset(BaseDataset): how="left", left_on=left_on_ending, right_on="original_description", - suffixes=("", "_ending"), + suffixes=("", "_ending") ) - # Drop properties where key material types have changed + # Drop inconsistent properties expanded_df = self._drop_inconsistent_properties(expanded_df, component) - + # Drop original cols and cols to drop - expanded_df = expanded_df.drop( - columns=cols_to_drop[component] + original_cols - ) + expanded_df = expanded_df.drop(columns=cols_to_drop[component] + original_cols) # Rename columns to component specific names, if they have not been dropped expanded_df = expanded_df.rename( @@ -709,12 +405,11 @@ class TrainingDataset(BaseDataset): } ) self.df = expanded_df - + # We don't need any lighting specific cleaning, we just drop the original description as we use # LOW_ENERGY_LIGHTING_STARTING, LOW_ENERGY_LIGHTING_ENDING - self.df = self.df.drop( - columns=["lighting_description_starting", "lighting_description_ending"] - ) + self.df = self.df.drop(columns=["lighting_description_starting", "lighting_description_ending"]) + def _clean_missing_values(self, ignore_cols=None): missings = pd.isnull(self.df).sum() @@ -725,17 +420,14 @@ class TrainingDataset(BaseDataset): for col in missings.index: unique_values = self.df[col].unique() - if ( - (True in unique_values) - or (False in unique_values) - or (col in BOOLEAN_VARIABLES) - ): + if True in unique_values or False in unique_values: self.df[col] = self.df[col].fillna(False) if "none" in unique_values: self.df[col] = self.df[col].fillna("none") else: self.df[col] = self.df[col].fillna("Unknown") + def _null_validation(self, information: str): print(f"Null validation after {information}") if pd.isnull(self.df).sum().sum(): @@ -745,21 +437,267 @@ class TrainingDataset(BaseDataset): """ Drop features that are not needed for modelling """ - self.df = self.df.drop( - columns=["lodgement_date_starting", "lodgement_date_ending"] - ) + self.df = self.df.drop(columns=["lodgement_date_starting", "lodgement_date_ending"]) + def _feature_generation(self): """ Generate features for modelling """ - self.df["days_to_starting"] = self._calculate_days_to( - self.df["lodgement_date_starting"] + self.df["days_to_starting"] = self._calculate_days_to(self.df["lodgement_date_starting"]) + self.df["day_to_ending"] = self._calculate_days_to(self.df["lodgement_date_ending"]) + + def _clean_efficiency_variables(self): + + """ + These is scope to clean this by the model per corresponding description. + E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and + fill in the missing values with this. + When looking at this initially, there are a large volume of records with missing energy efficiency + values and therefore a simpler approach was taken just to test including these variables + :param df: + :return: + """ + + missings = pd.isnull(self.df).sum() + missings = missings[missings >= 1] + + if len(missings) == 0: + return + + # Make sure they are all efficiency columns + if any(~missings.index.str.contains("energy_eff")): + raise ValueError("Non efficiency columns are missing") + + for m in missings.index: + self.df[m] = self.df[m].fillna("NO_RATING") + + + @staticmethod + def _calculate_days_to(lodgement_date): + + if isinstance(lodgement_date, str): + return ( + pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) + ).days + + return ( + pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) + ).dt.days + + # def __add__(self, other) -> "TrainingDataset": + # if not isinstance(other, TrainingDataset): + # raise TypeError("Addition can only be performed with another instance of TrainingDataset") + # return TrainingDataset(self.datasets + other.datasets) + + # def __radd__(self, other): + # """ + # Required for sum() to work + # """ + # if isinstance(other, int): + # return self + # else: + # return self.__add__(other) + +class RecordDataset(BaseDataset): + """ + A collection of EPCRecrods can be combined into a Dataset. + """ + + def __init__(self, datasets: pd.DataFrame, cleaned_lookup: dict) -> None: + # self.pipeline_steps = self.pipeline_factory("newdata") + self.datasets = datasets + self.df = datasets + + self._clean_efficiency_variables() + self._null_validation(information="Clean Efficiency Variables") + self._expand_description_to_features(cleaned_lookup) + self._adjust_assumed_values_in_wall_descriptions() + self._generate_u_values_from_features() + # # TODO: For some of the features that we clean, we have either a true, false or possibly null value + # # Those nulls should be False. clean_missings_after_description_process handles this but shouldn't + # # need to + self._clean_missing_values() + self._null_validation(information="Clean Missing Values") + # self._remove_abnormal_change_in_floor_area() + self._ensure_numeric() + + + def _ensure_numeric(self): + """ + Ensure that all columns are numeric + """ + # TODO: move into EPCRecord record + uvalue_columns = [col for col in self.df.columns if "thermal_transmittance" in col] + for uvalue_col in uvalue_columns: + self.df[uvalue_col] = pd.to_numeric(self.df[uvalue_col]) + + + def _clean_missing_values(self, ignore_cols=None): + missings = pd.isnull(self.df).sum() + missings = missings[missings > 0] + + if ignore_cols: + missings = missings[~missings.index.isin(ignore_cols)] + + for col in missings.index: + unique_values = self.df[col].unique() + if True in unique_values or False in unique_values: + self.df[col] = self.df[col].fillna(False) + if "none" in unique_values: + self.df[col] = self.df[col].fillna("none") + else: + self.df[col] = self.df[col].fillna("Unknown") + + + @staticmethod + def _lambda_function_to_generate_roof_uvalue(row, is_end=False): + """ + Using the apply method, use the get_roof_u_value method to generate the u-value + """ + + col_name = "roof_insulation_thickness" if not is_end else "roof_insulation_thickness_ending" + + if row["has_dwelling_above"]: + if row["roof_thermal_transmittance"] != 0: + raise ValueError("Should have 0 u-value for roof") + + return get_roof_u_value( + insulation_thickness=row[col_name], + has_dwelling_above=row["has_dwelling_above"], + is_loft=row["is_loft"], + is_roof_room=row["is_roof_room"], + is_thatched=row["is_thatched"], + is_flat=row["is_flat"], + is_pitched=row["is_pitched"], + is_at_rafters=row["is_at_rafters"], + age_band=england_wales_age_band_lookup[row["construction_age_band"]] + ) + + @staticmethod + def _lambda_function_to_generate_wall_uvalue(row, is_end=False): + """ + Using the apply method, use the get_wall_u_value method to generate the u-value + """ + description_col_name = "walls_clean_description" if not is_end else "walls_clean_description_ending" + thermal_transistance_col_name = "walls_thermal_transmittance" if not is_end else "walls_thermal_transmittance_ending" + + if pd.isnull(row[thermal_transistance_col_name]): + output = get_wall_u_value( + clean_description=row[description_col_name], + age_band=england_wales_age_band_lookup[row["construction_age_band"]], + is_granite_or_whinstone=row["is_granite_or_whinstone"], + is_sandstone_or_limestone=row["is_sandstone_or_limestone"], + ) + else: + output = row[thermal_transistance_col_name] + + return output + + @staticmethod + def _lambda_function_to_generate_floor_uvalue(row, is_end=False): + """ + Using the apply method, use the get_floor_u_value method to generate the u-value + """ + + floor_thermal_col_name = "floor_thermal_transmittance" if not is_end else "floor_thermal_transmittance_ending" + + if row["another_property_below"]: + if row["floor_thermal_transmittance"] != 0: + raise ValueError("Should have 0 u-value for floor") + + return 0 + else: + uvalue = row[floor_thermal_col_name] + + if pd.isnull(uvalue): + + insulation_col_name = "floor_insulation_thickness" if not is_end else "floor_insulation_thickness_ending" + floor_area_col_name = "estimated_perimeter" if not is_end else "estimated_perimeter_ending" + perimeter_col_name = "total_floor_area" if not is_end else "total_floor_area_ending" + + uvalue = get_floor_u_value( + floor_type=row["floor_type"], + perimeter=row[floor_area_col_name], + area=row[perimeter_col_name], + insulation_thickness=row[insulation_col_name], + wall_type=row["wall_type"], + age_band=england_wales_age_band_lookup[row["construction_age_band"]] + ) + + return uvalue + + def _generate_u_values_from_features(self): + """ + Generate u-values from the features + """ + + # ~~~~~~~~~~~~~~~~~~ + # Walls + # ~~~~~~~~~~~~~~~~~~ + + walls_uvalue = self.df.apply( + lambda row: self._lambda_function_to_generate_wall_uvalue(row), + axis=1 ) - self.df["days_to_ending"] = self._calculate_days_to( - self.df["lodgement_date_ending"] + + walls_uvalue = self.df['walls_thermal_transmittance'].fillna(walls_uvalue) + + # ~~~~~~~~~~~~~~~~~~ + # Roof + # ~~~~~~~~~~~~~~~~~~ + + roof_uvalue = self.df.apply( + lambda row: self._lambda_function_to_generate_roof_uvalue(row), + axis=1 ) + roof_uvalue = self.df['roof_thermal_transmittance'].fillna(roof_uvalue) + + # ~~~~~~~~~~~~~~~~~~ + # Floor + # ~~~~~~~~~~~~~~~~~~ + + self.df['estimated_perimeter'] = self.df.apply( + lambda row: estimate_perimeter(row["total_floor_area"], row["number_habitable_rooms"]), + axis=1 + ) + + self.df["floor_type"] = self.df["is_suspended"].replace({True: "suspended", False: "solid"}) + self.df["wall_type"] = self.df.apply( + lambda row: get_wall_type( + is_cavity_wall=row["is_cavity_wall"], + is_solid_brick=row["is_solid_brick"], + is_timber_frame=row["is_timber_frame"], + is_granite_or_whinstone=row["is_granite_or_whinstone"], + is_cob=row["is_cob"], + is_sandstone_or_limestone=row["is_sandstone_or_limestone"], + is_system_built=row["is_system_built"], + is_park_home=row["is_park_home"] + ), + axis=1 + ) + + floor_uvalue = self.df.apply( + lambda row: self._lambda_function_to_generate_floor_uvalue(row), + axis=1 + ) + + floor_uvalue = self.df['floor_thermal_transmittance'].fillna(floor_uvalue) + + for component in ["walls", "roof", "floor"]: + self.df[f"{component}_thermal_transmittance"] = self.df[f"{component}_thermal_transmittance"].fillna(eval(f"{component}_uvalue")) + + self.df = self.df.drop(columns=["floor_type", "wall_type", "walls_clean_description"]) + + def _adjust_assumed_values_in_wall_descriptions(self): + """ + Strip out assumed values for all wall descriptions + """ + for col in ["walls_clean_description"]: + self.df[col] = self.df[col].str.replace("(assumed)", "").str.rstrip() + + def _clean_efficiency_variables(self): """ These is scope to clean this by the model per corresponding description. @@ -775,31 +713,118 @@ class TrainingDataset(BaseDataset): missings = missings[missings >= 1] if len(missings) == 0: - return + return - # Make sure they are all efficiency columns + # Make sure they are all efficiency columns if any(~missings.index.str.contains("energy_eff")): raise ValueError("Non efficiency columns are missing") for m in missings.index: - self.df[m] = self.df[m].fillna("NO_RATING") + column_index = self.df[m].isna() + self.df.loc[column_index, m] = "NO_RATING" - @staticmethod - def _calculate_days_to(lodgement_date): - if isinstance(lodgement_date, str): - return ( - pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) - ).days - return ( - pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) - ).dt.days + def _null_validation(self, information: str): + print(f"Null validation after {information}") + if pd.isnull(self.df).sum().sum(): + raise ValueError(f"Null values found in dataset, after step {information}") - # def __add__(self, other) -> "TrainingDataset": - # if not isinstance(other, TrainingDataset): - # raise TypeError("Addition can only be performed with another instance of TrainingDataset") - # return TrainingDataset(self.datasets + other.datasets) + + def _expand_description_to_features(self, cleaned_lookup: dict): + """ + This method will merge on the cleaned lookup table and ensure that the building fabric in the + starting and ending EPC is consistent, so ensure that we are performing our modelling on the cleanest + possible dataset. + # We look for key building fabric features that have changed from one EPC to the next. + # if, for example, we see that a home has gone from being a cavity wall to a solid wall, we + # remove this record, as it indicates that the quality of the EPC conducted in the first instance + # is low + # We also replace descriptions with their cleaned variants + """ + cols_to_drop = { + "walls": [ + # We need to cleaned descriptions for pulling out u-values + 'original_description', 'thermal_transmittance_unit', + # Re remove the is_assumed columns + "is_assumed" + ], + "floor": [ + "original_description", "clean_description", "thermal_transmittance_unit", + "no_data", + "is_assumed" + ], + "roof": [ + "original_description", "clean_description", "thermal_transmittance_unit", + "is_assumed", "is_valid" + ], + "hotwater": [ + "original_description", "clean_description", "assumed", + ], + "mainheat": [ + "original_description", "clean_description", + "has_assumed", + ], + "mainheatcont": [ + "original_description", "clean_description", + ], + "windows": [ + "original_description", "clean_description", + # We don't need many of the glazing coverage features because we have the multi_glaze_proportion feature + "has_glazing", "glazing_coverage", "no_data", + ], + "main-fuel": [ + "original_description", "clean_description", + ], + } + + components_to_expand = cols_to_drop.keys() + + for component in components_to_expand: + + # TODO: change cleaned dataframe to have underscores instead of dashes + if component == "main-fuel": + cleaned_key = "main-fuel" + left_on_key = "main_fuel" + original_cols = ["main_fuel"] + else: + cleaned_key = f"{component}-description" + left_on_key = f"{component}_description" + original_cols = [f"{component}_description"] + + cleaned_lookup_df_for_key = pd.DataFrame(cleaned_lookup[cleaned_key]) + + expanded_df = self.df.merge( + cleaned_lookup_df_for_key, + how="left", + left_on=left_on_key, + right_on="original_description" + ) + + # Drop original cols and cols to drop + expanded_df = expanded_df.drop(columns=cols_to_drop[component] + original_cols) + + # Rename columns to component specific names, if they have not been dropped + expanded_df = expanded_df.rename( + columns={ + "insulation_thickness": f"{component}_insulation_thickness", + "thermal_transmittance": f"{component}_thermal_transmittance", + "tariff_type": f"{component}_tariff_type", + "clean_description": f"{component}_clean_description", + } + ) + self.df = expanded_df + + # We don't need any lighting specific cleaning, we just drop the original description as we use + # LOW_ENERGY_LIGHTING_STARTING, LOW_ENERGY_LIGHTING_ENDING + self.df = self.df.drop(columns=["lighting_description"]) + + + # def __add__(self, other) -> "NewDataset": + # if not isinstance(other, NewDataset): + # raise TypeError("Addition can only be performed with another instance of ScoringDataset") + # return NewDataset(self.datasets + other.datasets) + # def __radd__(self, other): # """ # Required for sum() to work @@ -807,30 +832,4 @@ class TrainingDataset(BaseDataset): # if isinstance(other, int): # return self # else: - # return self.__add__(other) - - -class NewDataset(BaseDataset): - """ - A collection of EPCDifferenceRecords can be combined into a ScoringDataset. - """ - - def __init__(self, datasets: List[EPCDifferenceRecord]) -> None: - # self.pipeline_steps = self.pipeline_factory("newdata") - self.datasets = datasets - - def __add__(self, other) -> "NewDataset": - if not isinstance(other, NewDataset): - raise TypeError( - "Addition can only be performed with another instance of ScoringDataset" - ) - return NewDataset(self.datasets + other.datasets) - - def __radd__(self, other): - """ - Required for sum() to work - """ - if isinstance(other, int): - return self - else: - return self.__add__(other) + # return self.__add__(other) \ No newline at end of file diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index ba228d89..f0be3c2f 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -7,7 +7,7 @@ from tqdm import tqdm from etl.epc.DataProcessor import EPCDataProcessor from etl.epc.Record import EPCRecord, EPCDifferenceRecord -from etl.epc.Dataset import TrainingDataset +from etl.epc.Dataset import TrainingDataset, RecordDataset from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3 from etl.epc.settings import ( MANDATORY_FIXED_FEATURES, @@ -24,8 +24,8 @@ from etl.epc.settings import ( # TODO: change in setting file MANDATORY_FIXED_FEATURES = [x.lower() for x in MANDATORY_FIXED_FEATURES] -# LATEST_FIELD = [x.lower() for x in LATEST_FIELD if x.lower() not in ROOM_FEATURES] -LATEST_FIELD = [x.lower() for x in LATEST_FIELD] +LATEST_FIELD = [x.lower() for x in LATEST_FIELD if x.lower() not in ROOM_FEATURES] +# LATEST_FIELD = [x.lower() for x in LATEST_FIELD] COMPONENT_FEATURES = [x.lower() for x in COMPONENT_FEATURES] RDSAP_RESPONSE = RDSAP_RESPONSE.lower() HEAT_DEMAND_RESPONSE = HEAT_DEMAND_RESPONSE.lower() @@ -62,6 +62,12 @@ def get_cleaned_description_mapping(): clean_lookup = get_cleaned_description_mapping() +# import pickle +# with open("./clean_lookup.pkl", "wb") as f: +# pickle.dump(clean_lookup, f) + +# clean_lookup = pickle.load(open("./clean_lookup.pkl", "rb")) + class EPCPipeline: """ @@ -117,8 +123,58 @@ class EPCPipeline: self.run_training_dataset_pipeline() elif self.run_mode == "newdata": self.run_newdata_dataset_pipeline() + elif self.run_mode == "record": + self.run_record_dataset_pipeline() else: raise ValueError("Run mode defined needs to be in 'training' or 'newdata'") + + + def run_record_dataset_pipeline(self): + """ + Running pipeline with just the EPCRecords + """ + + if self.directories is None: + raise ValueError( + "Directories not specified - Unable to run Training pipeline" + ) + + for directory in tqdm(self.directories): + + filepath = directory / self.epc_local_file + self.epc_data_processor.prepare_data(filepath=filepath) + + constituency_data = self.epc_data_processor.data + self.compiled_cleaning_averages.append( + self.epc_data_processor.cleaning_averages + ) + + # TODO: integrate with EPCRecord + record_dataset = constituency_data[['uprn'] + VARIABLE_DATA_FEATURES + MANDATORY_FIXED_FEATURES + LATEST_FIELD] + + constituency_dataset = RecordDataset(datasets=record_dataset, cleaned_lookup=clean_lookup) + + self.compiled_dataset = pd.concat( + [self.compiled_dataset, constituency_dataset.df] + ) + + save_dataframe_to_s3_parquet( + df=self.compiled_dataset, + bucket_name=self.epc_bucket_name, + file_key=self.epc_compiled_dataset_key, + ) + + save_dataframe_to_s3_parquet( + df=pd.DataFrame(self.compiled_all_equal_rows), + bucket_name=self.epc_bucket_name, + file_key=self.epc_all_equal_rows_key, + ) + + save_dataframe_to_s3_parquet( + df=pd.concat(self.compiled_cleaning_averages), + bucket_name=self.epc_bucket_name, + file_key=self.epc_cleaning_dataset_key, + ) def run_newdata_dataset_pipeline(self): """ From 955e72f0bb087ad545bbe02c02e0c2da85e3b371 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 22 Feb 2024 16:19:40 +0000 Subject: [PATCH 2/5] formatting --- etl/epc/Dataset.py | 614 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 462 insertions(+), 152 deletions(-) diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 5a7e3083..5efcae23 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -1,19 +1,133 @@ +import numpy as np import pandas as pd from typing import List from etl.epc.Record import EPCDifferenceRecord -from ValidationConfiguration import DatasetValidationConfiguration +from etl.epc.ValidationConfiguration import DatasetValidationConfiguration from etl.epc.settings import EARLIEST_EPC_DATE from recommendations.rdsap_tables import england_wales_age_band_lookup from recommendations.recommendation_utils import ( - get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter, - get_wall_type + estimate_number_of_floors, + get_wall_u_value, + get_roof_u_value, + get_floor_u_value, + estimate_perimeter, + get_wall_type, ) +# TODO: Can probably produce this in the property change app and store in S3 +BOOLEAN_VARIABLES = [ + "is_cavity_wall", + "is_filled_cavity", + "is_solid_brick", + "is_system_built", + "is_timber_frame", + "is_granite_or_whinstone", + "is_as_built", + "is_cob", + "is_sandstone_or_limestone", + "is_park_home", + "external_insulation", + "internal_insulation", + "is_park_home_ending", + "external_insulation_ending", + "internal_insulation_ending", + "is_to_unheated_space", + "is_to_external_air", + "is_suspended", + "is_solid", + "another_property_below", + "is_pitched", + "is_roof_room", + "is_loft", + "is_flat", + "is_thatched", + "is_at_rafters", + "has_dwelling_above", + "has_radiators", + "has_fan_coil_units", + "has_pipes_in_screed_above_insulation", + "has_pipes_in_insulated_timber_floor", + "has_pipes_in_concrete_slab", + "has_boiler", + "has_air_source_heat_pump", + "has_room_heaters", + "has_electric_storage_heaters", + "has_warm_air", + "has_electric_underfloor_heating", + "has_electric_ceiling_heating", + "has_community_scheme", + "has_ground_source_heat_pump", + "has_no_system_present", + "has_portable_electric_heaters", + "has_water_source_heat_pump", + "has_electric_heat_pump", + "has_micro-cogeneration", + "has_solar_assisted_heat_pump", + "has_exhaust_source_heat_pump", + "has_community_heat_pump", + "has_electric", + "has_mains_gas", + "has_wood_logs", + "has_coal", + "has_oil", + "has_wood_pellets", + "has_anthracite", + "has_dual_fuel_mineral_and_wood", + "has_smokeless_fuel", + "has_lpg", + "has_b30k", + "has_electricaire", + "has_assumed_for_most_rooms", + "has_underfloor_heating", + "has_radiators_ending", + "has_fan_coil_units_ending", + "has_pipes_in_screed_above_insulation_ending", + "has_pipes_in_insulated_timber_floor_ending", + "has_pipes_in_concrete_slab_ending", + "has_boiler_ending", + "has_air_source_heat_pump_ending", + "has_room_heaters_ending", + "has_electric_storage_heaters_ending", + "has_warm_air_ending", + "has_electric_underfloor_heating_ending", + "has_electric_ceiling_heating_ending", + "has_community_scheme_ending", + "has_ground_source_heat_pump_ending", + "has_no_system_present_ending", + "has_portable_electric_heaters_ending", + "has_water_source_heat_pump_ending", + "has_electric_heat_pump_ending", + "has_micro-cogeneration_ending", + "has_solar_assisted_heat_pump_ending", + "has_exhaust_source_heat_pump_ending", + "has_community_heat_pump_ending", + "has_electric_ending", + "has_mains_gas_ending", + "has_wood_logs_ending", + "has_coal_ending", + "has_oil_ending", + "has_wood_pellets_ending", + "has_anthracite_ending", + "has_dual_fuel_mineral_and_wood_ending", + "has_smokeless_fuel_ending", + "has_lpg_ending", + "has_b30k_ending", + "has_electricaire_ending", + "has_assumed_for_most_rooms_ending", + "has_underfloor_heating_ending", + "multiple_room_thermostats", + "multiple_room_thermostats_ending", + "is_community", + "no_individual_heating_or_community_network", + "is_community_ending", + "no_individual_heating_or_community_network_ending", +] + class BaseDataset: """ - # Base class for all datasets + Base class for all datasets """ def __init__(self) -> None: @@ -33,18 +147,20 @@ class BaseDataset: # raise ValueError(f"Pipeline type {pipeline_type} not found") # return self.pipeline_steps[pipeline_type] - + + class TrainingDataset(BaseDataset): """ A collection of EPCDifferenceRecords can be combined into a TrainingDataset. """ - def __init__(self, datasets: List[EPCDifferenceRecord], cleaned_lookup: dict) -> None: - + def __init__( + self, datasets: List[EPCDifferenceRecord], cleaned_lookup: dict + ) -> None: # self.pipeline_steps = self.pipeline_factory("training") self.datasets = datasets self.df = pd.DataFrame([dataset.difference_record for dataset in datasets]) - + self._feature_generation() self._drop_features() self._clean_efficiency_variables() @@ -59,14 +175,51 @@ class TrainingDataset(BaseDataset): self._null_validation(information="Clean Missing Values") self._remove_abnormal_change_in_floor_area() self._ensure_numeric() + self._organise_starting_ending_columns() + + def _organise_starting_ending_columns(self): + """ + Organise the starting and ending columns so that they are next to each other + """ + no_suffix_cols = [ + col + for col in self.df.columns + if "_ending" not in col and "_starting" not in col + ] + starting_cols = [col for col in self.df.columns if "_starting" in col] + ending_cols = [col for col in self.df.columns if "_ending" in col] + + common_cols = [ + col.rsplit("_", 1)[0] + for col in starting_cols + if col.replace("_starting", "_ending") in ending_cols + ] + only_ending_cols = [ + col + for col in ending_cols + if col.replace("_ending", "_starting") not in starting_cols + ] + + common_cols = [[col + "_starting", col + "_ending"] for col in common_cols] + + self.df = self.df.loc[ + :, + no_suffix_cols + + only_ending_cols + + [col for cols in common_cols for col in cols], + ] def _remove_abnormal_change_in_floor_area(self): """ Remove properties where the change in floor area is greater than 100% """ - self.df["tfa_diff_abs"] = abs(self.df["total_floor_area_ending"] - self.df["total_floor_area_starting"]) - self.df["tfa_diff_prop"] = self.df["tfa_diff_abs"] / self.df["total_floor_area_starting"] + self.df["tfa_diff_abs"] = abs( + self.df["total_floor_area_ending"] - self.df["total_floor_area_starting"] + ) + self.df["tfa_diff_prop"] = ( + self.df["tfa_diff_abs"] / self.df["total_floor_area_starting"] + ) self.df = self.df[self.df["tfa_diff_prop"] < 0.5] self.df = self.df.drop(columns=["tfa_diff_abs", "tfa_diff_prop"]) @@ -75,7 +228,9 @@ class TrainingDataset(BaseDataset): Ensure that all columns are numeric """ # TODO: move into EPCRecord record - uvalue_columns = [col for col in self.df.columns if "thermal_transmittance" in col] + uvalue_columns = [ + col for col in self.df.columns if "thermal_transmittance" in col + ] for uvalue_col in uvalue_columns: self.df[uvalue_col] = pd.to_numeric(self.df[uvalue_col]) @@ -85,12 +240,16 @@ class TrainingDataset(BaseDataset): Using the apply method, use the get_roof_u_value method to generate the u-value """ - col_name = "roof_insulation_thickness" if not is_end else "roof_insulation_thickness_ending" + col_name = ( + "roof_insulation_thickness" + if not is_end + else "roof_insulation_thickness_ending" + ) if row["has_dwelling_above"]: if row["roof_thermal_transmittance"] != 0: raise ValueError("Should have 0 u-value for roof") - + if row["roof_thermal_transmittance_ending"] != 0: raise ValueError("Should have 0 u-value for roof") @@ -103,16 +262,24 @@ class TrainingDataset(BaseDataset): is_flat=row["is_flat"], is_pitched=row["is_pitched"], is_at_rafters=row["is_at_rafters"], - age_band=england_wales_age_band_lookup[row["construction_age_band"]] - ) - + age_band=england_wales_age_band_lookup[row["construction_age_band"]], + ) + @staticmethod def _lambda_function_to_generate_wall_uvalue(row, is_end=False): """ Using the apply method, use the get_wall_u_value method to generate the u-value """ - description_col_name = "walls_clean_description" if not is_end else "walls_clean_description_ending" - thermal_transistance_col_name = "walls_thermal_transmittance" if not is_end else "walls_thermal_transmittance_ending" + description_col_name = ( + "walls_clean_description" + if not is_end + else "walls_clean_description_ending" + ) + thermal_transistance_col_name = ( + "walls_thermal_transmittance" + if not is_end + else "walls_thermal_transmittance_ending" + ) if pd.isnull(row[thermal_transistance_col_name]): output = get_wall_u_value( @@ -125,14 +292,18 @@ class TrainingDataset(BaseDataset): output = row[thermal_transistance_col_name] return output - + @staticmethod def _lambda_function_to_generate_floor_uvalue(row, is_end=False): """ Using the apply method, use the get_floor_u_value method to generate the u-value """ - floor_thermal_col_name = "floor_thermal_transmittance" if not is_end else "floor_thermal_transmittance_ending" + floor_thermal_col_name = ( + "floor_thermal_transmittance" + if not is_end + else "floor_thermal_transmittance_ending" + ) if row["another_property_below"]: if row["floor_thermal_transmittance"] != 0: @@ -145,20 +316,31 @@ class TrainingDataset(BaseDataset): uvalue = row[floor_thermal_col_name] if pd.isnull(uvalue): - - insulation_col_name = "floor_insulation_thickness" if not is_end else "floor_insulation_thickness_ending" - floor_area_col_name = "estimated_perimeter_starting" if not is_end else "estimated_perimeter_ending" - perimeter_col_name = "total_floor_area_starting" if not is_end else "total_floor_area_ending" + insulation_col_name = ( + "floor_insulation_thickness" + if not is_end + else "floor_insulation_thickness_ending" + ) + perimeter_col_name = ( + "estimated_perimeter_starting" + if not is_end + else "estimated_perimeter_ending" + ) + floor_area_col_name = ( + "ground_floor_area_starting" + if not is_end + else "ground_floor_area_ending" + ) uvalue = get_floor_u_value( - floor_type=row["floor_type"], - perimeter=row[floor_area_col_name], - area=row[perimeter_col_name], - insulation_thickness=row[insulation_col_name], - wall_type=row["wall_type"], - age_band=england_wales_age_band_lookup[row["construction_age_band"]] - ) - + floor_type=row["floor_type"], + perimeter=row[perimeter_col_name], + area=row[floor_area_col_name], + insulation_thickness=row[insulation_col_name], + wall_type=row["wall_type"], + age_band=england_wales_age_band_lookup[row["construction_age_band"]], + ) + return uvalue def _generate_u_values_from_features(self): @@ -171,88 +353,136 @@ class TrainingDataset(BaseDataset): # ~~~~~~~~~~~~~~~~~~ walls_starting_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_wall_uvalue(row), - axis=1 + lambda row: self._lambda_function_to_generate_wall_uvalue(row), axis=1 ) walls_ending_uvalue = self.df.apply( lambda row: self._lambda_function_to_generate_wall_uvalue(row, is_end=True), - axis=1 + axis=1, ) - walls_starting_uvalue = self.df['walls_thermal_transmittance'].fillna(walls_starting_uvalue) - walls_starting_equals_ending_flag = self.df['walls_clean_description'] == self.df["walls_clean_description_ending"] - walls_ending_uvalue[walls_starting_equals_ending_flag] = walls_starting_uvalue[walls_starting_equals_ending_flag] - + walls_starting_uvalue = self.df["walls_thermal_transmittance"].fillna( + walls_starting_uvalue + ) + walls_starting_equals_ending_flag = ( + self.df["walls_clean_description"] + == self.df["walls_clean_description_ending"] + ) + walls_ending_uvalue[walls_starting_equals_ending_flag] = walls_starting_uvalue[ + walls_starting_equals_ending_flag + ] + # ~~~~~~~~~~~~~~~~~~ # Roof # ~~~~~~~~~~~~~~~~~~ - + roof_starting_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_roof_uvalue(row), - axis=1 + lambda row: self._lambda_function_to_generate_roof_uvalue(row), axis=1 ) roof_ending_uvalue = self.df.apply( lambda row: self._lambda_function_to_generate_roof_uvalue(row, is_end=True), - axis=1 + axis=1, ) - roof_starting_uvalue = self.df['roof_thermal_transmittance'].fillna(roof_starting_uvalue) - roof_ending_uvalue = self.df['roof_thermal_transmittance_ending'].fillna(roof_ending_uvalue) + roof_starting_uvalue = self.df["roof_thermal_transmittance"].fillna( + roof_starting_uvalue + ) + roof_ending_uvalue = self.df["roof_thermal_transmittance_ending"].fillna( + roof_ending_uvalue + ) - # ~~~~~~~~~~~~~~~~~~ # Floor # ~~~~~~~~~~~~~~~~~~ - - self.df['estimated_perimeter_starting'] = self.df.apply( - lambda row: estimate_perimeter(row["total_floor_area_starting"], row["number_habitable_rooms"]), - axis=1 + + self.df["estimated_number_of_floors"] = self.df.apply( + lambda row: estimate_number_of_floors(row["property_type"]), axis=1 ) - self.df['estimated_perimeter_ending'] = self.df.apply( - lambda row: estimate_perimeter(row["total_floor_area_ending"], row["number_habitable_rooms"]), - axis=1 + + self.df["ground_floor_area_starting"] = ( + self.df["total_floor_area_starting"] / self.df["estimated_number_of_floors"] + ) + self.df["ground_floor_area_ending"] = ( + self.df["total_floor_area_ending"] / self.df["estimated_number_of_floors"] + ) + + self.df["estimated_perimeter_starting"] = self.df.apply( + lambda row: estimate_perimeter( + row["ground_floor_area_starting"], + row["number_habitable_rooms_starting"] + / row["estimated_number_of_floors"], + ), + axis=1, + ) + self.df["estimated_perimeter_ending"] = self.df.apply( + lambda row: estimate_perimeter( + row["ground_floor_area_starting"], + row["number_habitable_rooms_ending"] + / row["estimated_number_of_floors"], + ), + axis=1, + ) + self.df["floor_type"] = self.df["is_suspended"].replace( + {True: "suspended", False: "solid"} ) - self.df["floor_type"] = self.df["is_suspended"].replace({True: "suspended", False: "solid"}) self.df["wall_type"] = self.df.apply( lambda row: get_wall_type( - is_cavity_wall=row["is_cavity_wall"], - is_solid_brick=row["is_solid_brick"], - is_timber_frame=row["is_timber_frame"], - is_granite_or_whinstone=row["is_granite_or_whinstone"], - is_cob=row["is_cob"], + is_cavity_wall=row["is_cavity_wall"], + is_solid_brick=row["is_solid_brick"], + is_timber_frame=row["is_timber_frame"], + is_granite_or_whinstone=row["is_granite_or_whinstone"], + is_cob=row["is_cob"], is_sandstone_or_limestone=row["is_sandstone_or_limestone"], is_system_built=row["is_system_built"], - is_park_home=row["is_park_home"] - ), - axis=1 + is_park_home=row["is_park_home"], + ), + axis=1, ) - + floor_starting_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_floor_uvalue(row), - axis=1 + lambda row: self._lambda_function_to_generate_floor_uvalue(row), axis=1 ) floor_ending_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_floor_uvalue(row, is_end=True), - axis=1 + lambda row: self._lambda_function_to_generate_floor_uvalue( + row, is_end=True + ), + axis=1, ) - floor_starting_uvalue = self.df['floor_thermal_transmittance'].fillna(floor_starting_uvalue) - floor_ending_uvalue = self.df['floor_thermal_transmittance_ending'].fillna(floor_ending_uvalue) + floor_starting_uvalue = self.df["floor_thermal_transmittance"].fillna( + floor_starting_uvalue + ) + floor_ending_uvalue = self.df["floor_thermal_transmittance_ending"].fillna( + floor_ending_uvalue + ) for component in ["walls", "roof", "floor"]: - self.df[f"{component}_thermal_transmittance"] = self.df[f"{component}_thermal_transmittance"].fillna(eval(f"{component}_starting_uvalue")) - self.df[f"{component}_thermal_transmittance_ending"] = self.df[f"{component}_thermal_transmittance_ending"].fillna(eval(f"{component}_ending_uvalue")) + self.df[f"{component}_thermal_transmittance"] = self.df[ + f"{component}_thermal_transmittance" + ].fillna(eval(f"{component}_starting_uvalue")) + self.df[f"{component}_thermal_transmittance_ending"] = self.df[ + f"{component}_thermal_transmittance_ending" + ].fillna(eval(f"{component}_ending_uvalue")) - self.df = self.df.drop(columns=["floor_type", "wall_type", "walls_clean_description", "walls_clean_description_ending"]) + self.df = self.df.drop( + columns=[ + "floor_type", + "wall_type", + "walls_clean_description", + "walls_clean_description_ending", + "estimated_number_of_floors", + "ground_floor_area_starting", + "ground_floor_area_ending", + ] + ) - def _adjust_assumed_values_in_wall_descriptions(self): """ Strip out assumed values for all wall descriptions """ for col in ["walls_clean_description", "walls_clean_description_ending"]: - self.df[col] = self.df[col].str.replace("(assumed)", "").str.rstrip() - + self.df[col] = ( + self.df[col].str.replace("(assumed)", "", regex=False).str.rstrip() + ) def _drop_inconsistent_properties(self, expanded_df: pd.DataFrame, component: str): """ @@ -261,34 +491,57 @@ class TrainingDataset(BaseDataset): if component == "walls": expanded_df = expanded_df[ - (expanded_df["is_cavity_wall"] == expanded_df["is_cavity_wall_ending"]) & - (expanded_df["is_solid_brick"] == expanded_df["is_solid_brick_ending"]) & - (expanded_df["is_timber_frame"] == expanded_df["is_timber_frame_ending"]) & - (expanded_df["is_granite_or_whinstone"] == expanded_df["is_granite_or_whinstone_ending"]) & - (expanded_df["is_cob"] == expanded_df["is_cob_ending"]) & - (expanded_df["is_sandstone_or_limestone"] == expanded_df["is_sandstone_or_limestone_ending"]) - ] + (expanded_df["is_cavity_wall"] == expanded_df["is_cavity_wall_ending"]) + & ( + expanded_df["is_solid_brick"] + == expanded_df["is_solid_brick_ending"] + ) + & ( + expanded_df["is_timber_frame"] + == expanded_df["is_timber_frame_ending"] + ) + & ( + expanded_df["is_granite_or_whinstone"] + == expanded_df["is_granite_or_whinstone_ending"] + ) + & (expanded_df["is_cob"] == expanded_df["is_cob_ending"]) + & ( + expanded_df["is_sandstone_or_limestone"] + == expanded_df["is_sandstone_or_limestone_ending"] + ) + ] elif component == "floor": expanded_df = expanded_df[ - (expanded_df["is_suspended"] == expanded_df["is_suspended_ending"]) & - (expanded_df["is_solid"] == expanded_df["is_solid_ending"]) & - (expanded_df["another_property_below"] == expanded_df["another_property_below_ending"]) & - (expanded_df["is_to_unheated_space"] == expanded_df["is_to_unheated_space_ending"]) & - (expanded_df["is_to_external_air"] == expanded_df["is_to_external_air_ending"]) - ] + (expanded_df["is_suspended"] == expanded_df["is_suspended_ending"]) + & (expanded_df["is_solid"] == expanded_df["is_solid_ending"]) + & ( + expanded_df["another_property_below"] + == expanded_df["another_property_below_ending"] + ) + & ( + expanded_df["is_to_unheated_space"] + == expanded_df["is_to_unheated_space_ending"] + ) + & ( + expanded_df["is_to_external_air"] + == expanded_df["is_to_external_air_ending"] + ) + ] elif component == "roof": expanded_df = expanded_df[ - (expanded_df["is_pitched"] == expanded_df["is_pitched_ending"]) & - (expanded_df["is_roof_room"] == expanded_df["is_roof_room_ending"]) & - (expanded_df["is_loft"] == expanded_df["is_loft_ending"]) & - (expanded_df["is_flat"] == expanded_df["is_flat_ending"]) & - (expanded_df["is_thatched"] == expanded_df["is_thatched_ending"]) & - (expanded_df["is_at_rafters"] == expanded_df["is_at_rafters_ending"]) & - (expanded_df["has_dwelling_above"] == expanded_df["has_dwelling_above_ending"]) - ] - + (expanded_df["is_pitched"] == expanded_df["is_pitched_ending"]) + & (expanded_df["is_roof_room"] == expanded_df["is_roof_room_ending"]) + & (expanded_df["is_loft"] == expanded_df["is_loft_ending"]) + & (expanded_df["is_flat"] == expanded_df["is_flat_ending"]) + & (expanded_df["is_thatched"] == expanded_df["is_thatched_ending"]) + & (expanded_df["is_at_rafters"] == expanded_df["is_at_rafters_ending"]) + & ( + expanded_df["has_dwelling_above"] + == expanded_df["has_dwelling_above_ending"] + ) + ] + return expanded_df - def _expand_description_to_features(self, cleaned_lookup: dict): """ @@ -300,65 +553,111 @@ class TrainingDataset(BaseDataset): # remove this record, as it indicates that the quality of the EPC conducted in the first instance # is low # We also replace descriptions with their cleaned variants - """ + """ cols_to_drop = { "walls": [ # We need to cleaned descriptions for pulling out u-values - 'original_description', 'thermal_transmittance_unit', - 'original_description_ending', - 'thermal_transmittance_unit_ending', - 'is_cavity_wall_ending', 'is_filled_cavity_ending', - 'is_solid_brick_ending', 'is_system_built_ending', - 'is_timber_frame_ending', 'is_granite_or_whinstone_ending', - 'is_as_built_ending', 'is_cob_ending', 'is_assumed_ending', - 'is_sandstone_or_limestone_ending', + "original_description", + "thermal_transmittance_unit", + "original_description_ending", + "thermal_transmittance_unit_ending", + "is_cavity_wall_ending", + "is_solid_brick_ending", + "is_system_built_ending", + "is_timber_frame_ending", + "is_granite_or_whinstone_ending", + "is_as_built_ending", + "is_cob_ending", + "is_assumed_ending", + "is_sandstone_or_limestone_ending", # Re remove the is_assumed columns - "is_assumed", "is_assumed_ending" + "is_assumed", + "is_assumed_ending", ], "floor": [ - "original_description", "clean_description", "thermal_transmittance_unit", - "no_data", "no_data_ending", "original_description_ending", - "clean_description_ending", "thermal_transmittance_unit_ending", - "is_suspended_ending", "is_solid_ending", "another_property_below_ending", - "is_to_unheated_space_ending", "is_to_external_air_ending", "is_assumed", - "is_assumed_ending" + "original_description", + "clean_description", + "thermal_transmittance_unit", + "no_data", + "no_data_ending", + "original_description_ending", + "clean_description_ending", + "thermal_transmittance_unit_ending", + "is_suspended_ending", + "is_solid_ending", + "another_property_below_ending", + "is_to_unheated_space_ending", + "is_to_external_air_ending", + "is_assumed", + "is_assumed_ending", ], "roof": [ - "original_description", "clean_description", "thermal_transmittance_unit", - "is_assumed", "is_valid", "original_description_ending", "clean_description_ending", - "thermal_transmittance_unit_ending", "is_pitched_ending", "is_roof_room_ending", - "is_loft_ending", "is_flat_ending", "is_thatched_ending", "is_at_rafters_ending", - "has_dwelling_above_ending", "is_assumed_ending", "is_valid_ending" + "original_description", + "clean_description", + "thermal_transmittance_unit", + "is_assumed", + "is_valid", + "original_description_ending", + "clean_description_ending", + "thermal_transmittance_unit_ending", + "is_pitched_ending", + "is_roof_room_ending", + "is_loft_ending", + "is_flat_ending", + "is_thatched_ending", + "has_dwelling_above_ending", + "is_assumed_ending", + "is_valid_ending", ], "hotwater": [ - "original_description", "clean_description", "assumed", "original_description_ending", - "clean_description_ending", "assumed_ending" + "original_description", + "clean_description", + "assumed", + "original_description_ending", + "clean_description_ending", + "assumed_ending", ], "mainheat": [ - "original_description", "clean_description", "original_description_ending", - "has_assumed", "original_description_ending", "clean_description_ending", + "original_description", + "clean_description", + "original_description_ending", + "has_assumed", + "original_description_ending", + "clean_description_ending", "has_assumed_ending", ], "mainheatcont": [ - "original_description", "clean_description", "original_description_ending", "clean_description_ending" + "original_description", + "clean_description", + "original_description_ending", + "clean_description_ending", ], "windows": [ - "original_description", "clean_description", "original_description_ending", "clean_description_ending", + "original_description", + "clean_description", + "original_description_ending", + "clean_description_ending", # We don't need many of the glazing coverage features because we have the multi_glaze_proportion feature - "has_glazing", "glazing_coverage", "no_data", "has_glazing_ending", "glazing_coverage_ending", - "no_data_ending" + "has_glazing", + "glazing_coverage", + "no_data", + "has_glazing_ending", + "glazing_coverage_ending", + "no_data_ending", ], "main-fuel": [ - "original_description", "clean_description", "original_description_ending", "clean_description_ending" + "original_description", + "clean_description", + "original_description_ending", + "clean_description_ending", ], } components_to_expand = cols_to_drop.keys() - + for component in components_to_expand: - - # TODO: change cleaned dataframe to have underscores instead of dashes + # TODO: change cleaned dataframe to have underscores instead of dashes if component == "main-fuel": cleaned_key = "main-fuel" left_on_starting = "main_fuel_starting" @@ -368,10 +667,13 @@ class TrainingDataset(BaseDataset): cleaned_key = f"{component}-description" left_on_starting = f"{component}_description_starting" left_on_ending = f"{component}_description_ending" - original_cols = [f"{component}_description_starting", f"{component}_description_ending"] + original_cols = [ + f"{component}_description_starting", + f"{component}_description_ending", + ] cleaned_lookup_df_for_key = pd.DataFrame(cleaned_lookup[cleaned_key]) - + expanded_df = self.df.merge( cleaned_lookup_df_for_key, how="left", @@ -382,14 +684,16 @@ class TrainingDataset(BaseDataset): how="left", left_on=left_on_ending, right_on="original_description", - suffixes=("", "_ending") + suffixes=("", "_ending"), ) - # Drop inconsistent properties + # Drop properties where key material types have changed expanded_df = self._drop_inconsistent_properties(expanded_df, component) - + # Drop original cols and cols to drop - expanded_df = expanded_df.drop(columns=cols_to_drop[component] + original_cols) + expanded_df = expanded_df.drop( + columns=cols_to_drop[component] + original_cols + ) # Rename columns to component specific names, if they have not been dropped expanded_df = expanded_df.rename( @@ -405,11 +709,12 @@ class TrainingDataset(BaseDataset): } ) self.df = expanded_df - + # We don't need any lighting specific cleaning, we just drop the original description as we use # LOW_ENERGY_LIGHTING_STARTING, LOW_ENERGY_LIGHTING_ENDING - self.df = self.df.drop(columns=["lighting_description_starting", "lighting_description_ending"]) - + self.df = self.df.drop( + columns=["lighting_description_starting", "lighting_description_ending"] + ) def _clean_missing_values(self, ignore_cols=None): missings = pd.isnull(self.df).sum() @@ -420,14 +725,17 @@ class TrainingDataset(BaseDataset): for col in missings.index: unique_values = self.df[col].unique() - if True in unique_values or False in unique_values: + if ( + (True in unique_values) + or (False in unique_values) + or (col in BOOLEAN_VARIABLES) + ): self.df[col] = self.df[col].fillna(False) if "none" in unique_values: self.df[col] = self.df[col].fillna("none") else: self.df[col] = self.df[col].fillna("Unknown") - def _null_validation(self, information: str): print(f"Null validation after {information}") if pd.isnull(self.df).sum().sum(): @@ -437,18 +745,22 @@ class TrainingDataset(BaseDataset): """ Drop features that are not needed for modelling """ - self.df = self.df.drop(columns=["lodgement_date_starting", "lodgement_date_ending"]) - + self.df = self.df.drop( + columns=["lodgement_date_starting", "lodgement_date_ending"] + ) def _feature_generation(self): """ Generate features for modelling """ - self.df["days_to_starting"] = self._calculate_days_to(self.df["lodgement_date_starting"]) - self.df["day_to_ending"] = self._calculate_days_to(self.df["lodgement_date_ending"]) + self.df["days_to_starting"] = self._calculate_days_to( + self.df["lodgement_date_starting"] + ) + self.df["days_to_ending"] = self._calculate_days_to( + self.df["lodgement_date_ending"] + ) def _clean_efficiency_variables(self): - """ These is scope to clean this by the model per corresponding description. E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and @@ -463,19 +775,17 @@ class TrainingDataset(BaseDataset): missings = missings[missings >= 1] if len(missings) == 0: - return + return - # Make sure they are all efficiency columns + # Make sure they are all efficiency columns if any(~missings.index.str.contains("energy_eff")): raise ValueError("Non efficiency columns are missing") for m in missings.index: self.df[m] = self.df[m].fillna("NO_RATING") - @staticmethod def _calculate_days_to(lodgement_date): - if isinstance(lodgement_date, str): return ( pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) @@ -489,7 +799,7 @@ class TrainingDataset(BaseDataset): # if not isinstance(other, TrainingDataset): # raise TypeError("Addition can only be performed with another instance of TrainingDataset") # return TrainingDataset(self.datasets + other.datasets) - + # def __radd__(self, other): # """ # Required for sum() to work From ed407bc98b453bedf41a152b567c7e619da96750 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 22 Feb 2024 20:22:11 +0000 Subject: [PATCH 3/5] fix weird cases for now --- etl/epc/Dataset.py | 215 ++++++++++++++++++++------------- etl/epc/Pipeline.py | 19 ++- etl/epc/property_change_app.py | 3 +- 3 files changed, 145 insertions(+), 92 deletions(-) diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 5efcae23..3228668e 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -809,6 +809,7 @@ class TrainingDataset(BaseDataset): # else: # return self.__add__(other) + class RecordDataset(BaseDataset): """ A collection of EPCRecrods can be combined into a Dataset. @@ -824,25 +825,25 @@ class RecordDataset(BaseDataset): self._expand_description_to_features(cleaned_lookup) self._adjust_assumed_values_in_wall_descriptions() self._generate_u_values_from_features() - # # TODO: For some of the features that we clean, we have either a true, false or possibly null value - # # Those nulls should be False. clean_missings_after_description_process handles this but shouldn't - # # need to + # # # TODO: For some of the features that we clean, we have either a true, false or possibly null value + # # # Those nulls should be False. clean_missings_after_description_process handles this but shouldn't + # # # need to self._clean_missing_values() self._null_validation(information="Clean Missing Values") - # self._remove_abnormal_change_in_floor_area() + # # self._remove_abnormal_change_in_floor_area() self._ensure_numeric() - def _ensure_numeric(self): """ Ensure that all columns are numeric """ # TODO: move into EPCRecord record - uvalue_columns = [col for col in self.df.columns if "thermal_transmittance" in col] + uvalue_columns = [ + col for col in self.df.columns if "thermal_transmittance" in col + ] for uvalue_col in uvalue_columns: self.df[uvalue_col] = pd.to_numeric(self.df[uvalue_col]) - def _clean_missing_values(self, ignore_cols=None): missings = pd.isnull(self.df).sum() missings = missings[missings > 0] @@ -859,17 +860,22 @@ class RecordDataset(BaseDataset): else: self.df[col] = self.df[col].fillna("Unknown") - @staticmethod def _lambda_function_to_generate_roof_uvalue(row, is_end=False): """ Using the apply method, use the get_roof_u_value method to generate the u-value """ - col_name = "roof_insulation_thickness" if not is_end else "roof_insulation_thickness_ending" + col_name = ( + "roof_insulation_thickness" + if not is_end + else "roof_insulation_thickness_ending" + ) if row["has_dwelling_above"]: - if row["roof_thermal_transmittance"] != 0: + if (row["roof_thermal_transmittance"] != 0) & ( + not pd.isnull(row["roof_thermal_transmittance"]) + ): raise ValueError("Should have 0 u-value for roof") return get_roof_u_value( @@ -881,16 +887,24 @@ class RecordDataset(BaseDataset): is_flat=row["is_flat"], is_pitched=row["is_pitched"], is_at_rafters=row["is_at_rafters"], - age_band=england_wales_age_band_lookup[row["construction_age_band"]] - ) - + age_band=england_wales_age_band_lookup[row["construction_age_band"]], + ) + @staticmethod def _lambda_function_to_generate_wall_uvalue(row, is_end=False): """ Using the apply method, use the get_wall_u_value method to generate the u-value """ - description_col_name = "walls_clean_description" if not is_end else "walls_clean_description_ending" - thermal_transistance_col_name = "walls_thermal_transmittance" if not is_end else "walls_thermal_transmittance_ending" + description_col_name = ( + "walls_clean_description" + if not is_end + else "walls_clean_description_ending" + ) + thermal_transistance_col_name = ( + "walls_thermal_transmittance" + if not is_end + else "walls_thermal_transmittance_ending" + ) if pd.isnull(row[thermal_transistance_col_name]): output = get_wall_u_value( @@ -903,17 +917,23 @@ class RecordDataset(BaseDataset): output = row[thermal_transistance_col_name] return output - + @staticmethod def _lambda_function_to_generate_floor_uvalue(row, is_end=False): """ Using the apply method, use the get_floor_u_value method to generate the u-value """ - floor_thermal_col_name = "floor_thermal_transmittance" if not is_end else "floor_thermal_transmittance_ending" + floor_thermal_col_name = ( + "floor_thermal_transmittance" + if not is_end + else "floor_thermal_transmittance_ending" + ) if row["another_property_below"]: - if row["floor_thermal_transmittance"] != 0: + if (row["floor_thermal_transmittance"] != 0) & ( + not pd.isnull(row["floor_thermal_transmittance"]) + ): raise ValueError("Should have 0 u-value for floor") return 0 @@ -922,19 +942,27 @@ class RecordDataset(BaseDataset): if pd.isnull(uvalue): - insulation_col_name = "floor_insulation_thickness" if not is_end else "floor_insulation_thickness_ending" - floor_area_col_name = "estimated_perimeter" if not is_end else "estimated_perimeter_ending" - perimeter_col_name = "total_floor_area" if not is_end else "total_floor_area_ending" + insulation_col_name = ( + "floor_insulation_thickness" + if not is_end + else "floor_insulation_thickness_ending" + ) + floor_area_col_name = ( + "estimated_perimeter" if not is_end else "estimated_perimeter_ending" + ) + perimeter_col_name = ( + "total_floor_area" if not is_end else "total_floor_area_ending" + ) uvalue = get_floor_u_value( - floor_type=row["floor_type"], - perimeter=row[floor_area_col_name], - area=row[perimeter_col_name], - insulation_thickness=row[insulation_col_name], - wall_type=row["wall_type"], - age_band=england_wales_age_band_lookup[row["construction_age_band"]] - ) - + floor_type=row["floor_type"], + perimeter=row[floor_area_col_name], + area=row[perimeter_col_name], + insulation_thickness=row[insulation_col_name], + wall_type=row["wall_type"], + age_band=england_wales_age_band_lookup[row["construction_age_band"]], + ) + return uvalue def _generate_u_values_from_features(self): @@ -947,58 +975,63 @@ class RecordDataset(BaseDataset): # ~~~~~~~~~~~~~~~~~~ walls_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_wall_uvalue(row), - axis=1 + lambda row: self._lambda_function_to_generate_wall_uvalue(row), axis=1 ) - walls_uvalue = self.df['walls_thermal_transmittance'].fillna(walls_uvalue) - + walls_uvalue = self.df["walls_thermal_transmittance"].fillna(walls_uvalue) + # ~~~~~~~~~~~~~~~~~~ # Roof # ~~~~~~~~~~~~~~~~~~ - + roof_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_roof_uvalue(row), - axis=1 + lambda row: self._lambda_function_to_generate_roof_uvalue(row), axis=1 ) - roof_uvalue = self.df['roof_thermal_transmittance'].fillna(roof_uvalue) + roof_uvalue = self.df["roof_thermal_transmittance"].fillna(roof_uvalue) # ~~~~~~~~~~~~~~~~~~ # Floor # ~~~~~~~~~~~~~~~~~~ - - self.df['estimated_perimeter'] = self.df.apply( - lambda row: estimate_perimeter(row["total_floor_area"], row["number_habitable_rooms"]), - axis=1 + + self.df["estimated_perimeter"] = self.df.apply( + lambda row: estimate_perimeter( + row["total_floor_area"], row["number_habitable_rooms"] + ), + axis=1, ) - self.df["floor_type"] = self.df["is_suspended"].replace({True: "suspended", False: "solid"}) + self.df["floor_type"] = self.df["is_suspended"].replace( + {True: "suspended", False: "solid"} + ) self.df["wall_type"] = self.df.apply( lambda row: get_wall_type( - is_cavity_wall=row["is_cavity_wall"], - is_solid_brick=row["is_solid_brick"], - is_timber_frame=row["is_timber_frame"], - is_granite_or_whinstone=row["is_granite_or_whinstone"], - is_cob=row["is_cob"], + is_cavity_wall=row["is_cavity_wall"], + is_solid_brick=row["is_solid_brick"], + is_timber_frame=row["is_timber_frame"], + is_granite_or_whinstone=row["is_granite_or_whinstone"], + is_cob=row["is_cob"], is_sandstone_or_limestone=row["is_sandstone_or_limestone"], is_system_built=row["is_system_built"], - is_park_home=row["is_park_home"] - ), - axis=1 - ) - - floor_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_floor_uvalue(row), - axis=1 + is_park_home=row["is_park_home"], + ), + axis=1, ) - floor_uvalue = self.df['floor_thermal_transmittance'].fillna(floor_uvalue) + floor_uvalue = self.df.apply( + lambda row: self._lambda_function_to_generate_floor_uvalue(row), axis=1 + ) + + floor_uvalue = self.df["floor_thermal_transmittance"].fillna(floor_uvalue) for component in ["walls", "roof", "floor"]: - self.df[f"{component}_thermal_transmittance"] = self.df[f"{component}_thermal_transmittance"].fillna(eval(f"{component}_uvalue")) + self.df[f"{component}_thermal_transmittance"] = self.df[ + f"{component}_thermal_transmittance" + ].fillna(eval(f"{component}_uvalue")) - self.df = self.df.drop(columns=["floor_type", "wall_type", "walls_clean_description"]) + self.df = self.df.drop( + columns=["floor_type", "wall_type", "walls_clean_description"] + ) def _adjust_assumed_values_in_wall_descriptions(self): """ @@ -1007,7 +1040,6 @@ class RecordDataset(BaseDataset): for col in ["walls_clean_description"]: self.df[col] = self.df[col].str.replace("(assumed)", "").str.rstrip() - def _clean_efficiency_variables(self): """ These is scope to clean this by the model per corresponding description. @@ -1023,7 +1055,7 @@ class RecordDataset(BaseDataset): missings = missings[missings >= 1] if len(missings) == 0: - return + return # Make sure they are all efficiency columns if any(~missings.index.str.contains("energy_eff")): @@ -1033,13 +1065,11 @@ class RecordDataset(BaseDataset): column_index = self.df[m].isna() self.df.loc[column_index, m] = "NO_RATING" - def _null_validation(self, information: str): print(f"Null validation after {information}") if pd.isnull(self.df).sum().sum(): raise ValueError(f"Null values found in dataset, after step {information}") - def _expand_description_to_features(self, cleaned_lookup: dict): """ This method will merge on the cleaned lookup table and ensure that the building fabric in the @@ -1050,49 +1080,63 @@ class RecordDataset(BaseDataset): # remove this record, as it indicates that the quality of the EPC conducted in the first instance # is low # We also replace descriptions with their cleaned variants - """ + """ cols_to_drop = { "walls": [ # We need to cleaned descriptions for pulling out u-values - 'original_description', 'thermal_transmittance_unit', + "original_description", + "thermal_transmittance_unit", # Re remove the is_assumed columns - "is_assumed" + "is_assumed", ], "floor": [ - "original_description", "clean_description", "thermal_transmittance_unit", - "no_data", - "is_assumed" + "original_description", + "clean_description", + "thermal_transmittance_unit", + "no_data", + "is_assumed", ], "roof": [ - "original_description", "clean_description", "thermal_transmittance_unit", - "is_assumed", "is_valid" + "original_description", + "clean_description", + "thermal_transmittance_unit", + "is_assumed", + "is_valid", ], "hotwater": [ - "original_description", "clean_description", "assumed", + "original_description", + "clean_description", + "assumed", ], "mainheat": [ - "original_description", "clean_description", + "original_description", + "clean_description", "has_assumed", ], "mainheatcont": [ - "original_description", "clean_description", + "original_description", + "clean_description", ], "windows": [ - "original_description", "clean_description", + "original_description", + "clean_description", # We don't need many of the glazing coverage features because we have the multi_glaze_proportion feature - "has_glazing", "glazing_coverage", "no_data", + "has_glazing", + "glazing_coverage", + "no_data", ], "main-fuel": [ - "original_description", "clean_description", + "original_description", + "clean_description", ], } components_to_expand = cols_to_drop.keys() - + for component in components_to_expand: - - # TODO: change cleaned dataframe to have underscores instead of dashes + + # TODO: change cleaned dataframe to have underscores instead of dashes if component == "main-fuel": cleaned_key = "main-fuel" left_on_key = "main_fuel" @@ -1108,11 +1152,13 @@ class RecordDataset(BaseDataset): cleaned_lookup_df_for_key, how="left", left_on=left_on_key, - right_on="original_description" + right_on="original_description", ) # Drop original cols and cols to drop - expanded_df = expanded_df.drop(columns=cols_to_drop[component] + original_cols) + expanded_df = expanded_df.drop( + columns=cols_to_drop[component] + original_cols + ) # Rename columns to component specific names, if they have not been dropped expanded_df = expanded_df.rename( @@ -1124,17 +1170,16 @@ class RecordDataset(BaseDataset): } ) self.df = expanded_df - + # We don't need any lighting specific cleaning, we just drop the original description as we use # LOW_ENERGY_LIGHTING_STARTING, LOW_ENERGY_LIGHTING_ENDING self.df = self.df.drop(columns=["lighting_description"]) - # def __add__(self, other) -> "NewDataset": # if not isinstance(other, NewDataset): # raise TypeError("Addition can only be performed with another instance of ScoringDataset") # return NewDataset(self.datasets + other.datasets) - + # def __radd__(self, other): # """ # Required for sum() to work @@ -1142,4 +1187,4 @@ class RecordDataset(BaseDataset): # if isinstance(other, int): # return self # else: - # return self.__add__(other) \ No newline at end of file + # return self.__add__(other) diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index f0be3c2f..f8be16b4 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -87,9 +87,9 @@ class EPCPipeline: run_mode="training", epc_local_file="certificates.csv", epc_bucket_name="retrofit-data-dev", - epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_rooms.parquet", - epc_all_equal_rows_key="sap_change_model/all_equal_rows_rooms.parquet", - epc_compiled_dataset_key="sap_change_model/dataset_rooms.parquet", + epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_record.parquet", + epc_all_equal_rows_key="sap_change_model/all_equal_rows_record.parquet", + epc_compiled_dataset_key="sap_change_model/dataset_record.parquet", ): """ :param directories: List of directories to process @@ -127,7 +127,6 @@ class EPCPipeline: self.run_record_dataset_pipeline() else: raise ValueError("Run mode defined needs to be in 'training' or 'newdata'") - def run_record_dataset_pipeline(self): """ @@ -150,9 +149,17 @@ class EPCPipeline: ) # TODO: integrate with EPCRecord - record_dataset = constituency_data[['uprn'] + VARIABLE_DATA_FEATURES + MANDATORY_FIXED_FEATURES + LATEST_FIELD] + record_dataset = constituency_data[ + ["uprn"] + + [RDSAP_RESPONSE] + + VARIABLE_DATA_FEATURES + + MANDATORY_FIXED_FEATURES + + LATEST_FIELD + ].rename(columns={RDSAP_RESPONSE: "sap"}) - constituency_dataset = RecordDataset(datasets=record_dataset, cleaned_lookup=clean_lookup) + constituency_dataset = RecordDataset( + datasets=record_dataset, cleaned_lookup=clean_lookup + ) self.compiled_dataset = pd.concat( [self.compiled_dataset, constituency_dataset.df] diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index c8923d6d..8c97bff4 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -12,10 +12,11 @@ def main(): """ directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] - # directories = directories[0:3] + # directories = directories[202:203] epc_pipeline = EPCPipeline( directories=directories, + run_mode="record", epc_data_processor=EPCDataProcessor(run_mode="training"), ) From 1ba73c8115b8ee7024f1f648d42be93090272060 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 15 May 2024 09:00:16 +0000 Subject: [PATCH 4/5] ignore env --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 63884ad7..4a204ac3 100644 --- a/.gitignore +++ b/.gitignore @@ -268,4 +268,6 @@ adhoc adhoc/* etl-router-venv/ -refactor_datasets/ \ No newline at end of file +refactor_datasets/ +etl-router-*/ +.vscode/ \ No newline at end of file From d8f418e55e812b31d94aeab0f707a48123da474c Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 15 May 2024 09:01:32 +0000 Subject: [PATCH 5/5] ignore env --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 63884ad7..4a204ac3 100644 --- a/.gitignore +++ b/.gitignore @@ -268,4 +268,6 @@ adhoc adhoc/* etl-router-venv/ -refactor_datasets/ \ No newline at end of file +refactor_datasets/ +etl-router-*/ +.vscode/ \ No newline at end of file