From 8894ef7d6c55e9deefafc339f5e6a6ac0f06c7ae Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Sep 2023 15:39:32 +0100 Subject: [PATCH] added process and prune description to rdsap data generation and integrated updates to data --- .../generate_rdsap_change.py | 233 +++++++++--------- 1 file changed, 121 insertions(+), 112 deletions(-) diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py index b3961ce1..42c2f878 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -38,6 +38,126 @@ def get_cleaned(): return cleaned +def process_and_prune_desriptions(df, cleaned_lookup): + """ + This method will merge on the cleaned lookup table and ensure that the building fabric in the + starting and ending EPC is consistent, so ensure that we are performing our modelling on the cleanest + possible dataset. + :param df: + :param cleaned_lookup: + :return: + """ + + # TODO: In a future iteration, we can test using the binary features and the insulation thickness + # estimates, we well as estimated U-values + + cols_to_drop = { + "walls": [ + 'original_description', 'thermal_transmittance', + 'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity', + 'is_solid_brick', 'is_system_built', 'is_timber_frame', + 'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed', + 'is_sandstone_or_limestone', 'insulation_thickness', + 'external_insulation', 'internal_insulation', + 'original_description_ENDING', + 'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING', + 'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING', + 'is_solid_brick_ENDING', 'is_system_built_ENDING', + 'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING', + 'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING', + 'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING', + 'external_insulation_ENDING', 'internal_insulation_ENDING', + ], + "floor": [ + 'original_description', 'thermal_transmittance', + 'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space', + 'is_to_external_air', 'is_suspended', 'is_solid', + 'another_property_below', 'insulation_thickness', 'no_data', + 'original_description_ENDING', + 'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING', + 'is_assumed_ENDING', 'is_to_unheated_space_ENDING', + 'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING', + 'another_property_below_ENDING', 'insulation_thickness_ENDING', + 'no_data_ENDING', + ], + "roof": [ + 'original_description', 'clean_description', 'thermal_transmittance', + 'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft', + 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed', + 'has_dwelling_above', 'is_valid', 'insulation_thickness', + 'original_description_ENDING', 'clean_description_ENDING', + 'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING', + 'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING', + 'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING', + 'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING', + 'insulation_thickness_ENDING', + ] + + } + + for component in ["walls", "floor", "roof"]: + component_upper = component.upper() + + df = df.merge( + pd.DataFrame(cleaned_lookup[f"{component}-description"]), + how="left", + left_on=f"{component_upper}_DESCRIPTION_STARTING", + right_on="original_description", + ).merge( + pd.DataFrame(cleaned_lookup[f"{component}-description"]), + how="left", + left_on=f"{component_upper}_DESCRIPTION_ENDING", + right_on="original_description", + suffixes=("", "_ENDING") + ) + + if component == "walls": + # We make sure the wall construction hasn't changed + df = df[ + (df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) & + (df["is_solid_brick"] == df["is_solid_brick_ENDING"]) & + (df["is_timber_frame"] == df["is_timber_frame_ENDING"]) & + (df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) & + (df["is_cob"] == df["is_cob_ENDING"]) & + (df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"]) + ] + elif component == "floor": + df = df[ + (df["is_suspended"] == df["is_suspended_ENDING"]) & + (df["is_solid"] == df["is_solid_ENDING"]) & + (df["another_property_below"] == df["another_property_below_ENDING"]) & + (df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"]) + ] + else: + df = df[ + (df["is_pitched"] == df["is_pitched_ENDING"]) & + (df["is_roof_room"] == df["is_roof_room_ENDING"]) & + (df["is_loft"] == df["is_loft_ENDING"]) & + (df["is_flat"] == df["is_flat_ENDING"]) & + (df["is_thatched"] == df["is_thatched_ENDING"]) & + (df["is_at_rafters"] == df["is_at_rafters_ENDING"]) & + (df["has_dwelling_above"] == df["has_dwelling_above_ENDING"]) + ] + + # Drop the binary indicators and replace the original description with the cleaned version + + # Drop original cols + original_cols = [ + f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING" + ] + + df = df.drop( + columns=cols_to_drop[component] + original_cols + ).rename( + columns={ + "clean_description": f"{component_upper}_DESCRIPTION_STARTING", + "clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING", + } + ) + + return df + + def app(): # Get all the files in the directory @@ -61,7 +181,7 @@ def app(): # TODO [x] : Have a look at temporal features # TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value. # TODO [x]: Same as floor area for floor height - # TODO []: If fundamental building fabric changes, we should proabably discard the record + # TODO [x]: If fundamental building fabric changes, we should proabably discard the record # TODO [x]: Should we prune records that have an exceptionally large amount of time between them? # - leave for now and check performance after temporal features # TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections? @@ -188,117 +308,6 @@ def app(): # is low # We also replace descriptions with their cleaned variants - def process_and_prune_desriptions(df, cleaned_lookup): - - # TODO: In a future iteration, we can test using the binary features and the insulation thickness - # estimates, we well as estimated U-values - - cols_to_drop = { - "walls": [ - 'original_description', 'thermal_transmittance', - 'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity', - 'is_solid_brick', 'is_system_built', 'is_timber_frame', - 'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed', - 'is_sandstone_or_limestone', 'insulation_thickness', - 'external_insulation', 'internal_insulation', - 'original_description_ENDING', - 'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING', - 'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING', - 'is_solid_brick_ENDING', 'is_system_built_ENDING', - 'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING', - 'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING', - 'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING', - 'external_insulation_ENDING', 'internal_insulation_ENDING', - ], - "floor": [ - 'original_description', 'thermal_transmittance', - 'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space', - 'is_to_external_air', 'is_suspended', 'is_solid', - 'another_property_below', 'insulation_thickness', 'no_data', - 'original_description_ENDING', - 'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING', - 'is_assumed_ENDING', 'is_to_unheated_space_ENDING', - 'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING', - 'another_property_below_ENDING', 'insulation_thickness_ENDING', - 'no_data_ENDING', - ], - "roof": [ - 'original_description', 'clean_description', 'thermal_transmittance', - 'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft', - 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed', - 'has_dwelling_above', 'is_valid', 'insulation_thickness', - 'original_description_ENDING', 'clean_description_ENDING', - 'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING', - 'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING', - 'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING', - 'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING', - 'insulation_thickness_ENDING', - ] - - } - - for component in ["walls", "floor", "roof"]: - component_upper = component.upper() - - df = df.merge( - pd.DataFrame(cleaned_lookup[f"{component}-description"]), - how="left", - left_on=f"{component_upper}_DESCRIPTION_STARTING", - right_on="original_description", - ).merge( - pd.DataFrame(cleaned_lookup[f"{component}-description"]), - how="left", - left_on=f"{component_upper}_DESCRIPTION_ENDING", - right_on="original_description", - suffixes=("", "_ENDING") - ) - - if component == "walls": - # We make sure the wall construction hasn't changed - df = df[ - (df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) & - (df["is_solid_brick"] == df["is_solid_brick_ENDING"]) & - (df["is_timber_frame"] == df["is_timber_frame_ENDING"]) & - (df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) & - (df["is_cob"] == df["is_cob_ENDING"]) & - (df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"]) - ] - elif component == "floor": - df = df[ - (df["is_suspended"] == df["is_suspended_ENDING"]) & - (df["is_solid"] == df["is_solid_ENDING"]) & - (df["another_property_below"] == df["another_property_below_ENDING"]) & - (df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"]) - ] - else: - df = df[ - (df["is_pitched"] == df["is_pitched_ENDING"]) & - (df["is_roof_room"] == df["is_roof_room_ENDING"]) & - (df["is_loft"] == df["is_loft_ENDING"]) & - (df["is_flat"] == df["is_flat_ENDING"]) & - (df["is_thatched"] == df["is_thatched_ENDING"]) & - (df["is_at_rafters"] == df["is_at_rafters_ENDING"]) & - (df["has_dwelling_above"] == df["has_dwelling_above_ENDING"]) - ] - - # Drop the binary indicators and replace the original description with the cleaned version - - # Drop original cols - original_cols = [ - f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING" - ] - - df = df.drop( - columns=cols_to_drop[component] + original_cols - ).rename( - columns={ - "clean_description": f"{component_upper}_DESCRIPTION_STARTING", - "clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING", - } - ) - - return df - data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup) dataset.append(data_by_urpn_df)