added process and prune description to rdsap data generation and integrated updates to data

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-15 15:39:32 +01:00
parent e0019662c9
commit 8894ef7d6c

View file

@ -38,6 +38,126 @@ def get_cleaned():
return cleaned
def process_and_prune_desriptions(df, cleaned_lookup):
"""
This method will merge on the cleaned lookup table and ensure that the building fabric in the
starting and ending EPC is consistent, so ensure that we are performing our modelling on the cleanest
possible dataset.
:param df:
:param cleaned_lookup:
:return:
"""
# TODO: In a future iteration, we can test using the binary features and the insulation thickness
# estimates, we well as estimated U-values
cols_to_drop = {
"walls": [
'original_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
'is_solid_brick', 'is_system_built', 'is_timber_frame',
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
'is_sandstone_or_limestone', 'insulation_thickness',
'external_insulation', 'internal_insulation',
'original_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
'is_solid_brick_ENDING', 'is_system_built_ENDING',
'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
'external_insulation_ENDING', 'internal_insulation_ENDING',
],
"floor": [
'original_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
'is_to_external_air', 'is_suspended', 'is_solid',
'another_property_below', 'insulation_thickness', 'no_data',
'original_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
'another_property_below_ENDING', 'insulation_thickness_ENDING',
'no_data_ENDING',
],
"roof": [
'original_description', 'clean_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
'has_dwelling_above', 'is_valid', 'insulation_thickness',
'original_description_ENDING', 'clean_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
'insulation_thickness_ENDING',
]
}
for component in ["walls", "floor", "roof"]:
component_upper = component.upper()
df = df.merge(
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
how="left",
left_on=f"{component_upper}_DESCRIPTION_STARTING",
right_on="original_description",
).merge(
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
how="left",
left_on=f"{component_upper}_DESCRIPTION_ENDING",
right_on="original_description",
suffixes=("", "_ENDING")
)
if component == "walls":
# We make sure the wall construction hasn't changed
df = df[
(df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
(df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
(df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
(df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
(df["is_cob"] == df["is_cob_ENDING"]) &
(df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
]
elif component == "floor":
df = df[
(df["is_suspended"] == df["is_suspended_ENDING"]) &
(df["is_solid"] == df["is_solid_ENDING"]) &
(df["another_property_below"] == df["another_property_below_ENDING"]) &
(df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
]
else:
df = df[
(df["is_pitched"] == df["is_pitched_ENDING"]) &
(df["is_roof_room"] == df["is_roof_room_ENDING"]) &
(df["is_loft"] == df["is_loft_ENDING"]) &
(df["is_flat"] == df["is_flat_ENDING"]) &
(df["is_thatched"] == df["is_thatched_ENDING"]) &
(df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
(df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
]
# Drop the binary indicators and replace the original description with the cleaned version
# Drop original cols
original_cols = [
f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
]
df = df.drop(
columns=cols_to_drop[component] + original_cols
).rename(
columns={
"clean_description": f"{component_upper}_DESCRIPTION_STARTING",
"clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
}
)
return df
def app():
# Get all the files in the directory
@ -61,7 +181,7 @@ def app():
# TODO [x] : Have a look at temporal features
# TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
# TODO [x]: Same as floor area for floor height
# TODO []: If fundamental building fabric changes, we should proabably discard the record
# TODO [x]: If fundamental building fabric changes, we should proabably discard the record
# TODO [x]: Should we prune records that have an exceptionally large amount of time between them?
# - leave for now and check performance after temporal features
# TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
@ -188,117 +308,6 @@ def app():
# is low
# We also replace descriptions with their cleaned variants
def process_and_prune_desriptions(df, cleaned_lookup):
# TODO: In a future iteration, we can test using the binary features and the insulation thickness
# estimates, we well as estimated U-values
cols_to_drop = {
"walls": [
'original_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
'is_solid_brick', 'is_system_built', 'is_timber_frame',
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
'is_sandstone_or_limestone', 'insulation_thickness',
'external_insulation', 'internal_insulation',
'original_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
'is_solid_brick_ENDING', 'is_system_built_ENDING',
'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
'external_insulation_ENDING', 'internal_insulation_ENDING',
],
"floor": [
'original_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
'is_to_external_air', 'is_suspended', 'is_solid',
'another_property_below', 'insulation_thickness', 'no_data',
'original_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
'another_property_below_ENDING', 'insulation_thickness_ENDING',
'no_data_ENDING',
],
"roof": [
'original_description', 'clean_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
'has_dwelling_above', 'is_valid', 'insulation_thickness',
'original_description_ENDING', 'clean_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
'insulation_thickness_ENDING',
]
}
for component in ["walls", "floor", "roof"]:
component_upper = component.upper()
df = df.merge(
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
how="left",
left_on=f"{component_upper}_DESCRIPTION_STARTING",
right_on="original_description",
).merge(
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
how="left",
left_on=f"{component_upper}_DESCRIPTION_ENDING",
right_on="original_description",
suffixes=("", "_ENDING")
)
if component == "walls":
# We make sure the wall construction hasn't changed
df = df[
(df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
(df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
(df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
(df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
(df["is_cob"] == df["is_cob_ENDING"]) &
(df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
]
elif component == "floor":
df = df[
(df["is_suspended"] == df["is_suspended_ENDING"]) &
(df["is_solid"] == df["is_solid_ENDING"]) &
(df["another_property_below"] == df["another_property_below_ENDING"]) &
(df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
]
else:
df = df[
(df["is_pitched"] == df["is_pitched_ENDING"]) &
(df["is_roof_room"] == df["is_roof_room_ENDING"]) &
(df["is_loft"] == df["is_loft_ENDING"]) &
(df["is_flat"] == df["is_flat_ENDING"]) &
(df["is_thatched"] == df["is_thatched_ENDING"]) &
(df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
(df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
]
# Drop the binary indicators and replace the original description with the cleaned version
# Drop original cols
original_cols = [
f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
]
df = df.drop(
columns=cols_to_drop[component] + original_cols
).rename(
columns={
"clean_description": f"{component_upper}_DESCRIPTION_STARTING",
"clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
}
)
return df
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
dataset.append(data_by_urpn_df)