mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
added process and prune description to rdsap data generation and integrated updates to data
This commit is contained in:
parent
e0019662c9
commit
8894ef7d6c
1 changed files with 121 additions and 112 deletions
|
|
@ -38,6 +38,126 @@ def get_cleaned():
|
|||
return cleaned
|
||||
|
||||
|
||||
def process_and_prune_desriptions(df, cleaned_lookup):
|
||||
"""
|
||||
This method will merge on the cleaned lookup table and ensure that the building fabric in the
|
||||
starting and ending EPC is consistent, so ensure that we are performing our modelling on the cleanest
|
||||
possible dataset.
|
||||
:param df:
|
||||
:param cleaned_lookup:
|
||||
:return:
|
||||
"""
|
||||
|
||||
# TODO: In a future iteration, we can test using the binary features and the insulation thickness
|
||||
# estimates, we well as estimated U-values
|
||||
|
||||
cols_to_drop = {
|
||||
"walls": [
|
||||
'original_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
|
||||
'is_solid_brick', 'is_system_built', 'is_timber_frame',
|
||||
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
|
||||
'is_sandstone_or_limestone', 'insulation_thickness',
|
||||
'external_insulation', 'internal_insulation',
|
||||
'original_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
|
||||
'is_solid_brick_ENDING', 'is_system_built_ENDING',
|
||||
'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
|
||||
'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
|
||||
'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
|
||||
'external_insulation_ENDING', 'internal_insulation_ENDING',
|
||||
],
|
||||
"floor": [
|
||||
'original_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
|
||||
'is_to_external_air', 'is_suspended', 'is_solid',
|
||||
'another_property_below', 'insulation_thickness', 'no_data',
|
||||
'original_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
|
||||
'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
|
||||
'another_property_below_ENDING', 'insulation_thickness_ENDING',
|
||||
'no_data_ENDING',
|
||||
],
|
||||
"roof": [
|
||||
'original_description', 'clean_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
|
||||
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
|
||||
'has_dwelling_above', 'is_valid', 'insulation_thickness',
|
||||
'original_description_ENDING', 'clean_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
|
||||
'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
|
||||
'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
|
||||
'insulation_thickness_ENDING',
|
||||
]
|
||||
|
||||
}
|
||||
|
||||
for component in ["walls", "floor", "roof"]:
|
||||
component_upper = component.upper()
|
||||
|
||||
df = df.merge(
|
||||
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
|
||||
how="left",
|
||||
left_on=f"{component_upper}_DESCRIPTION_STARTING",
|
||||
right_on="original_description",
|
||||
).merge(
|
||||
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
|
||||
how="left",
|
||||
left_on=f"{component_upper}_DESCRIPTION_ENDING",
|
||||
right_on="original_description",
|
||||
suffixes=("", "_ENDING")
|
||||
)
|
||||
|
||||
if component == "walls":
|
||||
# We make sure the wall construction hasn't changed
|
||||
df = df[
|
||||
(df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
|
||||
(df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
|
||||
(df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
|
||||
(df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
|
||||
(df["is_cob"] == df["is_cob_ENDING"]) &
|
||||
(df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
|
||||
]
|
||||
elif component == "floor":
|
||||
df = df[
|
||||
(df["is_suspended"] == df["is_suspended_ENDING"]) &
|
||||
(df["is_solid"] == df["is_solid_ENDING"]) &
|
||||
(df["another_property_below"] == df["another_property_below_ENDING"]) &
|
||||
(df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
|
||||
]
|
||||
else:
|
||||
df = df[
|
||||
(df["is_pitched"] == df["is_pitched_ENDING"]) &
|
||||
(df["is_roof_room"] == df["is_roof_room_ENDING"]) &
|
||||
(df["is_loft"] == df["is_loft_ENDING"]) &
|
||||
(df["is_flat"] == df["is_flat_ENDING"]) &
|
||||
(df["is_thatched"] == df["is_thatched_ENDING"]) &
|
||||
(df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
|
||||
(df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
|
||||
]
|
||||
|
||||
# Drop the binary indicators and replace the original description with the cleaned version
|
||||
|
||||
# Drop original cols
|
||||
original_cols = [
|
||||
f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
|
||||
]
|
||||
|
||||
df = df.drop(
|
||||
columns=cols_to_drop[component] + original_cols
|
||||
).rename(
|
||||
columns={
|
||||
"clean_description": f"{component_upper}_DESCRIPTION_STARTING",
|
||||
"clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
|
||||
}
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def app():
|
||||
# Get all the files in the directory
|
||||
|
||||
|
|
@ -61,7 +181,7 @@ def app():
|
|||
# TODO [x] : Have a look at temporal features
|
||||
# TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
|
||||
# TODO [x]: Same as floor area for floor height
|
||||
# TODO []: If fundamental building fabric changes, we should proabably discard the record
|
||||
# TODO [x]: If fundamental building fabric changes, we should proabably discard the record
|
||||
# TODO [x]: Should we prune records that have an exceptionally large amount of time between them?
|
||||
# - leave for now and check performance after temporal features
|
||||
# TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
|
||||
|
|
@ -188,117 +308,6 @@ def app():
|
|||
# is low
|
||||
# We also replace descriptions with their cleaned variants
|
||||
|
||||
def process_and_prune_desriptions(df, cleaned_lookup):
|
||||
|
||||
# TODO: In a future iteration, we can test using the binary features and the insulation thickness
|
||||
# estimates, we well as estimated U-values
|
||||
|
||||
cols_to_drop = {
|
||||
"walls": [
|
||||
'original_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
|
||||
'is_solid_brick', 'is_system_built', 'is_timber_frame',
|
||||
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
|
||||
'is_sandstone_or_limestone', 'insulation_thickness',
|
||||
'external_insulation', 'internal_insulation',
|
||||
'original_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
|
||||
'is_solid_brick_ENDING', 'is_system_built_ENDING',
|
||||
'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
|
||||
'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
|
||||
'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
|
||||
'external_insulation_ENDING', 'internal_insulation_ENDING',
|
||||
],
|
||||
"floor": [
|
||||
'original_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
|
||||
'is_to_external_air', 'is_suspended', 'is_solid',
|
||||
'another_property_below', 'insulation_thickness', 'no_data',
|
||||
'original_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
|
||||
'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
|
||||
'another_property_below_ENDING', 'insulation_thickness_ENDING',
|
||||
'no_data_ENDING',
|
||||
],
|
||||
"roof": [
|
||||
'original_description', 'clean_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
|
||||
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
|
||||
'has_dwelling_above', 'is_valid', 'insulation_thickness',
|
||||
'original_description_ENDING', 'clean_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
|
||||
'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
|
||||
'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
|
||||
'insulation_thickness_ENDING',
|
||||
]
|
||||
|
||||
}
|
||||
|
||||
for component in ["walls", "floor", "roof"]:
|
||||
component_upper = component.upper()
|
||||
|
||||
df = df.merge(
|
||||
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
|
||||
how="left",
|
||||
left_on=f"{component_upper}_DESCRIPTION_STARTING",
|
||||
right_on="original_description",
|
||||
).merge(
|
||||
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
|
||||
how="left",
|
||||
left_on=f"{component_upper}_DESCRIPTION_ENDING",
|
||||
right_on="original_description",
|
||||
suffixes=("", "_ENDING")
|
||||
)
|
||||
|
||||
if component == "walls":
|
||||
# We make sure the wall construction hasn't changed
|
||||
df = df[
|
||||
(df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
|
||||
(df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
|
||||
(df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
|
||||
(df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
|
||||
(df["is_cob"] == df["is_cob_ENDING"]) &
|
||||
(df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
|
||||
]
|
||||
elif component == "floor":
|
||||
df = df[
|
||||
(df["is_suspended"] == df["is_suspended_ENDING"]) &
|
||||
(df["is_solid"] == df["is_solid_ENDING"]) &
|
||||
(df["another_property_below"] == df["another_property_below_ENDING"]) &
|
||||
(df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
|
||||
]
|
||||
else:
|
||||
df = df[
|
||||
(df["is_pitched"] == df["is_pitched_ENDING"]) &
|
||||
(df["is_roof_room"] == df["is_roof_room_ENDING"]) &
|
||||
(df["is_loft"] == df["is_loft_ENDING"]) &
|
||||
(df["is_flat"] == df["is_flat_ENDING"]) &
|
||||
(df["is_thatched"] == df["is_thatched_ENDING"]) &
|
||||
(df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
|
||||
(df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
|
||||
]
|
||||
|
||||
# Drop the binary indicators and replace the original description with the cleaned version
|
||||
|
||||
# Drop original cols
|
||||
original_cols = [
|
||||
f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
|
||||
]
|
||||
|
||||
df = df.drop(
|
||||
columns=cols_to_drop[component] + original_cols
|
||||
).rename(
|
||||
columns={
|
||||
"clean_description": f"{component_upper}_DESCRIPTION_STARTING",
|
||||
"clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
|
||||
}
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
|
||||
|
||||
dataset.append(data_by_urpn_df)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue