mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
328 lines
14 KiB
Python
328 lines
14 KiB
Python
import pandas as pd
|
|
from tqdm import tqdm
|
|
import msgpack
|
|
|
|
from pathlib import Path
|
|
from model_data.simulation_system.core.Settings import (
|
|
MANDATORY_FIXED_FEATURES,
|
|
LATEST_FIELD,
|
|
COMPONENT_FEATURES,
|
|
RDSAP_RESPONSE,
|
|
HEAT_DEMAND_RESPONSE,
|
|
COLUMNS_TO_MERGE_ON,
|
|
EARLIEST_EPC_DATE,
|
|
CARBON_RESPONSE,
|
|
)
|
|
from model_data.simulation_system.core.DataProcessor import DataProcessor
|
|
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_from_s3_parquet
|
|
|
|
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
|
|
|
|
|
def get_cleaned():
|
|
"""
|
|
This function will retrieve the cleaned dataset from s3 which has the cleaned
|
|
descriptions for the epc dataset
|
|
|
|
This data is stored in MessagePack format and therefore needs to be decoded
|
|
:return:
|
|
"""
|
|
|
|
cleaned = read_from_s3(
|
|
s3_file_name="cleaned_epc_data/cleaned.bson",
|
|
bucket_name="retrofit-data-dev"
|
|
)
|
|
|
|
cleaned = msgpack.unpackb(cleaned, raw=False)
|
|
|
|
return cleaned
|
|
|
|
|
|
def process_and_prune_desriptions(df, cleaned_lookup):
|
|
"""
|
|
This method will merge on the cleaned lookup table and ensure that the building fabric in the
|
|
starting and ending EPC is consistent, so ensure that we are performing our modelling on the cleanest
|
|
possible dataset.
|
|
:param df:
|
|
:param cleaned_lookup:
|
|
:return:
|
|
"""
|
|
|
|
# TODO: In a future iteration, we can test using the binary features and the insulation thickness
|
|
# estimates, we well as estimated U-values
|
|
|
|
cols_to_drop = {
|
|
"walls": [
|
|
'original_description', 'thermal_transmittance',
|
|
'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
|
|
'is_solid_brick', 'is_system_built', 'is_timber_frame',
|
|
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
|
|
'is_sandstone_or_limestone', 'insulation_thickness',
|
|
'external_insulation', 'internal_insulation',
|
|
'original_description_ENDING',
|
|
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
|
'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
|
|
'is_solid_brick_ENDING', 'is_system_built_ENDING',
|
|
'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
|
|
'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
|
|
'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
|
|
'external_insulation_ENDING', 'internal_insulation_ENDING',
|
|
],
|
|
"floor": [
|
|
'original_description', 'thermal_transmittance',
|
|
'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
|
|
'is_to_external_air', 'is_suspended', 'is_solid',
|
|
'another_property_below', 'insulation_thickness', 'no_data',
|
|
'original_description_ENDING',
|
|
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
|
'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
|
|
'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
|
|
'another_property_below_ENDING', 'insulation_thickness_ENDING',
|
|
'no_data_ENDING',
|
|
],
|
|
"roof": [
|
|
'original_description', 'clean_description', 'thermal_transmittance',
|
|
'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
|
|
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
|
|
'has_dwelling_above', 'is_valid', 'insulation_thickness',
|
|
'original_description_ENDING', 'clean_description_ENDING',
|
|
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
|
'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
|
|
'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
|
|
'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
|
|
'insulation_thickness_ENDING',
|
|
]
|
|
}
|
|
|
|
for component in ["walls", "floor", "roof"]:
|
|
component_upper = component.upper()
|
|
|
|
df = df.merge(
|
|
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
|
|
how="left",
|
|
left_on=f"{component_upper}_DESCRIPTION_STARTING",
|
|
right_on="original_description",
|
|
).merge(
|
|
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
|
|
how="left",
|
|
left_on=f"{component_upper}_DESCRIPTION_ENDING",
|
|
right_on="original_description",
|
|
suffixes=("", "_ENDING")
|
|
)
|
|
|
|
if component == "walls":
|
|
# We make sure the wall construction hasn't changed
|
|
df = df[
|
|
(df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
|
|
(df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
|
|
(df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
|
|
(df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
|
|
(df["is_cob"] == df["is_cob_ENDING"]) &
|
|
(df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
|
|
]
|
|
elif component == "floor":
|
|
df = df[
|
|
(df["is_suspended"] == df["is_suspended_ENDING"]) &
|
|
(df["is_solid"] == df["is_solid_ENDING"]) &
|
|
(df["another_property_below"] == df["another_property_below_ENDING"]) &
|
|
(df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
|
|
]
|
|
else:
|
|
df = df[
|
|
(df["is_pitched"] == df["is_pitched_ENDING"]) &
|
|
(df["is_roof_room"] == df["is_roof_room_ENDING"]) &
|
|
(df["is_loft"] == df["is_loft_ENDING"]) &
|
|
(df["is_flat"] == df["is_flat_ENDING"]) &
|
|
(df["is_thatched"] == df["is_thatched_ENDING"]) &
|
|
(df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
|
|
(df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
|
|
]
|
|
|
|
# Drop the binary indicators and replace the original description with the cleaned version
|
|
|
|
# Drop original cols
|
|
original_cols = [
|
|
f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
|
|
]
|
|
|
|
df = df.drop(
|
|
columns=cols_to_drop[component] + original_cols
|
|
).rename(
|
|
columns={
|
|
"clean_description": f"{component_upper}_DESCRIPTION_STARTING",
|
|
"clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
|
|
}
|
|
)
|
|
|
|
return df
|
|
|
|
|
|
def app():
|
|
# Get all the files in the directory
|
|
|
|
# Data glossary:
|
|
# https://epc.opendatacommunities.org/docs/guidance#glossary
|
|
|
|
cleaned_lookup = get_cleaned()
|
|
|
|
# List all subdirectories
|
|
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
|
|
|
dataset = []
|
|
cleaning_dataset = []
|
|
|
|
for directory in tqdm(directories):
|
|
|
|
filepath = directory / "certificates.csv"
|
|
|
|
data_processor = DataProcessor(filepath=filepath)
|
|
|
|
df = data_processor.pre_process()
|
|
cleaning_averages = data_processor.make_cleaning_averages()
|
|
|
|
data_by_urpn = []
|
|
for uprn, property_data in df.groupby("UPRN", observed=True):
|
|
# Fixed features - these are property attributes that shouldn't change over time
|
|
fixed_data = {}
|
|
|
|
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
|
|
if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1):
|
|
continue
|
|
|
|
# Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
|
|
latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict()
|
|
mandatory_field_data = (
|
|
property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
|
|
)
|
|
|
|
# Extract the columns that are not all None
|
|
modified_property_data = DataProcessor.apply_averages_cleaning(
|
|
data_to_clean=property_data,
|
|
cleaning_data=cleaning_averages,
|
|
cols_to_merge_on=COLUMNS_TO_MERGE_ON
|
|
)
|
|
|
|
# Combine all fields together
|
|
fixed_data.update(mandatory_field_data)
|
|
fixed_data.update(latest_field_data)
|
|
|
|
# Apply cleaning to fixed_data
|
|
fixed_data = DataProcessor.apply_averages_cleaning(
|
|
data_to_clean=pd.DataFrame([fixed_data]),
|
|
cleaning_data=cleaning_averages,
|
|
cols_to_merge_on=COLUMNS_TO_MERGE_ON
|
|
).to_dict("records")[0]
|
|
|
|
# We include the lodgement date here as we probably need to factor time into the
|
|
# model, since EPC standards and rigour have changed over time
|
|
variable_data = modified_property_data[
|
|
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
|
|
]
|
|
|
|
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
|
# e.g. first vs second, second vs third and also first vs third
|
|
property_model_data = []
|
|
for idx in range(0, modified_property_data.shape[0] - 1):
|
|
|
|
if idx >= modified_property_data.shape[0] - 1:
|
|
break
|
|
|
|
earliest_record = variable_data.iloc[idx]
|
|
latest_record = variable_data.iloc[idx + 1]
|
|
|
|
# Check if the sap gets better or worse
|
|
gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
|
|
|
|
if gets_better:
|
|
starting_sap = earliest_record[RDSAP_RESPONSE]
|
|
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
|
|
starting_carbon = earliest_record[CARBON_RESPONSE]
|
|
|
|
rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
|
|
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
|
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
|
|
|
|
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
|
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
|
else:
|
|
starting_sap = latest_record[RDSAP_RESPONSE]
|
|
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
|
starting_carbon = latest_record[CARBON_RESPONSE]
|
|
|
|
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
|
|
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
|
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
|
|
|
|
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
|
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
|
|
|
if rdsap_change == 0:
|
|
continue
|
|
|
|
features = pd.concat([starting_record, ending_record])
|
|
|
|
property_model_data.append(
|
|
{
|
|
"UPRN": uprn,
|
|
"RDSAP_CHANGE": rdsap_change,
|
|
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
|
"CARBON_CHANGE": carbon_change,
|
|
"SAP_STARTING": starting_sap,
|
|
"HEAT_DEMAND_STARTING": starting_heat_demand,
|
|
"CARBON_STARTING": starting_carbon,
|
|
**fixed_data,
|
|
**features.to_dict(),
|
|
}
|
|
)
|
|
|
|
data_by_urpn.extend(property_model_data)
|
|
|
|
data_by_urpn_df = pd.DataFrame(data_by_urpn)
|
|
|
|
# Add some temporal features - we look at the days from the standard starting point in time
|
|
# for the starting and ending date so all records are from a fixed point
|
|
data_by_urpn_df["DAYS_TO_STARTING"] = (
|
|
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
|
).dt.days
|
|
data_by_urpn_df["DAYS_TO_ENDING"] = (
|
|
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
|
).dt.days
|
|
|
|
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
|
|
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
|
# within descriptions
|
|
|
|
# We look for key building fabric features that have changed from one EPC to the next.
|
|
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
|
|
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
|
|
# is low
|
|
# We also replace descriptions with their cleaned variants
|
|
|
|
if pd.isnull(data_by_urpn_df).sum().sum():
|
|
raise ValueError("Null values found in dataset")
|
|
|
|
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
|
|
|
|
dataset.append(data_by_urpn_df)
|
|
|
|
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
|
|
cleaning_dataset.append(cleaning_averages)
|
|
|
|
# Store cleaning dataset in s3 as a parquet file
|
|
cleaning_dataset = pd.concat(cleaning_dataset)
|
|
save_dataframe_to_s3_parquet(
|
|
df=cleaning_dataset,
|
|
bucket_name="retrofit-data-dev",
|
|
file_key="sap_change_model/cleaning_dataset.parquet",
|
|
)
|
|
|
|
output = pd.concat(dataset)
|
|
save_dataframe_to_s3_parquet(
|
|
df=output,
|
|
bucket_name="retrofit-data-dev",
|
|
file_key="sap_change_model/dataset.parquet",
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|