mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
489 lines
20 KiB
Python
489 lines
20 KiB
Python
import pandas as pd
|
|
from tqdm import tqdm
|
|
import msgpack
|
|
|
|
from pathlib import Path
|
|
from model_data.simulation_system.core.Settings import (
|
|
MANDATORY_FIXED_FEATURES,
|
|
LATEST_FIELD,
|
|
COMPONENT_FEATURES,
|
|
RDSAP_RESPONSE,
|
|
HEAT_DEMAND_RESPONSE,
|
|
COLUMNS_TO_MERGE_ON,
|
|
EARLIEST_EPC_DATE,
|
|
CARBON_RESPONSE,
|
|
)
|
|
from model_data.simulation_system.core.DataProcessor import DataProcessor
|
|
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_from_s3_parquet
|
|
from recommendations.rdsap_tables import england_wales_age_band_lookup
|
|
from recommendations.recommendation_utils import get_wall_u_value, get_roof_u_value
|
|
|
|
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
|
|
|
|
|
def get_cleaned():
|
|
"""
|
|
This function will retrieve the cleaned dataset from s3 which has the cleaned
|
|
descriptions for the epc dataset
|
|
|
|
This data is stored in MessagePack format and therefore needs to be decoded
|
|
:return:
|
|
"""
|
|
|
|
cleaned = read_from_s3(
|
|
s3_file_name="cleaned_epc_data/cleaned.bson",
|
|
bucket_name="retrofit-data-dev"
|
|
)
|
|
|
|
cleaned = msgpack.unpackb(cleaned, raw=False)
|
|
|
|
return cleaned
|
|
|
|
|
|
def process_and_prune_desriptions(df, cleaned_lookup):
|
|
"""
|
|
This method will merge on the cleaned lookup table and ensure that the building fabric in the
|
|
starting and ending EPC is consistent, so ensure that we are performing our modelling on the cleanest
|
|
possible dataset.
|
|
:param df:
|
|
:param cleaned_lookup:
|
|
:return:
|
|
"""
|
|
|
|
cols_to_drop = {
|
|
"walls": [
|
|
# We need to cleaned descriptions for pulling out u-values
|
|
'original_description', 'thermal_transmittance_unit',
|
|
'original_description_ENDING',
|
|
'thermal_transmittance_unit_ENDING',
|
|
'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
|
|
'is_solid_brick_ENDING', 'is_system_built_ENDING',
|
|
'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
|
|
'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
|
|
'is_sandstone_or_limestone_ENDING',
|
|
# Re remove the is_assumed columns
|
|
"is_assumed", "is_assumed_ENDING"
|
|
],
|
|
"floor": [
|
|
"original_description", "clean_description", "thermal_transmittance_unit",
|
|
"no_data", "no_data_ENDING", "original_description_ENDING",
|
|
"clean_description_ENDING", "thermal_transmittance_unit_ENDING",
|
|
"is_suspended_ENDING", "is_solid_ENDING", "another_property_below_ENDING",
|
|
"is_to_unheated_space_ENDING", "is_to_external_air_ENDING", "is_assumed",
|
|
"is_assumed_ENDING"
|
|
],
|
|
"roof": [
|
|
"original_description", "clean_description", "thermal_transmittance_unit",
|
|
"is_assumed", "is_valid", "original_description_ENDING", "clean_description_ENDING",
|
|
"thermal_transmittance_unit_ENDING", "is_pitched_ENDING", "is_roof_room_ENDING",
|
|
"is_loft_ENDING", "is_flat_ENDING", "is_thatched_ENDING", "is_at_rafters_ENDING",
|
|
"has_dwelling_above_ENDING", "is_assumed_ENDING", "is_valid_ENDING"
|
|
],
|
|
"hotwater": [
|
|
"original_description", "clean_description", "assumed", "original_description_ENDING",
|
|
"clean_description_ENDING", "assumed_ENDING"
|
|
],
|
|
"mainheat": [
|
|
"original_description", "clean_description", "original_description_ENDING",
|
|
"has_assumed", "original_description_ENDING", "clean_description_ENDING",
|
|
"has_assumed_ENDING",
|
|
],
|
|
"mainheatcont": [
|
|
"original_description", "clean_description", "original_description_ENDING", "clean_description_ENDING"
|
|
],
|
|
"windows": [
|
|
"original_description", "clean_description", "original_description_ENDING", "clean_description_ENDING",
|
|
# We don't need many of the glazing coverage features because we have the multi_glaze_proportion feature
|
|
"has_glazing", "glazing_coverage", "no_data", "has_glazing_ENDING", "glazing_coverage_ENDING",
|
|
"no_data_ENDING"
|
|
],
|
|
"main-fuel": [
|
|
"original_description", "clean_description", "original_description_ENDING", "clean_description_ENDING"
|
|
],
|
|
}
|
|
|
|
for component in ["walls", "floor", "roof", "hotwater", "mainheat", "mainheatcont", "windows", "main-fuel"]:
|
|
component_upper = component.upper()
|
|
if component == "main-fuel":
|
|
component_upper = component_upper.replace("-", "_")
|
|
|
|
cleaned_key = "main-fuel" if component == "main-fuel" else f"{component}-description"
|
|
left_on_starting = (
|
|
f"{component_upper}_STARTING" if component == "main-fuel" else f"{component_upper}_DESCRIPTION_STARTING"
|
|
)
|
|
|
|
left_on_ending = (
|
|
f"{component_upper}_ENDING" if component == "main-fuel" else f"{component_upper}_DESCRIPTION_ENDING"
|
|
)
|
|
|
|
df = df.merge(
|
|
pd.DataFrame(cleaned_lookup[cleaned_key]),
|
|
how="left",
|
|
left_on=left_on_starting,
|
|
right_on="original_description",
|
|
).merge(
|
|
pd.DataFrame(cleaned_lookup[cleaned_key]),
|
|
how="left",
|
|
left_on=left_on_ending,
|
|
right_on="original_description",
|
|
suffixes=("", "_ENDING")
|
|
)
|
|
|
|
if component == "walls":
|
|
# We make sure the wall construction hasn't changed
|
|
df = df[
|
|
(df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
|
|
(df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
|
|
(df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
|
|
(df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
|
|
(df["is_cob"] == df["is_cob_ENDING"]) &
|
|
(df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
|
|
]
|
|
elif component == "floor":
|
|
df = df[
|
|
(df["is_suspended"] == df["is_suspended_ENDING"]) &
|
|
(df["is_solid"] == df["is_solid_ENDING"]) &
|
|
(df["another_property_below"] == df["another_property_below_ENDING"]) &
|
|
(df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"]) &
|
|
(df["is_to_external_air"] == df["is_to_external_air_ENDING"])
|
|
]
|
|
elif component == "roof":
|
|
df = df[
|
|
(df["is_pitched"] == df["is_pitched_ENDING"]) &
|
|
(df["is_roof_room"] == df["is_roof_room_ENDING"]) &
|
|
(df["is_loft"] == df["is_loft_ENDING"]) &
|
|
(df["is_flat"] == df["is_flat_ENDING"]) &
|
|
(df["is_thatched"] == df["is_thatched_ENDING"]) &
|
|
(df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
|
|
(df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
|
|
]
|
|
|
|
# Drop the binary indicators and replace the original description with the cleaned version
|
|
|
|
# Drop original cols
|
|
original_cols = [
|
|
f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
|
|
] if component != "main-fuel" else [
|
|
f"{component_upper}_STARTING", f"{component_upper}_ENDING"
|
|
]
|
|
|
|
df = df.drop(columns=cols_to_drop[component] + original_cols)
|
|
|
|
# If we have an insulation_thickness column, rename it
|
|
if "insulation_thickness" in cleaned_lookup[cleaned_key][0]:
|
|
df = df.rename(
|
|
columns={
|
|
"insulation_thickness": f"{component}_insulation_thickness",
|
|
"insulation_thickness_ENDING": f"{component}_insulation_thickness_ENDING",
|
|
}
|
|
)
|
|
# If we have thermal transmittance, rename it
|
|
if "thermal_transmittance" in cleaned_lookup[cleaned_key][0]:
|
|
df = df.rename(
|
|
columns={
|
|
"thermal_transmittance": f"{component}_thermal_transmittance",
|
|
"thermal_transmittance_ENDING": f"{component}_thermal_transmittance_ENDING",
|
|
}
|
|
)
|
|
|
|
# If we have tarrif, rename it
|
|
if "tariff_type" in cleaned_lookup[cleaned_key][0]:
|
|
df = df.rename(
|
|
columns={
|
|
"tariff_type": f"{component}_tariff_type",
|
|
"tariff_type_ENDING": f"{component}_tariff_type_ENDING",
|
|
}
|
|
)
|
|
|
|
# We need the walls descriptions so we rename them to distinguish them
|
|
if component == "walls":
|
|
df = df.rename(
|
|
columns={
|
|
"clean_description": f"{component}_clean_description",
|
|
"clean_description_ENDING": f"{component}_clean_description_ENDING",
|
|
}
|
|
)
|
|
|
|
# We don't need any lighting specific cleaning, we just drop the original description as we use
|
|
# LOW_ENERGY_LIGHTING_STARTING, LOW_ENERGY_LIGHTING_ENDING
|
|
|
|
df = df.drop(columns=["LIGHTING_DESCRIPTION_STARTING", "LIGHTING_DESCRIPTION_ENDING"])
|
|
|
|
return df
|
|
|
|
|
|
def make_uvalues(df):
|
|
df["row_index"] = df.index
|
|
|
|
uvalues = []
|
|
for _, x in df.iterrows():
|
|
|
|
uprn = x["UPRN"]
|
|
row_index = x["row_index"]
|
|
|
|
# ~~~~~~~~~~~~~~~~~~
|
|
# Walls
|
|
# ~~~~~~~~~~~~~~~~~~
|
|
|
|
starting_wall_uvalue = x["walls_thermal_transmittance"]
|
|
if pd.isnull(starting_wall_uvalue):
|
|
starting_wall_uvalue = get_wall_u_value(
|
|
clean_description=x["walls_clean_description"],
|
|
age_band=england_wales_age_band_lookup[x["CONSTRUCTION_AGE_BAND"]],
|
|
is_granite_or_whinstone=x["is_granite_or_whinstone"],
|
|
is_sandstone_or_limestone=x["is_sandstone_or_limestone"],
|
|
)
|
|
|
|
ending_wall_uvalue = x["walls_thermal_transmittance_ENDING"]
|
|
if pd.isnull(ending_wall_uvalue):
|
|
if x["walls_clean_description"] != x["walls_clean_description_ENDING"]:
|
|
ending_wall_uvalue = get_wall_u_value(
|
|
clean_description=x["walls_clean_description_ENDING"],
|
|
age_band=england_wales_age_band_lookup[x["CONSTRUCTION_AGE_BAND"]],
|
|
is_granite_or_whinstone=x["is_granite_or_whinstone"],
|
|
is_sandstone_or_limestone=x["is_sandstone_or_limestone"],
|
|
)
|
|
else:
|
|
ending_wall_uvalue = starting_wall_uvalue
|
|
|
|
# ~~~~~~~~~~~~~~~~~~
|
|
# Roof
|
|
# ~~~~~~~~~~~~~~~~~~
|
|
starting_roof_uvalue = x["roof_thermal_transmittance"]
|
|
if pd.isnull(starting_roof_uvalue):
|
|
starting_roof_uvalue = get_roof_u_value(
|
|
insulation_thickness=x["roof_insulation_thickness"],
|
|
has_dwelling_above=x["has_dwelling_above"],
|
|
is_loft=x["is_loft"],
|
|
is_roof_room=x["is_roof_room"],
|
|
is_thatched=x["is_thatched"],
|
|
is_flat=x["is_flat"],
|
|
is_pitched=x["is_pitched"],
|
|
is_at_rafters=x["is_at_rafters"],
|
|
age_band=england_wales_age_band_lookup[x["CONSTRUCTION_AGE_BAND"]]
|
|
)
|
|
|
|
ending_roof_uvalue = x["roof_thermal_transmittance_ENDING"]
|
|
|
|
if pd.isnull(ending_roof_uvalue):
|
|
ending_roof_uvalue = get_roof_u_value(
|
|
insulation_thickness=x["roof_insulation_thickness_ENDING"],
|
|
has_dwelling_above=x["has_dwelling_above"],
|
|
is_loft=x["is_loft"],
|
|
is_roof_room=x["is_roof_room"],
|
|
is_thatched=x["is_thatched"],
|
|
is_flat=x["is_flat"],
|
|
is_pitched=x["is_pitched"],
|
|
is_at_rafters=x["is_at_rafters"],
|
|
age_band=england_wales_age_band_lookup[x["CONSTRUCTION_AGE_BAND"]]
|
|
)
|
|
|
|
# ~~~~~~~~~~~~~~~~~~
|
|
# Floor
|
|
# ~~~~~~~~~~~~~~~~~~
|
|
|
|
uvalues.append(
|
|
{
|
|
"UPRN": uprn,
|
|
"row_index": row_index,
|
|
"starting_wall_uvalue": starting_wall_uvalue,
|
|
"ending_wall_uvalue": ending_wall_uvalue,
|
|
"starting_roof_uvalue": starting_roof_uvalue,
|
|
"ending_roof_uvalue": ending_roof_uvalue,
|
|
}
|
|
)
|
|
|
|
uvalues = pd.DataFrame(uvalues)
|
|
|
|
df = df.merge(
|
|
uvalues, how="left", on=["UPRN", "row_index"]
|
|
).drop(columns="row_index")
|
|
|
|
return df
|
|
|
|
|
|
def app():
|
|
# Get all the files in the directory
|
|
|
|
# Data glossary:
|
|
# https://epc.opendatacommunities.org/docs/guidance#glossary
|
|
|
|
cleaned_lookup = get_cleaned()
|
|
|
|
# List all subdirectories
|
|
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
|
|
|
dataset = []
|
|
cleaning_dataset = []
|
|
|
|
for directory in tqdm(directories):
|
|
|
|
filepath = directory / "certificates.csv"
|
|
|
|
data_processor = DataProcessor(filepath=filepath)
|
|
|
|
df = data_processor.pre_process()
|
|
cleaning_averages = data_processor.make_cleaning_averages()
|
|
|
|
# We have some odd cases with missing constituency so we fill
|
|
df = df.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]})
|
|
|
|
df = DataProcessor.apply_averages_cleaning(
|
|
data_to_clean=df,
|
|
cleaning_data=cleaning_averages,
|
|
cols_to_merge_on=COLUMNS_TO_MERGE_ON
|
|
)
|
|
|
|
data_by_urpn = []
|
|
for uprn, property_data in df.groupby("UPRN", observed=True):
|
|
|
|
# Fixed features - these are property attributes that shouldn't change over time
|
|
fixed_data = {}
|
|
|
|
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
|
|
if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1) or (
|
|
pd.isnull(property_data[MANDATORY_FIXED_FEATURES]).sum().sum() > 0
|
|
):
|
|
continue
|
|
|
|
# Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
|
|
latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict()
|
|
mandatory_field_data = (
|
|
property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
|
|
)
|
|
|
|
# Combine all fields together
|
|
fixed_data.update(mandatory_field_data)
|
|
fixed_data.update(latest_field_data)
|
|
|
|
# We include the lodgement date here as we probably need to factor time into the
|
|
# model, since EPC standards and rigour have changed over time
|
|
variable_data = property_data[
|
|
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
|
|
]
|
|
|
|
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
|
# e.g. first vs second, second vs third and also first vs third
|
|
property_model_data = []
|
|
for idx in range(0, property_data.shape[0] - 1):
|
|
|
|
if idx >= property_data.shape[0] - 1:
|
|
break
|
|
|
|
earliest_record = variable_data.iloc[idx]
|
|
latest_record = variable_data.iloc[idx + 1]
|
|
|
|
# Check if the sap gets better or worse
|
|
gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
|
|
|
|
if gets_better:
|
|
starting_sap = earliest_record[RDSAP_RESPONSE]
|
|
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
|
|
starting_carbon = earliest_record[CARBON_RESPONSE]
|
|
|
|
rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
|
|
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
|
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
|
|
|
|
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
|
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
|
else:
|
|
starting_sap = latest_record[RDSAP_RESPONSE]
|
|
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
|
starting_carbon = latest_record[CARBON_RESPONSE]
|
|
|
|
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
|
|
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
|
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
|
|
|
|
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
|
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
|
|
|
if rdsap_change == 0:
|
|
continue
|
|
|
|
features = pd.concat([starting_record, ending_record])
|
|
|
|
property_model_data.append(
|
|
{
|
|
"UPRN": uprn,
|
|
"RDSAP_CHANGE": rdsap_change,
|
|
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
|
"CARBON_CHANGE": carbon_change,
|
|
"SAP_STARTING": starting_sap,
|
|
"HEAT_DEMAND_STARTING": starting_heat_demand,
|
|
"CARBON_STARTING": starting_carbon,
|
|
**fixed_data,
|
|
**features.to_dict(),
|
|
}
|
|
)
|
|
|
|
data_by_urpn.extend(property_model_data)
|
|
|
|
data_by_urpn_df = pd.DataFrame(data_by_urpn)
|
|
|
|
# Add some temporal features - we look at the days from the standard starting point in time
|
|
# for the starting and ending date so all records are from a fixed point
|
|
data_by_urpn_df["DAYS_TO_STARTING"] = (
|
|
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
|
).dt.days
|
|
data_by_urpn_df["DAYS_TO_ENDING"] = (
|
|
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
|
).dt.days
|
|
|
|
# We look for key building fabric features that have changed from one EPC to the next.
|
|
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
|
|
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
|
|
# is low
|
|
# We also replace descriptions with their cleaned variants
|
|
|
|
if pd.isnull(data_by_urpn_df).sum().sum():
|
|
raise ValueError("Null values found in dataset")
|
|
|
|
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
|
|
|
|
# Apply u-values
|
|
########################
|
|
# Walls
|
|
########################
|
|
|
|
for col in ["walls_clean_description", "walls_clean_description_ENDING"]:
|
|
data_by_urpn_df[col] = data_by_urpn_df[col].str.replace("(assumed)", "").str.rstrip()
|
|
|
|
data_by_urpn_df = make_uvalues(data_by_urpn_df).drop(
|
|
columns=["walls_clean_description", "walls_clean_description_ENDING"]
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
get_wall_u_value(clean_description=)
|
|
|
|
if pd.isnull(data_by_urpn_df).sum().sum():
|
|
raise ValueError("Null values found in dataset after process_and_prune_desriptions")
|
|
|
|
dataset.append(data_by_urpn_df)
|
|
|
|
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
|
|
cleaning_dataset.append(cleaning_averages)
|
|
|
|
# Store cleaning dataset in s3 as a parquet file
|
|
cleaning_dataset = pd.concat(cleaning_dataset)
|
|
save_dataframe_to_s3_parquet(
|
|
df=cleaning_dataset,
|
|
bucket_name="retrofit-data-dev",
|
|
file_key="sap_change_model/cleaning_dataset.parquet",
|
|
)
|
|
|
|
output = pd.concat(dataset)
|
|
save_dataframe_to_s3_parquet(
|
|
df=output,
|
|
bucket_name="retrofit-data-dev",
|
|
file_key="sap_change_model/dataset_new.parquet",
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|