fixed bug in epc record cleaning"

This commit is contained in:
Khalim Conn-Kowlessar 2026-03-18 18:16:57 +00:00
parent 5e8847d028
commit f45260706e
8 changed files with 163 additions and 110 deletions

1
.idea/Model.iml generated
View file

@ -6,6 +6,7 @@
<sourceFolder url="file://$MODULE_DIR$/model_data" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/infrastructure/terraform/.terraform" />
</content>
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />

View file

@ -73,25 +73,24 @@ def app():
Property UPRN
"""
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lifespace Rentals/Missed"
# data_filename = "For Modelling - Final - reviewed.xlsx"
data_filename = "Missed Properties - with address.xlsx"
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/March 2026 SAL"
data_filename = "Domna System Review - Livewest.xlsx"
sheet_name = "Sheet1"
postcode_column = "Postcode"
address1_column = "address1"
address1_method = None
fulladdress_column = "address1"
address1_column = None
address1_method = "house_number_extraction"
fulladdress_column = "Address"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = "UPRN"
landlord_property_type = "Type"
landlord_built_form = None
landlord_os_uprn = "gov UPRN"
landlord_property_type = "AssetType"
landlord_built_form = "AssetType"
landlord_wall_construction = None
landlord_roof_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "Reference"
landlord_property_id = "landlord_uprn"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None

View file

@ -173,6 +173,7 @@ def get_data(
errors = []
no_epc = []
for _, home in tqdm(df.iterrows(), total=len(df)):
try:
# If we have a block of flats, we cannot retrieve this data

View file

@ -20,7 +20,7 @@ def _get_associated_records(results, uprn, uprn_key="UPRN"):
return matched_record
def get_associated_uprns(postcode_search: PostcodeSearch, uprn: str | int):
def get_associated_uprns(postcode_search: Optional[PostcodeSearch], uprn: str | int):
"""
Given a postcode and UPRN, for a remote assessment, fetch all associated UPRNs, based
on parent UPRN. This will be properties in the same building

View file

@ -147,6 +147,10 @@ class PropertyModel(Base):
is_sap_points_adjusted_for_installed_measures = Column(Boolean, default=False)
original_sap_points = Column(Float)
# New for re-scoring - we will need to delete some of the redundant fields but there is a ticket for this
lodged_sap_points = Column(Float)
lodged_epc_rating = Column(Enum(Epc))
class FeatureRating(enum.Enum):
VERY_GOOD = 5
@ -253,6 +257,12 @@ class PropertyDetailsEpcModel(Base):
installed_measures_heat_demand_adjustment = Column(Float)
is_epc_adjusted_for_installed_measures = Column(Boolean, default=False)
# New columns - we'll need to delete some of the redundant fields, associated to "already installed" but
# we have a ticket for this piece of work
lodged_co2_emissions = Column(Float)
lodged_heat_demand = Column(Float)
has_been_remodelled = Column(Boolean, default=False)
class PropertyDetailsSpatial(Base):
__tablename__ = "property_details_spatial"

View file

@ -837,41 +837,41 @@ async def model_engine(body: PlanTriggerRequest):
extract_uprn=True
)
for idx, rebaselined_prediction in rebaselining_response["retrofit-sap-baseline-predictions"].iterrows():
property_instance = next(p for p in input_properties if p.uprn == int(rebaselined_prediction["uprn"]))
new_rating = rebaselined_prediction["predictions"]
new_epc_rating = sap_to_epc(new_rating)
# Insert
# TODO: TEMP: Compare values
compare_scores = []
for x in rebaselining_scoring_data["uprn"].unique():
record = [p for p in input_properties if p.uprn == x][0].epc_record
original_sap = record.current_energy_efficiency
new_sap = rebaselining_response["retrofit-sap-baseline-predictions"][
rebaselining_response["retrofit-sap-baseline-predictions"]["uprn"] == x
]["predictions"].values[0]
lodgement_date = record.lodgement_date
compare_scores.append({
"uprn": x,
"original_sap": original_sap,
"new_sap": new_sap,
"lodgement_date": lodgement_date
})
compare_scores = pd.DataFrame(compare_scores)
# property_instance.data["current-energy-efficiency"] = sap_to_epc(new_rating)
for uprn in rebaselining_scoring_data["uprn"].unique():
# Get the predictions
sap_prediction = rebaselining_response["retrofit-sap-baseline-predictions"][
rebaselining_response["retrofit-sap-baseline-predictions"]["uprn"] == uprn
]["predictions"].values[0]
addr = [a for a in addresses if a.uprn == property_instance.uprn][0]
landlord_remapping = {
"total-floor-area": addr.landlord_total_floor_area_m2, # 1m tolerance on floor area to perform remap
"property-type": addr.landlord_property_type,
"built-form": addr.landlord_built_form,
# Components
"walls-description": addr.landlord_wall_construction,
"roof-description": addr.landlord_roof_construction,
"floor-description": addr.landlord_floor_construction,
"windows-description": addr.landlord_windows_type,
"main-fuel": addr.landlord_fuel_type,
"mainheatcont-description": addr.landlord_heating_controls,
"hotwater-description": addr.landlord_hot_water_system,
# Efficiency
"walls-energy-eff": addr.landlord_wall_efficiency,
"roof-energy-eff": addr.landlord_roof_efficiency,
"windows-energy-eff": addr.landlord_windows_efficiency,
"mainheat-energy-eff": addr.landlord_heating_efficiency,
"mainheatc-energy-eff": addr.landlord_heating_controls_efficiency,
"hot-water-energy-eff": addr.landlord_hot_water_efficiency,
"multi-glaze-proportion": addr.landlord_multi_glaze_proportion * 100, # TODO: Fix this!
"construction-age-band": addr.landlord_construction_age_band,
}
carbon_prediction = 1337
heat_demand_prediction = 1337
# Insert the re-baselined scores into the property data
for p in input_properties:
property_rebaselined_sap = rebaselining_response["retrofit-sap-baseline-predictions"]
epc_prediction = sap_to_epc(sap_prediction)
# We now need to insert the new values into the epc_record
property_instance = next(p for p in input_properties if p.uprn == int(uprn))
property_instance.epc_record.insert_new_performance_values(
new_sap=sap_prediction,
new_epc=epc_prediction,
new_carbon=carbon_prediction,
new_heat_demand=heat_demand_prediction,
)
kwh_client = KwhData(bucket=get_settings().DATA_BUCKET, read_consumption_data=True)
@ -924,26 +924,6 @@ async def model_engine(body: PlanTriggerRequest):
# We also make a tweak - if the property has been flagged for solar but doesn't contain
# any panel performance, we ensure that we have a 3kWp and 4kWp option for the property
# TODO: Temp - test re-baselining
p = input_properties[0]
p.create_base_difference_epc_record(cleaned_lookup=cleaned)
scoring_data = p.base_difference_record.df
# We just need a recent date to trigger the right models,
# as we are only interested in the deltas
scoring_data["is_post_sap10_starting"] = True
# Score model - SAP re-baselining model
model_api.MODEL_URLS["retrofit-sap-baseline-predictions"] = "sapbaselinemodel"
model_api.prediction_buckets["retrofit-sap-baseline-predictions"] = "retrofit-sap-baseline-predictions-dev"
example_response = model_api.predict_all(
df=scoring_data,
bucket=get_settings().DATA_BUCKET,
model_prefixes=["retrofit-sap-baseline-predictions"],
extract_ids=False
)
input_properties[0].data["current-energy-efficiency"] = 58.8
input_properties[0].data["current-energy-rating"] = "D"
logger.info("Identifying property recommendations")
recommendations, recommendations_scoring_data, representative_recommendations = {}, [], {}
for p in tqdm(input_properties):

View file

@ -1,4 +1,5 @@
import re
from dataclasses import fields
import pandas as pd
import numpy as np
from datetime import datetime
@ -14,24 +15,24 @@ logger = setup_logger()
class KwhData:
COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]
COLS_TO_STRINGIFY = ["main_heating_controls", "floor_level"]
CATEGORICAL_COLUMNS = [
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
"number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
"built-form",
"construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
"walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
"lodgement_year", "lodgement_month", "main_fuel", "mainheat_description", "number_heated_rooms",
"number_habitable_rooms", "mainheat_energy_eff", "mainheatcont_description", "property_type",
"built_form",
"construction_age_band", "secondheat_description", "hotwater_description", "hot_water_energy_eff",
"walls_description", "walls_energy_eff", "roof_description", "roof_energy_eff", "floor_description",
"county",
"windows-description", "windows-energy-eff", "flat-top-storey",
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating",
"floor-level"
"windows_description", "windows_energy_eff", "flat_top_storey",
"flat_storey_count", "unheated_corridor_length", "solar_water_heating_flag", "mechanical_ventilation",
"low_energy_lighting", "environment_impact_current", "energy_tariff", "current_energy_rating",
"floor_level"
]
NUMERICAL_COLUMNS = [
'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
'heating_cost_current', 'total_floor_area', 'co2_emissions_current', 'energy_consumption_current',
'heating_cost_potential', 'hot_water_cost_current', 'current_energy_efficiency'
]
def __init__(self, bucket=None, read_consumption_data=False):
@ -106,6 +107,16 @@ class KwhData:
# If no match is found, return None or raise an exception
return None
@staticmethod
def _normalise_epc_keys(data):
if isinstance(data, dict):
return {key.replace("-", "_"): value for key, value in data.items()}
if isinstance(data, pd.DataFrame):
return data.rename(columns=lambda column: column.replace("-", "_"))
raise TypeError("Expected dict or DataFrame")
def combine(self):
"""
Given the data that is collected containing the kwh values for heating and hot water, this method will combine
@ -128,9 +139,9 @@ class KwhData:
# We check that the retrieved energy consumption sufficiently matches the EPC data
internal_dataset = []
for x in data:
epc_data = x["epc"]
epc_sap = epc_data["current-energy-efficiency"]
epc_potential_sap = epc_data["potential-energy-efficiency"]
epc_data = self._normalise_epc_keys(x["epc"])
epc_sap = epc_data["current_energy_efficiency"]
epc_potential_sap = epc_data["potential_energy_efficiency"]
# Make sure this matches the extracted sap
if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int(
x["potential_epc_efficiency"]
@ -171,7 +182,7 @@ class KwhData:
# We also estimate the energy consumption reduction from this data, by band
df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"]
consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index()
consumption_averages = df.groupby("current_energy_efficiency")["total_consumption"].mean().reset_index()
df = df.drop(columns=["total_consumption"])
self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet"
@ -203,9 +214,11 @@ class KwhData:
# TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
# in anticipation of the new model
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
data["lodgement-year"] = data["lodgement-date"].dt.year
data["lodgement-month"] = data["lodgement-date"].dt.month
data = self._normalise_epc_keys(data.copy())
data["lodgement_date"] = pd.to_datetime(data["lodgement_date"])
data["lodgement_year"] = data["lodgement_date"].dt.year
data["lodgement_month"] = data["lodgement_date"].dt.month
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many
# categories
@ -231,8 +244,10 @@ class KwhData:
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
# Apply the lookup table to the data
for feature in ["walls-description", "roof-description", "floor-description"]:
cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
for feature in ["walls_description", "roof_description", "floor_description"]:
cleaned_df = pd.DataFrame(
cleaned[feature.replace("_", "-")]
)[["original_description", "thermal_transmittance"]]
# Round to 2 decimal places and convert to string
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
@ -261,10 +276,10 @@ class KwhData:
data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)
# Create new features:
data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
data['estimate_annual_kwh'] = data['energy_consumption_current'] * data['total_floor_area']
# Ensure this is string, because we could have mixed types
data["lodgement-datetime"] = data["lodgement-datetime"].astype(str)
data["lodgement_datetime"] = data["lodgement_datetime"].astype(str)
if save:
self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
@ -286,29 +301,39 @@ class KwhData:
data is in the format required by the model
:return:
"""
epc = p.data.copy()
numeric_cols = [
'current-energy-efficiency',
'potential-energy-efficiency', 'environment-impact-current',
'environment-impact-potential', 'energy-consumption-current',
'energy-consumption-potential', 'co2-emissions-current',
'co2-emiss-curr-per-floor-area', 'co2-emissions-potential',
'lighting-cost-current', 'lighting-cost-potential',
'heating-cost-current', 'heating-cost-potential',
'hot-water-cost-current', 'hot-water-cost-potential',
'total-floor-area', 'multi-glaze-proportion',
'extension-count', 'number-habitable-rooms', 'number-heated-rooms',
'low-energy-lighting', 'number-open-fireplaces',
'wind-turbine-count', 'unheated-corridor-length',
'floor-height', 'photo-supply', 'fixed-lighting-outlets-count',
'low-energy-fixed-light-count',
'current_energy_efficiency',
'potential_energy_efficiency', 'environment_impact_current',
'environment_impact_potential', 'energy_consumption_current',
'energy_consumption_potential', 'co2_emissions_current',
'co2_emiss_curr_per_floor_area', 'co2_emissions_potential',
'lighting_cost_current', 'lighting_cost_potential',
'heating_cost_current', 'heating_cost_potential',
'hot_water_cost_current', 'hot_water_cost_potential',
'total_floor_area', 'multi_glaze_proportion',
'extension_count', 'number_habitable_rooms', 'number_heated_rooms',
'low_energy_lighting', 'number_open_fireplaces',
'wind_turbine_count', 'unheated_corridor_length',
'floor_height', 'photo_supply', 'fixed_lighting_outlets_count',
'low_energy_fixed_light_count',
]
required_cols = set(numeric_cols + KwhData.CATEGORICAL_COLUMNS + [
"uprn", "lodgement_date", "lodgement_datetime", "floor_energy_eff"
])
epc_record = p.epc_record
available_fields = {field.name for field in fields(epc_record)}
missing_fields = required_cols - available_fields
if missing_fields:
raise ValueError(f"Missing EPCRecord fields required by KwhData: {sorted(missing_fields)}")
epc = {field_name: getattr(epc_record, field_name) for field_name in required_cols}
for v in numeric_cols:
if epc[v] is not None:
epc[v] = float(epc[v])
bools_to_remap = ['mains-gas-flag', 'flat-top-storey']
bools_to_remap = ['mains_gas_flag', 'flat_top_storey']
bool_map = {
True: "Y",
False: "N",
@ -320,8 +345,8 @@ class KwhData:
epc[v] = bool_map[epc[v]]
no_data = {
"floor-level": "NODATA!",
"floor-energy-eff": "NO DATA!"
"floor_level": "NODATA!",
"floor_energy_eff": "NO DATA!"
}
for v, fill_val in no_data.items():
if pd.isnull(epc[v]):
@ -331,8 +356,8 @@ class KwhData:
def prepare_epc(self, input_properties: list[Property]):
scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties])
scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year
scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month
scoring_data["lodgement_year"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.year
scoring_data["lodgement_month"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.month
scoring_data["id"] = scoring_data["uprn"].copy()

View file

@ -309,6 +309,7 @@ class EPCRecord:
# Indicates if the EPC record has been predicted. By default, false
estimated: Optional[bool] = False
sap_05_overwritten: Optional[bool] = False
has_been_remodelled: Optional[bool] = False
# ------------------------------------------------------------------
# MODEL FLAGS
@ -386,6 +387,35 @@ class EPCRecord:
return
def insert_new_performance_values(
self, new_sap: float, new_epc: float, new_carbon: float, new_heat_demand: float,
):
"""
Given re-modelling for this property, is used to insert the new values and also keep a record of the
fact that re-modelling has taken place
:param new_sap:
:param new_epc:
:param new_carbon:
:param new_heat_demand:
:return:
"""
self.has_been_remodelled = True
# Update prepared epc
update_data = {
"current_energy_efficiency": new_sap,
"current_energy_rating": new_epc,
"co2_emissions_current": new_carbon,
"energy_consumption_current": new_heat_demand,
}
# Validate we're updating correct fields
for k in update_data:
if k not in self._prepared_epc:
raise ValueError(f"Attempting to update unknown field '{k}' in prepared EPC")
self._prepared_epc.update(update_data)
# Update dataclass attributes
self._expand_prepared_epc_to_attributes()
def _apply_averages_cleaning(self) -> None:
"""
Fills missing property dimension values using medians from cleaning_data.
@ -626,6 +656,10 @@ class EPCRecord:
# Ignore keys that are not part of the dataclass schema
continue
if value is None:
setattr(self, key, None)
continue
try:
cast_value = self._cast_value(value, field_map[key].type)
setattr(self, key, cast_value)
@ -812,14 +846,17 @@ class EPCRecord:
(property_dimensions["PROPERTY_TYPE"] == self._prepared_epc["property-type"])
]
if self.construction_age_band not in DATA_ANOMALY_MATCHES:
if (
(self.construction_age_band not in DATA_ANOMALY_MATCHES) and
(self.construction_age_band in result["CONSTRUCTION_AGE_BAND"].values)
):
result = result[
(result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band)
]
if (
self._prepared_epc["built-form"] not in DATA_ANOMALY_MATCHES
and self._prepared_epc["built-form"] in result["BUILT_FORM"]
and self._prepared_epc["built-form"] in result["BUILT_FORM"].values
):
result = result[(result["BUILT_FORM"] == self._prepared_epc["built-form"])]
@ -935,7 +972,7 @@ class EPCRecord:
self._prepared_epc["unheated-corridor-length"] = (
float(self._prepared_epc["unheated-corridor-length"])
if self._prepared_epc["unheated-corridor-length"] not in ["", None]
if self._prepared_epc["unheated-corridor-length"] not in DATA_ANOMALY_MATCHES
else None
)