diff --git a/.idea/Model.iml b/.idea/Model.iml
index 1e51ede4..4d94187d 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -6,6 +6,7 @@
+
diff --git a/asset_list/app.py b/asset_list/app.py
index a97bb8e0..8becbd3e 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -73,25 +73,24 @@ def app():
Property UPRN
"""
- data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lifespace Rentals/Missed"
- # data_filename = "For Modelling - Final - reviewed.xlsx"
- data_filename = "Missed Properties - with address.xlsx"
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/March 2026 SAL"
+ data_filename = "Domna System Review - Livewest.xlsx"
sheet_name = "Sheet1"
postcode_column = "Postcode"
- address1_column = "address1"
- address1_method = None
- fulladdress_column = "address1"
+ address1_column = None
+ address1_method = "house_number_extraction"
+ fulladdress_column = "Address"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = None
- landlord_os_uprn = "UPRN"
- landlord_property_type = "Type"
- landlord_built_form = None
+ landlord_os_uprn = "gov UPRN"
+ landlord_property_type = "AssetType"
+ landlord_built_form = "AssetType"
landlord_wall_construction = None
landlord_roof_construction = None
landlord_heating_system = None
landlord_existing_pv = None
- landlord_property_id = "Reference"
+ landlord_property_id = "landlord_uprn"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None
diff --git a/asset_list/utils.py b/asset_list/utils.py
index d83a35f2..9d3ae1b6 100644
--- a/asset_list/utils.py
+++ b/asset_list/utils.py
@@ -173,6 +173,7 @@ def get_data(
errors = []
no_epc = []
for _, home in tqdm(df.iterrows(), total=len(df)):
+
try:
# If we have a block of flats, we cannot retrieve this data
diff --git a/backend/app/db/functions/address_functions.py b/backend/app/db/functions/address_functions.py
index 4b8ad5f2..42fcdcfa 100644
--- a/backend/app/db/functions/address_functions.py
+++ b/backend/app/db/functions/address_functions.py
@@ -20,7 +20,7 @@ def _get_associated_records(results, uprn, uprn_key="UPRN"):
return matched_record
-def get_associated_uprns(postcode_search: PostcodeSearch, uprn: str | int):
+def get_associated_uprns(postcode_search: Optional[PostcodeSearch], uprn: str | int):
"""
Given a postcode and UPRN, for a remote assessment, fetch all associated UPRNs, based
on parent UPRN. This will be properties in the same building
diff --git a/backend/app/db/models/portfolio.py b/backend/app/db/models/portfolio.py
index 9eb26597..c511b6c9 100644
--- a/backend/app/db/models/portfolio.py
+++ b/backend/app/db/models/portfolio.py
@@ -147,6 +147,10 @@ class PropertyModel(Base):
is_sap_points_adjusted_for_installed_measures = Column(Boolean, default=False)
original_sap_points = Column(Float)
+ # New for re-scoring - we will need to delete some of the redundant fields but there is a ticket for this
+ lodged_sap_points = Column(Float)
+ lodged_epc_rating = Column(Enum(Epc))
+
class FeatureRating(enum.Enum):
VERY_GOOD = 5
@@ -253,6 +257,12 @@ class PropertyDetailsEpcModel(Base):
installed_measures_heat_demand_adjustment = Column(Float)
is_epc_adjusted_for_installed_measures = Column(Boolean, default=False)
+ # New columns - we'll need to delete some of the redundant fields, associated to "already installed" but
+ # we have a ticket for this piece of work
+ lodged_co2_emissions = Column(Float)
+ lodged_heat_demand = Column(Float)
+ has_been_remodelled = Column(Boolean, default=False)
+
class PropertyDetailsSpatial(Base):
__tablename__ = "property_details_spatial"
diff --git a/backend/engine/engine.py b/backend/engine/engine.py
index 4f698e18..e1e45b47 100644
--- a/backend/engine/engine.py
+++ b/backend/engine/engine.py
@@ -837,41 +837,41 @@ async def model_engine(body: PlanTriggerRequest):
extract_uprn=True
)
- for idx, rebaselined_prediction in rebaselining_response["retrofit-sap-baseline-predictions"].iterrows():
- property_instance = next(p for p in input_properties if p.uprn == int(rebaselined_prediction["uprn"]))
- new_rating = rebaselined_prediction["predictions"]
- new_epc_rating = sap_to_epc(new_rating)
- # Insert
+ # TODO: TEMP: Compare values
+ compare_scores = []
+ for x in rebaselining_scoring_data["uprn"].unique():
+ record = [p for p in input_properties if p.uprn == x][0].epc_record
+ original_sap = record.current_energy_efficiency
+ new_sap = rebaselining_response["retrofit-sap-baseline-predictions"][
+ rebaselining_response["retrofit-sap-baseline-predictions"]["uprn"] == x
+ ]["predictions"].values[0]
+ lodgement_date = record.lodgement_date
+ compare_scores.append({
+ "uprn": x,
+ "original_sap": original_sap,
+ "new_sap": new_sap,
+ "lodgement_date": lodgement_date
+ })
+ compare_scores = pd.DataFrame(compare_scores)
- # property_instance.data["current-energy-efficiency"] = sap_to_epc(new_rating)
+ for uprn in rebaselining_scoring_data["uprn"].unique():
+ # Get the predictions
+ sap_prediction = rebaselining_response["retrofit-sap-baseline-predictions"][
+ rebaselining_response["retrofit-sap-baseline-predictions"]["uprn"] == uprn
+ ]["predictions"].values[0]
- addr = [a for a in addresses if a.uprn == property_instance.uprn][0]
- landlord_remapping = {
- "total-floor-area": addr.landlord_total_floor_area_m2, # 1m tolerance on floor area to perform remap
- "property-type": addr.landlord_property_type,
- "built-form": addr.landlord_built_form,
- # Components
- "walls-description": addr.landlord_wall_construction,
- "roof-description": addr.landlord_roof_construction,
- "floor-description": addr.landlord_floor_construction,
- "windows-description": addr.landlord_windows_type,
- "main-fuel": addr.landlord_fuel_type,
- "mainheatcont-description": addr.landlord_heating_controls,
- "hotwater-description": addr.landlord_hot_water_system,
- # Efficiency
- "walls-energy-eff": addr.landlord_wall_efficiency,
- "roof-energy-eff": addr.landlord_roof_efficiency,
- "windows-energy-eff": addr.landlord_windows_efficiency,
- "mainheat-energy-eff": addr.landlord_heating_efficiency,
- "mainheatc-energy-eff": addr.landlord_heating_controls_efficiency,
- "hot-water-energy-eff": addr.landlord_hot_water_efficiency,
- "multi-glaze-proportion": addr.landlord_multi_glaze_proportion * 100, # TODO: Fix this!
- "construction-age-band": addr.landlord_construction_age_band,
- }
+ carbon_prediction = 1337
+ heat_demand_prediction = 1337
- # Insert the re-baselined scores into the property data
- for p in input_properties:
- property_rebaselined_sap = rebaselining_response["retrofit-sap-baseline-predictions"]
+ epc_prediction = sap_to_epc(sap_prediction)
+ # We now need to insert the new values into the epc_record
+ property_instance = next(p for p in input_properties if p.uprn == int(uprn))
+ property_instance.epc_record.insert_new_performance_values(
+ new_sap=sap_prediction,
+ new_epc=epc_prediction,
+ new_carbon=carbon_prediction,
+ new_heat_demand=heat_demand_prediction,
+ )
kwh_client = KwhData(bucket=get_settings().DATA_BUCKET, read_consumption_data=True)
@@ -924,26 +924,6 @@ async def model_engine(body: PlanTriggerRequest):
# We also make a tweak - if the property has been flagged for solar but doesn't contain
# any panel performance, we ensure that we have a 3kWp and 4kWp option for the property
- # TODO: Temp - test re-baselining
- p = input_properties[0]
- p.create_base_difference_epc_record(cleaned_lookup=cleaned)
- scoring_data = p.base_difference_record.df
- # We just need a recent date to trigger the right models,
- # as we are only interested in the deltas
- scoring_data["is_post_sap10_starting"] = True
- # Score model - SAP re-baselining model
- model_api.MODEL_URLS["retrofit-sap-baseline-predictions"] = "sapbaselinemodel"
- model_api.prediction_buckets["retrofit-sap-baseline-predictions"] = "retrofit-sap-baseline-predictions-dev"
- example_response = model_api.predict_all(
- df=scoring_data,
- bucket=get_settings().DATA_BUCKET,
- model_prefixes=["retrofit-sap-baseline-predictions"],
- extract_ids=False
- )
-
- input_properties[0].data["current-energy-efficiency"] = 58.8
- input_properties[0].data["current-energy-rating"] = "D"
-
logger.info("Identifying property recommendations")
recommendations, recommendations_scoring_data, representative_recommendations = {}, [], {}
for p in tqdm(input_properties):
diff --git a/etl/bill_savings/KwhData.py b/etl/bill_savings/KwhData.py
index b4bb979d..dfb0be85 100644
--- a/etl/bill_savings/KwhData.py
+++ b/etl/bill_savings/KwhData.py
@@ -1,4 +1,5 @@
import re
+from dataclasses import fields
import pandas as pd
import numpy as np
from datetime import datetime
@@ -14,24 +15,24 @@ logger = setup_logger()
class KwhData:
- COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]
+ COLS_TO_STRINGIFY = ["main_heating_controls", "floor_level"]
CATEGORICAL_COLUMNS = [
- "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
- "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
- "built-form",
- "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
- "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
+ "lodgement_year", "lodgement_month", "main_fuel", "mainheat_description", "number_heated_rooms",
+ "number_habitable_rooms", "mainheat_energy_eff", "mainheatcont_description", "property_type",
+ "built_form",
+ "construction_age_band", "secondheat_description", "hotwater_description", "hot_water_energy_eff",
+ "walls_description", "walls_energy_eff", "roof_description", "roof_energy_eff", "floor_description",
"county",
- "windows-description", "windows-energy-eff", "flat-top-storey",
- "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
- "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating",
- "floor-level"
+ "windows_description", "windows_energy_eff", "flat_top_storey",
+ "flat_storey_count", "unheated_corridor_length", "solar_water_heating_flag", "mechanical_ventilation",
+ "low_energy_lighting", "environment_impact_current", "energy_tariff", "current_energy_rating",
+ "floor_level"
]
NUMERICAL_COLUMNS = [
- 'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
- 'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
+ 'heating_cost_current', 'total_floor_area', 'co2_emissions_current', 'energy_consumption_current',
+ 'heating_cost_potential', 'hot_water_cost_current', 'current_energy_efficiency'
]
def __init__(self, bucket=None, read_consumption_data=False):
@@ -106,6 +107,16 @@ class KwhData:
# If no match is found, return None or raise an exception
return None
+ @staticmethod
+ def _normalise_epc_keys(data):
+ if isinstance(data, dict):
+ return {key.replace("-", "_"): value for key, value in data.items()}
+
+ if isinstance(data, pd.DataFrame):
+ return data.rename(columns=lambda column: column.replace("-", "_"))
+
+ raise TypeError("Expected dict or DataFrame")
+
def combine(self):
"""
Given the data that is collected containing the kwh values for heating and hot water, this method will combine
@@ -128,9 +139,9 @@ class KwhData:
# We check that the retrieved energy consumption sufficiently matches the EPC data
internal_dataset = []
for x in data:
- epc_data = x["epc"]
- epc_sap = epc_data["current-energy-efficiency"]
- epc_potential_sap = epc_data["potential-energy-efficiency"]
+ epc_data = self._normalise_epc_keys(x["epc"])
+ epc_sap = epc_data["current_energy_efficiency"]
+ epc_potential_sap = epc_data["potential_energy_efficiency"]
# Make sure this matches the extracted sap
if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int(
x["potential_epc_efficiency"]
@@ -171,7 +182,7 @@ class KwhData:
# We also estimate the energy consumption reduction from this data, by band
df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"]
- consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index()
+ consumption_averages = df.groupby("current_energy_efficiency")["total_consumption"].mean().reset_index()
df = df.drop(columns=["total_consumption"])
self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet"
@@ -203,9 +214,11 @@ class KwhData:
# TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
# in anticipation of the new model
- data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
- data["lodgement-year"] = data["lodgement-date"].dt.year
- data["lodgement-month"] = data["lodgement-date"].dt.month
+ data = self._normalise_epc_keys(data.copy())
+
+ data["lodgement_date"] = pd.to_datetime(data["lodgement_date"])
+ data["lodgement_year"] = data["lodgement_date"].dt.year
+ data["lodgement_month"] = data["lodgement_date"].dt.month
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many
# categories
@@ -231,8 +244,10 @@ class KwhData:
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
# Apply the lookup table to the data
- for feature in ["walls-description", "roof-description", "floor-description"]:
- cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
+ for feature in ["walls_description", "roof_description", "floor_description"]:
+ cleaned_df = pd.DataFrame(
+ cleaned[feature.replace("_", "-")]
+ )[["original_description", "thermal_transmittance"]]
# Round to 2 decimal places and convert to string
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
@@ -261,10 +276,10 @@ class KwhData:
data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)
# Create new features:
- data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
+ data['estimate_annual_kwh'] = data['energy_consumption_current'] * data['total_floor_area']
# Ensure this is string, because we could have mixed types
- data["lodgement-datetime"] = data["lodgement-datetime"].astype(str)
+ data["lodgement_datetime"] = data["lodgement_datetime"].astype(str)
if save:
self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
@@ -286,29 +301,39 @@ class KwhData:
data is in the format required by the model
:return:
"""
-
- epc = p.data.copy()
numeric_cols = [
- 'current-energy-efficiency',
- 'potential-energy-efficiency', 'environment-impact-current',
- 'environment-impact-potential', 'energy-consumption-current',
- 'energy-consumption-potential', 'co2-emissions-current',
- 'co2-emiss-curr-per-floor-area', 'co2-emissions-potential',
- 'lighting-cost-current', 'lighting-cost-potential',
- 'heating-cost-current', 'heating-cost-potential',
- 'hot-water-cost-current', 'hot-water-cost-potential',
- 'total-floor-area', 'multi-glaze-proportion',
- 'extension-count', 'number-habitable-rooms', 'number-heated-rooms',
- 'low-energy-lighting', 'number-open-fireplaces',
- 'wind-turbine-count', 'unheated-corridor-length',
- 'floor-height', 'photo-supply', 'fixed-lighting-outlets-count',
- 'low-energy-fixed-light-count',
+ 'current_energy_efficiency',
+ 'potential_energy_efficiency', 'environment_impact_current',
+ 'environment_impact_potential', 'energy_consumption_current',
+ 'energy_consumption_potential', 'co2_emissions_current',
+ 'co2_emiss_curr_per_floor_area', 'co2_emissions_potential',
+ 'lighting_cost_current', 'lighting_cost_potential',
+ 'heating_cost_current', 'heating_cost_potential',
+ 'hot_water_cost_current', 'hot_water_cost_potential',
+ 'total_floor_area', 'multi_glaze_proportion',
+ 'extension_count', 'number_habitable_rooms', 'number_heated_rooms',
+ 'low_energy_lighting', 'number_open_fireplaces',
+ 'wind_turbine_count', 'unheated_corridor_length',
+ 'floor_height', 'photo_supply', 'fixed_lighting_outlets_count',
+ 'low_energy_fixed_light_count',
]
+ required_cols = set(numeric_cols + KwhData.CATEGORICAL_COLUMNS + [
+ "uprn", "lodgement_date", "lodgement_datetime", "floor_energy_eff"
+ ])
+
+ epc_record = p.epc_record
+ available_fields = {field.name for field in fields(epc_record)}
+ missing_fields = required_cols - available_fields
+ if missing_fields:
+ raise ValueError(f"Missing EPCRecord fields required by KwhData: {sorted(missing_fields)}")
+
+ epc = {field_name: getattr(epc_record, field_name) for field_name in required_cols}
+
for v in numeric_cols:
if epc[v] is not None:
epc[v] = float(epc[v])
- bools_to_remap = ['mains-gas-flag', 'flat-top-storey']
+ bools_to_remap = ['mains_gas_flag', 'flat_top_storey']
bool_map = {
True: "Y",
False: "N",
@@ -320,8 +345,8 @@ class KwhData:
epc[v] = bool_map[epc[v]]
no_data = {
- "floor-level": "NODATA!",
- "floor-energy-eff": "NO DATA!"
+ "floor_level": "NODATA!",
+ "floor_energy_eff": "NO DATA!"
}
for v, fill_val in no_data.items():
if pd.isnull(epc[v]):
@@ -331,8 +356,8 @@ class KwhData:
def prepare_epc(self, input_properties: list[Property]):
scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties])
- scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year
- scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month
+ scoring_data["lodgement_year"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.year
+ scoring_data["lodgement_month"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.month
scoring_data["id"] = scoring_data["uprn"].copy()
diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index 84d4d19a..0428542c 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -309,6 +309,7 @@ class EPCRecord:
# Indicates if the EPC record has been predicted. By default, false
estimated: Optional[bool] = False
sap_05_overwritten: Optional[bool] = False
+ has_been_remodelled: Optional[bool] = False
# ------------------------------------------------------------------
# MODEL FLAGS
@@ -386,6 +387,35 @@ class EPCRecord:
return
+ def insert_new_performance_values(
+ self, new_sap: float, new_epc: float, new_carbon: float, new_heat_demand: float,
+ ):
+ """
+ Given re-modelling for this property, is used to insert the new values and also keep a record of the
+ fact that re-modelling has taken place
+ :param new_sap:
+ :param new_epc:
+ :param new_carbon:
+ :param new_heat_demand:
+ :return:
+ """
+
+ self.has_been_remodelled = True
+ # Update prepared epc
+ update_data = {
+ "current_energy_efficiency": new_sap,
+ "current_energy_rating": new_epc,
+ "co2_emissions_current": new_carbon,
+ "energy_consumption_current": new_heat_demand,
+ }
+ # Validate we're updating correct fields
+ for k in update_data:
+ if k not in self._prepared_epc:
+ raise ValueError(f"Attempting to update unknown field '{k}' in prepared EPC")
+ self._prepared_epc.update(update_data)
+ # Update dataclass attributes
+ self._expand_prepared_epc_to_attributes()
+
def _apply_averages_cleaning(self) -> None:
"""
Fills missing property dimension values using medians from cleaning_data.
@@ -626,6 +656,10 @@ class EPCRecord:
# Ignore keys that are not part of the dataclass schema
continue
+ if value is None:
+ setattr(self, key, None)
+ continue
+
try:
cast_value = self._cast_value(value, field_map[key].type)
setattr(self, key, cast_value)
@@ -812,14 +846,17 @@ class EPCRecord:
(property_dimensions["PROPERTY_TYPE"] == self._prepared_epc["property-type"])
]
- if self.construction_age_band not in DATA_ANOMALY_MATCHES:
+ if (
+ (self.construction_age_band not in DATA_ANOMALY_MATCHES) and
+ (self.construction_age_band in result["CONSTRUCTION_AGE_BAND"].values)
+ ):
result = result[
(result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band)
]
if (
self._prepared_epc["built-form"] not in DATA_ANOMALY_MATCHES
- and self._prepared_epc["built-form"] in result["BUILT_FORM"]
+ and self._prepared_epc["built-form"] in result["BUILT_FORM"].values
):
result = result[(result["BUILT_FORM"] == self._prepared_epc["built-form"])]
@@ -935,7 +972,7 @@ class EPCRecord:
self._prepared_epc["unheated-corridor-length"] = (
float(self._prepared_epc["unheated-corridor-length"])
- if self._prepared_epc["unheated-corridor-length"] not in ["", None]
+ if self._prepared_epc["unheated-corridor-length"] not in DATA_ANOMALY_MATCHES
else None
)