removing data

This commit is contained in:
Khalim Conn-Kowlessar 2026-03-18 19:17:22 +00:00
parent f45260706e
commit 84d4263d9a
6 changed files with 84 additions and 81 deletions

View file

@ -182,8 +182,8 @@ class GoogleSolarApi:
self.exclude_north_facing_segments(property_instance=property_instance) self.exclude_north_facing_segments(property_instance=property_instance)
# If a property is semi-detached, it's possible for us to include segments from an attached unit # If a property is semi-detached, it's possible for us to include segments from an attached unit
if property_instance is not None: if property_instance is not None:
if (property_instance.data["built-form"] == "Semi-Detached") and ( if (property_instance.epc_record.built_form == "Semi-Detached") and (
property_instance.data["extension-count"] == 0 property_instance.epc_record.extension_count == 0
): ):
self.exclude_likely_duplicate_surfaces() self.exclude_likely_duplicate_surfaces()
@ -708,7 +708,7 @@ class GoogleSolarApi:
# We set the target rating to EPC C, which is the typical EPC rating we would expect the # We set the target rating to EPC C, which is the typical EPC rating we would expect the
# property to achieve post retrofit of just the fabric # property to achieve post retrofit of just the fabric
"energy_consumption": cls.estimate_new_consumption( "energy_consumption": cls.estimate_new_consumption(
current_energy_efficiency=min(p.data["current-energy-efficiency"], 100), current_energy_efficiency=min(p.epc_record.current_energy_efficiency, 100),
target_efficiency="69", target_efficiency="69",
current_consumption=p.estimate_electrical_consumption( current_consumption=p.estimate_electrical_consumption(
assumed_ashp_efficiency=assumptions.AVERAGE_ASHP_EFFICIENCY, exclusions=body.exclusions assumed_ashp_efficiency=assumptions.AVERAGE_ASHP_EFFICIENCY, exclusions=body.exclusions
@ -727,7 +727,7 @@ class GoogleSolarApi:
# We set the target rating to EPC C, which is the typical EPC rating we would expect the # We set the target rating to EPC C, which is the typical EPC rating we would expect the
# property to achieve post retrofit of just the fabric # property to achieve post retrofit of just the fabric
"energy_consumption": cls.estimate_new_consumption( "energy_consumption": cls.estimate_new_consumption(
current_energy_efficiency=min(int(p.data["current-energy-efficiency"]), 100), current_energy_efficiency=min(p.epc_record.current_energy_efficiency, 100),
target_efficiency="69", target_efficiency="69",
current_consumption=p.estimate_electrical_consumption( current_consumption=p.estimate_electrical_consumption(
assumed_ashp_efficiency=assumptions.AVERAGE_ASHP_EFFICIENCY, exclusions=body.exclusions assumed_ashp_efficiency=assumptions.AVERAGE_ASHP_EFFICIENCY, exclusions=body.exclusions

View file

@ -1,5 +1,5 @@
from typing import Optional
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy import func from sqlalchemy import func
from backend.app.db.models.addresses import PostcodeSearch from backend.app.db.models.addresses import PostcodeSearch
from utils.logger import setup_logger from utils.logger import setup_logger

View file

@ -1,5 +1,4 @@
import re import re
from dataclasses import fields
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from datetime import datetime from datetime import datetime
@ -15,24 +14,24 @@ logger = setup_logger()
class KwhData: class KwhData:
COLS_TO_STRINGIFY = ["main_heating_controls", "floor_level"] COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]
CATEGORICAL_COLUMNS = [ CATEGORICAL_COLUMNS = [
"lodgement_year", "lodgement_month", "main_fuel", "mainheat_description", "number_heated_rooms", "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
"number_habitable_rooms", "mainheat_energy_eff", "mainheatcont_description", "property_type", "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
"built_form", "built-form",
"construction_age_band", "secondheat_description", "hotwater_description", "hot_water_energy_eff", "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
"walls_description", "walls_energy_eff", "roof_description", "roof_energy_eff", "floor_description", "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
"county", "county",
"windows_description", "windows_energy_eff", "flat_top_storey", "windows-description", "windows-energy-eff", "flat-top-storey",
"flat_storey_count", "unheated_corridor_length", "solar_water_heating_flag", "mechanical_ventilation", "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low_energy_lighting", "environment_impact_current", "energy_tariff", "current_energy_rating", "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating",
"floor_level" "floor-level"
] ]
NUMERICAL_COLUMNS = [ NUMERICAL_COLUMNS = [
'heating_cost_current', 'total_floor_area', 'co2_emissions_current', 'energy_consumption_current', 'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
'heating_cost_potential', 'hot_water_cost_current', 'current_energy_efficiency' 'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
] ]
def __init__(self, bucket=None, read_consumption_data=False): def __init__(self, bucket=None, read_consumption_data=False):
@ -107,16 +106,6 @@ class KwhData:
# If no match is found, return None or raise an exception # If no match is found, return None or raise an exception
return None return None
@staticmethod
def _normalise_epc_keys(data):
if isinstance(data, dict):
return {key.replace("-", "_"): value for key, value in data.items()}
if isinstance(data, pd.DataFrame):
return data.rename(columns=lambda column: column.replace("-", "_"))
raise TypeError("Expected dict or DataFrame")
def combine(self): def combine(self):
""" """
Given the data that is collected containing the kwh values for heating and hot water, this method will combine Given the data that is collected containing the kwh values for heating and hot water, this method will combine
@ -139,9 +128,9 @@ class KwhData:
# We check that the retrieved energy consumption sufficiently matches the EPC data # We check that the retrieved energy consumption sufficiently matches the EPC data
internal_dataset = [] internal_dataset = []
for x in data: for x in data:
epc_data = self._normalise_epc_keys(x["epc"]) epc_data = x["epc"]
epc_sap = epc_data["current_energy_efficiency"] epc_sap = epc_data["current-energy-efficiency"]
epc_potential_sap = epc_data["potential_energy_efficiency"] epc_potential_sap = epc_data["potential-energy-efficiency"]
# Make sure this matches the extracted sap # Make sure this matches the extracted sap
if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int( if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int(
x["potential_epc_efficiency"] x["potential_epc_efficiency"]
@ -182,7 +171,7 @@ class KwhData:
# We also estimate the energy consumption reduction from this data, by band # We also estimate the energy consumption reduction from this data, by band
df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"] df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"]
consumption_averages = df.groupby("current_energy_efficiency")["total_consumption"].mean().reset_index() consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index()
df = df.drop(columns=["total_consumption"]) df = df.drop(columns=["total_consumption"])
self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet" self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet"
@ -214,11 +203,9 @@ class KwhData:
# TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
# in anticipation of the new model # in anticipation of the new model
data = self._normalise_epc_keys(data.copy()) data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
data["lodgement-year"] = data["lodgement-date"].dt.year
data["lodgement_date"] = pd.to_datetime(data["lodgement_date"]) data["lodgement-month"] = data["lodgement-date"].dt.month
data["lodgement_year"] = data["lodgement_date"].dt.year
data["lodgement_month"] = data["lodgement_date"].dt.month
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many # For walls, roof, floor description where we have average thermal transmittance, to avoid too many
# categories # categories
@ -244,10 +231,8 @@ class KwhData:
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str) thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
# Apply the lookup table to the data # Apply the lookup table to the data
for feature in ["walls_description", "roof_description", "floor_description"]: for feature in ["walls-description", "roof-description", "floor-description"]:
cleaned_df = pd.DataFrame( cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
cleaned[feature.replace("_", "-")]
)[["original_description", "thermal_transmittance"]]
# Round to 2 decimal places and convert to string # Round to 2 decimal places and convert to string
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str) cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
@ -276,10 +261,10 @@ class KwhData:
data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str) data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)
# Create new features: # Create new features:
data['estimate_annual_kwh'] = data['energy_consumption_current'] * data['total_floor_area'] data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
# Ensure this is string, because we could have mixed types # Ensure this is string, because we could have mixed types
data["lodgement_datetime"] = data["lodgement_datetime"].astype(str) data["lodgement-datetime"] = data["lodgement-datetime"].astype(str)
if save: if save:
self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet" self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
@ -301,39 +286,29 @@ class KwhData:
data is in the format required by the model data is in the format required by the model
:return: :return:
""" """
epc = p.epc_record.to_dict(case="kebab", source="prepared")
numeric_cols = [ numeric_cols = [
'current_energy_efficiency', 'current-energy-efficiency',
'potential_energy_efficiency', 'environment_impact_current', 'potential-energy-efficiency', 'environment-impact-current',
'environment_impact_potential', 'energy_consumption_current', 'environment-impact-potential', 'energy-consumption-current',
'energy_consumption_potential', 'co2_emissions_current', 'energy-consumption-potential', 'co2-emissions-current',
'co2_emiss_curr_per_floor_area', 'co2_emissions_potential', 'co2-emiss-curr-per-floor-area', 'co2-emissions-potential',
'lighting_cost_current', 'lighting_cost_potential', 'lighting-cost-current', 'lighting-cost-potential',
'heating_cost_current', 'heating_cost_potential', 'heating-cost-current', 'heating-cost-potential',
'hot_water_cost_current', 'hot_water_cost_potential', 'hot-water-cost-current', 'hot-water-cost-potential',
'total_floor_area', 'multi_glaze_proportion', 'total-floor-area', 'multi-glaze-proportion',
'extension_count', 'number_habitable_rooms', 'number_heated_rooms', 'extension-count', 'number-habitable-rooms', 'number-heated-rooms',
'low_energy_lighting', 'number_open_fireplaces', 'low-energy-lighting', 'number-open-fireplaces',
'wind_turbine_count', 'unheated_corridor_length', 'wind-turbine-count', 'unheated-corridor-length',
'floor_height', 'photo_supply', 'fixed_lighting_outlets_count', 'floor-height', 'photo-supply', 'fixed-lighting-outlets-count',
'low_energy_fixed_light_count', 'low-energy-fixed-light-count',
] ]
required_cols = set(numeric_cols + KwhData.CATEGORICAL_COLUMNS + [
"uprn", "lodgement_date", "lodgement_datetime", "floor_energy_eff"
])
epc_record = p.epc_record
available_fields = {field.name for field in fields(epc_record)}
missing_fields = required_cols - available_fields
if missing_fields:
raise ValueError(f"Missing EPCRecord fields required by KwhData: {sorted(missing_fields)}")
epc = {field_name: getattr(epc_record, field_name) for field_name in required_cols}
for v in numeric_cols: for v in numeric_cols:
if epc[v] is not None: if epc[v] is not None:
epc[v] = float(epc[v]) epc[v] = float(epc[v])
bools_to_remap = ['mains_gas_flag', 'flat_top_storey'] bools_to_remap = ['mains-gas-flag', 'flat-top-storey']
bool_map = { bool_map = {
True: "Y", True: "Y",
False: "N", False: "N",
@ -345,8 +320,8 @@ class KwhData:
epc[v] = bool_map[epc[v]] epc[v] = bool_map[epc[v]]
no_data = { no_data = {
"floor_level": "NODATA!", "floor-level": "NODATA!",
"floor_energy_eff": "NO DATA!" "floor-energy-eff": "NO DATA!"
} }
for v, fill_val in no_data.items(): for v, fill_val in no_data.items():
if pd.isnull(epc[v]): if pd.isnull(epc[v]):
@ -356,8 +331,8 @@ class KwhData:
def prepare_epc(self, input_properties: list[Property]): def prepare_epc(self, input_properties: list[Property]):
scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties]) scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties])
scoring_data["lodgement_year"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.year scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year
scoring_data["lodgement_month"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.month scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month
scoring_data["id"] = scoring_data["uprn"].copy() scoring_data["id"] = scoring_data["uprn"].copy()

View file

@ -1245,6 +1245,34 @@ class EPCRecord:
return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE] return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE]
def to_dict(
self,
case: Literal["snake", "kebab"] = "kebab",
source: Literal["prepared", "attributes"] = "prepared",
) -> dict[str, Any]:
if source == "prepared":
if self._prepared_epc is None:
raise ValueError("Prepared EPC not available")
data = self._prepared_epc.copy()
elif source == "attributes":
data = {
k: v for k, v in vars(self).items()
if not k.startswith("_")
}
else:
raise ValueError(f"Unknown source: {source}")
if case == "snake":
return {k.replace("-", "_"): v for k, v in data.items()}
if case == "kebab":
return {k.replace("_", "-"): v for k, v in data.items()}
return data
def get( def get(
self, self,
key: str | list[str], key: str | list[str],

View file

@ -200,20 +200,20 @@ class Costs:
self.property = property_instance self.property = property_instance
self.regional_labour_variations = regional_labour_variations self.regional_labour_variations = regional_labour_variations
self.region = county_to_region_map.get(self.property.data["county"], None) self.region = county_to_region_map.get(self.property.epc_record.county, None)
if self.region is None: if self.region is None:
# Try and grab using the local-authority-label # Try and grab using the local-authority-label
self.region = county_to_region_map.get(self.property.data["local-authority-label"], None) self.region = county_to_region_map.get(self.property.epc_record.local_authority_label, None)
if self.region is None: if self.region is None:
# Try and get the region after converting the keys to lower # Try and get the region after converting the keys to lower
self.region = { self.region = {
k.lower(): v for k, v in county_to_region_map.items() k.lower(): v for k, v in county_to_region_map.items()
}.get(self.property.data["local-authority-label"].lower(), None) }.get(self.property.property.epc_record.local_authority_label.lower(), None)
if self.region is None: if self.region is None:
logger.warning("No region found for county %s, defaulting to South East England", logger.warning("No region found for county %s, defaulting to South East England",
self.property.data["county"]) self.property.epc_record.county)
self.region = "South East England" self.region = "South East England"
self.labour_adjustment_factor = [ self.labour_adjustment_factor = [
@ -858,8 +858,8 @@ class Costs:
n_radiators = self._estimate_n_radiators( n_radiators = self._estimate_n_radiators(
number_habitable_rooms=n_rooms, number_habitable_rooms=n_rooms,
total_floor_area=self.property.floor_area, total_floor_area=self.property.floor_area,
property_type=self.property.data["property-type"], property_type=self.property.epc_record.property - type,
built_form=self.property.data["built-form"] built_form=self.property.epc_record.built_form
) )
additionals_labour_cost = labour_rate * self.labour_adjustment_factor additionals_labour_cost = labour_rate * self.labour_adjustment_factor

View file

@ -76,7 +76,7 @@ class FloorRecommendations(Definitions):
return return
u_value = self.property.floor["thermal_transmittance"] u_value = self.property.floor["thermal_transmittance"]
property_type = self.property.data["property-type"] property_type = self.property.epc_record.property_type
floor_area = self.property.insulation_floor_area floor_area = self.property.insulation_floor_area
if self.property.floor["another_property_below"] | (self.property.floor["insulation_thickness"] in [ if self.property.floor["another_property_below"] | (self.property.floor["insulation_thickness"] in [