removing data

This commit is contained in:
Khalim Conn-Kowlessar 2026-03-18 19:17:22 +00:00
parent f45260706e
commit 84d4263d9a
6 changed files with 84 additions and 81 deletions

View file

@ -182,8 +182,8 @@ class GoogleSolarApi:
self.exclude_north_facing_segments(property_instance=property_instance)
# If a property is semi-detached, it's possible for us to include segments from an attached unit
if property_instance is not None:
if (property_instance.data["built-form"] == "Semi-Detached") and (
property_instance.data["extension-count"] == 0
if (property_instance.epc_record.built_form == "Semi-Detached") and (
property_instance.epc_record.extension_count == 0
):
self.exclude_likely_duplicate_surfaces()
@ -708,7 +708,7 @@ class GoogleSolarApi:
# We set the target rating to EPC C, which is the typical EPC rating we would expect the
# property to achieve post retrofit of just the fabric
"energy_consumption": cls.estimate_new_consumption(
current_energy_efficiency=min(p.data["current-energy-efficiency"], 100),
current_energy_efficiency=min(p.epc_record.current_energy_efficiency, 100),
target_efficiency="69",
current_consumption=p.estimate_electrical_consumption(
assumed_ashp_efficiency=assumptions.AVERAGE_ASHP_EFFICIENCY, exclusions=body.exclusions
@ -727,7 +727,7 @@ class GoogleSolarApi:
# We set the target rating to EPC C, which is the typical EPC rating we would expect the
# property to achieve post retrofit of just the fabric
"energy_consumption": cls.estimate_new_consumption(
current_energy_efficiency=min(int(p.data["current-energy-efficiency"]), 100),
current_energy_efficiency=min(p.epc_record.current_energy_efficiency, 100),
target_efficiency="69",
current_consumption=p.estimate_electrical_consumption(
assumed_ashp_efficiency=assumptions.AVERAGE_ASHP_EFFICIENCY, exclusions=body.exclusions

View file

@ -1,5 +1,5 @@
from typing import Optional
from sqlalchemy.orm import Session
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy import func
from backend.app.db.models.addresses import PostcodeSearch
from utils.logger import setup_logger

View file

@ -1,5 +1,4 @@
import re
from dataclasses import fields
import pandas as pd
import numpy as np
from datetime import datetime
@ -15,24 +14,24 @@ logger = setup_logger()
class KwhData:
COLS_TO_STRINGIFY = ["main_heating_controls", "floor_level"]
COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]
CATEGORICAL_COLUMNS = [
"lodgement_year", "lodgement_month", "main_fuel", "mainheat_description", "number_heated_rooms",
"number_habitable_rooms", "mainheat_energy_eff", "mainheatcont_description", "property_type",
"built_form",
"construction_age_band", "secondheat_description", "hotwater_description", "hot_water_energy_eff",
"walls_description", "walls_energy_eff", "roof_description", "roof_energy_eff", "floor_description",
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
"number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
"built-form",
"construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
"walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
"county",
"windows_description", "windows_energy_eff", "flat_top_storey",
"flat_storey_count", "unheated_corridor_length", "solar_water_heating_flag", "mechanical_ventilation",
"low_energy_lighting", "environment_impact_current", "energy_tariff", "current_energy_rating",
"floor_level"
"windows-description", "windows-energy-eff", "flat-top-storey",
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating",
"floor-level"
]
NUMERICAL_COLUMNS = [
'heating_cost_current', 'total_floor_area', 'co2_emissions_current', 'energy_consumption_current',
'heating_cost_potential', 'hot_water_cost_current', 'current_energy_efficiency'
'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
]
def __init__(self, bucket=None, read_consumption_data=False):
@ -107,16 +106,6 @@ class KwhData:
# If no match is found, return None or raise an exception
return None
@staticmethod
def _normalise_epc_keys(data):
if isinstance(data, dict):
return {key.replace("-", "_"): value for key, value in data.items()}
if isinstance(data, pd.DataFrame):
return data.rename(columns=lambda column: column.replace("-", "_"))
raise TypeError("Expected dict or DataFrame")
def combine(self):
"""
Given the data that is collected containing the kwh values for heating and hot water, this method will combine
@ -139,9 +128,9 @@ class KwhData:
# We check that the retrieved energy consumption sufficiently matches the EPC data
internal_dataset = []
for x in data:
epc_data = self._normalise_epc_keys(x["epc"])
epc_sap = epc_data["current_energy_efficiency"]
epc_potential_sap = epc_data["potential_energy_efficiency"]
epc_data = x["epc"]
epc_sap = epc_data["current-energy-efficiency"]
epc_potential_sap = epc_data["potential-energy-efficiency"]
# Make sure this matches the extracted sap
if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int(
x["potential_epc_efficiency"]
@ -182,7 +171,7 @@ class KwhData:
# We also estimate the energy consumption reduction from this data, by band
df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"]
consumption_averages = df.groupby("current_energy_efficiency")["total_consumption"].mean().reset_index()
consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index()
df = df.drop(columns=["total_consumption"])
self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet"
@ -214,11 +203,9 @@ class KwhData:
# TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
# in anticipation of the new model
data = self._normalise_epc_keys(data.copy())
data["lodgement_date"] = pd.to_datetime(data["lodgement_date"])
data["lodgement_year"] = data["lodgement_date"].dt.year
data["lodgement_month"] = data["lodgement_date"].dt.month
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
data["lodgement-year"] = data["lodgement-date"].dt.year
data["lodgement-month"] = data["lodgement-date"].dt.month
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many
# categories
@ -244,10 +231,8 @@ class KwhData:
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
# Apply the lookup table to the data
for feature in ["walls_description", "roof_description", "floor_description"]:
cleaned_df = pd.DataFrame(
cleaned[feature.replace("_", "-")]
)[["original_description", "thermal_transmittance"]]
for feature in ["walls-description", "roof-description", "floor-description"]:
cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
# Round to 2 decimal places and convert to string
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
@ -276,10 +261,10 @@ class KwhData:
data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)
# Create new features:
data['estimate_annual_kwh'] = data['energy_consumption_current'] * data['total_floor_area']
data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
# Ensure this is string, because we could have mixed types
data["lodgement_datetime"] = data["lodgement_datetime"].astype(str)
data["lodgement-datetime"] = data["lodgement-datetime"].astype(str)
if save:
self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
@ -301,39 +286,29 @@ class KwhData:
data is in the format required by the model
:return:
"""
epc = p.epc_record.to_dict(case="kebab", source="prepared")
numeric_cols = [
'current_energy_efficiency',
'potential_energy_efficiency', 'environment_impact_current',
'environment_impact_potential', 'energy_consumption_current',
'energy_consumption_potential', 'co2_emissions_current',
'co2_emiss_curr_per_floor_area', 'co2_emissions_potential',
'lighting_cost_current', 'lighting_cost_potential',
'heating_cost_current', 'heating_cost_potential',
'hot_water_cost_current', 'hot_water_cost_potential',
'total_floor_area', 'multi_glaze_proportion',
'extension_count', 'number_habitable_rooms', 'number_heated_rooms',
'low_energy_lighting', 'number_open_fireplaces',
'wind_turbine_count', 'unheated_corridor_length',
'floor_height', 'photo_supply', 'fixed_lighting_outlets_count',
'low_energy_fixed_light_count',
'current-energy-efficiency',
'potential-energy-efficiency', 'environment-impact-current',
'environment-impact-potential', 'energy-consumption-current',
'energy-consumption-potential', 'co2-emissions-current',
'co2-emiss-curr-per-floor-area', 'co2-emissions-potential',
'lighting-cost-current', 'lighting-cost-potential',
'heating-cost-current', 'heating-cost-potential',
'hot-water-cost-current', 'hot-water-cost-potential',
'total-floor-area', 'multi-glaze-proportion',
'extension-count', 'number-habitable-rooms', 'number-heated-rooms',
'low-energy-lighting', 'number-open-fireplaces',
'wind-turbine-count', 'unheated-corridor-length',
'floor-height', 'photo-supply', 'fixed-lighting-outlets-count',
'low-energy-fixed-light-count',
]
required_cols = set(numeric_cols + KwhData.CATEGORICAL_COLUMNS + [
"uprn", "lodgement_date", "lodgement_datetime", "floor_energy_eff"
])
epc_record = p.epc_record
available_fields = {field.name for field in fields(epc_record)}
missing_fields = required_cols - available_fields
if missing_fields:
raise ValueError(f"Missing EPCRecord fields required by KwhData: {sorted(missing_fields)}")
epc = {field_name: getattr(epc_record, field_name) for field_name in required_cols}
for v in numeric_cols:
if epc[v] is not None:
epc[v] = float(epc[v])
bools_to_remap = ['mains_gas_flag', 'flat_top_storey']
bools_to_remap = ['mains-gas-flag', 'flat-top-storey']
bool_map = {
True: "Y",
False: "N",
@ -345,8 +320,8 @@ class KwhData:
epc[v] = bool_map[epc[v]]
no_data = {
"floor_level": "NODATA!",
"floor_energy_eff": "NO DATA!"
"floor-level": "NODATA!",
"floor-energy-eff": "NO DATA!"
}
for v, fill_val in no_data.items():
if pd.isnull(epc[v]):
@ -356,8 +331,8 @@ class KwhData:
def prepare_epc(self, input_properties: list[Property]):
scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties])
scoring_data["lodgement_year"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.year
scoring_data["lodgement_month"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.month
scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year
scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month
scoring_data["id"] = scoring_data["uprn"].copy()

View file

@ -1245,6 +1245,34 @@ class EPCRecord:
return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE]
def to_dict(
self,
case: Literal["snake", "kebab"] = "kebab",
source: Literal["prepared", "attributes"] = "prepared",
) -> dict[str, Any]:
if source == "prepared":
if self._prepared_epc is None:
raise ValueError("Prepared EPC not available")
data = self._prepared_epc.copy()
elif source == "attributes":
data = {
k: v for k, v in vars(self).items()
if not k.startswith("_")
}
else:
raise ValueError(f"Unknown source: {source}")
if case == "snake":
return {k.replace("-", "_"): v for k, v in data.items()}
if case == "kebab":
return {k.replace("_", "-"): v for k, v in data.items()}
return data
def get(
self,
key: str | list[str],

View file

@ -200,20 +200,20 @@ class Costs:
self.property = property_instance
self.regional_labour_variations = regional_labour_variations
self.region = county_to_region_map.get(self.property.data["county"], None)
self.region = county_to_region_map.get(self.property.epc_record.county, None)
if self.region is None:
# Try and grab using the local-authority-label
self.region = county_to_region_map.get(self.property.data["local-authority-label"], None)
self.region = county_to_region_map.get(self.property.epc_record.local_authority_label, None)
if self.region is None:
# Try and get the region after converting the keys to lower
self.region = {
k.lower(): v for k, v in county_to_region_map.items()
}.get(self.property.data["local-authority-label"].lower(), None)
}.get(self.property.property.epc_record.local_authority_label.lower(), None)
if self.region is None:
logger.warning("No region found for county %s, defaulting to South East England",
self.property.data["county"])
self.property.epc_record.county)
self.region = "South East England"
self.labour_adjustment_factor = [
@ -858,8 +858,8 @@ class Costs:
n_radiators = self._estimate_n_radiators(
number_habitable_rooms=n_rooms,
total_floor_area=self.property.floor_area,
property_type=self.property.data["property-type"],
built_form=self.property.data["built-form"]
property_type=self.property.epc_record.property - type,
built_form=self.property.epc_record.built_form
)
additionals_labour_cost = labour_rate * self.labour_adjustment_factor

View file

@ -76,7 +76,7 @@ class FloorRecommendations(Definitions):
return
u_value = self.property.floor["thermal_transmittance"]
property_type = self.property.data["property-type"]
property_type = self.property.epc_record.property_type
floor_area = self.property.insulation_floor_area
if self.property.floor["another_property_below"] | (self.property.floor["insulation_thickness"] in [