From 84d4263d9af4de27718237d47b2e92e8d6378004 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 18 Mar 2026 19:17:22 +0000 Subject: [PATCH] removing data --- backend/apis/GoogleSolarApi.py | 8 +- backend/app/db/functions/address_functions.py | 2 +- etl/bill_savings/KwhData.py | 113 +++++++----------- etl/epc/Record.py | 28 +++++ recommendations/Costs.py | 12 +- recommendations/FloorRecommendations.py | 2 +- 6 files changed, 84 insertions(+), 81 deletions(-) diff --git a/backend/apis/GoogleSolarApi.py b/backend/apis/GoogleSolarApi.py index bf07b5e5..6fc5daa6 100644 --- a/backend/apis/GoogleSolarApi.py +++ b/backend/apis/GoogleSolarApi.py @@ -182,8 +182,8 @@ class GoogleSolarApi: self.exclude_north_facing_segments(property_instance=property_instance) # If a property is semi-detached, it's possible for us to include segments from an attached unit if property_instance is not None: - if (property_instance.data["built-form"] == "Semi-Detached") and ( - property_instance.data["extension-count"] == 0 + if (property_instance.epc_record.built_form == "Semi-Detached") and ( + property_instance.epc_record.extension_count == 0 ): self.exclude_likely_duplicate_surfaces() @@ -708,7 +708,7 @@ class GoogleSolarApi: # We set the target rating to EPC C, which is the typical EPC rating we would expect the # property to achieve post retrofit of just the fabric "energy_consumption": cls.estimate_new_consumption( - current_energy_efficiency=min(p.data["current-energy-efficiency"], 100), + current_energy_efficiency=min(p.epc_record.current_energy_efficiency, 100), target_efficiency="69", current_consumption=p.estimate_electrical_consumption( assumed_ashp_efficiency=assumptions.AVERAGE_ASHP_EFFICIENCY, exclusions=body.exclusions @@ -727,7 +727,7 @@ class GoogleSolarApi: # We set the target rating to EPC C, which is the typical EPC rating we would expect the # property to achieve post retrofit of just the fabric "energy_consumption": cls.estimate_new_consumption( - current_energy_efficiency=min(int(p.data["current-energy-efficiency"]), 100), + current_energy_efficiency=min(p.epc_record.current_energy_efficiency, 100), target_efficiency="69", current_consumption=p.estimate_electrical_consumption( assumed_ashp_efficiency=assumptions.AVERAGE_ASHP_EFFICIENCY, exclusions=body.exclusions diff --git a/backend/app/db/functions/address_functions.py b/backend/app/db/functions/address_functions.py index 42fcdcfa..dbe2bf46 100644 --- a/backend/app/db/functions/address_functions.py +++ b/backend/app/db/functions/address_functions.py @@ -1,5 +1,5 @@ +from typing import Optional from sqlalchemy.orm import Session -from sqlalchemy.exc import SQLAlchemyError from sqlalchemy import func from backend.app.db.models.addresses import PostcodeSearch from utils.logger import setup_logger diff --git a/etl/bill_savings/KwhData.py b/etl/bill_savings/KwhData.py index dfb0be85..266f4b72 100644 --- a/etl/bill_savings/KwhData.py +++ b/etl/bill_savings/KwhData.py @@ -1,5 +1,4 @@ import re -from dataclasses import fields import pandas as pd import numpy as np from datetime import datetime @@ -15,24 +14,24 @@ logger = setup_logger() class KwhData: - COLS_TO_STRINGIFY = ["main_heating_controls", "floor_level"] + COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"] CATEGORICAL_COLUMNS = [ - "lodgement_year", "lodgement_month", "main_fuel", "mainheat_description", "number_heated_rooms", - "number_habitable_rooms", "mainheat_energy_eff", "mainheatcont_description", "property_type", - "built_form", - "construction_age_band", "secondheat_description", "hotwater_description", "hot_water_energy_eff", - "walls_description", "walls_energy_eff", "roof_description", "roof_energy_eff", "floor_description", + "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms", + "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", + "built-form", + "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff", + "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description", "county", - "windows_description", "windows_energy_eff", "flat_top_storey", - "flat_storey_count", "unheated_corridor_length", "solar_water_heating_flag", "mechanical_ventilation", - "low_energy_lighting", "environment_impact_current", "energy_tariff", "current_energy_rating", - "floor_level" + "windows-description", "windows-energy-eff", "flat-top-storey", + "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation", + "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating", + "floor-level" ] NUMERICAL_COLUMNS = [ - 'heating_cost_current', 'total_floor_area', 'co2_emissions_current', 'energy_consumption_current', - 'heating_cost_potential', 'hot_water_cost_current', 'current_energy_efficiency' + 'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current', + 'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency' ] def __init__(self, bucket=None, read_consumption_data=False): @@ -107,16 +106,6 @@ class KwhData: # If no match is found, return None or raise an exception return None - @staticmethod - def _normalise_epc_keys(data): - if isinstance(data, dict): - return {key.replace("-", "_"): value for key, value in data.items()} - - if isinstance(data, pd.DataFrame): - return data.rename(columns=lambda column: column.replace("-", "_")) - - raise TypeError("Expected dict or DataFrame") - def combine(self): """ Given the data that is collected containing the kwh values for heating and hot water, this method will combine @@ -139,9 +128,9 @@ class KwhData: # We check that the retrieved energy consumption sufficiently matches the EPC data internal_dataset = [] for x in data: - epc_data = self._normalise_epc_keys(x["epc"]) - epc_sap = epc_data["current_energy_efficiency"] - epc_potential_sap = epc_data["potential_energy_efficiency"] + epc_data = x["epc"] + epc_sap = epc_data["current-energy-efficiency"] + epc_potential_sap = epc_data["potential-energy-efficiency"] # Make sure this matches the extracted sap if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int( x["potential_epc_efficiency"] @@ -182,7 +171,7 @@ class KwhData: # We also estimate the energy consumption reduction from this data, by band df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"] - consumption_averages = df.groupby("current_energy_efficiency")["total_consumption"].mean().reset_index() + consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index() df = df.drop(columns=["total_consumption"]) self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet" @@ -214,11 +203,9 @@ class KwhData: # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features # in anticipation of the new model - data = self._normalise_epc_keys(data.copy()) - - data["lodgement_date"] = pd.to_datetime(data["lodgement_date"]) - data["lodgement_year"] = data["lodgement_date"].dt.year - data["lodgement_month"] = data["lodgement_date"].dt.month + data["lodgement-date"] = pd.to_datetime(data["lodgement-date"]) + data["lodgement-year"] = data["lodgement-date"].dt.year + data["lodgement-month"] = data["lodgement-date"].dt.month # For walls, roof, floor description where we have average thermal transmittance, to avoid too many # categories @@ -244,10 +231,8 @@ class KwhData: thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str) # Apply the lookup table to the data - for feature in ["walls_description", "roof_description", "floor_description"]: - cleaned_df = pd.DataFrame( - cleaned[feature.replace("_", "-")] - )[["original_description", "thermal_transmittance"]] + for feature in ["walls-description", "roof-description", "floor-description"]: + cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]] # Round to 2 decimal places and convert to string cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str) @@ -276,10 +261,10 @@ class KwhData: data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str) # Create new features: - data['estimate_annual_kwh'] = data['energy_consumption_current'] * data['total_floor_area'] + data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area'] # Ensure this is string, because we could have mixed types - data["lodgement_datetime"] = data["lodgement_datetime"].astype(str) + data["lodgement-datetime"] = data["lodgement-datetime"].astype(str) if save: self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet" @@ -301,39 +286,29 @@ class KwhData: data is in the format required by the model :return: """ + + epc = p.epc_record.to_dict(case="kebab", source="prepared") numeric_cols = [ - 'current_energy_efficiency', - 'potential_energy_efficiency', 'environment_impact_current', - 'environment_impact_potential', 'energy_consumption_current', - 'energy_consumption_potential', 'co2_emissions_current', - 'co2_emiss_curr_per_floor_area', 'co2_emissions_potential', - 'lighting_cost_current', 'lighting_cost_potential', - 'heating_cost_current', 'heating_cost_potential', - 'hot_water_cost_current', 'hot_water_cost_potential', - 'total_floor_area', 'multi_glaze_proportion', - 'extension_count', 'number_habitable_rooms', 'number_heated_rooms', - 'low_energy_lighting', 'number_open_fireplaces', - 'wind_turbine_count', 'unheated_corridor_length', - 'floor_height', 'photo_supply', 'fixed_lighting_outlets_count', - 'low_energy_fixed_light_count', + 'current-energy-efficiency', + 'potential-energy-efficiency', 'environment-impact-current', + 'environment-impact-potential', 'energy-consumption-current', + 'energy-consumption-potential', 'co2-emissions-current', + 'co2-emiss-curr-per-floor-area', 'co2-emissions-potential', + 'lighting-cost-current', 'lighting-cost-potential', + 'heating-cost-current', 'heating-cost-potential', + 'hot-water-cost-current', 'hot-water-cost-potential', + 'total-floor-area', 'multi-glaze-proportion', + 'extension-count', 'number-habitable-rooms', 'number-heated-rooms', + 'low-energy-lighting', 'number-open-fireplaces', + 'wind-turbine-count', 'unheated-corridor-length', + 'floor-height', 'photo-supply', 'fixed-lighting-outlets-count', + 'low-energy-fixed-light-count', ] - required_cols = set(numeric_cols + KwhData.CATEGORICAL_COLUMNS + [ - "uprn", "lodgement_date", "lodgement_datetime", "floor_energy_eff" - ]) - - epc_record = p.epc_record - available_fields = {field.name for field in fields(epc_record)} - missing_fields = required_cols - available_fields - if missing_fields: - raise ValueError(f"Missing EPCRecord fields required by KwhData: {sorted(missing_fields)}") - - epc = {field_name: getattr(epc_record, field_name) for field_name in required_cols} - for v in numeric_cols: if epc[v] is not None: epc[v] = float(epc[v]) - bools_to_remap = ['mains_gas_flag', 'flat_top_storey'] + bools_to_remap = ['mains-gas-flag', 'flat-top-storey'] bool_map = { True: "Y", False: "N", @@ -345,8 +320,8 @@ class KwhData: epc[v] = bool_map[epc[v]] no_data = { - "floor_level": "NODATA!", - "floor_energy_eff": "NO DATA!" + "floor-level": "NODATA!", + "floor-energy-eff": "NO DATA!" } for v, fill_val in no_data.items(): if pd.isnull(epc[v]): @@ -356,8 +331,8 @@ class KwhData: def prepare_epc(self, input_properties: list[Property]): scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties]) - scoring_data["lodgement_year"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.year - scoring_data["lodgement_month"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.month + scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year + scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month scoring_data["id"] = scoring_data["uprn"].copy() diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 0428542c..10968edc 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -1245,6 +1245,34 @@ class EPCRecord: return self.__dict__[RDSAP_RESPONSE] <= other.__dict__[RDSAP_RESPONSE] + def to_dict( + self, + case: Literal["snake", "kebab"] = "kebab", + source: Literal["prepared", "attributes"] = "prepared", + ) -> dict[str, Any]: + + if source == "prepared": + if self._prepared_epc is None: + raise ValueError("Prepared EPC not available") + data = self._prepared_epc.copy() + + elif source == "attributes": + data = { + k: v for k, v in vars(self).items() + if not k.startswith("_") + } + + else: + raise ValueError(f"Unknown source: {source}") + + if case == "snake": + return {k.replace("-", "_"): v for k, v in data.items()} + + if case == "kebab": + return {k.replace("_", "-"): v for k, v in data.items()} + + return data + def get( self, key: str | list[str], diff --git a/recommendations/Costs.py b/recommendations/Costs.py index 5f312f63..2bcc67df 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -200,20 +200,20 @@ class Costs: self.property = property_instance self.regional_labour_variations = regional_labour_variations - self.region = county_to_region_map.get(self.property.data["county"], None) + self.region = county_to_region_map.get(self.property.epc_record.county, None) if self.region is None: # Try and grab using the local-authority-label - self.region = county_to_region_map.get(self.property.data["local-authority-label"], None) + self.region = county_to_region_map.get(self.property.epc_record.local_authority_label, None) if self.region is None: # Try and get the region after converting the keys to lower self.region = { k.lower(): v for k, v in county_to_region_map.items() - }.get(self.property.data["local-authority-label"].lower(), None) + }.get(self.property.property.epc_record.local_authority_label.lower(), None) if self.region is None: logger.warning("No region found for county %s, defaulting to South East England", - self.property.data["county"]) + self.property.epc_record.county) self.region = "South East England" self.labour_adjustment_factor = [ @@ -858,8 +858,8 @@ class Costs: n_radiators = self._estimate_n_radiators( number_habitable_rooms=n_rooms, total_floor_area=self.property.floor_area, - property_type=self.property.data["property-type"], - built_form=self.property.data["built-form"] + property_type=self.property.epc_record.property - type, + built_form=self.property.epc_record.built_form ) additionals_labour_cost = labour_rate * self.labour_adjustment_factor diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py index 7469031c..df86c497 100644 --- a/recommendations/FloorRecommendations.py +++ b/recommendations/FloorRecommendations.py @@ -76,7 +76,7 @@ class FloorRecommendations(Definitions): return u_value = self.property.floor["thermal_transmittance"] - property_type = self.property.data["property-type"] + property_type = self.property.epc_record.property_type floor_area = self.property.insulation_floor_area if self.property.floor["another_property_below"] | (self.property.floor["insulation_thickness"] in [