improving basic typing of EpcRecord

This commit is contained in:
Khalim Conn-Kowlessar 2026-03-06 14:42:43 +00:00
parent fb2a69faff
commit 8f0cd7f98c

View file

@ -1,4 +1,4 @@
from typing import Optional, get_origin, get_args, TypedDict, Dict from typing import Optional, get_origin, get_args, TypedDict, cast, TypeAlias
from dataclasses import fields from dataclasses import fields
from datetime import datetime from datetime import datetime
from dataclasses import dataclass from dataclasses import dataclass
@ -45,11 +45,15 @@ DATA_BUCKET = os.environ.get(
pd.set_option("future.no_silent_downcasting", True) pd.set_option("future.no_silent_downcasting", True)
RawEpcRow: TypeAlias = dict[str, str | None]
PreparedEpcValue: TypeAlias = str | int | float | bool | None
PreparedEpcRow: TypeAlias = dict[str, PreparedEpcValue]
class InputEpcRecords(TypedDict): class InputEpcRecords(TypedDict):
original_epc: Dict[str, Any] original_epc: RawEpcRow
full_sap_epc: Dict[str, Any] full_sap_epc: RawEpcRow
old_data: List[Dict[str, Any]] old_data: list[RawEpcRow]
@dataclass @dataclass
@ -231,22 +235,33 @@ class EPCRecord:
run_mode: str = "training" run_mode: str = "training"
# ------------------------------------------------------------------
# INPUT DATA STRUCTURES
# ------------------------------------------------------------------
epc_records: Optional[InputEpcRecords] = None epc_records: Optional[InputEpcRecords] = None
full_sap_epc: Optional[dict] = None # Raw EPC input (immutable)
old_data: list[dict] = None original_epc: Optional[RawEpcRow] = None
original_epc: Optional[dict] = None
prepared_epc: Optional[dict] = None # Working dictionary that gets cleaned
prepared_epc: Optional[PreparedEpcRow] = None
# Supporting
full_sap_epc: Optional[RawEpcRow] = None
old_data: Optional[list[RawEpcRow]] = None
# # Metadata generated during processing
prepared_epc_delta_metadata: pd.DataFrame = None prepared_epc_delta_metadata: pd.DataFrame = None
cleaning_data: pd.DataFrame = None cleaning_data: pd.DataFrame = None
# Not used in training mod but used in newdata mode # Not used in training mod but used in newdata mode
age_band: str = None age_band: Optional[str] = None
construction_age_band: str = None construction_age_band: Optional[str] = None
year_built: int = None year_built: Optional[int] = None
number_of_floors: int = None number_of_floors: Optional[int] = None
number_of_open_fireplaces: int = None number_of_open_fireplaces: Optional[int] = None
heat_loss_corridor_bool: bool = None heat_loss_corridor_bool: Optional[bool] = None
solar_water_heating_flag_bool: bool = None solar_water_heating_flag_bool: Optional[bool] = None
def __post_init__(self): def __post_init__(self):
# We can have validation and cleaning steps for each of the fields # We can have validation and cleaning steps for each of the fields
@ -255,15 +270,18 @@ class EPCRecord:
if self.run_mode == "training": if self.run_mode == "training":
self.validation_configuration = EPCRecordValidationConfiguration self.validation_configuration = EPCRecordValidationConfiguration
# self._field_validation()
return return
# We are running in newdata mode # We are running in newdata mode
if self.epc_records is None: if self.epc_records is None:
raise ValueError("Must provide epc records if running in newdata mode") raise ValueError("Must provide epc records if running in newdata mode")
self.prepared_epc = self.epc_records["original_epc"] # Immutable copy; raw record
self.original_epc = self.epc_records["original_epc"].copy() self.original_epc = self.epc_records["original_epc"].copy()
# Working copy that we will clean and manipulate
self.prepared_epc = self.epc_records["original_epc"].copy()
self.full_sap_epc = self.epc_records["full_sap_epc"] self.full_sap_epc = self.epc_records["full_sap_epc"]
self.old_data = self.epc_records["old_data"] self.old_data = self.epc_records["old_data"]
@ -299,9 +317,12 @@ class EPCRecord:
) )
epc_data_processor.prepare_data() epc_data_processor.prepare_data()
self.prepared_epc = epc_data_processor.data.to_dict(orient="records")[0] record = epc_data_processor.data.to_dict(orient="records")[0]
def _cast_value(self, value, type_hint): self.prepared_epc = cast(RawEpcRow, record)
@staticmethod
def _cast_value(value, type_hint):
origin = get_origin(type_hint) origin = get_origin(type_hint)
args = get_args(type_hint) args = get_args(type_hint)
@ -396,14 +417,6 @@ class EPCRecord:
self._clean_constituency() self._clean_constituency()
self._clean_new_build_descriptions() self._clean_new_build_descriptions()
# self._clean_potential_energy_efficiency()
# self._clean_environment_impact_potential()
# self._clean_energy_consumption_potential()
# self._clean_co2_emissions_potential()
# self._clean_current_energy_efficiency()
# self._clean_energy_consumption_current()
# self._clean_co2_emissions_current()
def epc_record_as_dataframe( def epc_record_as_dataframe(
self, self,
epc_type: str = "prepared_epc", epc_type: str = "prepared_epc",
@ -524,9 +537,7 @@ class EPCRecord:
cleaned_property_data["FIXED_LIGHTING_OUTLETS_COUNT"].values[0] cleaned_property_data["FIXED_LIGHTING_OUTLETS_COUNT"].values[0]
) )
else: else:
self.prepared_epc["fixed-lighting-outlets-count"] = float( self.prepared_epc["fixed-lighting-outlets-count"] = float(self.prepared_epc["fixed-lighting-outlets-count"])
self.prepared_epc["fixed-lighting-outlets-count"]
)
def _filter_property_dimensions(self, property_dimensions): def _filter_property_dimensions(self, property_dimensions):
""" """
@ -604,15 +615,6 @@ class EPCRecord:
self.prepared_epc["property-type"] self.prepared_epc["property-type"]
) )
# if self.prepared_epc["property-type"] == "House":
# self.number_of_floors = 2
# elif self.prepared_epc["property-type"] in ["Flat", "Bungalow"]:
# self.number_of_floors = 1
# elif self.prepared_epc["property-type"] == "Maisonette":
# self.number_of_floors = 2
# else:
# raise NotImplementedError("Implement me")
if ( if (
self.prepared_epc["floor-height"] == "" self.prepared_epc["floor-height"] == ""
or self.prepared_epc["floor-height"] in DATA_ANOMALY_MATCHES or self.prepared_epc["floor-height"] in DATA_ANOMALY_MATCHES
@ -859,9 +861,12 @@ class EPCRecord:
This method will clean the year built, if empty or invalid This method will clean the year built, if empty or invalid
""" """
if self.full_sap_epc: if self.full_sap_epc:
self.year_built = datetime.strptime( lodgement_date = self.full_sap_epc["lodgement-date"]
self.full_sap_epc["lodgement-date"], "%Y-%m-%d"
).year if lodgement_date is None:
raise ValueError("full_sap_epc lodgement-date is missing")
self.year_built = datetime.strptime(str(lodgement_date), "%Y-%m-%d").year
return return