diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 4aa14a4d..0b5ad31b 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -1,4 +1,4 @@ -from typing import Optional, get_origin, get_args, TypedDict, Dict +from typing import Optional, get_origin, get_args, TypedDict, cast, TypeAlias from dataclasses import fields from datetime import datetime from dataclasses import dataclass @@ -45,11 +45,15 @@ DATA_BUCKET = os.environ.get( pd.set_option("future.no_silent_downcasting", True) +RawEpcRow: TypeAlias = dict[str, str | None] +PreparedEpcValue: TypeAlias = str | int | float | bool | None +PreparedEpcRow: TypeAlias = dict[str, PreparedEpcValue] + class InputEpcRecords(TypedDict): - original_epc: Dict[str, Any] - full_sap_epc: Dict[str, Any] - old_data: List[Dict[str, Any]] + original_epc: RawEpcRow + full_sap_epc: RawEpcRow + old_data: list[RawEpcRow] @dataclass @@ -231,22 +235,33 @@ class EPCRecord: run_mode: str = "training" + # ------------------------------------------------------------------ + # INPUT DATA STRUCTURES + # ------------------------------------------------------------------ + epc_records: Optional[InputEpcRecords] = None - full_sap_epc: Optional[dict] = None - old_data: list[dict] = None - original_epc: Optional[dict] = None - prepared_epc: Optional[dict] = None + # Raw EPC input (immutable) + original_epc: Optional[RawEpcRow] = None + + # Working dictionary that gets cleaned + prepared_epc: Optional[PreparedEpcRow] = None + + # Supporting + full_sap_epc: Optional[RawEpcRow] = None + old_data: Optional[list[RawEpcRow]] = None + + # # Metadata generated during processing prepared_epc_delta_metadata: pd.DataFrame = None cleaning_data: pd.DataFrame = None # Not used in training mod but used in newdata mode - age_band: str = None - construction_age_band: str = None - year_built: int = None - number_of_floors: int = None - number_of_open_fireplaces: int = None - heat_loss_corridor_bool: bool = None - solar_water_heating_flag_bool: bool = None + age_band: Optional[str] = None + construction_age_band: Optional[str] = None + year_built: Optional[int] = None + number_of_floors: Optional[int] = None + number_of_open_fireplaces: Optional[int] = None + heat_loss_corridor_bool: Optional[bool] = None + solar_water_heating_flag_bool: Optional[bool] = None def __post_init__(self): # We can have validation and cleaning steps for each of the fields @@ -255,15 +270,18 @@ class EPCRecord: if self.run_mode == "training": self.validation_configuration = EPCRecordValidationConfiguration - # self._field_validation() return # We are running in newdata mode if self.epc_records is None: raise ValueError("Must provide epc records if running in newdata mode") - self.prepared_epc = self.epc_records["original_epc"] + # Immutable copy; raw record self.original_epc = self.epc_records["original_epc"].copy() + + # Working copy that we will clean and manipulate + self.prepared_epc = self.epc_records["original_epc"].copy() + self.full_sap_epc = self.epc_records["full_sap_epc"] self.old_data = self.epc_records["old_data"] @@ -299,9 +317,12 @@ class EPCRecord: ) epc_data_processor.prepare_data() - self.prepared_epc = epc_data_processor.data.to_dict(orient="records")[0] + record = epc_data_processor.data.to_dict(orient="records")[0] - def _cast_value(self, value, type_hint): + self.prepared_epc = cast(RawEpcRow, record) + + @staticmethod + def _cast_value(value, type_hint): origin = get_origin(type_hint) args = get_args(type_hint) @@ -396,14 +417,6 @@ class EPCRecord: self._clean_constituency() self._clean_new_build_descriptions() - # self._clean_potential_energy_efficiency() - # self._clean_environment_impact_potential() - # self._clean_energy_consumption_potential() - # self._clean_co2_emissions_potential() - # self._clean_current_energy_efficiency() - # self._clean_energy_consumption_current() - # self._clean_co2_emissions_current() - def epc_record_as_dataframe( self, epc_type: str = "prepared_epc", @@ -524,9 +537,7 @@ class EPCRecord: cleaned_property_data["FIXED_LIGHTING_OUTLETS_COUNT"].values[0] ) else: - self.prepared_epc["fixed-lighting-outlets-count"] = float( - self.prepared_epc["fixed-lighting-outlets-count"] - ) + self.prepared_epc["fixed-lighting-outlets-count"] = float(self.prepared_epc["fixed-lighting-outlets-count"]) def _filter_property_dimensions(self, property_dimensions): """ @@ -604,15 +615,6 @@ class EPCRecord: self.prepared_epc["property-type"] ) - # if self.prepared_epc["property-type"] == "House": - # self.number_of_floors = 2 - # elif self.prepared_epc["property-type"] in ["Flat", "Bungalow"]: - # self.number_of_floors = 1 - # elif self.prepared_epc["property-type"] == "Maisonette": - # self.number_of_floors = 2 - # else: - # raise NotImplementedError("Implement me") - if ( self.prepared_epc["floor-height"] == "" or self.prepared_epc["floor-height"] in DATA_ANOMALY_MATCHES @@ -859,9 +861,12 @@ class EPCRecord: This method will clean the year built, if empty or invalid """ if self.full_sap_epc: - self.year_built = datetime.strptime( - self.full_sap_epc["lodgement-date"], "%Y-%m-%d" - ).year + lodgement_date = self.full_sap_epc["lodgement-date"] + + if lodgement_date is None: + raise ValueError("full_sap_epc lodgement-date is missing") + + self.year_built = datetime.strptime(str(lodgement_date), "%Y-%m-%d").year return