From ec146cba77b18d95eb12b735992427dc373418be Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 11 Mar 2026 19:25:21 +0000 Subject: [PATCH] reducing cleaning code --- etl/epc/Record.py | 209 ++++++++++++++++++++++++++++------------------ 1 file changed, 129 insertions(+), 80 deletions(-) diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 75188707..0c420399 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -1,5 +1,5 @@ import warnings -from typing import Optional, get_origin, get_args, TypedDict, cast, TypeAlias, Literal +from typing import Optional, get_origin, get_args, TypedDict, cast, TypeAlias, Literal, Callable from backend.addresses.Address import Address from dataclasses import fields from datetime import datetime @@ -58,6 +58,65 @@ class InputEpcRecords(TypedDict): old_data: list[RawEpcRow] +class CleaningRule(TypedDict, total=False): + cast: Callable[[Any], Any] + map: dict[Any, Any] + default: Any + anomaly_to: Any + + +CLEANING_RULES: dict[str, CleaningRule] = { + + # ----------------------------- + # BOOLEAN FLAGS + # ----------------------------- + + "mains-gas-flag": { + "map": {"Y": True, "N": False, True: True, False: False}, + "anomaly_to": None, + }, + + "solar-water-heating-flag": { + "map": {"Y": "Y", "N": "N", "": "N", None: "N"}, + }, + + # ----------------------------- + # NUMERIC CASTS + # ----------------------------- + + "photo-supply": { + "cast": float, + "anomaly_to": None, + }, + + "energy-consumption-current": { + "cast": float, + }, + + "co2-emissions-current": { + "cast": float, + }, + + "wind-turbine-count": { + "cast": int, + "anomaly_to": None, + }, + + "extension-count": { + "cast": int, + "default": 0 + }, + + # ----------------------------- + # TO NONE + # ----------------------------- + "mechanical-ventilation": { + "anomaly_to": None + }, + +} + + @dataclass class EPCRecord: """ @@ -293,6 +352,10 @@ class EPCRecord: if self.cleaning_data is None: raise ValueError("Must provide cleaning data if running in newdata mode") + invalid_rules = [k for k in CLEANING_RULES if k not in self._prepared_epc] + if invalid_rules: + logger.warning(f"Cleaning rules for unknown fields: {invalid_rules}") + self._clean_records_using_epc_records() self._clean_with_data_processor() self._inject_address_metadata() @@ -301,6 +364,58 @@ class EPCRecord: return + def _apply_cleaning_rules(self) -> None: + """ + Apply simple field-level cleaning rules defined in CLEANING_RULES. + """ + + if not self._prepared_epc: + raise ValueError("EPCRecord does not contain prepared EPC data") + + for field, rule in CLEANING_RULES.items(): + + if field not in self._prepared_epc: + logger.warning(f"Cleaning rule defined for missing field '{field}'") + continue + + value = self._prepared_epc[field] + + # ------------------------------------------------ + # 1. Mapping rules (highest priority) + # ------------------------------------------------ + + if "map" in rule and value in rule["map"]: + self._prepared_epc[field] = rule["map"][value] + continue + + # ------------------------------------------------ + # 2. Handle anomaly values + # ------------------------------------------------ + + if value in DATA_ANOMALY_MATCHES: + + if "anomaly_to" in rule: + self._prepared_epc[field] = rule["anomaly_to"] + continue + + if "default" in rule: + self._prepared_epc[field] = rule["default"] + continue + + continue + + # ------------------------------------------------ + # 3. Casting rules + # ------------------------------------------------ + + if "cast" in rule and value is not None: + try: + self._prepared_epc[field] = rule["cast"](value) + except Exception as e: + logger.warning( + f"Failed casting field '{field}' value '{value}': {e}" + ) + def _inject_address_metadata(self): """ Given metadata about an address, provided by the landlord on input, this method will inject it into the prepared @@ -341,14 +456,15 @@ class EPCRecord: "construction_age_band": addr.landlord_construction_age_band, } - # Saniry check - ensure valid keys - if any(k for k in landlord_remapping.keys() if k not in self._prepared_epc): + # Sanity check - ensure valid keys + if any(k not in self._prepared_epc for k in landlord_remapping): raise ValueError("Landlord remapping contains keys that are not in the EPC record") self.landlord_differences = {} # Anything actaully changed for k, v in landlord_remapping.items(): if k == "total_floor_area": - if abs(self._prepared_epc.get(k) - v) > 1: # 1m tolerance + existing = self._prepared_epc.get(k) + if existing is not None and v is not None and abs(existing - v) > 1: # 1m tolerance self.landlord_differences[k] = v else: if v != self._prepared_epc.get(k) and (not pd.isnull(v)) and (not pd.isnull(self._prepared_epc.get(k))): @@ -380,7 +496,7 @@ class EPCRecord: record = epc_data_processor.data.to_dict(orient="records")[0] - self._prepared_epc = cast(RawEpcRow, record) + self._prepared_epc = cast(PreparedEpcRow, record) @staticmethod def _cast_value(value: PreparedEpcValue, type_hint: Any) -> PreparedEpcValue: @@ -388,8 +504,11 @@ class EPCRecord: origin = get_origin(type_hint) args = get_args(type_hint) + # Handle Optional[T] / Union[T, None] if origin is Union: - type_hint = [a for a in args if a is not type(None)][0] + args = [a for a in get_args(type_hint) if a is not type(None)] + if len(args) == 1: + type_hint = args[0] if type_hint is int: return int(value) @@ -458,16 +577,12 @@ class EPCRecord: This method will clean the records """ - # TODO: Move all the cleaning steps in the Property class into here + self._apply_cleaning_rules() + self._clean_built_form() - self._clean_energy() - self._clean_ventilation() - self._clean_solar_pv() self._clean_solar_hot_water() - self._clean_wind_turbine() self._clean_count_variables() self._clean_heat_loss_corridor() - self._clean_mains_gas() self._clean_age_band() self._clean_year_built() self._clean_floor_area() @@ -492,6 +607,8 @@ class EPCRecord: raise ValueError(f"Invalid epc_type: {epc_type}") source = getattr(self, epc_type) + if source is None: + raise ValueError(f"{epc_type} is None") df = pd.DataFrame.from_dict(source, orient="index").T @@ -716,24 +833,6 @@ class EPCRecord: ) self._prepared_epc["total-floor-area"] = None - def _clean_mains_gas(self) -> None: - """ - This method will clean the mains gas, if empty or invalid - """ - if not self._prepared_epc: - raise ValueError("EPC Recrod doesn not contain epc data") - - mains_gas_map = {"Y": True, "N": False, True: True, False: False} - - self._prepared_epc["mains-gas-flag"] = ( - None - if ( - self._prepared_epc["mains-gas-flag"] == "" - or self._prepared_epc["mains-gas-flag"] in DATA_ANOMALY_MATCHES - ) - else mains_gas_map[self._prepared_epc["mains-gas-flag"]] - ) - def _clean_heat_loss_corridor(self) -> None: """ This method will clean the heat loss corridor, if empty or invalid @@ -796,19 +895,6 @@ class EPCRecord: self._prepared_epc[attribute] = value - def _clean_wind_turbine(self) -> None: - """ - This method will clean the wind turbine, if empty or invalid - """ - if not self._prepared_epc: - raise ValueError("EPC Recrod doesn not contain epc data") - - self._prepared_epc["wind-turbine-count"] = ( - int(self._prepared_epc["wind-turbine-count"]) - if self._prepared_epc["wind-turbine-count"] not in DATA_ANOMALY_MATCHES - else None - ) - def _clean_solar_hot_water(self) -> None: """ This method will clean the solar hot water, if empty or invalid @@ -832,33 +918,6 @@ class EPCRecord: self._prepared_epc["solar-water-heating-flag"] ] - def _clean_solar_pv(self) -> None: - """ - This method will clean the solar pv, if empty or invalid - """ - if not self._prepared_epc: - raise ValueError("EPC Recrod doesn not contain epc data") - - self._prepared_epc["photo-supply"] = ( - float(self._prepared_epc["photo-supply"]) - if (self._prepared_epc["photo-supply"] not in DATA_ANOMALY_MATCHES) - else None - ) - - def _clean_energy(self) -> None: - """ - This method will clean the energy, if empty or invalid - """ - if not self._prepared_epc: - raise ValueError("EPC Recrod doesn not contain epc data") - - self._prepared_epc["energy-consumption-current"] = float( - self._prepared_epc["energy-consumption-current"] - ) - self._prepared_epc["co2-emissions-current"] = float( - self._prepared_epc["co2-emissions-current"] - ) - def _clean_built_form(self) -> None: """ This method will clean the build form, if empty or invalid @@ -954,16 +1013,6 @@ class EPCRecord: # We don't know when the property was built self.year_built = None - def _clean_ventilation(self) -> None: - """ - This method will clean the ventilation, if empty or invalid - """ - self._prepared_epc["mechanical-ventilation"] = ( - None - if (self._prepared_epc["mechanical-ventilation"] in DATA_ANOMALY_MATCHES) - else (self._prepared_epc["mechanical-ventilation"]) - ) - def _field_validation(self) -> None: """ This method will validate each of the fields in the EPC record