reducing cleaning code

This commit is contained in:
Khalim Conn-Kowlessar 2026-03-11 19:25:21 +00:00
parent ec4959b58a
commit ec146cba77

View file

@ -1,5 +1,5 @@
import warnings
from typing import Optional, get_origin, get_args, TypedDict, cast, TypeAlias, Literal
from typing import Optional, get_origin, get_args, TypedDict, cast, TypeAlias, Literal, Callable
from backend.addresses.Address import Address
from dataclasses import fields
from datetime import datetime
@ -58,6 +58,65 @@ class InputEpcRecords(TypedDict):
old_data: list[RawEpcRow]
class CleaningRule(TypedDict, total=False):
cast: Callable[[Any], Any]
map: dict[Any, Any]
default: Any
anomaly_to: Any
CLEANING_RULES: dict[str, CleaningRule] = {
# -----------------------------
# BOOLEAN FLAGS
# -----------------------------
"mains-gas-flag": {
"map": {"Y": True, "N": False, True: True, False: False},
"anomaly_to": None,
},
"solar-water-heating-flag": {
"map": {"Y": "Y", "N": "N", "": "N", None: "N"},
},
# -----------------------------
# NUMERIC CASTS
# -----------------------------
"photo-supply": {
"cast": float,
"anomaly_to": None,
},
"energy-consumption-current": {
"cast": float,
},
"co2-emissions-current": {
"cast": float,
},
"wind-turbine-count": {
"cast": int,
"anomaly_to": None,
},
"extension-count": {
"cast": int,
"default": 0
},
# -----------------------------
# TO NONE
# -----------------------------
"mechanical-ventilation": {
"anomaly_to": None
},
}
@dataclass
class EPCRecord:
"""
@ -293,6 +352,10 @@ class EPCRecord:
if self.cleaning_data is None:
raise ValueError("Must provide cleaning data if running in newdata mode")
invalid_rules = [k for k in CLEANING_RULES if k not in self._prepared_epc]
if invalid_rules:
logger.warning(f"Cleaning rules for unknown fields: {invalid_rules}")
self._clean_records_using_epc_records()
self._clean_with_data_processor()
self._inject_address_metadata()
@ -301,6 +364,58 @@ class EPCRecord:
return
def _apply_cleaning_rules(self) -> None:
"""
Apply simple field-level cleaning rules defined in CLEANING_RULES.
"""
if not self._prepared_epc:
raise ValueError("EPCRecord does not contain prepared EPC data")
for field, rule in CLEANING_RULES.items():
if field not in self._prepared_epc:
logger.warning(f"Cleaning rule defined for missing field '{field}'")
continue
value = self._prepared_epc[field]
# ------------------------------------------------
# 1. Mapping rules (highest priority)
# ------------------------------------------------
if "map" in rule and value in rule["map"]:
self._prepared_epc[field] = rule["map"][value]
continue
# ------------------------------------------------
# 2. Handle anomaly values
# ------------------------------------------------
if value in DATA_ANOMALY_MATCHES:
if "anomaly_to" in rule:
self._prepared_epc[field] = rule["anomaly_to"]
continue
if "default" in rule:
self._prepared_epc[field] = rule["default"]
continue
continue
# ------------------------------------------------
# 3. Casting rules
# ------------------------------------------------
if "cast" in rule and value is not None:
try:
self._prepared_epc[field] = rule["cast"](value)
except Exception as e:
logger.warning(
f"Failed casting field '{field}' value '{value}': {e}"
)
def _inject_address_metadata(self):
"""
Given metadata about an address, provided by the landlord on input, this method will inject it into the prepared
@ -341,14 +456,15 @@ class EPCRecord:
"construction_age_band": addr.landlord_construction_age_band,
}
# Saniry check - ensure valid keys
if any(k for k in landlord_remapping.keys() if k not in self._prepared_epc):
# Sanity check - ensure valid keys
if any(k not in self._prepared_epc for k in landlord_remapping):
raise ValueError("Landlord remapping contains keys that are not in the EPC record")
self.landlord_differences = {} # Anything actaully changed
for k, v in landlord_remapping.items():
if k == "total_floor_area":
if abs(self._prepared_epc.get(k) - v) > 1: # 1m tolerance
existing = self._prepared_epc.get(k)
if existing is not None and v is not None and abs(existing - v) > 1: # 1m tolerance
self.landlord_differences[k] = v
else:
if v != self._prepared_epc.get(k) and (not pd.isnull(v)) and (not pd.isnull(self._prepared_epc.get(k))):
@ -380,7 +496,7 @@ class EPCRecord:
record = epc_data_processor.data.to_dict(orient="records")[0]
self._prepared_epc = cast(RawEpcRow, record)
self._prepared_epc = cast(PreparedEpcRow, record)
@staticmethod
def _cast_value(value: PreparedEpcValue, type_hint: Any) -> PreparedEpcValue:
@ -388,8 +504,11 @@ class EPCRecord:
origin = get_origin(type_hint)
args = get_args(type_hint)
# Handle Optional[T] / Union[T, None]
if origin is Union:
type_hint = [a for a in args if a is not type(None)][0]
args = [a for a in get_args(type_hint) if a is not type(None)]
if len(args) == 1:
type_hint = args[0]
if type_hint is int:
return int(value)
@ -458,16 +577,12 @@ class EPCRecord:
This method will clean the records
"""
# TODO: Move all the cleaning steps in the Property class into here
self._apply_cleaning_rules()
self._clean_built_form()
self._clean_energy()
self._clean_ventilation()
self._clean_solar_pv()
self._clean_solar_hot_water()
self._clean_wind_turbine()
self._clean_count_variables()
self._clean_heat_loss_corridor()
self._clean_mains_gas()
self._clean_age_band()
self._clean_year_built()
self._clean_floor_area()
@ -492,6 +607,8 @@ class EPCRecord:
raise ValueError(f"Invalid epc_type: {epc_type}")
source = getattr(self, epc_type)
if source is None:
raise ValueError(f"{epc_type} is None")
df = pd.DataFrame.from_dict(source, orient="index").T
@ -716,24 +833,6 @@ class EPCRecord:
)
self._prepared_epc["total-floor-area"] = None
def _clean_mains_gas(self) -> None:
"""
This method will clean the mains gas, if empty or invalid
"""
if not self._prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
mains_gas_map = {"Y": True, "N": False, True: True, False: False}
self._prepared_epc["mains-gas-flag"] = (
None
if (
self._prepared_epc["mains-gas-flag"] == ""
or self._prepared_epc["mains-gas-flag"] in DATA_ANOMALY_MATCHES
)
else mains_gas_map[self._prepared_epc["mains-gas-flag"]]
)
def _clean_heat_loss_corridor(self) -> None:
"""
This method will clean the heat loss corridor, if empty or invalid
@ -796,19 +895,6 @@ class EPCRecord:
self._prepared_epc[attribute] = value
def _clean_wind_turbine(self) -> None:
"""
This method will clean the wind turbine, if empty or invalid
"""
if not self._prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
self._prepared_epc["wind-turbine-count"] = (
int(self._prepared_epc["wind-turbine-count"])
if self._prepared_epc["wind-turbine-count"] not in DATA_ANOMALY_MATCHES
else None
)
def _clean_solar_hot_water(self) -> None:
"""
This method will clean the solar hot water, if empty or invalid
@ -832,33 +918,6 @@ class EPCRecord:
self._prepared_epc["solar-water-heating-flag"]
]
def _clean_solar_pv(self) -> None:
"""
This method will clean the solar pv, if empty or invalid
"""
if not self._prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
self._prepared_epc["photo-supply"] = (
float(self._prepared_epc["photo-supply"])
if (self._prepared_epc["photo-supply"] not in DATA_ANOMALY_MATCHES)
else None
)
def _clean_energy(self) -> None:
"""
This method will clean the energy, if empty or invalid
"""
if not self._prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
self._prepared_epc["energy-consumption-current"] = float(
self._prepared_epc["energy-consumption-current"]
)
self._prepared_epc["co2-emissions-current"] = float(
self._prepared_epc["co2-emissions-current"]
)
def _clean_built_form(self) -> None:
"""
This method will clean the build form, if empty or invalid
@ -954,16 +1013,6 @@ class EPCRecord:
# We don't know when the property was built
self.year_built = None
def _clean_ventilation(self) -> None:
"""
This method will clean the ventilation, if empty or invalid
"""
self._prepared_epc["mechanical-ventilation"] = (
None
if (self._prepared_epc["mechanical-ventilation"] in DATA_ANOMALY_MATCHES)
else (self._prepared_epc["mechanical-ventilation"])
)
def _field_validation(self) -> None:
"""
This method will validate each of the fields in the EPC record