mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
reducing cleaning code
This commit is contained in:
parent
ec4959b58a
commit
ec146cba77
1 changed files with 129 additions and 80 deletions
|
|
@ -1,5 +1,5 @@
|
|||
import warnings
|
||||
from typing import Optional, get_origin, get_args, TypedDict, cast, TypeAlias, Literal
|
||||
from typing import Optional, get_origin, get_args, TypedDict, cast, TypeAlias, Literal, Callable
|
||||
from backend.addresses.Address import Address
|
||||
from dataclasses import fields
|
||||
from datetime import datetime
|
||||
|
|
@ -58,6 +58,65 @@ class InputEpcRecords(TypedDict):
|
|||
old_data: list[RawEpcRow]
|
||||
|
||||
|
||||
class CleaningRule(TypedDict, total=False):
|
||||
cast: Callable[[Any], Any]
|
||||
map: dict[Any, Any]
|
||||
default: Any
|
||||
anomaly_to: Any
|
||||
|
||||
|
||||
CLEANING_RULES: dict[str, CleaningRule] = {
|
||||
|
||||
# -----------------------------
|
||||
# BOOLEAN FLAGS
|
||||
# -----------------------------
|
||||
|
||||
"mains-gas-flag": {
|
||||
"map": {"Y": True, "N": False, True: True, False: False},
|
||||
"anomaly_to": None,
|
||||
},
|
||||
|
||||
"solar-water-heating-flag": {
|
||||
"map": {"Y": "Y", "N": "N", "": "N", None: "N"},
|
||||
},
|
||||
|
||||
# -----------------------------
|
||||
# NUMERIC CASTS
|
||||
# -----------------------------
|
||||
|
||||
"photo-supply": {
|
||||
"cast": float,
|
||||
"anomaly_to": None,
|
||||
},
|
||||
|
||||
"energy-consumption-current": {
|
||||
"cast": float,
|
||||
},
|
||||
|
||||
"co2-emissions-current": {
|
||||
"cast": float,
|
||||
},
|
||||
|
||||
"wind-turbine-count": {
|
||||
"cast": int,
|
||||
"anomaly_to": None,
|
||||
},
|
||||
|
||||
"extension-count": {
|
||||
"cast": int,
|
||||
"default": 0
|
||||
},
|
||||
|
||||
# -----------------------------
|
||||
# TO NONE
|
||||
# -----------------------------
|
||||
"mechanical-ventilation": {
|
||||
"anomaly_to": None
|
||||
},
|
||||
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class EPCRecord:
|
||||
"""
|
||||
|
|
@ -293,6 +352,10 @@ class EPCRecord:
|
|||
if self.cleaning_data is None:
|
||||
raise ValueError("Must provide cleaning data if running in newdata mode")
|
||||
|
||||
invalid_rules = [k for k in CLEANING_RULES if k not in self._prepared_epc]
|
||||
if invalid_rules:
|
||||
logger.warning(f"Cleaning rules for unknown fields: {invalid_rules}")
|
||||
|
||||
self._clean_records_using_epc_records()
|
||||
self._clean_with_data_processor()
|
||||
self._inject_address_metadata()
|
||||
|
|
@ -301,6 +364,58 @@ class EPCRecord:
|
|||
|
||||
return
|
||||
|
||||
def _apply_cleaning_rules(self) -> None:
|
||||
"""
|
||||
Apply simple field-level cleaning rules defined in CLEANING_RULES.
|
||||
"""
|
||||
|
||||
if not self._prepared_epc:
|
||||
raise ValueError("EPCRecord does not contain prepared EPC data")
|
||||
|
||||
for field, rule in CLEANING_RULES.items():
|
||||
|
||||
if field not in self._prepared_epc:
|
||||
logger.warning(f"Cleaning rule defined for missing field '{field}'")
|
||||
continue
|
||||
|
||||
value = self._prepared_epc[field]
|
||||
|
||||
# ------------------------------------------------
|
||||
# 1. Mapping rules (highest priority)
|
||||
# ------------------------------------------------
|
||||
|
||||
if "map" in rule and value in rule["map"]:
|
||||
self._prepared_epc[field] = rule["map"][value]
|
||||
continue
|
||||
|
||||
# ------------------------------------------------
|
||||
# 2. Handle anomaly values
|
||||
# ------------------------------------------------
|
||||
|
||||
if value in DATA_ANOMALY_MATCHES:
|
||||
|
||||
if "anomaly_to" in rule:
|
||||
self._prepared_epc[field] = rule["anomaly_to"]
|
||||
continue
|
||||
|
||||
if "default" in rule:
|
||||
self._prepared_epc[field] = rule["default"]
|
||||
continue
|
||||
|
||||
continue
|
||||
|
||||
# ------------------------------------------------
|
||||
# 3. Casting rules
|
||||
# ------------------------------------------------
|
||||
|
||||
if "cast" in rule and value is not None:
|
||||
try:
|
||||
self._prepared_epc[field] = rule["cast"](value)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed casting field '{field}' value '{value}': {e}"
|
||||
)
|
||||
|
||||
def _inject_address_metadata(self):
|
||||
"""
|
||||
Given metadata about an address, provided by the landlord on input, this method will inject it into the prepared
|
||||
|
|
@ -341,14 +456,15 @@ class EPCRecord:
|
|||
"construction_age_band": addr.landlord_construction_age_band,
|
||||
}
|
||||
|
||||
# Saniry check - ensure valid keys
|
||||
if any(k for k in landlord_remapping.keys() if k not in self._prepared_epc):
|
||||
# Sanity check - ensure valid keys
|
||||
if any(k not in self._prepared_epc for k in landlord_remapping):
|
||||
raise ValueError("Landlord remapping contains keys that are not in the EPC record")
|
||||
|
||||
self.landlord_differences = {} # Anything actaully changed
|
||||
for k, v in landlord_remapping.items():
|
||||
if k == "total_floor_area":
|
||||
if abs(self._prepared_epc.get(k) - v) > 1: # 1m tolerance
|
||||
existing = self._prepared_epc.get(k)
|
||||
if existing is not None and v is not None and abs(existing - v) > 1: # 1m tolerance
|
||||
self.landlord_differences[k] = v
|
||||
else:
|
||||
if v != self._prepared_epc.get(k) and (not pd.isnull(v)) and (not pd.isnull(self._prepared_epc.get(k))):
|
||||
|
|
@ -380,7 +496,7 @@ class EPCRecord:
|
|||
|
||||
record = epc_data_processor.data.to_dict(orient="records")[0]
|
||||
|
||||
self._prepared_epc = cast(RawEpcRow, record)
|
||||
self._prepared_epc = cast(PreparedEpcRow, record)
|
||||
|
||||
@staticmethod
|
||||
def _cast_value(value: PreparedEpcValue, type_hint: Any) -> PreparedEpcValue:
|
||||
|
|
@ -388,8 +504,11 @@ class EPCRecord:
|
|||
origin = get_origin(type_hint)
|
||||
args = get_args(type_hint)
|
||||
|
||||
# Handle Optional[T] / Union[T, None]
|
||||
if origin is Union:
|
||||
type_hint = [a for a in args if a is not type(None)][0]
|
||||
args = [a for a in get_args(type_hint) if a is not type(None)]
|
||||
if len(args) == 1:
|
||||
type_hint = args[0]
|
||||
|
||||
if type_hint is int:
|
||||
return int(value)
|
||||
|
|
@ -458,16 +577,12 @@ class EPCRecord:
|
|||
This method will clean the records
|
||||
"""
|
||||
|
||||
# TODO: Move all the cleaning steps in the Property class into here
|
||||
self._apply_cleaning_rules()
|
||||
|
||||
self._clean_built_form()
|
||||
self._clean_energy()
|
||||
self._clean_ventilation()
|
||||
self._clean_solar_pv()
|
||||
self._clean_solar_hot_water()
|
||||
self._clean_wind_turbine()
|
||||
self._clean_count_variables()
|
||||
self._clean_heat_loss_corridor()
|
||||
self._clean_mains_gas()
|
||||
self._clean_age_band()
|
||||
self._clean_year_built()
|
||||
self._clean_floor_area()
|
||||
|
|
@ -492,6 +607,8 @@ class EPCRecord:
|
|||
raise ValueError(f"Invalid epc_type: {epc_type}")
|
||||
|
||||
source = getattr(self, epc_type)
|
||||
if source is None:
|
||||
raise ValueError(f"{epc_type} is None")
|
||||
|
||||
df = pd.DataFrame.from_dict(source, orient="index").T
|
||||
|
||||
|
|
@ -716,24 +833,6 @@ class EPCRecord:
|
|||
)
|
||||
self._prepared_epc["total-floor-area"] = None
|
||||
|
||||
def _clean_mains_gas(self) -> None:
|
||||
"""
|
||||
This method will clean the mains gas, if empty or invalid
|
||||
"""
|
||||
if not self._prepared_epc:
|
||||
raise ValueError("EPC Recrod doesn not contain epc data")
|
||||
|
||||
mains_gas_map = {"Y": True, "N": False, True: True, False: False}
|
||||
|
||||
self._prepared_epc["mains-gas-flag"] = (
|
||||
None
|
||||
if (
|
||||
self._prepared_epc["mains-gas-flag"] == ""
|
||||
or self._prepared_epc["mains-gas-flag"] in DATA_ANOMALY_MATCHES
|
||||
)
|
||||
else mains_gas_map[self._prepared_epc["mains-gas-flag"]]
|
||||
)
|
||||
|
||||
def _clean_heat_loss_corridor(self) -> None:
|
||||
"""
|
||||
This method will clean the heat loss corridor, if empty or invalid
|
||||
|
|
@ -796,19 +895,6 @@ class EPCRecord:
|
|||
|
||||
self._prepared_epc[attribute] = value
|
||||
|
||||
def _clean_wind_turbine(self) -> None:
|
||||
"""
|
||||
This method will clean the wind turbine, if empty or invalid
|
||||
"""
|
||||
if not self._prepared_epc:
|
||||
raise ValueError("EPC Recrod doesn not contain epc data")
|
||||
|
||||
self._prepared_epc["wind-turbine-count"] = (
|
||||
int(self._prepared_epc["wind-turbine-count"])
|
||||
if self._prepared_epc["wind-turbine-count"] not in DATA_ANOMALY_MATCHES
|
||||
else None
|
||||
)
|
||||
|
||||
def _clean_solar_hot_water(self) -> None:
|
||||
"""
|
||||
This method will clean the solar hot water, if empty or invalid
|
||||
|
|
@ -832,33 +918,6 @@ class EPCRecord:
|
|||
self._prepared_epc["solar-water-heating-flag"]
|
||||
]
|
||||
|
||||
def _clean_solar_pv(self) -> None:
|
||||
"""
|
||||
This method will clean the solar pv, if empty or invalid
|
||||
"""
|
||||
if not self._prepared_epc:
|
||||
raise ValueError("EPC Recrod doesn not contain epc data")
|
||||
|
||||
self._prepared_epc["photo-supply"] = (
|
||||
float(self._prepared_epc["photo-supply"])
|
||||
if (self._prepared_epc["photo-supply"] not in DATA_ANOMALY_MATCHES)
|
||||
else None
|
||||
)
|
||||
|
||||
def _clean_energy(self) -> None:
|
||||
"""
|
||||
This method will clean the energy, if empty or invalid
|
||||
"""
|
||||
if not self._prepared_epc:
|
||||
raise ValueError("EPC Recrod doesn not contain epc data")
|
||||
|
||||
self._prepared_epc["energy-consumption-current"] = float(
|
||||
self._prepared_epc["energy-consumption-current"]
|
||||
)
|
||||
self._prepared_epc["co2-emissions-current"] = float(
|
||||
self._prepared_epc["co2-emissions-current"]
|
||||
)
|
||||
|
||||
def _clean_built_form(self) -> None:
|
||||
"""
|
||||
This method will clean the build form, if empty or invalid
|
||||
|
|
@ -954,16 +1013,6 @@ class EPCRecord:
|
|||
# We don't know when the property was built
|
||||
self.year_built = None
|
||||
|
||||
def _clean_ventilation(self) -> None:
|
||||
"""
|
||||
This method will clean the ventilation, if empty or invalid
|
||||
"""
|
||||
self._prepared_epc["mechanical-ventilation"] = (
|
||||
None
|
||||
if (self._prepared_epc["mechanical-ventilation"] in DATA_ANOMALY_MATCHES)
|
||||
else (self._prepared_epc["mechanical-ventilation"])
|
||||
)
|
||||
|
||||
def _field_validation(self) -> None:
|
||||
"""
|
||||
This method will validate each of the fields in the EPC record
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue