created _apply_averages_cleaning

This commit is contained in:
Khalim Conn-Kowlessar 2026-03-11 20:52:20 +00:00
parent 6c89b07624
commit 3bf641f49d
3 changed files with 75 additions and 143 deletions

View file

@ -487,7 +487,7 @@ class Property:
# CO₂ emissions per square metre floor area per year in kg/m². Since CO₂ emissions are in tonnes
# per year, we multiply by 1000 to get kg/m²
"co2-emiss-curr-per-floor-area": round(
1000 * (rec_impact["carbon"] / self.data["total-floor-area"])
1000 * (rec_impact["carbon"] / self.epc_record.get("total_floor_area"))
),
"co2-emissions-current": rec_impact["carbon"],
"current-energy-rating": sap_to_epc(rec_impact["sap"]),
@ -594,21 +594,21 @@ class Property:
if not cleaned:
raise ValueError("Cleaner does not contain cleaned data")
if not self.data:
if not self.epc_record:
raise ValueError("Property does not contain data")
for description, attribute in cleaned.items():
cleaner_cls = all_cleaner_map[description]
if self.data[description] in self.DATA_ANOMALY_MATCHES:
if self.epc_record.get(description) in self.DATA_ANOMALY_MATCHES:
if description == "lighting-description":
cleaner_cls = cleaner_cls("", averages=None)
else:
cleaner_cls = cleaner_cls("")
fill_dict = {
"original_description": self.data[description],
"clean_description": self.data[description],
"original_description": self.epc_record.get(description),
"clean_description": self.epc_record.get(description),
**cleaner_cls.process()
}
setattr(self, self.ATTRIBUTE_MAP[description], fill_dict)
@ -617,7 +617,7 @@ class Property:
attributes = [
x
for x in cleaned[description]
if x["original_description"] == self.data[description]
if x["original_description"] == self.epc_record.get(description)
]
if len(attributes) > 1:
@ -628,11 +628,11 @@ class Property:
if len(attributes) == 0:
# We attempt to perform the clean on the fly
if description == "lighting-description":
cleaner_cls = cleaner_cls(self.data[description], averages=None)
cleaner_cls = cleaner_cls(self.epc_record.get(description), averages=None)
else:
cleaner_cls = cleaner_cls(self.data[description])
cleaner_cls = cleaner_cls(self.epc_record.get(description))
processed = {
"original_description": self.data[description],
"original_description": self.epc_record.get(description),
"clean_description": cleaner_cls.description.replace(
"(assumed)", ""
)
@ -672,7 +672,7 @@ class Property:
# Today's costs
todays_lighting_cost = kwh_client.convert_cost_to_today(
original_cost=float(self.data["lighting-cost-current"]),
lodgement_date=pd.Timestamp(self.epc_record.prepared_epc["lodgement_date"]).tz_localize(None)
lodgement_date=pd.Timestamp(self.epc_record.get("lodgement_date")).tz_localize(None)
)
# If we have the kwh figures, we don't need to predict them
@ -1299,7 +1299,7 @@ class Property:
valid for 10 years.
:return: boolean indicating whether the EPC is expired
"""
lodgement_date = self.data["lodgement-date"]
lodgement_date = self.epc_record.get("lodgement-date")
return (datetime.now() - pd.to_datetime(lodgement_date)) > timedelta(days=3650)
@property
@ -1308,4 +1308,4 @@ class Property:
This property indicates that the EPC is estimated, based on the presence of the "estimated" flag in the data
:return: boolean indicating whether the EPC is estimated
"""
return self.data.get("estimated", False)
return self.epc_record.get("estimated")

View file

@ -413,94 +413,6 @@ def check_duplicate_property_ids(input_properties):
return True
def averages_cleaning(prepared_epc: EPCRecord, cleaning_data: pd.DataFrame):
"""
Placeholder cleaning function to handle edge cases where we have missing data for
number of habitable rooms, number of heated rooms and floor height. We take the median
This need was born out of the Peabody project
:param prepared_epc:
:param cleaning_data:
:return:
"""
variables_to_clean = [
"number_habitable_rooms",
"number_heated_rooms",
"floor_height",
]
if not any([pd.isnull(prepared_epc.prepared_epc[k]) for k in variables_to_clean]):
# Nothing to do
return prepared_epc
# Clean with cleaning_data
clean_with = cleaning_data[
(cleaning_data["property_type"] == prepared_epc.prepared_epc["property_type"]) &
(cleaning_data["property_type"] == prepared_epc.prepared_epc["property_type"])
]
if prepared_epc.prepared_epc["local_authority"] in clean_with["local_authority"].values:
clean_with = clean_with[
clean_with["local_authority"] == prepared_epc.prepared_epc["local_authority"]
]
floor_area_clean_with = clean_with[
(clean_with["total_floor_area"] <= prepared_epc.prepared_epc["total_floor_area"] * 1.1) &
(clean_with["total_floor_area"] >= prepared_epc.prepared_epc["total_floor_area"] * 0.9)
]
if not floor_area_clean_with.empty:
clean_with = floor_area_clean_with
clean_n_habitable_rooms = int(round(clean_with["number_habitable_rooms"].median()))
clean_n_heated_rooms = int(round(clean_with["number_heated_rooms"].median()))
if clean_n_heated_rooms > clean_n_habitable_rooms:
clean_n_heated_rooms = clean_n_habitable_rooms
clean_floor_height = clean_with["floor_height"].median()
# We now fill
if not pd.isnull(clean_n_habitable_rooms) and pd.isnull(
prepared_epc.prepared_epc["number_habitable_rooms"]):
prepared_epc.prepared_epc["number_habitable_rooms"] = clean_n_habitable_rooms
prepared_epc.number_habitable_rooms = clean_n_habitable_rooms
if not pd.isnull(clean_n_heated_rooms) and pd.isnull(
prepared_epc.prepared_epc["number_heated_rooms"]):
prepared_epc.prepared_epc["number_heated_rooms"] = clean_n_heated_rooms
prepared_epc.number_heated_rooms = clean_n_heated_rooms
if not pd.isnull(clean_floor_height) and pd.isnull(
prepared_epc.prepared_epc["floor_height"]):
prepared_epc.prepared_epc["floor_height"] = clean_floor_height
prepared_epc.floor_height = clean_floor_height
# if pd.isnull(prepared_epc.lighting_cost_current):
# # This is a basic assumption as an average
# prepared_epc.prepared_epc["lighting_cost_current"] = assumptions.AVERAGE_LIGHTING_COST
# prepared_epc.lighting_cost_current = assumptions.AVERAGE_LIGHTING_COST
# if pd.isnull(prepared_epc.heating_cost_current):
# # This is a basic assumption as an average
# appliance_cost = AnnualBillSavings.estimate_appliances_energy_use(
# total_floor_area=prepared_epc.total_floor_area
# ) * AnnualBillSavings.ELECTRICITY_PRICE_CAP
# heating_cleaned_value = assumptions.AVERAGE_HEATING_AND_APPLIANCE_COST - appliance_cost
# prepared_epc.prepared_epc["heating_cost_current"] = heating_cleaned_value
# prepared_epc.heating_cost_current = heating_cleaned_value
#
# if pd.isnull(prepared_epc.hot_water_cost_current):
# # This is a basic assumption as an average
# prepared_epc.prepared_epc["hot_water_cost_current"] = assumptions.AVERAGE_HOT_WATER_COST
# prepared_epc.hot_water_cost_current = assumptions.AVERAGE_HOT_WATER_COST
#
# if pd.isnull(prepared_epc.energy_consumption_potential):
# # Set to current
# prepared_epc.prepared_epc["energy_consumption_potential"] = prepared_epc.energy_consumption_current
# prepared_epc.energy_consumption_potential = prepared_epc.energy_consumption_current
return prepared_epc
def extract_address_data(config, body):
"""
Simple helper to grab address data from the config
@ -828,10 +740,6 @@ async def model_engine(body: PlanTriggerRequest):
epc_records=epc_records, run_mode="newdata", cleaning_data=cleaning_data, address_metadata=addr
)
# TODO: This is a temp function to handle a specific edge case with Peabody. We should
# factor this into EPCRecord as part of the cleaning however we need some more testing
prepared_epc = averages_cleaning(prepared_epc, cleaning_data)
input_properties.append(
Property(
id=property_id,
@ -906,45 +814,6 @@ async def model_engine(body: PlanTriggerRequest):
# 2) Missing EPC
# 3) Materially different information from landlord vs EPC
# make the landlord remapping dictionary
addr = next((a for a in addresses if a.uprn == p.uprn), None)
if addr is None:
raise ValueError("Could not find address for property with UPRN: %s", p.uprn)
landlord_remapping = {
"total_floor_area": addr.landlord_total_floor_area_m2, # 1m tolerance on floor area to perform remap
"property_type": addr.landlord_property_type,
"built_form": addr.landlord_built_form,
# Components
"walls_description": addr.landlord_wall_construction,
"roof_description": addr.landlord_roof_construction,
"floor_description": addr.landlord_floor_construction,
"windows_description": addr.landlord_windows_type,
"main_fuel": addr.landlord_fuel_type,
"mainheat_description": addr.landlord_heating_system,
"mainheatcont_description": addr.landlord_heating_controls,
"hotwater_description": addr.landlord_hot_water_system,
# Efficiency
"walls_energy_eff": addr.landlord_wall_efficiency,
"roof_energy_eff": addr.landlord_roof_efficiency,
"windows_energy_eff": addr.landlord_windows_efficiency,
"mainheat_energy_eff": addr.landlord_heating_efficiency,
"mainheatc_energy_eff": addr.landlord_heating_controls_efficiency,
"hot_water_energy_eff": addr.landlord_hot_water_efficiency,
"multi_glaze_proportion": addr.landlord_multi_glaze_proportion * 100, # TODO: Fix this!
"construction_age_band": addr.landlord_construction_age_band,
}
# Find differences between EPC and landlord data
differences = {}
for k, v in landlord_remapping.items():
if k == "total_floor_area":
if abs(p.epc_record.prepared_epc.get(k) - v) > 1: # 1m tolerance
differences[k] = v
else:
if v != p.epc_record.get(k) and (not pd.isnull(v)) and (not pd.isnull(p.epc_record.get(k))):
differences[k] = v
needs_rebaselining = p.epc_is_expired | p.epc_is_estimated | (len(differences) > 0)

View file

@ -303,6 +303,12 @@ class EPCRecord:
glazed_type: Optional[str] = None
multi_glaze_proportion: Optional[float] = None
# ------------------------------------------------------------------
# CLEANING FLAG
# ------------------------------------------------------------------
# Indicates if the EPC record has been predicted. By default, false
estimated: Optional[bool] = False
# ------------------------------------------------------------------
# MODEL FLAGS
# ------------------------------------------------------------------
@ -379,6 +385,63 @@ class EPCRecord:
return
def _apply_averages_cleaning(self) -> None:
"""
Fills missing property dimension values using medians from cleaning_data.
"""
if self._prepared_epc is None:
raise ValueError("Prepared EPC missing")
if self.cleaning_data is None:
raise ValueError("Cleaning data required for averages cleaning")
variables = [
"number-habitable-rooms",
"number-heated-rooms",
"floor-height",
]
if not any(pd.isnull(self._prepared_epc.get(v)) for v in variables):
return
cleaning_data: pd.DataFrame = self.cleaning_data
clean_with = cleaning_data[
(cleaning_data["property_type"] == self._prepared_epc["property-type"])
]
if self._prepared_epc["local-authority"] in clean_with["local_authority"].values:
clean_with = clean_with[
clean_with["local_authority"] == self._prepared_epc["local-authority"]
]
floor_area = self._prepared_epc.get("total-floor-area")
if floor_area is not None:
subset = clean_with[
(
(clean_with["total_floor_area"].astype(float) <= floor_area * 1.1) &
(clean_with["total_floor_area"].astype(float) >= floor_area * 0.9)
)
]
if not subset.empty:
clean_with = subset
medians = {
"number-habitable-rooms": int(round(clean_with["number_habitable_rooms"].median())),
"number-heated-rooms": int(round(clean_with["number_heated_rooms"].median())),
"floor-height": float(clean_with["floor_height"].median()),
}
# heated rooms should never exceed habitable
if medians["number-heated-rooms"] > medians["number-habitable-rooms"]:
medians["number-heated-rooms"] = medians["number-habitable-rooms"]
for key, value in medians.items():
if pd.isnull(self._prepared_epc.get(key)):
self._prepared_epc[key] = value
def _apply_cleaning_rules(self) -> None:
"""
Apply simple field-level cleaning rules defined in CLEANING_RULES.