mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
created _apply_averages_cleaning
This commit is contained in:
parent
6c89b07624
commit
3bf641f49d
3 changed files with 75 additions and 143 deletions
|
|
@ -487,7 +487,7 @@ class Property:
|
|||
# CO₂ emissions per square metre floor area per year in kg/m². Since CO₂ emissions are in tonnes
|
||||
# per year, we multiply by 1000 to get kg/m²
|
||||
"co2-emiss-curr-per-floor-area": round(
|
||||
1000 * (rec_impact["carbon"] / self.data["total-floor-area"])
|
||||
1000 * (rec_impact["carbon"] / self.epc_record.get("total_floor_area"))
|
||||
),
|
||||
"co2-emissions-current": rec_impact["carbon"],
|
||||
"current-energy-rating": sap_to_epc(rec_impact["sap"]),
|
||||
|
|
@ -594,21 +594,21 @@ class Property:
|
|||
if not cleaned:
|
||||
raise ValueError("Cleaner does not contain cleaned data")
|
||||
|
||||
if not self.data:
|
||||
if not self.epc_record:
|
||||
raise ValueError("Property does not contain data")
|
||||
|
||||
for description, attribute in cleaned.items():
|
||||
|
||||
cleaner_cls = all_cleaner_map[description]
|
||||
|
||||
if self.data[description] in self.DATA_ANOMALY_MATCHES:
|
||||
if self.epc_record.get(description) in self.DATA_ANOMALY_MATCHES:
|
||||
if description == "lighting-description":
|
||||
cleaner_cls = cleaner_cls("", averages=None)
|
||||
else:
|
||||
cleaner_cls = cleaner_cls("")
|
||||
fill_dict = {
|
||||
"original_description": self.data[description],
|
||||
"clean_description": self.data[description],
|
||||
"original_description": self.epc_record.get(description),
|
||||
"clean_description": self.epc_record.get(description),
|
||||
**cleaner_cls.process()
|
||||
}
|
||||
setattr(self, self.ATTRIBUTE_MAP[description], fill_dict)
|
||||
|
|
@ -617,7 +617,7 @@ class Property:
|
|||
attributes = [
|
||||
x
|
||||
for x in cleaned[description]
|
||||
if x["original_description"] == self.data[description]
|
||||
if x["original_description"] == self.epc_record.get(description)
|
||||
]
|
||||
|
||||
if len(attributes) > 1:
|
||||
|
|
@ -628,11 +628,11 @@ class Property:
|
|||
if len(attributes) == 0:
|
||||
# We attempt to perform the clean on the fly
|
||||
if description == "lighting-description":
|
||||
cleaner_cls = cleaner_cls(self.data[description], averages=None)
|
||||
cleaner_cls = cleaner_cls(self.epc_record.get(description), averages=None)
|
||||
else:
|
||||
cleaner_cls = cleaner_cls(self.data[description])
|
||||
cleaner_cls = cleaner_cls(self.epc_record.get(description))
|
||||
processed = {
|
||||
"original_description": self.data[description],
|
||||
"original_description": self.epc_record.get(description),
|
||||
"clean_description": cleaner_cls.description.replace(
|
||||
"(assumed)", ""
|
||||
)
|
||||
|
|
@ -672,7 +672,7 @@ class Property:
|
|||
# Today's costs
|
||||
todays_lighting_cost = kwh_client.convert_cost_to_today(
|
||||
original_cost=float(self.data["lighting-cost-current"]),
|
||||
lodgement_date=pd.Timestamp(self.epc_record.prepared_epc["lodgement_date"]).tz_localize(None)
|
||||
lodgement_date=pd.Timestamp(self.epc_record.get("lodgement_date")).tz_localize(None)
|
||||
)
|
||||
|
||||
# If we have the kwh figures, we don't need to predict them
|
||||
|
|
@ -1299,7 +1299,7 @@ class Property:
|
|||
valid for 10 years.
|
||||
:return: boolean indicating whether the EPC is expired
|
||||
"""
|
||||
lodgement_date = self.data["lodgement-date"]
|
||||
lodgement_date = self.epc_record.get("lodgement-date")
|
||||
return (datetime.now() - pd.to_datetime(lodgement_date)) > timedelta(days=3650)
|
||||
|
||||
@property
|
||||
|
|
@ -1308,4 +1308,4 @@ class Property:
|
|||
This property indicates that the EPC is estimated, based on the presence of the "estimated" flag in the data
|
||||
:return: boolean indicating whether the EPC is estimated
|
||||
"""
|
||||
return self.data.get("estimated", False)
|
||||
return self.epc_record.get("estimated")
|
||||
|
|
|
|||
|
|
@ -413,94 +413,6 @@ def check_duplicate_property_ids(input_properties):
|
|||
return True
|
||||
|
||||
|
||||
def averages_cleaning(prepared_epc: EPCRecord, cleaning_data: pd.DataFrame):
|
||||
"""
|
||||
Placeholder cleaning function to handle edge cases where we have missing data for
|
||||
number of habitable rooms, number of heated rooms and floor height. We take the median
|
||||
This need was born out of the Peabody project
|
||||
:param prepared_epc:
|
||||
:param cleaning_data:
|
||||
:return:
|
||||
"""
|
||||
|
||||
variables_to_clean = [
|
||||
"number_habitable_rooms",
|
||||
"number_heated_rooms",
|
||||
"floor_height",
|
||||
]
|
||||
|
||||
if not any([pd.isnull(prepared_epc.prepared_epc[k]) for k in variables_to_clean]):
|
||||
# Nothing to do
|
||||
return prepared_epc
|
||||
|
||||
# Clean with cleaning_data
|
||||
clean_with = cleaning_data[
|
||||
(cleaning_data["property_type"] == prepared_epc.prepared_epc["property_type"]) &
|
||||
(cleaning_data["property_type"] == prepared_epc.prepared_epc["property_type"])
|
||||
]
|
||||
if prepared_epc.prepared_epc["local_authority"] in clean_with["local_authority"].values:
|
||||
clean_with = clean_with[
|
||||
clean_with["local_authority"] == prepared_epc.prepared_epc["local_authority"]
|
||||
]
|
||||
|
||||
floor_area_clean_with = clean_with[
|
||||
(clean_with["total_floor_area"] <= prepared_epc.prepared_epc["total_floor_area"] * 1.1) &
|
||||
(clean_with["total_floor_area"] >= prepared_epc.prepared_epc["total_floor_area"] * 0.9)
|
||||
]
|
||||
|
||||
if not floor_area_clean_with.empty:
|
||||
clean_with = floor_area_clean_with
|
||||
|
||||
clean_n_habitable_rooms = int(round(clean_with["number_habitable_rooms"].median()))
|
||||
clean_n_heated_rooms = int(round(clean_with["number_heated_rooms"].median()))
|
||||
if clean_n_heated_rooms > clean_n_habitable_rooms:
|
||||
clean_n_heated_rooms = clean_n_habitable_rooms
|
||||
|
||||
clean_floor_height = clean_with["floor_height"].median()
|
||||
|
||||
# We now fill
|
||||
if not pd.isnull(clean_n_habitable_rooms) and pd.isnull(
|
||||
prepared_epc.prepared_epc["number_habitable_rooms"]):
|
||||
prepared_epc.prepared_epc["number_habitable_rooms"] = clean_n_habitable_rooms
|
||||
prepared_epc.number_habitable_rooms = clean_n_habitable_rooms
|
||||
|
||||
if not pd.isnull(clean_n_heated_rooms) and pd.isnull(
|
||||
prepared_epc.prepared_epc["number_heated_rooms"]):
|
||||
prepared_epc.prepared_epc["number_heated_rooms"] = clean_n_heated_rooms
|
||||
prepared_epc.number_heated_rooms = clean_n_heated_rooms
|
||||
|
||||
if not pd.isnull(clean_floor_height) and pd.isnull(
|
||||
prepared_epc.prepared_epc["floor_height"]):
|
||||
prepared_epc.prepared_epc["floor_height"] = clean_floor_height
|
||||
prepared_epc.floor_height = clean_floor_height
|
||||
|
||||
# if pd.isnull(prepared_epc.lighting_cost_current):
|
||||
# # This is a basic assumption as an average
|
||||
# prepared_epc.prepared_epc["lighting_cost_current"] = assumptions.AVERAGE_LIGHTING_COST
|
||||
# prepared_epc.lighting_cost_current = assumptions.AVERAGE_LIGHTING_COST
|
||||
|
||||
# if pd.isnull(prepared_epc.heating_cost_current):
|
||||
# # This is a basic assumption as an average
|
||||
# appliance_cost = AnnualBillSavings.estimate_appliances_energy_use(
|
||||
# total_floor_area=prepared_epc.total_floor_area
|
||||
# ) * AnnualBillSavings.ELECTRICITY_PRICE_CAP
|
||||
# heating_cleaned_value = assumptions.AVERAGE_HEATING_AND_APPLIANCE_COST - appliance_cost
|
||||
# prepared_epc.prepared_epc["heating_cost_current"] = heating_cleaned_value
|
||||
# prepared_epc.heating_cost_current = heating_cleaned_value
|
||||
#
|
||||
# if pd.isnull(prepared_epc.hot_water_cost_current):
|
||||
# # This is a basic assumption as an average
|
||||
# prepared_epc.prepared_epc["hot_water_cost_current"] = assumptions.AVERAGE_HOT_WATER_COST
|
||||
# prepared_epc.hot_water_cost_current = assumptions.AVERAGE_HOT_WATER_COST
|
||||
#
|
||||
# if pd.isnull(prepared_epc.energy_consumption_potential):
|
||||
# # Set to current
|
||||
# prepared_epc.prepared_epc["energy_consumption_potential"] = prepared_epc.energy_consumption_current
|
||||
# prepared_epc.energy_consumption_potential = prepared_epc.energy_consumption_current
|
||||
|
||||
return prepared_epc
|
||||
|
||||
|
||||
def extract_address_data(config, body):
|
||||
"""
|
||||
Simple helper to grab address data from the config
|
||||
|
|
@ -828,10 +740,6 @@ async def model_engine(body: PlanTriggerRequest):
|
|||
epc_records=epc_records, run_mode="newdata", cleaning_data=cleaning_data, address_metadata=addr
|
||||
)
|
||||
|
||||
# TODO: This is a temp function to handle a specific edge case with Peabody. We should
|
||||
# factor this into EPCRecord as part of the cleaning however we need some more testing
|
||||
prepared_epc = averages_cleaning(prepared_epc, cleaning_data)
|
||||
|
||||
input_properties.append(
|
||||
Property(
|
||||
id=property_id,
|
||||
|
|
@ -906,45 +814,6 @@ async def model_engine(body: PlanTriggerRequest):
|
|||
# 2) Missing EPC
|
||||
# 3) Materially different information from landlord vs EPC
|
||||
# make the landlord remapping dictionary
|
||||
addr = next((a for a in addresses if a.uprn == p.uprn), None)
|
||||
if addr is None:
|
||||
raise ValueError("Could not find address for property with UPRN: %s", p.uprn)
|
||||
|
||||
landlord_remapping = {
|
||||
"total_floor_area": addr.landlord_total_floor_area_m2, # 1m tolerance on floor area to perform remap
|
||||
"property_type": addr.landlord_property_type,
|
||||
"built_form": addr.landlord_built_form,
|
||||
|
||||
# Components
|
||||
"walls_description": addr.landlord_wall_construction,
|
||||
"roof_description": addr.landlord_roof_construction,
|
||||
"floor_description": addr.landlord_floor_construction,
|
||||
"windows_description": addr.landlord_windows_type,
|
||||
"main_fuel": addr.landlord_fuel_type,
|
||||
"mainheat_description": addr.landlord_heating_system,
|
||||
"mainheatcont_description": addr.landlord_heating_controls,
|
||||
"hotwater_description": addr.landlord_hot_water_system,
|
||||
|
||||
# Efficiency
|
||||
"walls_energy_eff": addr.landlord_wall_efficiency,
|
||||
"roof_energy_eff": addr.landlord_roof_efficiency,
|
||||
"windows_energy_eff": addr.landlord_windows_efficiency,
|
||||
"mainheat_energy_eff": addr.landlord_heating_efficiency,
|
||||
"mainheatc_energy_eff": addr.landlord_heating_controls_efficiency,
|
||||
"hot_water_energy_eff": addr.landlord_hot_water_efficiency,
|
||||
|
||||
"multi_glaze_proportion": addr.landlord_multi_glaze_proportion * 100, # TODO: Fix this!
|
||||
"construction_age_band": addr.landlord_construction_age_band,
|
||||
}
|
||||
# Find differences between EPC and landlord data
|
||||
differences = {}
|
||||
for k, v in landlord_remapping.items():
|
||||
if k == "total_floor_area":
|
||||
if abs(p.epc_record.prepared_epc.get(k) - v) > 1: # 1m tolerance
|
||||
differences[k] = v
|
||||
else:
|
||||
if v != p.epc_record.get(k) and (not pd.isnull(v)) and (not pd.isnull(p.epc_record.get(k))):
|
||||
differences[k] = v
|
||||
|
||||
needs_rebaselining = p.epc_is_expired | p.epc_is_estimated | (len(differences) > 0)
|
||||
|
||||
|
|
|
|||
|
|
@ -303,6 +303,12 @@ class EPCRecord:
|
|||
glazed_type: Optional[str] = None
|
||||
multi_glaze_proportion: Optional[float] = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# CLEANING FLAG
|
||||
# ------------------------------------------------------------------
|
||||
# Indicates if the EPC record has been predicted. By default, false
|
||||
estimated: Optional[bool] = False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# MODEL FLAGS
|
||||
# ------------------------------------------------------------------
|
||||
|
|
@ -379,6 +385,63 @@ class EPCRecord:
|
|||
|
||||
return
|
||||
|
||||
def _apply_averages_cleaning(self) -> None:
|
||||
"""
|
||||
Fills missing property dimension values using medians from cleaning_data.
|
||||
"""
|
||||
|
||||
if self._prepared_epc is None:
|
||||
raise ValueError("Prepared EPC missing")
|
||||
|
||||
if self.cleaning_data is None:
|
||||
raise ValueError("Cleaning data required for averages cleaning")
|
||||
|
||||
variables = [
|
||||
"number-habitable-rooms",
|
||||
"number-heated-rooms",
|
||||
"floor-height",
|
||||
]
|
||||
|
||||
if not any(pd.isnull(self._prepared_epc.get(v)) for v in variables):
|
||||
return
|
||||
|
||||
cleaning_data: pd.DataFrame = self.cleaning_data
|
||||
|
||||
clean_with = cleaning_data[
|
||||
(cleaning_data["property_type"] == self._prepared_epc["property-type"])
|
||||
]
|
||||
|
||||
if self._prepared_epc["local-authority"] in clean_with["local_authority"].values:
|
||||
clean_with = clean_with[
|
||||
clean_with["local_authority"] == self._prepared_epc["local-authority"]
|
||||
]
|
||||
|
||||
floor_area = self._prepared_epc.get("total-floor-area")
|
||||
|
||||
if floor_area is not None:
|
||||
subset = clean_with[
|
||||
(
|
||||
(clean_with["total_floor_area"].astype(float) <= floor_area * 1.1) &
|
||||
(clean_with["total_floor_area"].astype(float) >= floor_area * 0.9)
|
||||
)
|
||||
]
|
||||
if not subset.empty:
|
||||
clean_with = subset
|
||||
|
||||
medians = {
|
||||
"number-habitable-rooms": int(round(clean_with["number_habitable_rooms"].median())),
|
||||
"number-heated-rooms": int(round(clean_with["number_heated_rooms"].median())),
|
||||
"floor-height": float(clean_with["floor_height"].median()),
|
||||
}
|
||||
|
||||
# heated rooms should never exceed habitable
|
||||
if medians["number-heated-rooms"] > medians["number-habitable-rooms"]:
|
||||
medians["number-heated-rooms"] = medians["number-habitable-rooms"]
|
||||
|
||||
for key, value in medians.items():
|
||||
if pd.isnull(self._prepared_epc.get(key)):
|
||||
self._prepared_epc[key] = value
|
||||
|
||||
def _apply_cleaning_rules(self) -> None:
|
||||
"""
|
||||
Apply simple field-level cleaning rules defined in CLEANING_RULES.
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue