add post sap 10 feature

This commit is contained in:
Michael Duong 2025-11-02 09:44:41 +00:00
parent 92fcbe8cdb
commit 6aefd1eb3c
4 changed files with 40 additions and 16 deletions

View file

@ -4,6 +4,7 @@ import pandas as pd
from etl.epc.settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
POST_SAP10_DATE,
# IGNORED_TRANSACTION_TYPES,
IGNORED_FLOOR_LEVELS,
IGNORED_PROPERTY_TYPES,
@ -159,6 +160,9 @@ class EPCDataProcessor:
# colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
# )
# Create post sap10 flag
self.create_post_sap10_flag()
# When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper
cleaning_averages = self.cleaning_averages.copy()
if self.run_mode == "newdata":
@ -175,6 +179,13 @@ class EPCDataProcessor:
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
self.cast_data_columns_to_lower()
def create_post_sap10_flag(self):
"""
Create a flag to indicate if the epc is post sap10
"""
self.data["is_post_sap10"] = self.data["LODGEMENT_DATE"] >= POST_SAP10_DATE
def cast_data_columns_to_lower(self):
"""
Convert all columns names to lower

View file

@ -23,6 +23,7 @@ from etl.epc.settings import (
POTENTIAL_COLUMNS,
ROOM_FEATURES,
COST_FEATURES,
POST_SAP10_FEATURE,
)
# TODO: change in setting file
@ -325,7 +326,9 @@ class EPCPipeline:
# We include the lodgement date here as we probably need to factor time into the
# model, since EPC standards and rigour have changed over time
variable_data = property_data[VARIABLE_DATA_FEATURES + COST_FEATURES]
variable_data = property_data[
VARIABLE_DATA_FEATURES + COST_FEATURES + POST_SAP10_FEATURE
]
uprn = str(uprn)
epc_records = [

View file

@ -20,6 +20,7 @@ from etl.epc.settings import (
COMPONENT_FEATURES,
EFFICIENCY_FEATURES,
ROOM_FEATURES,
POST_SAP10_FEATURE,
)
from recommendations.recommendation_utils import estimate_number_of_floors
from utils.s3 import read_dataframe_from_s3_parquet
@ -89,6 +90,7 @@ class EPCRecord:
co2_emissions_current: float = None
number_habitable_rooms: float = None
number_heated_rooms: float = None
is_post_sap10: bool = None
# u_values_walls = None
# u_values_roof = None
@ -277,6 +279,7 @@ class EPCRecord:
self.number_heated_rooms: float = float(
self.prepared_epc["number_heated_rooms"]
)
self.is_post_sap10: bool = bool(self.prepared_epc["is_post_sap10"])
def _identify_delta_between_prepared_and_original_records(self):
"""
@ -385,11 +388,11 @@ class EPCRecord:
return df
def _clean_floor_height(self):
""" Remaps anomalies in floor height to the average floor height for the property type """
"""Remaps anomalies in floor height to the average floor height for the property type"""
floor_height_data = self.cleaning_data[
(self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) &
(self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
]
(self.cleaning_data["property_type"] == self.prepared_epc["property-type"])
& (self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
]
average = floor_height_data["floor_height"].mean()
sd = floor_height_data["floor_height"].std()
# If we're in the top 0.5 percentile of floor heights, we'll set it to the average
@ -399,14 +402,16 @@ class EPCRecord:
self.prepared_epc["floor-height"] = average
def _clean_new_build_descriptions(self):
for col in ['roof-description', 'walls-description', 'floor-description']:
for col in ["roof-description", "walls-description", "floor-description"]:
self.prepared_epc[col] = self.prepared_epc[col].replace("W/m²K", "W/m-¦K")
def _clean_constituency(self):
"""
We handle the single case of finding a missing constituency by using the local authority
"""
if pd.isnull(self.prepared_epc["constituency"]) or (self.prepared_epc["constituency"] == ""):
if pd.isnull(self.prepared_epc["constituency"]) or (
self.prepared_epc["constituency"] == ""
):
if self.prepared_epc["local-authority"] != "E06000044":
raise NotImplementedError(
"This function is only implemented for Portsmouth, in the single edgecase seen"
@ -595,12 +600,12 @@ class EPCRecord:
# We handle the edge case of floor area being 0. We set it to zero and it is cleaned by
# _clean_with_data_processor
if self.prepared_epc['total-floor-area'] == 0:
if self.prepared_epc["total-floor-area"] == 0:
print(
"Edge case of floor area being zero - will set to none and will be cleaned in "
"_clean_with_data_processor"
)
self.prepared_epc['total-floor-area'] = None
self.prepared_epc["total-floor-area"] = None
def _clean_mains_gas(self):
"""
@ -609,12 +614,7 @@ class EPCRecord:
if not self.prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
mains_gas_map = {
"Y": True,
"N": False,
True: True,
False: False
}
mains_gas_map = {"Y": True, "N": False, True: True, False: False}
self.prepared_epc["mains-gas-flag"] = (
None
@ -1064,7 +1064,12 @@ class EPCDifferenceRecord:
CARBON_RESPONSE
)
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + ROOM_FEATURES
component_variables = (
COMPONENT_FEATURES
+ EFFICIENCY_FEATURES
+ ROOM_FEATURES
+ POST_SAP10_FEATURE
)
ending_record = self.record2.get(
component_variables + ["lodgement_date"],
return_asdict=True,

View file

@ -52,6 +52,9 @@ DATA_ANOMALY_MATCHES = {
"Unknown",
}
# Add the post_sap10 date to indicate if the epc is post sap10
POST_SAP10_DATE = "2025-06-22"
DATA_ANOMALY_SUBSTRINGS = {
# Where values in a pick list that have been superseded by another value. For example, where a value for
# pitched roof has been replaced by three sub-categories of pitched roof. The original value is retained
@ -184,6 +187,8 @@ EFFICIENCY_FEATURES = [
ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"]
POST_SAP10_FEATURE = ["is_post_sap10"]
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
"TRANSACTION_TYPE",
"ENERGY_TARIFF", # Not sure if this is relevant