add post sap 10 feature

2026-07-27 23:35:01 +00:00 · 2025-11-02 09:44:41 +00:00 · 2025-11-02 09:44:41 +00:00 · 6aefd1eb3c
commit 6aefd1eb3c
parent 92fcbe8cdb
4 changed files with 40 additions and 16 deletions
--- a/etl/epc/DataProcessor.py
+++ b/etl/epc/DataProcessor.py
@ -4,6 +4,7 @@ import pandas as pd
 from etl.epc.settings import (
    DATA_PROCESSOR_SETTINGS,
    EARLIEST_EPC_DATE,
+    POST_SAP10_DATE,
    # IGNORED_TRANSACTION_TYPES,
    IGNORED_FLOOR_LEVELS,
    IGNORED_PROPERTY_TYPES,
@ -159,6 +160,9 @@ class EPCDataProcessor:
        #     colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
        # )

+        # Create post sap10 flag
+        self.create_post_sap10_flag()
+
        # When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper
        cleaning_averages = self.cleaning_averages.copy()
        if self.run_mode == "newdata":
@ -175,6 +179,13 @@ class EPCDataProcessor:
        self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
        self.cast_data_columns_to_lower()

+    def create_post_sap10_flag(self):
+        """
+        Create a flag to indicate if the epc is post sap10
+        """
+
+        self.data["is_post_sap10"] = self.data["LODGEMENT_DATE"] >= POST_SAP10_DATE
+
    def cast_data_columns_to_lower(self):
        """
        Convert all columns names to lower
--- a/etl/epc/Pipeline.py
+++ b/etl/epc/Pipeline.py
@ -23,6 +23,7 @@ from etl.epc.settings import (
    POTENTIAL_COLUMNS,
    ROOM_FEATURES,
    COST_FEATURES,
+    POST_SAP10_FEATURE,
 )

 # TODO: change in setting file
@ -325,7 +326,9 @@ class EPCPipeline:

        # We include the lodgement date here as we probably need to factor time into the
        # model, since EPC standards and rigour have changed over time
-        variable_data = property_data[VARIABLE_DATA_FEATURES + COST_FEATURES]
+        variable_data = property_data[
+            VARIABLE_DATA_FEATURES + COST_FEATURES + POST_SAP10_FEATURE
+        ]

        uprn = str(uprn)
        epc_records = [
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@ -20,6 +20,7 @@ from etl.epc.settings import (
    COMPONENT_FEATURES,
    EFFICIENCY_FEATURES,
    ROOM_FEATURES,
+    POST_SAP10_FEATURE,
 )
 from recommendations.recommendation_utils import estimate_number_of_floors
 from utils.s3 import read_dataframe_from_s3_parquet
@ -89,6 +90,7 @@ class EPCRecord:
    co2_emissions_current: float = None
    number_habitable_rooms: float = None
    number_heated_rooms: float = None
+    is_post_sap10: bool = None

    # u_values_walls = None
    # u_values_roof = None
@ -277,6 +279,7 @@ class EPCRecord:
        self.number_heated_rooms: float = float(
            self.prepared_epc["number_heated_rooms"]
        )
+        self.is_post_sap10: bool = bool(self.prepared_epc["is_post_sap10"])

    def _identify_delta_between_prepared_and_original_records(self):
        """
@ -385,11 +388,11 @@ class EPCRecord:
        return df

    def _clean_floor_height(self):
-        """ Remaps anomalies in floor height to the average floor height for the property type """
+        """Remaps anomalies in floor height to the average floor height for the property type"""
        floor_height_data = self.cleaning_data[
-            (self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) &
-            (self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
-            ]
+            (self.cleaning_data["property_type"] == self.prepared_epc["property-type"])
+            & (self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
+        ]
        average = floor_height_data["floor_height"].mean()
        sd = floor_height_data["floor_height"].std()
        # If we're in the top 0.5 percentile of floor heights, we'll set it to the average
@ -399,14 +402,16 @@ class EPCRecord:
            self.prepared_epc["floor-height"] = average

    def _clean_new_build_descriptions(self):
-        for col in ['roof-description', 'walls-description', 'floor-description']:
+        for col in ["roof-description", "walls-description", "floor-description"]:
            self.prepared_epc[col] = self.prepared_epc[col].replace("W/m²K", "W/m-¦K")

    def _clean_constituency(self):
        """
        We handle the single case of finding a missing constituency by using the local authority
        """
-        if pd.isnull(self.prepared_epc["constituency"]) or (self.prepared_epc["constituency"] == ""):
+        if pd.isnull(self.prepared_epc["constituency"]) or (
+            self.prepared_epc["constituency"] == ""
+        ):
            if self.prepared_epc["local-authority"] != "E06000044":
                raise NotImplementedError(
                    "This function is only implemented for Portsmouth, in the single edgecase seen"
@ -595,12 +600,12 @@ class EPCRecord:

        # We handle the edge case of floor area being 0. We set it to zero and it is cleaned by
        # _clean_with_data_processor
-        if self.prepared_epc['total-floor-area'] == 0:
+        if self.prepared_epc["total-floor-area"] == 0:
            print(
                "Edge case of floor area being zero - will set to none and will be cleaned in "
                "_clean_with_data_processor"
            )
-            self.prepared_epc['total-floor-area'] = None
+            self.prepared_epc["total-floor-area"] = None

    def _clean_mains_gas(self):
        """
@ -609,12 +614,7 @@ class EPCRecord:
        if not self.prepared_epc:
            raise ValueError("EPC Recrod doesn not contain epc data")

-        mains_gas_map = {
-            "Y": True,
-            "N": False,
-            True: True,
-            False: False
-        }
+        mains_gas_map = {"Y": True, "N": False, True: True, False: False}

        self.prepared_epc["mains-gas-flag"] = (
            None
@ -1064,7 +1064,12 @@ class EPCDifferenceRecord:
            CARBON_RESPONSE
        )

-        component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + ROOM_FEATURES
+        component_variables = (
+            COMPONENT_FEATURES
+            + EFFICIENCY_FEATURES
+            + ROOM_FEATURES
+            + POST_SAP10_FEATURE
+        )
        ending_record = self.record2.get(
            component_variables + ["lodgement_date"],
            return_asdict=True,
--- a/etl/epc/settings.py
+++ b/etl/epc/settings.py
@ -52,6 +52,9 @@ DATA_ANOMALY_MATCHES = {
    "Unknown",
 }

+# Add the post_sap10 date to indicate if the epc is post sap10
+POST_SAP10_DATE = "2025-06-22"
+
 DATA_ANOMALY_SUBSTRINGS = {
    # Where values in a ‘pick’ list that have been superseded by another value. For example, where a value for
    # ‘pitched roof’ has been replaced by three sub-categories of pitched roof. The original value is retained
@ -184,6 +187,8 @@ EFFICIENCY_FEATURES = [

 ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"]

+POST_SAP10_FEATURE = ["is_post_sap10"]
+
 COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
    "TRANSACTION_TYPE",
    "ENERGY_TARIFF",  # Not sure if this is relevant