diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 682e9e78..5e5d0872 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -4,6 +4,7 @@ import pandas as pd from etl.epc.settings import ( DATA_PROCESSOR_SETTINGS, EARLIEST_EPC_DATE, + POST_SAP10_DATE, # IGNORED_TRANSACTION_TYPES, IGNORED_FLOOR_LEVELS, IGNORED_PROPERTY_TYPES, @@ -159,6 +160,9 @@ class EPCDataProcessor: # colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], # ) + # Create post sap10 flag + self.create_post_sap10_flag() + # When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper cleaning_averages = self.cleaning_averages.copy() if self.run_mode == "newdata": @@ -175,6 +179,13 @@ class EPCDataProcessor: self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step) self.cast_data_columns_to_lower() + def create_post_sap10_flag(self): + """ + Create a flag to indicate if the epc is post sap10 + """ + + self.data["is_post_sap10"] = self.data["LODGEMENT_DATE"] >= POST_SAP10_DATE + def cast_data_columns_to_lower(self): """ Convert all columns names to lower diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index 9f427c59..fac58cd9 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -23,6 +23,7 @@ from etl.epc.settings import ( POTENTIAL_COLUMNS, ROOM_FEATURES, COST_FEATURES, + POST_SAP10_FEATURE, ) # TODO: change in setting file @@ -325,7 +326,9 @@ class EPCPipeline: # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time - variable_data = property_data[VARIABLE_DATA_FEATURES + COST_FEATURES] + variable_data = property_data[ + VARIABLE_DATA_FEATURES + COST_FEATURES + POST_SAP10_FEATURE + ] uprn = str(uprn) epc_records = [ diff --git a/etl/epc/Record.py b/etl/epc/Record.py index d0816034..7552a0c4 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -20,6 +20,7 @@ from etl.epc.settings import ( COMPONENT_FEATURES, EFFICIENCY_FEATURES, ROOM_FEATURES, + POST_SAP10_FEATURE, ) from recommendations.recommendation_utils import estimate_number_of_floors from utils.s3 import read_dataframe_from_s3_parquet @@ -89,6 +90,7 @@ class EPCRecord: co2_emissions_current: float = None number_habitable_rooms: float = None number_heated_rooms: float = None + is_post_sap10: bool = None # u_values_walls = None # u_values_roof = None @@ -277,6 +279,7 @@ class EPCRecord: self.number_heated_rooms: float = float( self.prepared_epc["number_heated_rooms"] ) + self.is_post_sap10: bool = bool(self.prepared_epc["is_post_sap10"]) def _identify_delta_between_prepared_and_original_records(self): """ @@ -385,11 +388,11 @@ class EPCRecord: return df def _clean_floor_height(self): - """ Remaps anomalies in floor height to the average floor height for the property type """ + """Remaps anomalies in floor height to the average floor height for the property type""" floor_height_data = self.cleaning_data[ - (self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) & - (self.cleaning_data["built_form"] == self.prepared_epc["built-form"]) - ] + (self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) + & (self.cleaning_data["built_form"] == self.prepared_epc["built-form"]) + ] average = floor_height_data["floor_height"].mean() sd = floor_height_data["floor_height"].std() # If we're in the top 0.5 percentile of floor heights, we'll set it to the average @@ -399,14 +402,16 @@ class EPCRecord: self.prepared_epc["floor-height"] = average def _clean_new_build_descriptions(self): - for col in ['roof-description', 'walls-description', 'floor-description']: + for col in ["roof-description", "walls-description", "floor-description"]: self.prepared_epc[col] = self.prepared_epc[col].replace("W/m²K", "W/m-¦K") def _clean_constituency(self): """ We handle the single case of finding a missing constituency by using the local authority """ - if pd.isnull(self.prepared_epc["constituency"]) or (self.prepared_epc["constituency"] == ""): + if pd.isnull(self.prepared_epc["constituency"]) or ( + self.prepared_epc["constituency"] == "" + ): if self.prepared_epc["local-authority"] != "E06000044": raise NotImplementedError( "This function is only implemented for Portsmouth, in the single edgecase seen" @@ -595,12 +600,12 @@ class EPCRecord: # We handle the edge case of floor area being 0. We set it to zero and it is cleaned by # _clean_with_data_processor - if self.prepared_epc['total-floor-area'] == 0: + if self.prepared_epc["total-floor-area"] == 0: print( "Edge case of floor area being zero - will set to none and will be cleaned in " "_clean_with_data_processor" ) - self.prepared_epc['total-floor-area'] = None + self.prepared_epc["total-floor-area"] = None def _clean_mains_gas(self): """ @@ -609,12 +614,7 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - mains_gas_map = { - "Y": True, - "N": False, - True: True, - False: False - } + mains_gas_map = {"Y": True, "N": False, True: True, False: False} self.prepared_epc["mains-gas-flag"] = ( None @@ -1064,7 +1064,12 @@ class EPCDifferenceRecord: CARBON_RESPONSE ) - component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + ROOM_FEATURES + component_variables = ( + COMPONENT_FEATURES + + EFFICIENCY_FEATURES + + ROOM_FEATURES + + POST_SAP10_FEATURE + ) ending_record = self.record2.get( component_variables + ["lodgement_date"], return_asdict=True, diff --git a/etl/epc/settings.py b/etl/epc/settings.py index ecc56552..47a75def 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -52,6 +52,9 @@ DATA_ANOMALY_MATCHES = { "Unknown", } +# Add the post_sap10 date to indicate if the epc is post sap10 +POST_SAP10_DATE = "2025-06-22" + DATA_ANOMALY_SUBSTRINGS = { # Where values in a ‘pick’ list that have been superseded by another value. For example, where a value for # ‘pitched roof’ has been replaced by three sub-categories of pitched roof. The original value is retained @@ -184,6 +187,8 @@ EFFICIENCY_FEATURES = [ ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"] +POST_SAP10_FEATURE = ["is_post_sap10"] + COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [ "TRANSACTION_TYPE", "ENERGY_TARIFF", # Not sure if this is relevant