From 723beaf1041168461220da87ebdee8477d9c3f8c Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 26 Aug 2025 10:44:31 +0100 Subject: [PATCH 1/3] fix pipeline for July 2025 data, keep lodgement date to do analysis on new rdsap standard in ML stage --- .gitignore | 4 +- BaseUtility.py | 21 +- etl/epc/DataProcessor.py | 8 +- etl/epc/Dataset.py | 39 +- recommendations/rdsap_tables.py | 864 +++++++++++++++++++++++++++----- 5 files changed, 775 insertions(+), 161 deletions(-) diff --git a/.gitignore b/.gitignore index 5e247d77..a6538116 100644 --- a/.gitignore +++ b/.gitignore @@ -275,4 +275,6 @@ cache/ */.idea *.png -*.pptx \ No newline at end of file +*.pptx + +local_data* \ No newline at end of file diff --git a/BaseUtility.py b/BaseUtility.py index e799144d..2f990695 100644 --- a/BaseUtility.py +++ b/BaseUtility.py @@ -30,24 +30,25 @@ class Definitions: # The Building Emission Rate (BER) data field for non-domestic buildings may contain a ‘blank’ value. The BER # was only lodged on the register from 7 March 2010. "Blank" - # There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to - # the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB) - # lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested - # manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds, - # etc). These records are being published for completeness. An ongoing process to manage these manually added + # There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to + # the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB) + # lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested + # manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds, + # etc). These records are being published for completeness. An ongoing process to manage these manually added # addresses will take time to develop to deal with these and future anomalies. # - # There are several fields within the lodged data where it is possible to enter multiple entries to cater for - # different data_types of build within a single property, i.e. extensions. This results in multiple entries for - # the description fields for floor, roof and wall. For the purposes of this data release only the information - # contained within the first of these multiple entries is being provided. As there are no restrictions on the - # value in this first field it means that sometimes the first field in a multiple entry description field may + # There are several fields within the lodged data where it is possible to enter multiple entries to cater for + # different data_types of build within a single property, i.e. extensions. This results in multiple entries for + # the description fields for floor, roof and wall. For the purposes of this data release only the information + # contained within the first of these multiple entries is being provided. As there are no restrictions on the + # value in this first field it means that sometimes the first field in a multiple entry description field may # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases. "NULL", # We sometimes see fields populated with just an empty string. "", # An older value which rarely shows up but has been seen in the data. "UNKNOWN", + "Unknown", } DATA_ANOMALY_SUBSTRINGS = { diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 9655cf77..99987f48 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -48,6 +48,8 @@ construction_age_bounds_map = { "England and Wales: 2003-2006": {"l": 2003, "u": 2006}, "England and Wales: 2007-2011": {"l": 2007, "u": 2011}, "England and Wales: 2012 onwards": {"l": 2012, "u": 3000}, + "England and Wales: 2012-2021": {"l": 2012, "u": 2021}, + "England and Wales: 2022 onwards": {"l": 2022, "u": 3000}, } construction_age_remap = { @@ -384,7 +386,7 @@ class EPCDataProcessor: has_missings = pd.isnull(self.data[col]).sum() while has_missings: self.data = apply_clean( - data=self.data, matching_columns=matching_columns[0: to_index + 1] + data=self.data, matching_columns=matching_columns[0 : to_index + 1] ) has_missings = pd.isnull(self.data[col]).sum() @@ -858,7 +860,9 @@ class EPCDataProcessor: # Fill NaN values with averages for col in cols_to_clean: - data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"]) + data_to_clean[col] = data_to_clean[col].fillna( + data_to_clean[f"{col}_AVERAGE"] + ) data_to_clean = data_to_clean.drop(columns=[f"{col}_AVERAGE"]) # If we still have missings data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[col].mean()) diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 5d3720fc..35bc108e 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -8,7 +8,9 @@ from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes -from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes +from etl.epc_clean.epc_attributes.MainheatControlAttributes import ( + MainheatControlAttributes, +) from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes @@ -169,7 +171,7 @@ class TrainingDataset(BaseDataset): self.df = pd.DataFrame([dataset.difference_record for dataset in datasets]) self._feature_generation() - self._drop_features() + # self._drop_features() self._clean_efficiency_variables() self._null_validation(information="Clean Efficiency Variables") self._expand_description_to_features(cleaned_lookup) @@ -210,11 +212,11 @@ class TrainingDataset(BaseDataset): common_cols = [[col + "_starting", col + "_ending"] for col in common_cols] self.df = self.df.loc[ - :, - no_suffix_cols - + only_ending_cols - + [col for cols in common_cols for col in cols], - ] + :, + no_suffix_cols + + only_ending_cols + + [col for cols in common_cols for col in cols], + ] def _remove_abnormal_change_in_floor_area(self): """ @@ -519,7 +521,7 @@ class TrainingDataset(BaseDataset): expanded_df["is_sandstone_or_limestone"] == expanded_df["is_sandstone_or_limestone_ending"] ) - ] + ] elif component == "floor": expanded_df = expanded_df[ (expanded_df["is_suspended"] == expanded_df["is_suspended_ending"]) @@ -536,7 +538,7 @@ class TrainingDataset(BaseDataset): expanded_df["is_to_external_air"] == expanded_df["is_to_external_air_ending"] ) - ] + ] elif component == "roof": expanded_df = expanded_df[ (expanded_df["is_pitched"] == expanded_df["is_pitched_ending"]) @@ -549,7 +551,7 @@ class TrainingDataset(BaseDataset): expanded_df["has_dwelling_above"] == expanded_df["has_dwelling_above_ending"] ) - ] + ] return expanded_df @@ -695,10 +697,14 @@ class TrainingDataset(BaseDataset): cleaned_lookup_df_for_key = pd.DataFrame(cleaned_lookup[cleaned_key]) # We handle a specific edge case where we're missing information for the original description - descriptions = [x for x in self.df[left_on_starting].unique() if pd.notnull(x)] + descriptions = [ + x for x in self.df[left_on_starting].unique() if pd.notnull(x) + ] # take any not in the cleaned lookup missing_descriptions = [ - x for x in descriptions if x not in cleaned_lookup_df_for_key["original_description"].values + x + for x in descriptions + if x not in cleaned_lookup_df_for_key["original_description"].values ] if missing_descriptions: # We handle them here @@ -710,9 +716,12 @@ class TrainingDataset(BaseDataset): cleaned_data.append( { "original_description": x, - "clean_description": desc_cleaner.description.replace("(assumed)", - "").rstrip().capitalize(), - **cleaned + "clean_description": desc_cleaner.description.replace( + "(assumed)", "" + ) + .rstrip() + .capitalize(), + **cleaned, } ) cleaned_lookup_df_for_key = pd.concat( diff --git a/recommendations/rdsap_tables.py b/recommendations/rdsap_tables.py index e56faf7c..14c7f247 100644 --- a/recommendations/rdsap_tables.py +++ b/recommendations/rdsap_tables.py @@ -3,6 +3,7 @@ This script contains standard tables which are defined in rdsap. The most recent based on the 2012 version, however the government is currently working on releasing a new version, and there we will need to re-visit this """ + import pandas as pd age_band_data = [ @@ -11,84 +12,91 @@ age_band_data = [ "England_Wales": "before 1900", "Scotland": "before 1919", "Northern_Ireland": "before 1919", - "Park_home_UK": None + "Park_home_UK": None, }, { "age_band": "B", "England_Wales": "1900-1929", "Scotland": "1919-1929", "Northern_Ireland": "1919-1929", - "Park_home_UK": None + "Park_home_UK": None, }, { "age_band": "C", "England_Wales": "1930-1949", "Scotland": "1930-1949", "Northern_Ireland": "1930-1949", - "Park_home_UK": None + "Park_home_UK": None, }, { "age_band": "D", "England_Wales": "1950-1966", "Scotland": "1950-1964", "Northern_Ireland": "1950-1973", - "Park_home_UK": None + "Park_home_UK": None, }, { "age_band": "E", "England_Wales": "1967-1975", "Scotland": "1965-1975", "Northern_Ireland": "1974-1977", - "Park_home_UK": None + "Park_home_UK": None, }, { "age_band": "F", "England_Wales": "1976-1982", "Scotland": "1976-1983", "Northern_Ireland": "1978-1985", - "Park_home_UK": "before 1983" + "Park_home_UK": "before 1983", }, { "age_band": "G", "England_Wales": "1983-1990", "Scotland": "1984-1991", "Northern_Ireland": "1986-1991", - "Park_home_UK": "1983-1995" + "Park_home_UK": "1983-1995", }, { "age_band": "H", "England_Wales": "1991-1995", "Scotland": "1992-1998", "Northern_Ireland": "1992-1999", - "Park_home_UK": None + "Park_home_UK": None, }, { "age_band": "I", "England_Wales": "1996-2002", "Scotland": "1999-2002", "Northern_Ireland": "2000-2006", - "Park_home_UK": "1996-2005" + "Park_home_UK": "1996-2005", }, { "age_band": "J", "England_Wales": "2003-2006", "Scotland": "2003-2007", "Northern_Ireland": None, - "Park_home_UK": None + "Park_home_UK": None, }, { "age_band": "K", "England_Wales": "2007-2011", "Scotland": "2008-2011", "Northern_Ireland": "2007-2013", - "Park_home_UK": "2006 onwards" + "Park_home_UK": "2006 onwards", }, { "age_band": "L", "England_Wales": "2012 onwards", "Scotland": "2012 onwards", "Northern_Ireland": "2014 onwards", - "Park_home_UK": None + "Park_home_UK": None, + }, + { + "age_band": "L", + "England_Wales": "2012-2021", + "Scotland": "2012-2023", + "Northern_Ireland": "2014-2022", + "Park_home_UK": None, }, ] @@ -102,32 +110,109 @@ england_wales_age_band_lookup = { ######################################################################################################################## default_wall_thickness = [ { - "type": "stone", "A": 500, "B": 500, "C": 500, "D": 500, "E": 450, "F": 420, "G": 420, "H": 420, - "I": 450, "J": 450, "K": 450, "L": 450 + "type": "stone", + "A": 500, + "B": 500, + "C": 500, + "D": 500, + "E": 450, + "F": 420, + "G": 420, + "H": 420, + "I": 450, + "J": 450, + "K": 450, + "L": 450, }, { - "type": "solid brick", "A": 220, "B": 220, "C": 220, "D": 220, "E": 240, "F": 250, "G": 270, "H": 270, - "I": 300, "J": 300, "K": 300, "L": 300 + "type": "solid brick", + "A": 220, + "B": 220, + "C": 220, + "D": 220, + "E": 240, + "F": 250, + "G": 270, + "H": 270, + "I": 300, + "J": 300, + "K": 300, + "L": 300, }, { - "type": "cavity", "A": 250, "B": 250, "C": 250, "D": 250, "E": 250, "F": 260, "G": 270, "H": 270, - "I": 300, "J": 300, "K": 300, "L": 300 + "type": "cavity", + "A": 250, + "B": 250, + "C": 250, + "D": 250, + "E": 250, + "F": 260, + "G": 270, + "H": 270, + "I": 300, + "J": 300, + "K": 300, + "L": 300, }, { - "type": "timber frame", "A": 150, "B": 150, "C": 150, "D": 250, "E": 270, "F": 270, "G": 270, "H": 270, - "I": 300, "J": 300, "K": 300, "L": 300 + "type": "timber frame", + "A": 150, + "B": 150, + "C": 150, + "D": 250, + "E": 270, + "F": 270, + "G": 270, + "H": 270, + "I": 300, + "J": 300, + "K": 300, + "L": 300, }, { - "type": "cob", "A": 540, "B": 540, "C": 540, "D": 540, "E": 540, "F": 540, "G": 560, "H": 560, "I": 590, - "J": 590, "K": 590, "L": 590 + "type": "cob", + "A": 540, + "B": 540, + "C": 540, + "D": 540, + "E": 540, + "F": 540, + "G": 560, + "H": 560, + "I": 590, + "J": 590, + "K": 590, + "L": 590, }, { - "type": "system build", "A": 250, "B": 250, "C": 250, "D": 250, "E": 250, "F": 300, "G": 300, "H": 300, - "I": 300, "J": 300, "K": 300, "L": 300 + "type": "system build", + "A": 250, + "B": 250, + "C": 250, + "D": 250, + "E": 250, + "F": 300, + "G": 300, + "H": 300, + "I": 300, + "J": 300, + "K": 300, + "L": 300, }, { - "type": "park home", "A": None, "B": None, "C": None, "D": None, "E": None, "F": 50, "G": 50, - "H": None, "I": 75, "J": 100, "K": 100, "L": 100 + "type": "park home", + "A": None, + "B": None, + "C": None, + "D": None, + "E": None, + "F": 50, + "G": 50, + "H": None, + "I": 75, + "J": 100, + "K": 100, + "L": 100, }, ] @@ -170,33 +255,384 @@ wall_types = [ u_values = [ ["a", "a", "a", "a", "1.7b", "1.0", "0.6", "0.60", "0.45", "0.35", "0.30", "0.28"], ["a", "a", "a", "a", "1.7b", "1.0", "0.6", "0.60", "0.45", "0.35", "0.30", "0.28"], - ["1.7", "1.7", "1.7", "1.7", "1.7", "1.0", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"], - ["0.55", "0.55", "0.55", "0.55", "0.55", "0.45", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"], - ["0.32", "0.32", "0.32", "0.32", "0.32", "0.28", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"], - ["0.23", "0.23", "0.23", "0.23", "0.23", "0.21", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"], - ["0.18", "0.18", "0.18", "0.18", "0.18", "0.17", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"], - ["0.80", "0.80", "0.80", "0.80", "0.80", "0.80", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"], - ["0.40", "0.40", "0.40", "0.40", "0.40", "0.40", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"], - ["0.26", "0.26", "0.26", "0.26", "0.26", "0.26", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"], - ["0.20", "0.20", "0.20", "0.20", "0.20", "0.20", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"], - ["0.16", "0.16", "0.16", "0.16", "0.16", "0.16", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"], - ["1.5", "1.5", "1.5", "1.5", "1.5", "1.0", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"], - ["0.53", "0.53", "0.53", "0.53", "0.53", "0.45", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"], - ["0.32", "0.32", "0.32", "0.32", "0.32", "0.30", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"], - ["0.23", "0.23", "0.23", "0.23", "0.23", "0.21", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"], - ["0.18", "0.18", "0.18", "0.18", "0.18", "0.17", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"], - ["0.7", "0.7", "0.7", "0.7", "0.7", "0.40", "0.35", "0.35", "0.45", "0.35", "0.30", "0.28"], - ["0.37", "0.37", "0.37", "0.37", "0.37", "0.27", "0.25", "0.25", "0.25", "0.25", "0.21", "0.21"], - ["0.25", "0.25", "0.25", "0.25", "0.25", "0.20", "0.19", "0.19", "0.19", "0.19", "0.17", "0.16"], - ["0.19", "0.19", "0.19", "0.19", "0.19", "0.16", "0.15", "0.15", "0.15", "0.15", "0.14", "0.14"], - ["0.16", "0.16", "0.16", "0.16", "0.16", "0.13", "0.13", "0.13", "0.13", "0.13", "0.12", "0.12"], - ["2.5", "1.9", "1.9", "1.0", "0.80", "0.45", "0.40", "0.40", "0.40", "0.35", "0.30", "0.28"], - ["0.60", "0.55", "0.55", "0.40", "0.40", "0.40", "0.40", "0.40", "0.40", "0.35", "0.30", "0.28"], - ["2.0", "2.0", "2.0", "2.0", "1.7", "1.0", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"], - ["0.60", "0.60", "0.60", "0.60", "0.55", "0.45", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"], - ["0.35", "0.35", "0.35", "0.35", "0.35", "0.32", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"], - ["0.25", "0.25", "0.25", "0.25", "0.25", "0.21", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"], - ["0.18", "0.18", "0.18", "0.18", "0.18", "0.17", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"], + [ + "1.7", + "1.7", + "1.7", + "1.7", + "1.7", + "1.0", + "0.60", + "0.60", + "0.45", + "0.35", + "0.30", + "0.28", + ], + [ + "0.55", + "0.55", + "0.55", + "0.55", + "0.55", + "0.45", + "0.35", + "0.35", + "0.30", + "0.25", + "0.21", + "0.21", + ], + [ + "0.32", + "0.32", + "0.32", + "0.32", + "0.32", + "0.28", + "0.24", + "0.24", + "0.21", + "0.19", + "0.17", + "0.16", + ], + [ + "0.23", + "0.23", + "0.23", + "0.23", + "0.23", + "0.21", + "0.18", + "0.18", + "0.17", + "0.15", + "0.14", + "0.14", + ], + [ + "0.18", + "0.18", + "0.18", + "0.18", + "0.18", + "0.17", + "0.15", + "0.15", + "0.14", + "0.13", + "0.12", + "0.12", + ], + [ + "0.80", + "0.80", + "0.80", + "0.80", + "0.80", + "0.80", + "0.60", + "0.60", + "0.45", + "0.35", + "0.30", + "0.28", + ], + [ + "0.40", + "0.40", + "0.40", + "0.40", + "0.40", + "0.40", + "0.35", + "0.35", + "0.30", + "0.25", + "0.21", + "0.21", + ], + [ + "0.26", + "0.26", + "0.26", + "0.26", + "0.26", + "0.26", + "0.24", + "0.24", + "0.21", + "0.19", + "0.17", + "0.16", + ], + [ + "0.20", + "0.20", + "0.20", + "0.20", + "0.20", + "0.20", + "0.18", + "0.18", + "0.17", + "0.15", + "0.14", + "0.14", + ], + [ + "0.16", + "0.16", + "0.16", + "0.16", + "0.16", + "0.16", + "0.15", + "0.15", + "0.14", + "0.13", + "0.12", + "0.12", + ], + [ + "1.5", + "1.5", + "1.5", + "1.5", + "1.5", + "1.0", + "0.60", + "0.60", + "0.45", + "0.35", + "0.30", + "0.28", + ], + [ + "0.53", + "0.53", + "0.53", + "0.53", + "0.53", + "0.45", + "0.35", + "0.35", + "0.30", + "0.25", + "0.21", + "0.21", + ], + [ + "0.32", + "0.32", + "0.32", + "0.32", + "0.32", + "0.30", + "0.24", + "0.24", + "0.21", + "0.19", + "0.17", + "0.16", + ], + [ + "0.23", + "0.23", + "0.23", + "0.23", + "0.23", + "0.21", + "0.18", + "0.18", + "0.17", + "0.15", + "0.14", + "0.14", + ], + [ + "0.18", + "0.18", + "0.18", + "0.18", + "0.18", + "0.17", + "0.15", + "0.15", + "0.14", + "0.13", + "0.12", + "0.12", + ], + [ + "0.7", + "0.7", + "0.7", + "0.7", + "0.7", + "0.40", + "0.35", + "0.35", + "0.45", + "0.35", + "0.30", + "0.28", + ], + [ + "0.37", + "0.37", + "0.37", + "0.37", + "0.37", + "0.27", + "0.25", + "0.25", + "0.25", + "0.25", + "0.21", + "0.21", + ], + [ + "0.25", + "0.25", + "0.25", + "0.25", + "0.25", + "0.20", + "0.19", + "0.19", + "0.19", + "0.19", + "0.17", + "0.16", + ], + [ + "0.19", + "0.19", + "0.19", + "0.19", + "0.19", + "0.16", + "0.15", + "0.15", + "0.15", + "0.15", + "0.14", + "0.14", + ], + [ + "0.16", + "0.16", + "0.16", + "0.16", + "0.16", + "0.13", + "0.13", + "0.13", + "0.13", + "0.13", + "0.12", + "0.12", + ], + [ + "2.5", + "1.9", + "1.9", + "1.0", + "0.80", + "0.45", + "0.40", + "0.40", + "0.40", + "0.35", + "0.30", + "0.28", + ], + [ + "0.60", + "0.55", + "0.55", + "0.40", + "0.40", + "0.40", + "0.40", + "0.40", + "0.40", + "0.35", + "0.30", + "0.28", + ], + [ + "2.0", + "2.0", + "2.0", + "2.0", + "1.7", + "1.0", + "0.60", + "0.60", + "0.45", + "0.35", + "0.30", + "0.28", + ], + [ + "0.60", + "0.60", + "0.60", + "0.60", + "0.55", + "0.45", + "0.35", + "0.35", + "0.30", + "0.25", + "0.21", + "0.21", + ], + [ + "0.35", + "0.35", + "0.35", + "0.35", + "0.35", + "0.32", + "0.24", + "0.24", + "0.21", + "0.19", + "0.17", + "0.16", + ], + [ + "0.25", + "0.25", + "0.25", + "0.25", + "0.25", + "0.21", + "0.18", + "0.18", + "0.17", + "0.15", + "0.14", + "0.14", + ], + [ + "0.18", + "0.18", + "0.18", + "0.18", + "0.18", + "0.17", + "0.15", + "0.15", + "0.14", + "0.13", + "0.12", + "0.12", + ], ] age_bands = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"] @@ -210,8 +646,13 @@ for i, wall_type in enumerate(wall_types): parkhome_wall_uvalues = [ {"Wall_type": "Park home as built", "F": "1.7", "G": "1.2", "I": "0.7", "K": "0.6"}, - {"Wall_type": "Park home with additional insulation", "F": "s1.1.2", "G": "s1.1.2", "I": "s1.1.2", - "K": "s1.1.2"} + { + "Wall_type": "Park home with additional insulation", + "F": "s1.1.2", + "G": "s1.1.2", + "I": "s1.1.2", + "K": "s1.1.2", + }, ] wall_uvalues.extend(parkhome_wall_uvalues) @@ -229,16 +670,12 @@ epc_wall_description_map = { "Cavity wall, as built, insulated": "Filled cavity", "Cavity wall, with external insulation": "Unfilled cavity with 100 mm external or internal insulation", "Cavity wall, insulated": "Filled cavity", - 'Cavity wall, partial insulation': "Filled cavity", - + "Cavity wall, partial insulation": "Filled cavity", "Cavity wall,": "Cavity as built", # General case of cavity wall without further details - "Cavity wall, filled cavity and external insulation": - "Filled cavity with 100 mm external or internal insulation", - "Cavity wall, filled cavity and internal insulation": - "Filled cavity with 100 mm external or internal insulation", + "Cavity wall, filled cavity and external insulation": "Filled cavity with 100 mm external or internal insulation", + "Cavity wall, filled cavity and internal insulation": "Filled cavity with 100 mm external or internal insulation", "Cavity wall, with internal insulation": "Unfilled cavity with 100 mm external or internal insulation", "Cavity wall, no insulation": "Cavity as built", - ############################ # Solid brick wall mappings ############################ @@ -247,7 +684,6 @@ epc_wall_description_map = { "Solid brick, as built, insulated": "Stone/solid brick with 100 mm external or internal insulation", "Solid brick, with external insulation": "Stone/solid brick with 100 mm external or internal insulation", "Solid brick, as built, partial insulation": "Stone/solid brick with 50 mm external or internal insulation", - ############################ # Timber frame wall mappings ############################ @@ -262,33 +698,29 @@ epc_wall_description_map = { # Sandstone/limestones wall mappings ############################ "Sandstone or limestone, as built, no insulation": "Stone: sandstone or limestone as built", - "Sandstone or limestone, with internal insulation": - "Stone/solid brick with 100 mm external or internal insulation", + "Sandstone or limestone, with internal insulation": "Stone/solid brick with 100 mm external or internal insulation", "Sandstone or limestone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal " - "insulation", + "insulation", "Sandstone, as built, no insulation": "Stone: sandstone or limestone as built", - "Sandstone or limestone, as built, insulated": - "Stone/solid brick with 100 mm external or internal insulation", + "Sandstone or limestone, as built, insulated": "Stone/solid brick with 100 mm external or internal insulation", "Sandstone, as built, insulated": "Stone/solid brick with 100 mm external or internal insulation", "Sandstone, with internal insulation": "Stone/solid brick with 100 mm external or internal insulation", "Sandstone or limestone, with external insulation": "Stone/solid brick with 100 mm external or internal " - "insulation", + "insulation", "Sandstone, with external insulation": "Stone/solid brick with 100 mm external or internal insulation", "Sandstone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal insulation", - ############################ # Granite/whinstone wall mappings ############################ "Granite or whinstone, as built, no insulation": "Stone: granite or whinstone as built", "Granite or whinstone, with internal insulation": "Stone/solid brick with 100 mm external or internal " - "insulation", + "insulation", "Granite or whinstone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal " - "insulation", + "insulation", "Granite or whinstone, as built, insulated": "Stone/solid brick with 100 mm external or internal " - "insulation", + "insulation", "Granite or whinstone, with external insulation": "Stone/solid brick with 100 mm external or internal " - "insulation", - + "insulation", ############################ # System built wall mappings ############################ @@ -297,15 +729,13 @@ epc_wall_description_map = { "System built, with internal insulation": "System build with 100 mm external or internal insulation", "System built, with external insulation": "System build with 100 mm external or internal insulation", "System built, as built, insulated": "System build with 100 mm external or internal insulation", - ############################ # Cob wall mappings ############################ "Cob, as built": "Cob as built", "Cob, with external insulation": "Cob with 100 mm external or internal insulation", "Cob, with internal insulation": "Cob with 100 mm external or internal insulation", - 'Cob,': "Cob as built", - + "Cob,": "Cob as built", ############################ # Park home mappings ############################ @@ -321,20 +751,71 @@ epc_wall_description_map = { ######################################################################################################################## s9_list = [ - {"Insulation_thickness_mm": None, "Slates_or_tiles_U_value_W_m2K": 2.3, "Thatched_roof_U_value_W_m2K": 0.35}, - {"Insulation_thickness_mm": 12, "Slates_or_tiles_U_value_W_m2K": 1.5, "Thatched_roof_U_value_W_m2K": 0.32}, - {"Insulation_thickness_mm": 25, "Slates_or_tiles_U_value_W_m2K": 1.0, "Thatched_roof_U_value_W_m2K": 0.30}, - {"Insulation_thickness_mm": 50, "Slates_or_tiles_U_value_W_m2K": 0.68, "Thatched_roof_U_value_W_m2K": 0.25}, - {"Insulation_thickness_mm": 75, "Slates_or_tiles_U_value_W_m2K": 0.50, "Thatched_roof_U_value_W_m2K": 0.22}, - {"Insulation_thickness_mm": 100, "Slates_or_tiles_U_value_W_m2K": 0.40, "Thatched_roof_U_value_W_m2K": 0.20}, - {"Insulation_thickness_mm": 150, "Slates_or_tiles_U_value_W_m2K": 0.30, "Thatched_roof_U_value_W_m2K": 0.17}, - {"Insulation_thickness_mm": 200, "Slates_or_tiles_U_value_W_m2K": 0.21, "Thatched_roof_U_value_W_m2K": 0.14}, - {"Insulation_thickness_mm": 250, "Slates_or_tiles_U_value_W_m2K": 0.17, "Thatched_roof_U_value_W_m2K": 0.12}, - {"Insulation_thickness_mm": 270, "Slates_or_tiles_U_value_W_m2K": 0.16, "Thatched_roof_U_value_W_m2K": 0.12}, - {"Insulation_thickness_mm": 300, "Slates_or_tiles_U_value_W_m2K": 0.14, "Thatched_roof_U_value_W_m2K": 0.11}, - {"Insulation_thickness_mm": 350, "Slates_or_tiles_U_value_W_m2K": 0.12, "Thatched_roof_U_value_W_m2K": 0.10}, - {"Insulation_thickness_mm": 400, "Slates_or_tiles_U_value_W_m2K": 0.11, - "Thatched_roof_U_value_W_m2K": 0.09}, + { + "Insulation_thickness_mm": None, + "Slates_or_tiles_U_value_W_m2K": 2.3, + "Thatched_roof_U_value_W_m2K": 0.35, + }, + { + "Insulation_thickness_mm": 12, + "Slates_or_tiles_U_value_W_m2K": 1.5, + "Thatched_roof_U_value_W_m2K": 0.32, + }, + { + "Insulation_thickness_mm": 25, + "Slates_or_tiles_U_value_W_m2K": 1.0, + "Thatched_roof_U_value_W_m2K": 0.30, + }, + { + "Insulation_thickness_mm": 50, + "Slates_or_tiles_U_value_W_m2K": 0.68, + "Thatched_roof_U_value_W_m2K": 0.25, + }, + { + "Insulation_thickness_mm": 75, + "Slates_or_tiles_U_value_W_m2K": 0.50, + "Thatched_roof_U_value_W_m2K": 0.22, + }, + { + "Insulation_thickness_mm": 100, + "Slates_or_tiles_U_value_W_m2K": 0.40, + "Thatched_roof_U_value_W_m2K": 0.20, + }, + { + "Insulation_thickness_mm": 150, + "Slates_or_tiles_U_value_W_m2K": 0.30, + "Thatched_roof_U_value_W_m2K": 0.17, + }, + { + "Insulation_thickness_mm": 200, + "Slates_or_tiles_U_value_W_m2K": 0.21, + "Thatched_roof_U_value_W_m2K": 0.14, + }, + { + "Insulation_thickness_mm": 250, + "Slates_or_tiles_U_value_W_m2K": 0.17, + "Thatched_roof_U_value_W_m2K": 0.12, + }, + { + "Insulation_thickness_mm": 270, + "Slates_or_tiles_U_value_W_m2K": 0.16, + "Thatched_roof_U_value_W_m2K": 0.12, + }, + { + "Insulation_thickness_mm": 300, + "Slates_or_tiles_U_value_W_m2K": 0.14, + "Thatched_roof_U_value_W_m2K": 0.11, + }, + { + "Insulation_thickness_mm": 350, + "Slates_or_tiles_U_value_W_m2K": 0.12, + "Thatched_roof_U_value_W_m2K": 0.10, + }, + { + "Insulation_thickness_mm": 400, + "Slates_or_tiles_U_value_W_m2K": 0.11, + "Thatched_roof_U_value_W_m2K": 0.09, + }, ] s10_list = [ @@ -347,7 +828,7 @@ s10_list = [ "Room_in_roof_slates_or_tiles": 2.3, "Thatched_roof": 0.35, "Thatched_roof_room_in_roof": 0.25, - "Park_home": None + "Park_home": None, }, { "Age_band": "E", @@ -358,7 +839,7 @@ s10_list = [ "Room_in_roof_slates_or_tiles": 1.5, "Thatched_roof": 0.35, "Thatched_roof_room_in_roof": 0.25, - "Park_home": None + "Park_home": None, }, { "Age_band": "F", @@ -369,7 +850,7 @@ s10_list = [ "Room_in_roof_slates_or_tiles": 0.80, "Thatched_roof": 0.35, "Thatched_roof_room_in_roof": 0.25, - "Park_home": 1.7 + "Park_home": 1.7, }, { "Age_band": "G", @@ -380,7 +861,7 @@ s10_list = [ "Room_in_roof_slates_or_tiles": "0.50", "Thatched_roof": 0.35, "Thatched_roof_room_in_roof": 0.25, - "Park_home": 0.6 + "Park_home": 0.6, }, { "Age_band": "H", @@ -391,7 +872,7 @@ s10_list = [ "Room_in_roof_slates_or_tiles": 0.35, "Thatched_roof": 0.35, "Thatched_roof_room_in_roof": 0.25, - "Park_home": None + "Park_home": None, }, { "Age_band": "I", @@ -402,7 +883,7 @@ s10_list = [ "Room_in_roof_slates_or_tiles": 0.35, "Thatched_roof": 0.35, "Thatched_roof_room_in_roof": 0.25, - "Park_home": 0.35 + "Park_home": 0.35, }, { "Age_band": "J", @@ -413,7 +894,7 @@ s10_list = [ "Room_in_roof_slates_or_tiles": 0.30, "Thatched_roof": 0.30, "Thatched_roof_room_in_roof": 0.25, - "Park_home": None + "Park_home": None, }, { "Age_band": "K", @@ -424,7 +905,7 @@ s10_list = [ "Room_in_roof_slates_or_tiles": 0.25, "Thatched_roof": 0.25, "Thatched_roof_room_in_roof": 0.25, - "Park_home": 0.30 + "Park_home": 0.30, }, { "Age_band": "L", @@ -435,8 +916,8 @@ s10_list = [ "Room_in_roof_slates_or_tiles": 0.18, "Thatched_roof": 0.18, "Thatched_roof_room_in_roof": 0.18, - "Park_home": None - } + "Park_home": None, + }, ] table_s9 = pd.DataFrame(s9_list) @@ -452,22 +933,70 @@ table_s10 = pd.DataFrame(s10_list) ######################################################################################################################## s11_list = [ - {"Age_band": "A, B", "Floor_construction": "suspended timber", "England_Wales": 0, "Scotland": 0, - "Northern_Ireland": 0, "Park_home": 0}, - {"Age_band": "C to F", "Floor_construction": "solid", "England_Wales": 0, "Scotland": 0, - "Northern_Ireland": 0, "Park_home": 0}, - {"Age_band": "G", "Floor_construction": "solid", "England_Wales": 0, "Scotland": 0, - "Northern_Ireland": 0, "Park_home": 25}, - {"Age_band": "H", "Floor_construction": "solid", "England_Wales": 0, "Scotland": 25, - "Northern_Ireland": 25, "Park_home": 0}, - {"Age_band": "I", "Floor_construction": "solid", "England_Wales": 25, "Scotland": 50, - "Northern_Ireland": 50, "Park_home": 50}, - {"Age_band": "J", "Floor_construction": "solid", "England_Wales": 75, "Scotland": 75, - "Northern_Ireland": 0, "Park_home": 0}, - {"Age_band": "K", "Floor_construction": "solid", "England_Wales": 100, "Scotland": 100, - "Northern_Ireland": 100, "Park_home": 70}, - {"Age_band": "L", "Floor_construction": "solid", "England_Wales": 100, "Scotland": 120, - "Northern_Ireland": 100, "Park_home": 0}, + { + "Age_band": "A, B", + "Floor_construction": "suspended timber", + "England_Wales": 0, + "Scotland": 0, + "Northern_Ireland": 0, + "Park_home": 0, + }, + { + "Age_band": "C to F", + "Floor_construction": "solid", + "England_Wales": 0, + "Scotland": 0, + "Northern_Ireland": 0, + "Park_home": 0, + }, + { + "Age_band": "G", + "Floor_construction": "solid", + "England_Wales": 0, + "Scotland": 0, + "Northern_Ireland": 0, + "Park_home": 25, + }, + { + "Age_band": "H", + "Floor_construction": "solid", + "England_Wales": 0, + "Scotland": 25, + "Northern_Ireland": 25, + "Park_home": 0, + }, + { + "Age_band": "I", + "Floor_construction": "solid", + "England_Wales": 25, + "Scotland": 50, + "Northern_Ireland": 50, + "Park_home": 50, + }, + { + "Age_band": "J", + "Floor_construction": "solid", + "England_Wales": 75, + "Scotland": 75, + "Northern_Ireland": 0, + "Park_home": 0, + }, + { + "Age_band": "K", + "Floor_construction": "solid", + "England_Wales": 100, + "Scotland": 100, + "Northern_Ireland": 100, + "Park_home": 70, + }, + { + "Age_band": "L", + "Floor_construction": "solid", + "England_Wales": 100, + "Scotland": 120, + "Northern_Ireland": 100, + "Park_home": 0, + }, ] table_s11 = pd.DataFrame(s11_list) @@ -481,21 +1010,90 @@ table_s11 = pd.DataFrame(s11_list) ######################################################################################################################## s12_list = [ - {"age_band": "A", "insulation_0": 1.2, "insulation_50": 0.5, "insulation_100": 0.3, "insulation_150": 0.22}, - {"age_band": "B", "insulation_0": 1.2, "insulation_50": 0.5, "insulation_100": 0.3, "insulation_150": 0.22}, - {"age_band": "C", "insulation_0": 1.2, "insulation_50": 0.5, "insulation_100": 0.3, "insulation_150": 0.22}, - {"age_band": "D", "insulation_0": 1.2, "insulation_50": 0.5, "insulation_100": 0.3, "insulation_150": 0.22}, - {"age_band": "E", "insulation_0": 1.2, "insulation_50": 0.5, "insulation_100": 0.3, "insulation_150": 0.22}, - {"age_band": "F", "insulation_0": 1.2, "insulation_50": 0.5, "insulation_100": 0.3, "insulation_150": 0.22}, - {"age_band": "G", "insulation_0": 1.2, "insulation_50": 0.5, "insulation_100": 0.3, "insulation_150": 0.22}, - - {"age_band": "H", "insulation_0": 0.51, "insulation_50": 0.5, "insulation_100": 0.3, "insulation_150": 0.22}, - {"age_band": "I", "insulation_0": 0.51, "insulation_50": 0.5, "insulation_100": 0.3, "insulation_150": 0.22}, - - {"age_band": "J", "insulation_0": 0.25, "insulation_50": 0.25, "insulation_100": 0.25, "insulation_150": 0.22}, - - {"age_band": "K", "insulation_0": 0.22, "insulation_50": 0.22, "insulation_100": 0.22, "insulation_150": 0.22}, - {"age_band": "L", "insulation_0": 0.22, "insulation_50": 0.22, "insulation_100": 0.22, "insulation_150": 0.22}, + { + "age_band": "A", + "insulation_0": 1.2, + "insulation_50": 0.5, + "insulation_100": 0.3, + "insulation_150": 0.22, + }, + { + "age_band": "B", + "insulation_0": 1.2, + "insulation_50": 0.5, + "insulation_100": 0.3, + "insulation_150": 0.22, + }, + { + "age_band": "C", + "insulation_0": 1.2, + "insulation_50": 0.5, + "insulation_100": 0.3, + "insulation_150": 0.22, + }, + { + "age_band": "D", + "insulation_0": 1.2, + "insulation_50": 0.5, + "insulation_100": 0.3, + "insulation_150": 0.22, + }, + { + "age_band": "E", + "insulation_0": 1.2, + "insulation_50": 0.5, + "insulation_100": 0.3, + "insulation_150": 0.22, + }, + { + "age_band": "F", + "insulation_0": 1.2, + "insulation_50": 0.5, + "insulation_100": 0.3, + "insulation_150": 0.22, + }, + { + "age_band": "G", + "insulation_0": 1.2, + "insulation_50": 0.5, + "insulation_100": 0.3, + "insulation_150": 0.22, + }, + { + "age_band": "H", + "insulation_0": 0.51, + "insulation_50": 0.5, + "insulation_100": 0.3, + "insulation_150": 0.22, + }, + { + "age_band": "I", + "insulation_0": 0.51, + "insulation_50": 0.5, + "insulation_100": 0.3, + "insulation_150": 0.22, + }, + { + "age_band": "J", + "insulation_0": 0.25, + "insulation_50": 0.25, + "insulation_100": 0.25, + "insulation_150": 0.22, + }, + { + "age_band": "K", + "insulation_0": 0.22, + "insulation_50": 0.22, + "insulation_100": 0.22, + "insulation_150": 0.22, + }, + { + "age_band": "L", + "insulation_0": 0.22, + "insulation_50": 0.22, + "insulation_100": 0.22, + "insulation_150": 0.22, + }, ] table_s12 = pd.DataFrame(s12_list) From 92fcbe8cdb86b58be282cb0c97ae8bb183307954 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sat, 1 Nov 2025 15:34:45 +0000 Subject: [PATCH 2/3] amend etl code for new october data --- etl/epc/DataProcessor.py | 11 +- etl/epc/Pipeline.py | 10 ++ etl/epc/property_change_app.py | 2 +- .../epc_attributes/FloorAttributes.py | 59 ++++--- .../epc_attributes/RoofAttributes.py | 89 ++++++++--- .../epc_attributes/WallAttributes.py | 146 ++++++++++-------- recommendations/rdsap_tables.py | 102 ++++++++++-- recommendations/recommendation_utils.py | 4 +- 8 files changed, 295 insertions(+), 128 deletions(-) diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 41dca943..682e9e78 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -21,7 +21,7 @@ from etl.epc.settings import ( ENDING_SUFFIX_COMPONENT_COLS, POTENTIAL_COLUMNS, EFFICIENCY_FEATURES, - DATA_ANOMALY_MATCHES + DATA_ANOMALY_MATCHES, ) from recommendations.rdsap_tables import FLOOR_LEVEL_MAP @@ -249,7 +249,8 @@ class EPCDataProcessor: # Map all anomaly values to None data_anomaly_map = dict( zip( - DATA_ANOMALY_MATCHES, [None] * len(DATA_ANOMALY_MATCHES), + DATA_ANOMALY_MATCHES, + [None] * len(DATA_ANOMALY_MATCHES), ) ) @@ -749,6 +750,12 @@ class EPCDataProcessor: self.data = self.data[~pd.isnull(self.data["HOTWATER_DESCRIPTION"])] self.data = self.data[~pd.isnull(self.data["ROOF_DESCRIPTION"])] + # Remove any walls described as Basement walls since these are non-standard + # TODO: CHECK IF WE SHOULD MAP THESE U VALUES INSTEAD + index_to_remove = self.data["WALLS_DESCRIPTION"] == "Basement wall" + print(f"Removing {index_to_remove.sum()} records with basement walls") + self.data = self.data[~index_to_remove] + # Because park homes are surveyed unusually (for example, we don't have u-values to # look up for their different components, they need to be collected in survey and aren't reflected in # EPCs) we'll ignore them from the model diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index c03abfcf..9f427c59 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -112,6 +112,16 @@ clean_lookup["mainheatcont-description"] = new_mainheatcont_mapping.to_dict( orient="records" ) +# TEMP FIX - GRANITE OR WHINSTONE BOOLEAN ISSUE +new_walls_description_mapping = pd.DataFrame(clean_lookup["walls-description"]) +new_walls_description_mapping.loc[ + new_walls_description_mapping["original_description"].str.contains("Granite"), + "is_granite_or_whinstone", +] = True +clean_lookup["walls-description"] = new_walls_description_mapping.to_dict( + orient="records" +) + class EPCPipeline: """ diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index c985567d..cdb7cfb8 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -12,7 +12,7 @@ def main(): """ directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] - # directories = directories[0:3] + # directories = directories[235:275] epc_pipeline = EPCPipeline( directories=directories, diff --git a/etl/epc_clean/epc_attributes/FloorAttributes.py b/etl/epc_clean/epc_attributes/FloorAttributes.py index 6def93f0..23c7dd8e 100644 --- a/etl/epc_clean/epc_attributes/FloorAttributes.py +++ b/etl/epc_clean/epc_attributes/FloorAttributes.py @@ -1,17 +1,26 @@ import re from typing import Dict, Union from BaseUtility import Definitions -from etl.epc_clean.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types +from etl.epc_clean.epc_attributes.attribute_utils import ( + extract_thermal_transmittance, + extract_component_types, +) class FloorAttributes(Definitions): DWELLING_BELOW = ["another dwelling below", "other premises below"] - FLOOR_TYPES = ["assumed", "to unheated space", "to external air", "suspended", "solid"] + FLOOR_TYPES = [ + "assumed", + "to unheated space", + "to external air", + "suspended", + "solid", + ] # For the short term, while we are still exploring the data, we maintain a list of error cases which # we want to ignore and consider as no data. - OBSERVED_ERRORS = ["Conservatory", "insulated"] + OBSERVED_ERRORS = ["Conservatory", "insulated", "Basement"] WELSH_TEXT = { "(anheddiad arall islaw)": "(another dwelling below)", @@ -35,32 +44,40 @@ class FloorAttributes(Definitions): "i ofod heb ei wresogi, heb ei inswleiddio (rhagdybiaeth)": "to unheated space, no insulation (assumed)", "i ofod heb ei wresogi, dim inswleiddio": "to unheated space, no insulation", "igçör awyr y tu allan, wedigçöi inswleiddio (rhagdybiaeth)": "to external air, insulated (assumed)", - "crog, inswleiddio cyfyngedig (rhagdybiaeth)": "suspended, limited insulation (assumed)" + "crog, inswleiddio cyfyngedig (rhagdybiaeth)": "suspended, limited insulation (assumed)", } def __init__(self, description: str): self.description: str = description.lower() - self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or ( - description in self.OBSERVED_ERRORS) or (self.description == "sap05:floor") + self.nodata = ( + (not description) + or (description in self.DATA_ANOMALY_MATCHES) + or (description in self.OBSERVED_ERRORS) + or (self.description == "sap05:floor") + ) # Try and perform a translation, incase it's in welsh self.translate_welsh_text() if not self.nodata and not any( - rt in self.description for rt in - self.FLOOR_TYPES + self.DWELLING_BELOW + ["average thermal transmittance"] + rt in self.description + for rt in self.FLOOR_TYPES + + self.DWELLING_BELOW + + ["average thermal transmittance"] ): - raise ValueError('Invalid description') + raise ValueError("Invalid description") def translate_welsh_text(self): uvalue_match = re.search( - r'trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m-¦k', self.description + r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m-¦k", + self.description, ) uvalue_match2 = re.search( - r'trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m.+k', self.description + r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m.+k", + self.description, ) # Step 2: Generalized translation with placeholder @@ -69,7 +86,7 @@ class FloorAttributes(Definitions): uvalue = uvalue_match.group(1) else: uvalue = uvalue_match2.group(1) - self.description = f'average thermal transmittance {uvalue} w/m-¦k' + self.description = f"average thermal transmittance {uvalue} w/m-¦k" else: translation = self.WELSH_TEXT.get(self.description) @@ -89,11 +106,15 @@ class FloorAttributes(Definitions): result, description = extract_thermal_transmittance(result, description) # floor type - result, description = extract_component_types(result, description, list_of_components=self.FLOOR_TYPES) + result, description = extract_component_types( + result, description, list_of_components=self.FLOOR_TYPES + ) # check if there is another dwelling below - result['another_property_below'] = "(another dwelling below)" in description or "(other premises below)" in \ - description + result["another_property_below"] = ( + "(another dwelling below)" in description + or "(other premises below)" in description + ) thickness_map = { "external insulation": "average", @@ -102,17 +123,17 @@ class FloorAttributes(Definitions): "partial insulation": "below average", "no insulation": "none", "additional insulation": "above average", - "insulated": "average" + "insulated": "average", } for key, value in thickness_map.items(): if key in description: - result['insulation_thickness'] = value + result["insulation_thickness"] = value break else: - result['insulation_thickness'] = None + result["insulation_thickness"] = None if result["another_property_below"]: result["thermal_transmittance"] = 0 - result["thermal_transmittance_unit"] = 'w/m-¦k' + result["thermal_transmittance_unit"] = "w/m-¦k" return result diff --git a/etl/epc_clean/epc_attributes/RoofAttributes.py b/etl/epc_clean/epc_attributes/RoofAttributes.py index 2eacc951..153fb548 100644 --- a/etl/epc_clean/epc_attributes/RoofAttributes.py +++ b/etl/epc_clean/epc_attributes/RoofAttributes.py @@ -1,12 +1,28 @@ import re from typing import Dict, Union from BaseUtility import Definitions -from etl.epc_clean.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance +from etl.epc_clean.epc_attributes.attribute_utils import ( + extract_component_types, + extract_thermal_transmittance, +) class RoofAttributes(Definitions): - ROOF_TYPES = ['pitched', 'roof room', 'loft', 'flat', 'thatched', 'at rafters', 'assumed'] - DWELLING_ABOVE = ["another dwelling above", "other premises above", "other dwelling above"] + ROOF_TYPES = [ + "pitched", + "roof room", + "loft", + "flat", + "thatched", + "at rafters", + "assumed", + ] + DWELLING_ABOVE = [ + "another dwelling above", + "other premises above", + "other dwelling above", + "(same dwelling above)", + ] WELSH_TEXT = { "ar oleddf, dim inswleiddio": "pitched, no insulation", @@ -18,10 +34,10 @@ class RoofAttributes(Definitions): "ar oleddf, wedi?i inswleiddio": "pitched, insulated", "ar oleddf, inswleiddio cyfyngedig (rhagdybiaeth)": "pitched, limited insulation (assumed)", "ar oleddf, inswleiddio cyfyngedig": "pitched, limited insulation", - "ar oleddf, wedigçöi inswleiddio wrth y trawstiau": 'pitched, insulated at rafters', - "ar oleddf, wedi?i inswleiddio wrth y trawstiau": 'pitched, insulated at rafters', - "ar oleddf, wedi?i inswleiddio wrth y trawstia": 'pitched, insulated at rafters', - "ar oleddf, wedigçöi inswleiddio wrth y trawstia": 'pitched, insulated at rafters', + "ar oleddf, wedigçöi inswleiddio wrth y trawstiau": "pitched, insulated at rafters", + "ar oleddf, wedi?i inswleiddio wrth y trawstiau": "pitched, insulated at rafters", + "ar oleddf, wedi?i inswleiddio wrth y trawstia": "pitched, insulated at rafters", + "ar oleddf, wedigçöi inswleiddio wrth y trawstia": "pitched, insulated at rafters", "yn wastad, inswleiddio cyfyngedig (rhagdybiaeth)": "flat, limited insulation (assumed)", "yn wastad, inswleiddio cyfyngedig": "flat, limited insulation", "yn wastad, dim inswleiddio (rhagdybiaeth)": "flat, no insulation (assumed)", @@ -43,9 +59,18 @@ class RoofAttributes(Definitions): } DEFAULT_KEYS = [ - 'thermal_transmittance', 'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', - 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed', 'has_dwelling_above', - 'is_valid', 'insulation_thickness' + "thermal_transmittance", + "thermal_transmittance_unit", + "is_pitched", + "is_roof_room", + "is_loft", + "is_flat", + "is_thatched", + "is_at_rafters", + "is_assumed", + "has_dwelling_above", + "is_valid", + "insulation_thickness", ] def __init__(self, description: str): @@ -54,14 +79,21 @@ class RoofAttributes(Definitions): """ self.description: str = description.lower().strip() - self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or self.description == "sap05:roof" + self.nodata = ( + not description + or description in self.DATA_ANOMALY_MATCHES + or self.description == "sap05:roof" + ) self.welsh_translation_search() if not self.nodata and not any( - rt in self.description for rt in self.ROOF_TYPES + self.DWELLING_ABOVE + ["average thermal transmittance"] + rt in self.description + for rt in self.ROOF_TYPES + + self.DWELLING_ABOVE + + ["average thermal transmittance"] ): - raise ValueError('Invalid description') + raise ValueError("Invalid description") def welsh_translation_search(self): """ @@ -76,7 +108,7 @@ class RoofAttributes(Definitions): r"ar oleddf, (\d+ mm) lo inswleiddio yn y llof", r"ar oleddf, (\d+\+ mm) lo inswleiddio yn y llof", r"ar oleddf, (\d+mm) o inswleiddio yn y llofft", - r"ar oleddf, (\d+\+ mm) o inswleiddio yn y llofft" + r"ar oleddf, (\d+\+ mm) o inswleiddio yn y llofft", ] li_thickness_match = None for regex in loft_insulation_regexes: @@ -84,9 +116,14 @@ class RoofAttributes(Definitions): if li_thickness_match: break - uvalue_search = re.search(r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m-¦k", self.description) + uvalue_search = re.search( + r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m-¦k", + self.description, + ) uvalue_search2 = re.search( - r'trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m.+k', self.description, re.IGNORECASE + r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m.+k", + self.description, + re.IGNORECASE, ) # Step 2: Generalized translation with placeholder @@ -121,9 +158,13 @@ class RoofAttributes(Definitions): result, description = extract_thermal_transmittance(result, description) # roof type - result, description = extract_component_types(result, description, list_of_components=self.ROOF_TYPES) + result, description = extract_component_types( + result, description, list_of_components=self.ROOF_TYPES + ) - result["has_dwelling_above"] = any([x in description for x in self.DWELLING_ABOVE]) + result["has_dwelling_above"] = any( + [x in description for x in self.DWELLING_ABOVE] + ) for dwelling_above in self.DWELLING_ABOVE: description = description.replace(dwelling_above, "") @@ -136,7 +177,7 @@ class RoofAttributes(Definitions): # Search for a regular expression that matches 150 insulation match = re.search(r"(\d+\+?)\s*insulation", description) if match: - result['insulation_thickness'] = match.group(1) + result["insulation_thickness"] = match.group(1) # insulation thickness thickness_map = { @@ -149,21 +190,21 @@ class RoofAttributes(Definitions): } for key, value in thickness_map.items(): if key in description: - result['insulation_thickness'] = value + result["insulation_thickness"] = value # Remove the match from the description # description = description.replace(key, "") break # Extract insulation thickness in mm, if present - match = re.search(r'(\d+\+?)\s*mm', description) + match = re.search(r"(\d+\+?)\s*mm", description) if match: - result['insulation_thickness'] = match.group(1) + result["insulation_thickness"] = match.group(1) if "insulation_thickness" not in result: - result['insulation_thickness'] = None + result["insulation_thickness"] = None if result["has_dwelling_above"]: result["thermal_transmittance"] = 0 - result["thermal_transmittance_unit"] = 'w/m-¦k' + result["thermal_transmittance_unit"] = "w/m-¦k" return result diff --git a/etl/epc_clean/epc_attributes/WallAttributes.py b/etl/epc_clean/epc_attributes/WallAttributes.py index 49252552..8cf32a0b 100644 --- a/etl/epc_clean/epc_attributes/WallAttributes.py +++ b/etl/epc_clean/epc_attributes/WallAttributes.py @@ -3,76 +3,78 @@ from typing import Dict, Union from BaseUtility import Definitions from etl.epc_clean.epc_attributes.attribute_utils import ( extract_component_types, - extract_thermal_transmittance + extract_thermal_transmittance, ) class WallAttributes(Definitions): - WALL_TYPES = ['cavity wall', 'filled cavity', 'solid brick', 'system built', 'timber frame', 'granite or whinstone', - 'as built', 'cob', 'assumed', 'sandstone or limestone', "park home"] + WALL_TYPES = [ + "cavity wall", + "filled cavity", + "solid brick", + "system built", + "timber frame", + "granite or whinstone", + "as built", + "cob", + "assumed", + "sandstone or limestone", + "park home", + ] WELSH_TEXT = { - "Briciau solet, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": - "Solid brick, as built, no insulation (assumed)", - 'Waliau ceudod, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)': - 'Cavity wall, as built, partial insulation (assumed)', - 'Waliau ceudod, fel yGÇÖu hadeiladwyd, inswleiddio rhannol': - 'Cavity wall, as built, partial insulation', - 'Waliau ceudod, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)': - 'Cavity wall, as built, no insulation (assumed)', - 'Waliau ceudod, fel yGÇÖu hadeiladwyd, dim inswleiddio': - 'Cavity wall, as built, no insulation', - 'Tywodfaen, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)': - 'Sandstone or limestone, as built, no insulation (assumed)', - 'Tywodfaen, fel yGÇÖu hadeiladwyd, dim inswleiddio': - 'Sandstone or limestone, as built, no insulation', - 'Waliau ceudod, ceudod wediGÇÖi lenwi': 'Cavity wall, filled cavity', - 'Waliau ceudod, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)': - 'Cavity wall, as built, insulated (assumed)', - 'Waliau ceudod, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio': - 'Cavity wall, as built, insulated', - 'Gwenithfaen neu risgraig, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)': - 'Granite or whinstone, as built, no insulation (assumed)', - 'Waliau ceudod,': 'Cavity wall, as built, no insulation', - 'Ffr+óm bren, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)': - 'Timber frame, as built, insulated (assumed)', - 'Ffr+óm bren, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio': - 'Timber frame, as built, insulated', - 'Gwenithfaen neu risgraig, gydag inswleiddio allanol': 'Granite or whinstone, with external insulation', - 'WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)': - 'System built, as built, no insulation (assumed)', - 'Tywodfaen, gydag inswleiddio mewnol': 'Sandstone or limestone, with internal insulation', - 'Waliau ceudod, ynysydd allanol a llenwi ceudod': 'Cavity wall, filled cavity and external insulation', - 'Gwenithfaen neu risgraig, gydag inswleiddio mewnol': 'Granite or whinstone, with internal insulation', - 'Ffr+óm bren, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)': - 'Timber frame, as built, partial insulation (assumed)', - 'WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)': - 'System built, as built, insulated (assumed)', - 'WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio': - 'System built, as built, insulated', - 'WediGÇÖu hadeiladu yn +¦l system, gydag inswleiddio allanol': 'System built, with external insulation', - 'Briciau solet, gydag inswleiddio mewnol': 'Solid brick, with internal insulation', - 'WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)': - 'System built, as built, partial insulation (assumed)', - 'WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, inswleiddio rhannol': - 'System built, as built, partial insulation', - 'Ffr+óm bren, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)': - 'Timber frame, as built, no insulation (assumed)', - 'Ffr+óm bren, fel yGÇÖu hadeiladwyd, dim inswleiddio': - 'Timber frame, as built, no insulation', - 'Tywodfaen, gydag inswleiddio allanol': 'Sandstone or limestone, with external insulation', - 'Waliau ceudod, gydag inswleiddio allanol': 'Cavity wall, with external insulation', - 'Briciau solet, gydag inswleiddio allanol': 'Solid brick, with external insulation', + "Briciau solet, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "Solid brick, as built, no insulation (assumed)", + "Waliau ceudod, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)": "Cavity wall, as built, partial insulation (assumed)", + "Waliau ceudod, fel yGÇÖu hadeiladwyd, inswleiddio rhannol": "Cavity wall, as built, partial insulation", + "Waliau ceudod, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "Cavity wall, as built, no insulation (assumed)", + "Waliau ceudod, fel yGÇÖu hadeiladwyd, dim inswleiddio": "Cavity wall, as built, no insulation", + "Tywodfaen, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "Sandstone or limestone, as built, no insulation (assumed)", + "Tywodfaen, fel yGÇÖu hadeiladwyd, dim inswleiddio": "Sandstone or limestone, as built, no insulation", + "Waliau ceudod, ceudod wediGÇÖi lenwi": "Cavity wall, filled cavity", + "Waliau ceudod, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)": "Cavity wall, as built, insulated (assumed)", + "Waliau ceudod, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio": "Cavity wall, as built, insulated", + "Gwenithfaen neu risgraig, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "Granite or whinstone, as built, no insulation (assumed)", + "Waliau ceudod,": "Cavity wall, as built, no insulation", + "Ffr+óm bren, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)": "Timber frame, as built, insulated (assumed)", + "Ffr+óm bren, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio": "Timber frame, as built, insulated", + "Gwenithfaen neu risgraig, gydag inswleiddio allanol": "Granite or whinstone, with external insulation", + "WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "System built, as built, no insulation (assumed)", + "Tywodfaen, gydag inswleiddio mewnol": "Sandstone or limestone, with internal insulation", + "Waliau ceudod, ynysydd allanol a llenwi ceudod": "Cavity wall, filled cavity and external insulation", + "Gwenithfaen neu risgraig, gydag inswleiddio mewnol": "Granite or whinstone, with internal insulation", + "Ffr+óm bren, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)": "Timber frame, as built, partial insulation (assumed)", + "WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)": "System built, as built, insulated (assumed)", + "WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio": "System built, as built, insulated", + "WediGÇÖu hadeiladu yn +¦l system, gydag inswleiddio allanol": "System built, with external insulation", + "Briciau solet, gydag inswleiddio mewnol": "Solid brick, with internal insulation", + "WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)": "System built, as built, partial insulation (assumed)", + "WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, inswleiddio rhannol": "System built, as built, partial insulation", + "Ffr+óm bren, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "Timber frame, as built, no insulation (assumed)", + "Ffr+óm bren, fel yGÇÖu hadeiladwyd, dim inswleiddio": "Timber frame, as built, no insulation", + "Tywodfaen, gydag inswleiddio allanol": "Sandstone or limestone, with external insulation", + "Waliau ceudod, gydag inswleiddio allanol": "Cavity wall, with external insulation", + "Briciau solet, gydag inswleiddio allanol": "Solid brick, with external insulation", # Add in some corrections: - 'Co with external insulation': 'Cob, with external insulation', - 'Cowith external insulation': 'Cob, with external insulation', + "Co with external insulation": "Cob, with external insulation", + "Cowith external insulation": "Cob, with external insulation", } DEFAULT_KEYS = [ - 'thermal_transmittance', 'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity', - 'is_solid_brick', 'is_system_built', 'is_timber_frame', 'is_granite_or_whinstone', - 'is_as_built', 'is_cob', 'is_assumed', 'is_sandstone_or_limestone', - 'insulation_thickness', 'external_insulation', 'internal_insulation' + "thermal_transmittance", + "thermal_transmittance_unit", + "is_cavity_wall", + "is_filled_cavity", + "is_solid_brick", + "is_system_built", + "is_timber_frame", + "is_granite_or_whinstone", + "is_as_built", + "is_cob", + "is_assumed", + "is_sandstone_or_limestone", + "insulation_thickness", + "external_insulation", + "internal_insulation", ] CORRECTIONS = { @@ -98,7 +100,9 @@ class WallAttributes(Definitions): :return: """ - uvalue_search = re.search(r"Trawsyriannedd thermol cyfartalog (\d+\.?\d*)", self.description) + uvalue_search = re.search( + r"Trawsyriannedd thermol cyfartalog (\d+\.?\d*)", self.description + ) if uvalue_search: uvalue = uvalue_search.group(1) @@ -123,7 +127,9 @@ class WallAttributes(Definitions): result, description = extract_thermal_transmittance(result, description) # wall type - result, description = extract_component_types(result, description, list_of_components=self.WALL_TYPES) + result, description = extract_component_types( + result, description, list_of_components=self.WALL_TYPES + ) # Handle some edge cases if "sandstone" in description and not result["is_sandstone_or_limestone"]: @@ -137,18 +143,18 @@ class WallAttributes(Definitions): "partial insulation": "below average", "no insulation": "none", "additional insulation": "above average", - "insulated": "average" + "insulated": "average", } for key, value in thickness_map.items(): if key in description: - result['insulation_thickness'] = value + result["insulation_thickness"] = value break else: - result['insulation_thickness'] = None + result["insulation_thickness"] = None # insulation type - result['external_insulation'] = 'external insulation' in description - result['internal_insulation'] = 'internal insulation' in description + result["external_insulation"] = "external insulation" in description + result["internal_insulation"] = "internal insulation" in description if result["is_filled_cavity"]: # If it has a filled cavity + internal/external insulation, it's deemed to have above average insulation @@ -159,7 +165,11 @@ class WallAttributes(Definitions): else: result["insulation_thickness"] = "average" - if result["is_cavity_wall"] & result["is_as_built"] & (result["insulation_thickness"] == "average"): + if ( + result["is_cavity_wall"] + & result["is_as_built"] + & (result["insulation_thickness"] == "average") + ): result["is_filled_cavity"] = True return result diff --git a/recommendations/rdsap_tables.py b/recommendations/rdsap_tables.py index 14c7f247..46e7d083 100644 --- a/recommendations/rdsap_tables.py +++ b/recommendations/rdsap_tables.py @@ -98,6 +98,13 @@ age_band_data = [ "Northern_Ireland": "2014-2022", "Park_home_UK": None, }, + { + "age_band": "M", + "England_Wales": "2022 onwards", + "Scotland": "2024 onwards", + "Northern_Ireland": "2023 onwards", + "Park_home_UK": None, + }, ] england_wales_age_band_lookup = { @@ -123,6 +130,7 @@ default_wall_thickness = [ "J": 450, "K": 450, "L": 450, + "M": 450, }, { "type": "solid brick", @@ -138,6 +146,7 @@ default_wall_thickness = [ "J": 300, "K": 300, "L": 300, + "M": 300, }, { "type": "cavity", @@ -153,6 +162,7 @@ default_wall_thickness = [ "J": 300, "K": 300, "L": 300, + "M": 300, }, { "type": "timber frame", @@ -168,6 +178,7 @@ default_wall_thickness = [ "J": 300, "K": 300, "L": 300, + "M": 300, }, { "type": "cob", @@ -183,6 +194,7 @@ default_wall_thickness = [ "J": 590, "K": 590, "L": 590, + "M": 590, }, { "type": "system build", @@ -198,6 +210,7 @@ default_wall_thickness = [ "J": 300, "K": 300, "L": 300, + "M": 300, }, { "type": "park home", @@ -213,6 +226,7 @@ default_wall_thickness = [ "J": 100, "K": 100, "L": 100, + "M": 100, }, ] @@ -253,8 +267,36 @@ wall_types = [ ] u_values = [ - ["a", "a", "a", "a", "1.7b", "1.0", "0.6", "0.60", "0.45", "0.35", "0.30", "0.28"], - ["a", "a", "a", "a", "1.7b", "1.0", "0.6", "0.60", "0.45", "0.35", "0.30", "0.28"], + [ + "a", + "a", + "a", + "a", + "1.7b", + "1.0", + "0.6", + "0.60", + "0.45", + "0.35", + "0.30", + "0.28", + "0.26", + ], + [ + "a", + "a", + "a", + "a", + "1.7b", + "1.0", + "0.6", + "0.60", + "0.45", + "0.35", + "0.30", + "0.28", + "0.26", + ], [ "1.7", "1.7", @@ -268,6 +310,7 @@ u_values = [ "0.35", "0.30", "0.28", + "0.26", ], [ "0.55", @@ -282,6 +325,7 @@ u_values = [ "0.25", "0.21", "0.21", + "0.20", ], [ "0.32", @@ -296,6 +340,7 @@ u_values = [ "0.19", "0.17", "0.16", + "0.15", ], [ "0.23", @@ -310,6 +355,7 @@ u_values = [ "0.15", "0.14", "0.14", + "0.13", ], [ "0.18", @@ -324,6 +370,7 @@ u_values = [ "0.13", "0.12", "0.12", + "0.11", ], [ "0.80", @@ -338,6 +385,7 @@ u_values = [ "0.35", "0.30", "0.28", + "0.26", ], [ "0.40", @@ -352,6 +400,7 @@ u_values = [ "0.25", "0.21", "0.21", + "0.20", ], [ "0.26", @@ -366,6 +415,7 @@ u_values = [ "0.19", "0.17", "0.16", + "0.15", ], [ "0.20", @@ -380,6 +430,7 @@ u_values = [ "0.15", "0.14", "0.14", + "0.13", ], [ "0.16", @@ -394,6 +445,7 @@ u_values = [ "0.13", "0.12", "0.12", + "0.11", ], [ "1.5", @@ -408,6 +460,7 @@ u_values = [ "0.35", "0.30", "0.28", + "0.26", ], [ "0.53", @@ -422,6 +475,7 @@ u_values = [ "0.25", "0.21", "0.21", + "0.20", ], [ "0.32", @@ -436,6 +490,7 @@ u_values = [ "0.19", "0.17", "0.16", + "0.15", ], [ "0.23", @@ -450,6 +505,7 @@ u_values = [ "0.15", "0.14", "0.14", + "0.13", ], [ "0.18", @@ -464,6 +520,7 @@ u_values = [ "0.13", "0.12", "0.12", + "0.11", ], [ "0.7", @@ -478,6 +535,7 @@ u_values = [ "0.35", "0.30", "0.28", + "0.26", ], [ "0.37", @@ -492,6 +550,7 @@ u_values = [ "0.25", "0.21", "0.21", + "0.20", ], [ "0.25", @@ -506,6 +565,7 @@ u_values = [ "0.19", "0.17", "0.16", + "0.15", ], [ "0.19", @@ -520,6 +580,7 @@ u_values = [ "0.15", "0.14", "0.14", + "0.13", ], [ "0.16", @@ -534,6 +595,7 @@ u_values = [ "0.13", "0.12", "0.12", + "0.11", ], [ "2.5", @@ -548,6 +610,7 @@ u_values = [ "0.35", "0.30", "0.28", + "0.26", ], [ "0.60", @@ -562,6 +625,7 @@ u_values = [ "0.35", "0.30", "0.28", + "0.26", ], [ "2.0", @@ -576,6 +640,7 @@ u_values = [ "0.35", "0.30", "0.28", + "0.26", ], [ "0.60", @@ -590,6 +655,7 @@ u_values = [ "0.25", "0.21", "0.21", + "0.20", ], [ "0.35", @@ -604,6 +670,7 @@ u_values = [ "0.19", "0.17", "0.16", + "0.15", ], [ "0.25", @@ -618,6 +685,7 @@ u_values = [ "0.15", "0.14", "0.14", + "0.13", ], [ "0.18", @@ -632,10 +700,11 @@ u_values = [ "0.13", "0.12", "0.12", + "0.11", ], ] -age_bands = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"] +age_bands = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"] wall_uvalues = [] for i, wall_type in enumerate(wall_types): @@ -645,13 +714,23 @@ for i, wall_type in enumerate(wall_types): wall_uvalues.append(row) parkhome_wall_uvalues = [ - {"Wall_type": "Park home as built", "F": "1.7", "G": "1.2", "I": "0.7", "K": "0.6"}, + { + "Wall_type": "Park home as built", + "F": "1.7", + "G": "1.2", + "I": "0.7", + "K": "0.6", + "L": "0.6", + "M": "0.6", + }, { "Wall_type": "Park home with additional insulation", "F": "s1.1.2", "G": "s1.1.2", "I": "s1.1.2", "K": "s1.1.2", + "L": "s1.1.2", + "M": "s1.1.2", }, ] @@ -713,14 +792,13 @@ epc_wall_description_map = { # Granite/whinstone wall mappings ############################ "Granite or whinstone, as built, no insulation": "Stone: granite or whinstone as built", - "Granite or whinstone, with internal insulation": "Stone/solid brick with 100 mm external or internal " - "insulation", - "Granite or whinstone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal " - "insulation", - "Granite or whinstone, as built, insulated": "Stone/solid brick with 100 mm external or internal " - "insulation", - "Granite or whinstone, with external insulation": "Stone/solid brick with 100 mm external or internal " - "insulation", + "Granite or whinstone, with internal insulation": "Stone/solid brick with 100 mm external or internal insulation", + "Granite or whin, with internal insulation": "Stone/solid brick with 100 mm external or internal insulation", + "Granite or whinstone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal insulation", + "Granite or whinstone, as built, insulated": "Stone/solid brick with 100 mm external or internal insulation", + "Granite or whin, as built, insulated": "Stone/solid brick with 100 mm external or internal insulation", + "Granite or whinstone, with external insulation": "Stone/solid brick with 100 mm external or internal insulation", + "Granite or whin, with external insulation": "Stone/solid brick with 100 mm external or internal insulation", ############################ # System built wall mappings ############################ diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py index 602684cf..7c39668a 100644 --- a/recommendations/recommendation_utils.py +++ b/recommendations/recommendation_utils.py @@ -205,7 +205,7 @@ def get_wall_u_value( mapped_value = wall_uvalues_df[ wall_uvalues_df["Wall_type"] == mapped_description - ][age_band].values[0] + ][age_band].values[0] if pd.isnull(mapped_value) and "Park home" in mapped_description: # We don't know enough in this case so we default to 0 @@ -563,7 +563,7 @@ def get_floor_u_value( insulation_lookup = s11[ s11["Age_band"].str.contains(age_band) & s11["Floor_construction"] == floor_type - ] + ] if insulation_lookup.empty: insulation_thickness = 0 else: From 6aefd1eb3c72be741cdae08df3146623bd2f3c20 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 2 Nov 2025 09:44:41 +0000 Subject: [PATCH 3/3] add post sap 10 feature --- etl/epc/DataProcessor.py | 11 +++++++++++ etl/epc/Pipeline.py | 5 ++++- etl/epc/Record.py | 35 ++++++++++++++++++++--------------- etl/epc/settings.py | 5 +++++ 4 files changed, 40 insertions(+), 16 deletions(-) diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 682e9e78..5e5d0872 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -4,6 +4,7 @@ import pandas as pd from etl.epc.settings import ( DATA_PROCESSOR_SETTINGS, EARLIEST_EPC_DATE, + POST_SAP10_DATE, # IGNORED_TRANSACTION_TYPES, IGNORED_FLOOR_LEVELS, IGNORED_PROPERTY_TYPES, @@ -159,6 +160,9 @@ class EPCDataProcessor: # colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], # ) + # Create post sap10 flag + self.create_post_sap10_flag() + # When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper cleaning_averages = self.cleaning_averages.copy() if self.run_mode == "newdata": @@ -175,6 +179,13 @@ class EPCDataProcessor: self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step) self.cast_data_columns_to_lower() + def create_post_sap10_flag(self): + """ + Create a flag to indicate if the epc is post sap10 + """ + + self.data["is_post_sap10"] = self.data["LODGEMENT_DATE"] >= POST_SAP10_DATE + def cast_data_columns_to_lower(self): """ Convert all columns names to lower diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index 9f427c59..fac58cd9 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -23,6 +23,7 @@ from etl.epc.settings import ( POTENTIAL_COLUMNS, ROOM_FEATURES, COST_FEATURES, + POST_SAP10_FEATURE, ) # TODO: change in setting file @@ -325,7 +326,9 @@ class EPCPipeline: # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time - variable_data = property_data[VARIABLE_DATA_FEATURES + COST_FEATURES] + variable_data = property_data[ + VARIABLE_DATA_FEATURES + COST_FEATURES + POST_SAP10_FEATURE + ] uprn = str(uprn) epc_records = [ diff --git a/etl/epc/Record.py b/etl/epc/Record.py index d0816034..7552a0c4 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -20,6 +20,7 @@ from etl.epc.settings import ( COMPONENT_FEATURES, EFFICIENCY_FEATURES, ROOM_FEATURES, + POST_SAP10_FEATURE, ) from recommendations.recommendation_utils import estimate_number_of_floors from utils.s3 import read_dataframe_from_s3_parquet @@ -89,6 +90,7 @@ class EPCRecord: co2_emissions_current: float = None number_habitable_rooms: float = None number_heated_rooms: float = None + is_post_sap10: bool = None # u_values_walls = None # u_values_roof = None @@ -277,6 +279,7 @@ class EPCRecord: self.number_heated_rooms: float = float( self.prepared_epc["number_heated_rooms"] ) + self.is_post_sap10: bool = bool(self.prepared_epc["is_post_sap10"]) def _identify_delta_between_prepared_and_original_records(self): """ @@ -385,11 +388,11 @@ class EPCRecord: return df def _clean_floor_height(self): - """ Remaps anomalies in floor height to the average floor height for the property type """ + """Remaps anomalies in floor height to the average floor height for the property type""" floor_height_data = self.cleaning_data[ - (self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) & - (self.cleaning_data["built_form"] == self.prepared_epc["built-form"]) - ] + (self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) + & (self.cleaning_data["built_form"] == self.prepared_epc["built-form"]) + ] average = floor_height_data["floor_height"].mean() sd = floor_height_data["floor_height"].std() # If we're in the top 0.5 percentile of floor heights, we'll set it to the average @@ -399,14 +402,16 @@ class EPCRecord: self.prepared_epc["floor-height"] = average def _clean_new_build_descriptions(self): - for col in ['roof-description', 'walls-description', 'floor-description']: + for col in ["roof-description", "walls-description", "floor-description"]: self.prepared_epc[col] = self.prepared_epc[col].replace("W/m²K", "W/m-¦K") def _clean_constituency(self): """ We handle the single case of finding a missing constituency by using the local authority """ - if pd.isnull(self.prepared_epc["constituency"]) or (self.prepared_epc["constituency"] == ""): + if pd.isnull(self.prepared_epc["constituency"]) or ( + self.prepared_epc["constituency"] == "" + ): if self.prepared_epc["local-authority"] != "E06000044": raise NotImplementedError( "This function is only implemented for Portsmouth, in the single edgecase seen" @@ -595,12 +600,12 @@ class EPCRecord: # We handle the edge case of floor area being 0. We set it to zero and it is cleaned by # _clean_with_data_processor - if self.prepared_epc['total-floor-area'] == 0: + if self.prepared_epc["total-floor-area"] == 0: print( "Edge case of floor area being zero - will set to none and will be cleaned in " "_clean_with_data_processor" ) - self.prepared_epc['total-floor-area'] = None + self.prepared_epc["total-floor-area"] = None def _clean_mains_gas(self): """ @@ -609,12 +614,7 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - mains_gas_map = { - "Y": True, - "N": False, - True: True, - False: False - } + mains_gas_map = {"Y": True, "N": False, True: True, False: False} self.prepared_epc["mains-gas-flag"] = ( None @@ -1064,7 +1064,12 @@ class EPCDifferenceRecord: CARBON_RESPONSE ) - component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + ROOM_FEATURES + component_variables = ( + COMPONENT_FEATURES + + EFFICIENCY_FEATURES + + ROOM_FEATURES + + POST_SAP10_FEATURE + ) ending_record = self.record2.get( component_variables + ["lodgement_date"], return_asdict=True, diff --git a/etl/epc/settings.py b/etl/epc/settings.py index ecc56552..47a75def 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -52,6 +52,9 @@ DATA_ANOMALY_MATCHES = { "Unknown", } +# Add the post_sap10 date to indicate if the epc is post sap10 +POST_SAP10_DATE = "2025-06-22" + DATA_ANOMALY_SUBSTRINGS = { # Where values in a ‘pick’ list that have been superseded by another value. For example, where a value for # ‘pitched roof’ has been replaced by three sub-categories of pitched roof. The original value is retained @@ -184,6 +187,8 @@ EFFICIENCY_FEATURES = [ ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"] +POST_SAP10_FEATURE = ["is_post_sap10"] + COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [ "TRANSACTION_TYPE", "ENERGY_TARIFF", # Not sure if this is relevant