From 362e657ab5f4710cf6bd472ccd14f65c9fa354e3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 11:45:57 +0100 Subject: [PATCH 001/255] handling different format of surveyed windows --- etl/customers/aiha/xml_extraction.py | 60 ++++++++++++++++++++++++++ etl/xml_survey_extraction/XmlParser.py | 34 ++++++++++++++- 2 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 etl/customers/aiha/xml_extraction.py diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py new file mode 100644 index 00000000..d235be78 --- /dev/null +++ b/etl/customers/aiha/xml_extraction.py @@ -0,0 +1,60 @@ +import os +from io import BytesIO +from etl.xml_survey_extraction.XmlParser import XmlParser + +SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS" + + +def main(): + """ + This script handles the extraction of data from the XML files in the survey folders. + :return: + """ + # Step 1: List all subfolders inside SURVEY_FOLDER_PATH. + subfolders = [f.path for f in os.scandir(SURVEY_FOLDER_PATH) if f.is_dir()] + + # Step 2: Loop through each subfolder and find the XML files. + extracted_surveys = [] + for subfolder in subfolders: + print(f"Searching in subfolder: {subfolder}") + + # Find all XML files in the current subfolder. + xml_files = [f for f in os.listdir(subfolder) if f.endswith('.xml')] + + if not xml_files: + raise FileNotFoundError(f"No XML files found in subfolder: {subfolder}") + + # If any XML files are found, perform the data extraction. We use the subfolder name as the survey key. + for xml_file in xml_files: + xml_path = os.path.join(subfolder, xml_file) + print(f"Processing XML file: {xml_path}") + + # Read in the XML and parse it using the XmlParser class. + with open(xml_path, 'rb') as file: + xml_data_io = BytesIO(file.read()) + uprn = None # Set the UPRN if available. + + # Create an XmlParser instance + xml_parser = XmlParser( + file=xml_data_io, + filekey=xml_path, + surveyor_company="", + uprn=uprn, + ) + + # Run the parser to extract the data + xml_parser.run() + + # Store the extracted data for further processing + extracted_surveys.append({ + "epc": xml_parser.epc, + "additional_data": xml_parser.additional_data, + "subfolder": subfolder + }) + + print(f"Extracted {len(extracted_surveys)} surveys.") + # Process the extracted_surveys as needed, for example, save to a database or write to a file. + + +if __name__ == "__main__": + main() diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index ffe191a4..ed3d65d2 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -769,8 +769,6 @@ class XmlParser: :return: """ - sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window") - glazing_type_lookup = { "3": "double glazing, unknown install date", "5": "Single glazing", @@ -787,6 +785,38 @@ class XmlParser: "8": "North West" } + sap_windows = self.xml.getElementsByTagName("SAP-Windows") + + if not sap_windows: + # We look for Multi-Glazed-Proportion + multiple_glazing_type = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "Multiple-Glazing-Type" + )[0].firstChild.nodeValue + + pvc_frame = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "PVC-Window-Frames" + )[0].firstChild.nodeValue + + multple_glazed_proportion = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "Multiple-Glazed-Proportion" + )[0].firstChild.nodeValue + + self.windows = [ + { + "window_location": None, + "window_area": None, + "window_type": None, + "glazing_type": glazing_type_lookup[multiple_glazing_type], + "pvc_frame": pvc_frame, + "glazing_gap": None, + "orientation": None, + "multple_glazed_proportion": multple_glazed_proportion + } + ] + return + + sap_windows = sap_windows[0].getElementsByTagName("SAP-Window") + self.windows = [ self._parse_windows_content( window=window, From 323364e0dff03fe5a02c575cce043568eae783e4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 11:51:00 +0100 Subject: [PATCH 002/255] added additional built form to built form map in XmlParser --- etl/xml_survey_extraction/XmlParser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index ed3d65d2..a0ed02e1 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -107,6 +107,7 @@ class XmlParser: BUILT_FORM_MAP = { "1": "Detached", + "2": "Semi-Detached", "3": "End-Terrace", "4": "Mid-Terrace", } @@ -803,7 +804,7 @@ class XmlParser: self.windows = [ { - "window_location": None, + "window_location": "0", "window_area": None, "window_type": None, "glazing_type": glazing_type_lookup[multiple_glazing_type], From 8f8e85c1e1d1fa202f5ec5c4747a92fcde36b292 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 12:01:05 +0100 Subject: [PATCH 003/255] debuggin xml extraction --- etl/xml_survey_extraction/XmlParser.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index a0ed02e1..a4061b3a 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -113,6 +113,7 @@ class XmlParser: } GLAZED_AREA_MAP = { + "2": "More than Typical", "4": "Much More Than Typical" } @@ -121,7 +122,8 @@ class XmlParser: } TRANSACTION_TYPE_MAP = { - "13": "ECO assessment" + "13": "ECO assessment", + "14": "Stock condition survey", } TENURE_MAP = { @@ -401,8 +403,13 @@ class XmlParser: ] wall_areas = sum([float(f["heat_loss_perimeter"]) * float(f["room_height"]) for f in main_dwelling_floors]) - window_areas = sum([float(w["window_area"]) for w in main_dwelling_windows]) - return wall_areas - window_areas + window_areas = [float(w["window_area"]) for w in main_dwelling_windows if w["window_area"] is not None] + if not window_areas: + # We discount 10% of the wall area + insulation_wall_area = wall_areas * 0.9 + else: + insulation_wall_area = wall_areas - window_areas + return insulation_wall_area def extract_additional_data(self): @@ -416,7 +423,8 @@ class XmlParser: main_dwelling_windows = [w for w in self.windows if w["window_location"] == "0"] number_of_windows = len(main_dwelling_windows) - windows_area = sum([float(w["window_area"]) for w in main_dwelling_windows]) + windows_area = [float(w["window_area"]) for w in main_dwelling_windows if w["window_area"] is not None] + windows_area = sum(windows_area) if windows_area else None boolean_lookup = { "true": True, @@ -462,7 +470,7 @@ class XmlParser: "cylinder_thermostat": cylinder_thermostat, "main_dwelling_ground_floor_area": float(main_dwelling_ground_floor_area), "number_of_windows": int(number_of_windows), - "windows_area": float(windows_area), + "windows_area": float(windows_area) if windows_area is not None else windows_area, } def get_node_value(self, tag_name): From 60490cd4faf100fe3f66754a23effc8211b1793c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 14:23:20 +0100 Subject: [PATCH 004/255] xml extraction --- etl/xml_survey_extraction/XmlParser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index a4061b3a..a2246629 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -804,7 +804,9 @@ class XmlParser: pvc_frame = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( "PVC-Window-Frames" - )[0].firstChild.nodeValue + ) + + pvc_frame = pvc_frame[0].firstChild.nodeValue if pvc_frame else None multple_glazed_proportion = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( "Multiple-Glazed-Proportion" From 9d4a93ca3efa43a66c5d3f13843f4f62386e978c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 15:18:42 +0100 Subject: [PATCH 005/255] debugging xml extraction --- etl/xml_survey_extraction/XmlParser.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index a2246629..f8f2285d 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -9,7 +9,8 @@ from etl.xml_survey_extraction.pcdb import heating_data PROPERTY_TYPE_LOOKUP = { "0": "House", "House": "House", - "2": "Flat" + "2": "Flat", + "3": "Maisonette", } @@ -122,6 +123,7 @@ class XmlParser: } TRANSACTION_TYPE_MAP = { + "5": "Rented (social)", "13": "ECO assessment", "14": "Stock condition survey", } @@ -134,7 +136,8 @@ class XmlParser: TARIFF_MAP = { "1": "Dual", - "2": "Single" + "2": "Single", + "3": "Unknown" } def __init__(self, file, filekey, surveyor_company, uprn=None): @@ -408,7 +411,7 @@ class XmlParser: # We discount 10% of the wall area insulation_wall_area = wall_areas * 0.9 else: - insulation_wall_area = wall_areas - window_areas + insulation_wall_area = wall_areas - sum(window_areas) return insulation_wall_area def extract_additional_data(self): @@ -779,6 +782,7 @@ class XmlParser: """ glazing_type_lookup = { + "2": "double glazing installed during or after 2002", "3": "double glazing, unknown install date", "5": "Single glazing", } From bfded2aaf985b65a5551c7f0f55706d54f36a5f7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 15:25:11 +0100 Subject: [PATCH 006/255] expanding xml extraction --- etl/xml_survey_extraction/XmlParser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index f8f2285d..fa70b6b7 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -439,6 +439,7 @@ class XmlParser: cylinder_insulation_type = { None: "", "1": "Foam", + "2": "Jacket" } cylinder_insulation_thickness = int( @@ -782,6 +783,7 @@ class XmlParser: """ glazing_type_lookup = { + "ND": "Single glazing", "2": "double glazing installed during or after 2002", "3": "double glazing, unknown install date", "5": "Single glazing", From ce9b3e5e2014fdeaba52ecf977618a5b16898a29 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Oct 2024 18:13:28 +0100 Subject: [PATCH 007/255] creating aiha output --- etl/customers/aiha/xml_extraction.py | 452 ++++++++++++++++++++++++++- 1 file changed, 448 insertions(+), 4 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index d235be78..416065e7 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -1,5 +1,8 @@ import os from io import BytesIO + +import pandas as pd + from etl.xml_survey_extraction.XmlParser import XmlParser SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS" @@ -22,7 +25,8 @@ def main(): xml_files = [f for f in os.listdir(subfolder) if f.endswith('.xml')] if not xml_files: - raise FileNotFoundError(f"No XML files found in subfolder: {subfolder}") + print(f"No XML files found in subfolder: {subfolder}") + continue # If any XML files are found, perform the data extraction. We use the subfolder name as the survey key. for xml_file in xml_files: @@ -44,16 +48,456 @@ def main(): # Run the parser to extract the data xml_parser.run() + if not xml_parser.epc: + # If we don't have a lig xml + continue # Store the extracted data for further processing extracted_surveys.append({ - "epc": xml_parser.epc, - "additional_data": xml_parser.additional_data, - "subfolder": subfolder + "survey_key": subfolder.split("/")[-1], + **xml_parser.epc, + **xml_parser.additional_data }) print(f"Extracted {len(extracted_surveys)} surveys.") # Process the extracted_surveys as needed, for example, save to a database or write to a file. + extracted_surveys = pd.DataFrame(extracted_surveys) + + # THis is the data we need for the AIHA project + measures_data = extracted_surveys[ + ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating"] + ] + measures_data = measures_data.sort_values("survey_key", ascending=True) + + # Note: + # The properties will still have "Very poor" ratings for their hot water + + # TODO + # - AIH001-03 has a basement and so we should discount this area from the ground floor + # - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft + # - AIH001-03 instead of cylinder insulation, we could install an air source heat pump but it might not be the + # best option for this property due to it being extrememly large and the walls being uninsulated. It might not + # be performant enough in the winter, when COP will be more like 1.5. + # - AIH001-03 - can add additional 1.6kWp solar PV to flat roof to get close to EPC C. How many occupants are + # in the property? Does it make sense to have such a large solar PV system (5.6kWp)? + # - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C + # - Generally, should we consider insulated doors? + # - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same + # buulding + # - AIH001-09 - The extension is 1900-1929 but has a cavity wall + # - AIH001-09 - Is it not possible to install a loft hatch? + # - AIH001-09 - Why is there assumed secondary heating? + # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? + # - AIH001-11 - The layout of this unit is confusing, is there roof access? + # - AIH001-12 - Why was there not access to the cylinder? + # + + recommended_measures = [ + { + "survey_key": "AIH001-01", + "starting_sap": 69, + "recommended_measures": [], + "notes": "Is EPC C" + }, + { + "survey_key": "AIH001-03", + "starting_sap": 43, + "recommended_measures": [ + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 44, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with various configurations", + "config": [ + { + "size": "4kWp", + "orientation": "East", + "elavation": 30, + "overshading": "Modest", + }, + { + "size": "1.6kWp", + "orientation": "Horizontal", + "elavation": "Horizontal", + "overshading": "Modest", + } + ], + "sap_points": 7, + "ending_sap": 53 + }, + { + "measure": "Loft Insulation", + "description": "300mm of loft insulation", + "sap_points": 8, + "ending_sap": 61 + }, + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 3, + "ending_sap": 64 + } + ], + "notes": "There was no access to the loft for this property and so a loft hatch would need to be " + "installed..." + }, + { + "survey_key": "AIH001-04", + "starting_sap": 48, + "recommended_measures": [ + { + "measure": "Flat Roof Insulation", + "description": "100mm flat roof insulation", + "sap_points": 4, + "ending_sap": 52 + }, + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 3, + "ending_sap": 55 + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 4kW capacity, south-facing", + "config": [ + { + "size": "4kW", + "orientation": "South", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 12, + "ending_sap": 67 + } + ], + "notes": "" + }, + { + "survey_key": "AIH001-05", + "starting_sap": 54, + "recommended_measures": [ + { + "measure": "Flat Roof Insulation", + "description": "100mm flat roof insulation", + "sap_points": 5, + "ending_sap": 59, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 2, + "ending_sap": 61, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 4kW capacity, horizontal orientation", + "config": [ + { + "size": "4kW", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 9, + "ending_sap": 70 + }, + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 3, + "ending_sap": 73 + } + ], + "notes": "" + }, + { + "survey_key": "AIH001-06", + "starting_sap": 62, + "recommended_measures": [ + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 2, + "ending_sap": 64, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 2kW capacity, south-facing", + "config": [ + { + "size": "2kW", + "orientation": "South", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 6, + "ending_sap": 70 + } + ] + }, + { + "survey_key": "AIH001-07", + "starting_sap": 74, + "recommended_measures": [], + "notes": "Is EPC C" + }, + { + "survey_key": "AIH001-08", + "starting_sap": 56, + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm of loft insulation", + "sap_points": 2, + "ending_sap": 58, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 4, + "ending_sap": 62, + }, + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "sap_points": 5, + "ending_sap": 69, + }, + { + "measure": "Ventilation", + "description": "Ventilation improvement", + "sap_points": 0, + "ending_sap": 69, + } + ] + }, + { + "survey_key": "AIH001-09", + "starting_sap": 44, + "recommended_measures": [ + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "sap_points": 8, + "ending_sap": 52, + }, + { + "measure": "Cavity Wall Insulation", + "description": "Cavity wall insulation for extensions", + "sap_points": 1, + "ending_sap": 53, + }, + { + "measure": "Ventilation", + "description": "Ventilation improvement", + "sap_points": 0, + "ending_sap": 53, + }, + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 3, + "ending_sap": 56, + } + ] + }, + { + "survey_key": "AIH001-11", + "starting_sap": 59, + "recommended_measures": [ + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 4, + "ending_sap": 63, + }, + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "sap_points": 5, + "ending_sap": 68, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 69, + } + ] + }, + { + "survey_key": "AIH001-12", + "starting_sap": 46, + "recommended_measures": [ + { + "measure": "Double Glazing", + "description": "Installation of double glazing", + "sap_points": 2, + "ending_sap": 48, + }, + { + "measure": "Draught Proofing", + "description": "Draught proofing improvements", + "sap_points": 1, + "ending_sap": 49, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 3.2kW capacity, east-facing", + "config": [ + { + "size": "3.2W", + "orientation": "East", + "elavation": 30, + "overshading": "Little or none", + } + ], + "sap_points": 9, + "ending_sap": 58 + }, + { + "measure": "Air Source Heat Pump", + "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump", + "sap_points": 15, + "ending_sap": 73 + }, + { + "measure": "Tariff Review", + "description": "Switch to 24-hour tariff", + "sap_points": 15, + "ending_sap": 88 + } + ] + }, + { + "survey_key": "AIH001-13", + "starting_sap": 53, + "recommended_measures": [ + { + "measure": "Roof Insulation", + "description": "100mm+ insulation on all surfaces (ceiling u=0.16, walls u=0.3)", + "sap_points": 6, + "ending_sap": 59, + }, + { + "measure": "Flat Roof Insulation", + "description": "Flat roof insulation", + "sap_points": 2, + "ending_sap": 61, + }, + { + "measure": "Cavity Wall Insulation", + "description": "Cavity wall insulation", + "sap_points": 6, + "ending_sap": 67, + }, + { + "measure": "Ventilation", + "description": "Ventilation improvement", + "sap_points": 0, + "ending_sap": 67, + }, + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 2, + "ending_sap": 69, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 4kW capacity, flat roof installation", + "config": [ + { + "size": "4kW", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 9, + "ending_sap": 78 + } + ] + }, + { + "survey_key": "AIH001-14", + "starting_sap": 63, + "recommended_measures": [ + { + "measure": "Cavity Wall Insulation", + "description": "Insulation for cavity walls", + "sap_points": 5, + "ending_sap": 68, + }, + { + "measure": "Ventilation", + "description": "Ventilation improvement", + "sap_points": 0, + "ending_sap": 68, + }, + { + "measure": "Loft Insulation", + "description": "Installation of loft insulation", + "sap_points": 1, + "ending_sap": 69, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 10kW capacity", + "sap_points": 10, + "ending_sap": 79, + } + ] + }, + ] + + # Step 1: Normalize the recommended_measures data into a DataFrame. + normalized_measures = [] + + for survey in recommended_measures: + survey_key = survey["survey_key"] + starting_sap = survey["starting_sap"] + for measure in survey.get("recommended_measures", []): + normalized_measures.append({ + "survey_key": survey_key, + "starting_sap": starting_sap, + "measure": measure["measure"], + "description": measure.get("description", "") + }) + + # Convert the normalized list into a DataFrame. + measures_df = pd.DataFrame(normalized_measures) + + # Step 2: Pivot the measures_df to have a column for each measure type, using the description as values. + pivoted_measures = measures_df.pivot_table( + index="survey_key", + columns="measure", + values="description", + aggfunc=lambda x: ' '.join(x), # Concatenate descriptions if there are multiple entries. + fill_value=None + ).reset_index() + + # Step 3: Extract starting SAP for each survey key. + starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]] + + # Merge starting SAP back onto pivoted measures. + result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left") + + # Step 4: Calculate the ending SAP using the total sap points. + # Note: If you want to use total sap points, you'll need to update the total calculation accordingly. + + # Step 5: Merge the result with the measures_data to get the final DataFrame. + final_measures = measures_data.merge( + result_df, how="left", on="survey_key" + ) if __name__ == "__main__": From 56fb33a64a16261f35f286adffc8268503fac24c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Oct 2024 18:39:55 +0100 Subject: [PATCH 008/255] added placeholder pricing sheet --- etl/customers/aiha/xml_extraction.py | 101 ++++++++++++++++++--------- 1 file changed, 68 insertions(+), 33 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 416065e7..563ed7ca 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -90,7 +90,7 @@ def main(): # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? # - AIH001-11 - The layout of this unit is confusing, is there roof access? # - AIH001-12 - Why was there not access to the cylinder? - # + # - AIH001-12 - Is the need to draught proofing due to the windows? recommended_measures = [ { @@ -111,7 +111,7 @@ def main(): }, { "measure": "Solar PV", - "description": "Solar PV system with various configurations", + "description": "4kWp Solar PV system", "config": [ { "size": "4kWp", @@ -131,13 +131,13 @@ def main(): }, { "measure": "Loft Insulation", - "description": "300mm of loft insulation", + "description": "300mm loft insulation", "sap_points": 8, "ending_sap": 61 }, { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 3, "ending_sap": 64 } @@ -157,16 +157,16 @@ def main(): }, { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 3, "ending_sap": 55 }, { "measure": "Solar PV", - "description": "Solar PV system with 4kW capacity, south-facing", + "description": "4kWp Solar PV system", "config": [ { - "size": "4kW", + "size": "4kWp", "orientation": "South", "elavation": 30, "overshading": "Modest", @@ -196,7 +196,7 @@ def main(): }, { "measure": "Solar PV", - "description": "Solar PV system with 4kW capacity, horizontal orientation", + "description": "4kWp Solar PV system", "config": [ { "size": "4kW", @@ -210,7 +210,7 @@ def main(): }, { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 3, "ending_sap": 73 } @@ -229,7 +229,7 @@ def main(): }, { "measure": "Solar PV", - "description": "Solar PV system with 2kW capacity, south-facing", + "description": "2kWp Solar PV system", "config": [ { "size": "2kW", @@ -255,7 +255,7 @@ def main(): "recommended_measures": [ { "measure": "Loft Insulation", - "description": "300mm of loft insulation", + "description": "300mm loft insulation", "sap_points": 2, "ending_sap": 58, }, @@ -273,7 +273,7 @@ def main(): }, { "measure": "Ventilation", - "description": "Ventilation improvement", + "description": "2x DMEV fans", "sap_points": 0, "ending_sap": 69, } @@ -291,19 +291,19 @@ def main(): }, { "measure": "Cavity Wall Insulation", - "description": "Cavity wall insulation for extensions", + "description": "CWI to rdSAP default standard", "sap_points": 1, "ending_sap": 53, }, { "measure": "Ventilation", - "description": "Ventilation improvement", + "description": "2x DMEV fans", "sap_points": 0, "ending_sap": 53, }, { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 3, "ending_sap": 56, } @@ -315,7 +315,7 @@ def main(): "recommended_measures": [ { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 4, "ending_sap": 63, }, @@ -345,13 +345,13 @@ def main(): }, { "measure": "Draught Proofing", - "description": "Draught proofing improvements", + "description": "Window draught proofing improvements", "sap_points": 1, "ending_sap": 49, }, { "measure": "Solar PV", - "description": "Solar PV system with 3.2kW capacity, east-facing", + "description": "3.2kWp Solar PV system", "config": [ { "size": "3.2W", @@ -383,37 +383,37 @@ def main(): "recommended_measures": [ { "measure": "Roof Insulation", - "description": "100mm+ insulation on all surfaces (ceiling u=0.16, walls u=0.3)", + "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)", "sap_points": 6, "ending_sap": 59, }, { "measure": "Flat Roof Insulation", - "description": "Flat roof insulation", + "description": "100mm flat roof insulation", "sap_points": 2, "ending_sap": 61, }, { "measure": "Cavity Wall Insulation", - "description": "Cavity wall insulation", + "description": "CWI to rdSAP default standard", "sap_points": 6, "ending_sap": 67, }, { "measure": "Ventilation", - "description": "Ventilation improvement", + "description": "2x DMEV fans", "sap_points": 0, "ending_sap": 67, }, { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 2, "ending_sap": 69, }, { "measure": "Solar PV", - "description": "Solar PV system with 4kW capacity, flat roof installation", + "description": "4kWp Solar PV system", "config": [ { "size": "4kW", @@ -433,25 +433,25 @@ def main(): "recommended_measures": [ { "measure": "Cavity Wall Insulation", - "description": "Insulation for cavity walls", + "description": "CWI to rdSAP default standard", "sap_points": 5, "ending_sap": 68, }, { "measure": "Ventilation", - "description": "Ventilation improvement", + "description": "2x DMEV fans", "sap_points": 0, "ending_sap": 68, }, { "measure": "Loft Insulation", - "description": "Installation of loft insulation", + "description": "300mm loft insulation", "sap_points": 1, "ending_sap": 69, }, { "measure": "Solar PV", - "description": "Solar PV system with 10kW capacity", + "description": "3.2kWp Solar PV system", "sap_points": 10, "ending_sap": 79, } @@ -459,6 +459,33 @@ def main(): }, ] + descs = [] + for r in recommended_measures: + for m in r["recommended_measures"]: + descs.append(m["description"]) + descs = list(set(descs)) + + # TODO - need to add scaffolding + pricing_data = [ + {'item': '80mm cylinder insulation', 'unit_price': None, 'unit': 'unit'}, + {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'}, + {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'}, + {'item': 'Window draught proofing improvements', 'unit_price': None, 'unit': 'unit'}, + {'item': '100mm flat roof insulation', 'unit_price': None, 'unit': 'floor_m2'}, + {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None}, + {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, + {'item': 'Installation of double glazing', 'unit_price': None, 'unit': 'window'}, + {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'}, + {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, + {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80, + 'unit': 'floor_m2'}, + {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, + {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'}, + {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'}, + {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'} + ] + pricing_data = pd.DataFrame(pricing_data) + # Step 1: Normalize the recommended_measures data into a DataFrame. normalized_measures = [] @@ -470,7 +497,8 @@ def main(): "survey_key": survey_key, "starting_sap": starting_sap, "measure": measure["measure"], - "description": measure.get("description", "") + "description": measure.get("description", ""), + "sap_points": measure.get("sap_points", 0) }) # Convert the normalized list into a DataFrame. @@ -485,16 +513,23 @@ def main(): fill_value=None ).reset_index() - # Step 3: Extract starting SAP for each survey key. + # Step 3: Calculate the total sap points for each survey. + total_sap_points = measures_df.groupby("survey_key")["sap_points"].sum().reset_index() + total_sap_points.columns = ["survey_key", "total_sap_points"] + + # Merge total sap points into the pivoted measures. + pivoted_measures = pd.merge(pivoted_measures, total_sap_points, on="survey_key", how="left") + + # Step 4: Extract starting SAP for each survey key. starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]] # Merge starting SAP back onto pivoted measures. result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left") - # Step 4: Calculate the ending SAP using the total sap points. - # Note: If you want to use total sap points, you'll need to update the total calculation accordingly. + # Step 5: Calculate the ending SAP. + result_df["ending_sap"] = result_df["starting_sap"] + result_df["total_sap_points"] - # Step 5: Merge the result with the measures_data to get the final DataFrame. + # Step 6: Merge the result with the measures_data to get the final DataFrame. final_measures = measures_data.merge( result_df, how="left", on="survey_key" ) From 93d375bc7a4f0e845c3bb13c9ff00b4b33fd7ff1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Oct 2024 19:11:40 +0100 Subject: [PATCH 009/255] adding aiha costing --- etl/customers/aiha/xml_extraction.py | 46 +++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 563ed7ca..29ac44c6 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -65,7 +65,7 @@ def main(): # THis is the data we need for the AIHA project measures_data = extracted_surveys[ - ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating"] + ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating", "number_of_floors"] ] measures_data = measures_data.sort_values("survey_key", ascending=True) @@ -459,15 +459,20 @@ def main(): }, ] - descs = [] - for r in recommended_measures: - for m in r["recommended_measures"]: - descs.append(m["description"]) - descs = list(set(descs)) + scaffolding_data = [ + { + "number_of_floors": 2, + "price": 841, + }, + { + "number_of_floors": 3, + "price": 1077, + } + ] - # TODO - need to add scaffolding + # TODO - Need an update cost for cylinder insulation pricing_data = [ - {'item': '80mm cylinder insulation', 'unit_price': None, 'unit': 'unit'}, + {'item': '80mm cylinder insulation', 'unit_price': 50, 'unit': 'unit'}, {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'}, {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'}, {'item': 'Window draught proofing improvements', 'unit_price': None, 'unit': 'unit'}, @@ -486,6 +491,31 @@ def main(): ] pricing_data = pd.DataFrame(pricing_data) + for recommendation in recommended_measures: + + property_data = measures_data[measures_data["survey_key"] == recommendation["survey_key"]].squeeze() + + for measure in recommendation["recommended_measures"]: + measure_pricing = pricing_data[pricing_data["item"] == measure["description"]] + measure_unit = measure_pricing["unit"].values[0] + if measure_unit is None: + blah + continue + + if measure_unit == "unit": + measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + continue + + if measure_unit == "unit_needs_scaffolding": + # We need the number of floors + n_floors = property_data["number_of_floors"] + cost_of_scalfolding = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"] + measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + cost_of_scalfolding + + blah + + measure["total"] = pricing_data[pricing_data["item"] == measure["measure"]]["unit_price"].values[0] + # Step 1: Normalize the recommended_measures data into a DataFrame. normalized_measures = [] From 854c784bd9c4341546ea57d2a0549b40552fbd92 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Oct 2024 19:32:15 +0100 Subject: [PATCH 010/255] working on the costing methodology --- etl/customers/aiha/xml_extraction.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 29ac44c6..4d4705c9 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -3,6 +3,7 @@ from io import BytesIO import pandas as pd +from etl.ownership.config import EXCLUDED_UPRNS from etl.xml_survey_extraction.XmlParser import XmlParser SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS" @@ -91,6 +92,7 @@ def main(): # - AIH001-11 - The layout of this unit is confusing, is there roof access? # - AIH001-12 - Why was there not access to the cylinder? # - AIH001-12 - Is the need to draught proofing due to the windows? + # - AIH001-04 - is the flat roof area correct? recommended_measures = [ { @@ -132,6 +134,7 @@ def main(): { "measure": "Loft Insulation", "description": "300mm loft insulation", + "floor_area": 80, # Based on area of 1st floor "sap_points": 8, "ending_sap": 61 }, @@ -152,6 +155,7 @@ def main(): { "measure": "Flat Roof Insulation", "description": "100mm flat roof insulation", + "floor_area": 39.1482, # based on area of top floor "sap_points": 4, "ending_sap": 52 }, @@ -185,6 +189,7 @@ def main(): { "measure": "Flat Roof Insulation", "description": "100mm flat roof insulation", + "floor_area": 49.48, # based on area of top floor "sap_points": 5, "ending_sap": 59, }, @@ -256,6 +261,7 @@ def main(): { "measure": "Loft Insulation", "description": "300mm loft insulation", + "floor_area": 54.2864, # Based on area of top floor "sap_points": 2, "ending_sap": 58, }, @@ -390,6 +396,7 @@ def main(): { "measure": "Flat Roof Insulation", "description": "100mm flat roof insulation", + "floor_area": 33.06, # Based on area of the extension "sap_points": 2, "ending_sap": 61, }, @@ -445,7 +452,8 @@ def main(): }, { "measure": "Loft Insulation", - "description": "300mm loft insulation", + "description": "300mm loft insulation", # Based on area of main building + "floor_area": 59.20, "sap_points": 1, "ending_sap": 69, }, @@ -511,10 +519,18 @@ def main(): n_floors = property_data["number_of_floors"] cost_of_scalfolding = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"] measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + cost_of_scalfolding + continue - blah + if measure_unit == "floor_m2": + floor_area = measure["floor_area"] + measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * floor_area + continue - measure["total"] = pricing_data[pricing_data["item"] == measure["measure"]]["unit_price"].values[0] + if measure_unit == "hlp_m2": + hlp = measure["hlp"] + measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * hlp + + raise Exception("Unknown unit type") # Step 1: Normalize the recommended_measures data into a DataFrame. normalized_measures = [] From 8325f1bf7a7bcf0cb7ebd94f6a83c49684163e17 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 10:18:53 +0000 Subject: [PATCH 011/255] Finished costings WIP --- etl/customers/aiha/xml_extraction.py | 76 ++++++++++++++++------------ 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 4d4705c9..c246105a 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -3,10 +3,10 @@ from io import BytesIO import pandas as pd -from etl.ownership.config import EXCLUDED_UPRNS from etl.xml_survey_extraction.XmlParser import XmlParser SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS" +CONTINGENCY_RATE = 0.26 def main(): @@ -274,6 +274,7 @@ def main(): { "measure": "Internal Wall Insulation", "description": "100mm internal wall insulation", + "hlp": 24.13 * 2.63, "sap_points": 5, "ending_sap": 69, }, @@ -292,12 +293,14 @@ def main(): { "measure": "Internal Wall Insulation", "description": "100mm internal wall insulation", + "hlp": (22.35 * 3.24) + (22.13 * 2.53), "sap_points": 8, "ending_sap": 52, }, { "measure": "Cavity Wall Insulation", "description": "CWI to rdSAP default standard", + "hlp": (2.68 * 2.39) + (5.93 * 2.63) + (6.13 * 2.39), # 1st & 2nd extension "sap_points": 1, "ending_sap": 53, }, @@ -328,6 +331,7 @@ def main(): { "measure": "Internal Wall Insulation", "description": "100mm internal wall insulation", + "hlp": (18.50 * 3.12) + (19.00 * 2.75), "sap_points": 5, "ending_sap": 68, }, @@ -346,12 +350,15 @@ def main(): { "measure": "Double Glazing", "description": "Installation of double glazing", + "n_windows": 20, # Counted the bay windows each as 3 + "windows_area": 10.66, "sap_points": 2, "ending_sap": 48, }, { "measure": "Draught Proofing", "description": "Window draught proofing improvements", + "n_windows": 20, # Counted the bay windows each as 3 "sap_points": 1, "ending_sap": 49, }, @@ -390,6 +397,7 @@ def main(): { "measure": "Roof Insulation", "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)", + "floor_area": 39.75, # based on the floor area of the RIR "sap_points": 6, "ending_sap": 59, }, @@ -403,6 +411,7 @@ def main(): { "measure": "Cavity Wall Insulation", "description": "CWI to rdSAP default standard", + "hlp": (35.40 * 2.65) + (26.70 * 2.73) + (16.30 * 2.71), # 1st & 2nd extension "sap_points": 6, "ending_sap": 67, }, @@ -441,6 +450,7 @@ def main(): { "measure": "Cavity Wall Insulation", "description": "CWI to rdSAP default standard", + "hlp": (11.00 * 2.6) + (11.00 * 2.65) + (4.60 * 2.7), "sap_points": 5, "ending_sap": 68, }, @@ -483,11 +493,11 @@ def main(): {'item': '80mm cylinder insulation', 'unit_price': 50, 'unit': 'unit'}, {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'}, {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'}, - {'item': 'Window draught proofing improvements', 'unit_price': None, 'unit': 'unit'}, - {'item': '100mm flat roof insulation', 'unit_price': None, 'unit': 'floor_m2'}, + {'item': 'Window draught proofing improvements', 'unit_price': 63, 'unit': 'window'}, + {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'}, {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None}, {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, - {'item': 'Installation of double glazing', 'unit_price': None, 'unit': 'window'}, + {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'}, {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'}, {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80, @@ -500,51 +510,49 @@ def main(): pricing_data = pd.DataFrame(pricing_data) for recommendation in recommended_measures: - property_data = measures_data[measures_data["survey_key"] == recommendation["survey_key"]].squeeze() + total_cost = 0 for measure in recommendation["recommended_measures"]: measure_pricing = pricing_data[pricing_data["item"] == measure["description"]] measure_unit = measure_pricing["unit"].values[0] - if measure_unit is None: - blah - continue - if measure_unit == "unit": - measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) - continue - - if measure_unit == "unit_needs_scaffolding": - # We need the number of floors + if measure_unit in ["unit", None]: + measure_cost = float(measure_pricing["unit_price"].values[0]) + elif measure_unit == "unit_needs_scaffolding": n_floors = property_data["number_of_floors"] - cost_of_scalfolding = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"] - measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + cost_of_scalfolding - continue + scaffolding_cost = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"] + measure_cost = float(measure_pricing["unit_price"].values[0]) + scaffolding_cost + elif measure_unit == "floor_m2": + measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["floor_area"] + elif measure_unit == "hlp_m2": + measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["hlp"] + elif measure_unit == "window": + measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["n_windows"] + else: + raise Exception("Unknown unit type") - if measure_unit == "floor_m2": - floor_area = measure["floor_area"] - measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * floor_area - continue + measure["Total Cost"] = measure_cost + total_cost += measure_cost - if measure_unit == "hlp_m2": - hlp = measure["hlp"] - measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * hlp - - raise Exception("Unknown unit type") + recommendation["total_cost"] = total_cost # Step 1: Normalize the recommended_measures data into a DataFrame. normalized_measures = [] - for survey in recommended_measures: survey_key = survey["survey_key"] starting_sap = survey["starting_sap"] + total_cost = survey.get("total_cost", 0) + for measure in survey.get("recommended_measures", []): normalized_measures.append({ "survey_key": survey_key, "starting_sap": starting_sap, "measure": measure["measure"], "description": measure.get("description", ""), - "sap_points": measure.get("sap_points", 0) + "sap_points": measure.get("sap_points", 0), + "measure_cost": measure.get("Total Cost", 0), + "total_cost": total_cost }) # Convert the normalized list into a DataFrame. @@ -559,12 +567,16 @@ def main(): fill_value=None ).reset_index() - # Step 3: Calculate the total sap points for each survey. - total_sap_points = measures_df.groupby("survey_key")["sap_points"].sum().reset_index() - total_sap_points.columns = ["survey_key", "total_sap_points"] + # Step 3: Calculate the total sap points and total cost for each survey. + sap_cost_totals = measures_df.groupby("survey_key").agg( + total_sap_points=("sap_points", "sum"), + total_cost_of_measures=("measure_cost", "sum") + ).reset_index() # Merge total sap points into the pivoted measures. - pivoted_measures = pd.merge(pivoted_measures, total_sap_points, on="survey_key", how="left") + pivoted_measures = pd.merge(pivoted_measures, sap_cost_totals, on="survey_key", how="left") + pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE + pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"] # Step 4: Extract starting SAP for each survey key. starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]] From 7513e475d3cac3a21a95b0096833a43914ee7974 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 10:57:26 +0000 Subject: [PATCH 012/255] adding in the basic structure of the extraction code --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../stonewater/Wave 3 Preparation.py | 92 +++++++++++++++++++ .../requirements/requirements-wave-3-prep.txt | 1 + 4 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 etl/customers/stonewater/Wave 3 Preparation.py create mode 100644 etl/customers/stonewater/requirements/requirements-wave-3-prep.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py new file mode 100644 index 00000000..bd916494 --- /dev/null +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -0,0 +1,92 @@ +import os +import PyPDF2 +import re + +FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" + + +def extract_summary_report(pdf_path): + """ + Extracts specific data from the provided PDF file. + Data includes: + - Current SAP rating + - Fuel Bill + - Emissions (t/year) + """ + data = { + "Current SAP rating": None, + "Fuel Bill": None, + "Emissions (t/year)": None, + } + + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Extract Current SAP rating + sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) + if sap_match: + data["Current SAP rating"] = sap_match.group(1) + + # Extract Fuel Bill + fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) + if fuel_bill_match: + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + # Extract Emissions + emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text) + if emissions_match: + data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes" + + return data + + +def main(): + """ + This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. + """ + # List only directories in the specified FILE_PATH + survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))] + + extracted_data = [] + for survey_folder in survey_folders: + # List the folders inside of the survey folder + survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder)) + if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))] + + if not survey_subfolders: + continue + + # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment: + # If it exists, we will use the data from that folder + retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + + # List contents of the retrofit folder + retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder)) + + if not retrofit_files: + continue + + # We now look for specific files: + # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is + # .pdf + summary_report = next( + (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None + ) + if summary_report is not None: + pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report) + summary_data = extract_summary_report(pdf_path) + summary_data = { + "survey_folder": survey_folder, + **summary_data + } + extracted_data.append(summary_data) + continue + + raise NotImplementedError("IMPLEMENT ME!") + + +if __name__ == "__main__": + main() diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt new file mode 100644 index 00000000..e9a5c8ea --- /dev/null +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -0,0 +1 @@ +PyPDF2 From 0332c77098b4b77576422eb6b1cf1898f0ed79c3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 11:21:54 +0000 Subject: [PATCH 013/255] [Crefactoring structure of extraction code --- .../stonewater/Wave 3 Preparation.py | 80 +++++++++++++------ 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bd916494..976a953f 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -43,6 +43,42 @@ def extract_summary_report(pdf_path): return data +def extract_retrofit_assessment_folder(retrofit_folder_path): + """ + Handles extraction from a retrofit assessment folder if it exists and has content. + """ + retrofit_files = os.listdir(retrofit_folder_path) + + # Find the summary report in the retrofit folder + summary_report = next( + (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None + ) + + if summary_report: + pdf_path = os.path.join(retrofit_folder_path, summary_report) + return extract_summary_report(pdf_path) + + return None # If no relevant PDF is found + + +def extract_from_survey_folder_files(survey_folder_path): + """ + Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. + """ + survey_files = os.listdir(survey_folder_path) + + # Look for a summary report directly in the survey folder + summary_report = next( + (name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None + ) + + if summary_report: + pdf_path = os.path.join(survey_folder_path, summary_report) + return extract_summary_report(pdf_path) + + return None # If no relevant PDF is found + + def main(): """ This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. @@ -52,40 +88,38 @@ def main(): extracted_data = [] for survey_folder in survey_folders: + survey_folder_path = os.path.join(FILE_PATH, survey_folder) + # List the folders inside of the survey folder - survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder)) - if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))] + survey_subfolders = [name for name in os.listdir(survey_folder_path) + if os.path.isdir(os.path.join(survey_folder_path, name))] - if not survey_subfolders: - continue - - # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment: - # If it exists, we will use the data from that folder + # Check if there's a "retrofit assessment" folder retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) - # List contents of the retrofit folder - retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder)) + # If retrofit assessment folder exists, check if it has content + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + if os.listdir(retrofit_folder_path): # If not empty + summary_data = extract_retrofit_assessment_folder(retrofit_folder_path) + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data + } + extracted_data.append(summary_data) + continue - if not retrofit_files: - continue - - # We now look for specific files: - # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is - # .pdf - summary_report = next( - (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None - ) - if summary_report is not None: - pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report) - summary_data = extract_summary_report(pdf_path) + # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_from_survey_folder_files(survey_folder_path) + if summary_data: summary_data = { "survey_folder": survey_folder, **summary_data } extracted_data.append(summary_data) - continue - raise NotImplementedError("IMPLEMENT ME!") + print("Extracted Data:", extracted_data) if __name__ == "__main__": From cf2a94cb365b3903a733653136ae793b6a8299a4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 12:04:57 +0000 Subject: [PATCH 014/255] extracting epr --- .../stonewater/Wave 3 Preparation.py | 94 +++++++++++++++++-- 1 file changed, 84 insertions(+), 10 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 976a953f..53d5bb34 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -43,6 +43,65 @@ def extract_summary_report(pdf_path): return data +def extract_epr(pdf_path): + """ + Extracts specific data from an Energy Report (EPR) PDF file. + """ + data = { + "Address": None, + "Estimated Annual Costs": None, + "Current SAP": None, + "Space Heating": None, + "Water Heating": None, + "Fuel Bill": None, + } + + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Extract Address + address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) + data["Address"] = address_match.group(1).strip() + + # Extract Total Floor Area + area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) + data["Total Floor Area"] = area_match.group(1) + + # Extract Estimated Annual Costs + cost_match = re.search(r"TOTAL\s*£(\d+)", text) + data["Estimated Annual Costs"] = f"£{cost_match.group(1)}" + + # Extract Current SAP rating + # Updated Regular Expression to find "GG (1-20)" followed by two numbers + sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text) + + # Extract and validate the Current and Potential SAP ratings + current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2)) + # Ensure potential is greater than or equal to current + if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap: + data["Current SAP"] = current_sap + data["Potential SAP"] = potential_sap + else: + raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.") + + # Extract Space Heating (kWh) + space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text) + data["Space Heating"] = int(space_heating_match.group(1)) + + # Extract Water Heating (kWh) + water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text) + data["Water Heating"] = int(water_heating_match.group(1)) + + # Extract Fuel Bill (total estimated costs) + fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + return data + + def extract_retrofit_assessment_folder(retrofit_folder_path): """ Handles extraction from a retrofit assessment folder if it exists and has content. @@ -61,22 +120,38 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): return None # If no relevant PDF is found +def is_energy_report(text): + """ + Determines if the provided text indicates that the PDF is an Energy Report. + Returns True if the text contains 'Energy Report'. + """ + return text.startswith("ENERGY REPORT") + + def extract_from_survey_folder_files(survey_folder_path): """ Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. """ - survey_files = os.listdir(survey_folder_path) + survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")] - # Look for a summary report directly in the survey folder - summary_report = next( - (name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None - ) + for pdf_file in survey_files: + pdf_path = os.path.join(survey_folder_path, pdf_file) - if summary_report: - pdf_path = os.path.join(survey_folder_path, summary_report) - return extract_summary_report(pdf_path) + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" - return None # If no relevant PDF is found + if is_energy_report(first_page_text): + # Treat this as an Energy Report + return extract_epr(pdf_path) + elif "summary" in pdf_file.lower(): + # Treat this as a Summary Report + return extract_summary_report(pdf_path) + else: + raise NotImplementedError("Implement me") + + return None def main(): @@ -109,7 +184,6 @@ def main(): } extracted_data.append(summary_data) continue - # If no retrofit folder or it was empty, check files in survey_folder summary_data = extract_from_survey_folder_files(survey_folder_path) if summary_data: From 33ea47e71d8b0a226629400dca5b6400b46daf96 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 12:42:28 +0000 Subject: [PATCH 015/255] fixed address extraction --- .../stonewater/Wave 3 Preparation.py | 47 ++++++++++++++----- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 53d5bb34..bc567bd2 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1,6 +1,7 @@ import os import PyPDF2 import re +import pandas as pd FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" @@ -11,12 +12,12 @@ def extract_summary_report(pdf_path): Data includes: - Current SAP rating - Fuel Bill - - Emissions (t/year) + - Address """ data = { - "Current SAP rating": None, + "Address": None, + "Current SAP Rating": None, "Fuel Bill": None, - "Emissions (t/year)": None, } with open(pdf_path, "rb") as file: @@ -28,17 +29,36 @@ def extract_summary_report(pdf_path): # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) if sap_match: - data["Current SAP rating"] = sap_match.group(1) + data["Current SAP Rating"] = sap_match.group(1) # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) if fuel_bill_match: data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" - # Extract Emissions - emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text) - if emissions_match: - data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes" + # Extract individual address components + postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) + # region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) + house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) + house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) + street = re.search(r"Street:\s*(.*?)\nLocality:", text) + locality = re.search(r"Locality:\s*(.*?)\nTown:", text) + town = re.search(r"Town:\s*(.*?)\nCounty:", text) + county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) + + # Clean extracted values and remove any prefixes + address_parts = [ + house_no.group(1).strip() if house_no else "", + house_name.group(1).strip() if house_name else "", + street.group(1).strip() if street else "", + locality.group(1).strip() if locality else "", + town.group(1).strip() if town else "", + county.group(1).strip() if county else "", + postcode.group(1).strip() if postcode else "" + ] + + # Join non-empty parts with a comma + data["Address"] = ", ".join([part for part in address_parts if part]) return data @@ -49,8 +69,7 @@ def extract_epr(pdf_path): """ data = { "Address": None, - "Estimated Annual Costs": None, - "Current SAP": None, + "Current SAP Rating": None, "Space Heating": None, "Water Heating": None, "Fuel Bill": None, @@ -82,8 +101,8 @@ def extract_epr(pdf_path): current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2)) # Ensure potential is greater than or equal to current if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap: - data["Current SAP"] = current_sap - data["Potential SAP"] = potential_sap + data["Current SAP Rating"] = current_sap + data["Potential SAP Rating"] = potential_sap else: raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.") @@ -117,6 +136,8 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): pdf_path = os.path.join(retrofit_folder_path, summary_report) return extract_summary_report(pdf_path) + raise Exception("Not Implemented") + return None # If no relevant PDF is found @@ -193,7 +214,7 @@ def main(): } extracted_data.append(summary_data) - print("Extracted Data:", extracted_data) + extracted_data = pd.DataFrame(extracted_data) if __name__ == "__main__": diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index e9a5c8ea..2cabb047 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -1 +1,2 @@ PyPDF2 +pandas From c68e4f017e48f4cb12639cbd9f69ce40849e68fd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 12:43:59 +0000 Subject: [PATCH 016/255] additional data cleaning --- etl/customers/stonewater/Wave 3 Preparation.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bc567bd2..c6736ba8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -86,12 +86,8 @@ def extract_epr(pdf_path): data["Address"] = address_match.group(1).strip() # Extract Total Floor Area - area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) - data["Total Floor Area"] = area_match.group(1) - - # Extract Estimated Annual Costs - cost_match = re.search(r"TOTAL\s*£(\d+)", text) - data["Estimated Annual Costs"] = f"£{cost_match.group(1)}" + # area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) + # data["Total Floor Area"] = area_match.group(1) # Extract Current SAP rating # Updated Regular Expression to find "GG (1-20)" followed by two numbers @@ -216,6 +212,5 @@ def main(): extracted_data = pd.DataFrame(extracted_data) - -if __name__ == "__main__": - main() +# if __name__ == "__main__": +# main() From 70d02075cf1da79ccce4950cb8080a9b05745a6d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 14:16:33 +0000 Subject: [PATCH 017/255] allowing extract_retrofit_assessment_folder to handle eprs --- .../stonewater/Wave 3 Preparation.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index c6736ba8..14e50460 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -121,20 +121,25 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): """ Handles extraction from a retrofit assessment folder if it exists and has content. """ - retrofit_files = os.listdir(retrofit_folder_path) + retrofit_files = [f for f in os.listdir(retrofit_folder_path) if f.endswith(".pdf")] - # Find the summary report in the retrofit folder - summary_report = next( - (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None - ) + for pdf_file in retrofit_files: + pdf_path = os.path.join(retrofit_folder_path, pdf_file) - if summary_report: - pdf_path = os.path.join(retrofit_folder_path, summary_report) - return extract_summary_report(pdf_path) + # Attempt to read the first page of the PDF to determine the report type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" - raise Exception("Not Implemented") + if is_energy_report(first_page_text): + # Treat this as an Energy Report + return extract_epr(pdf_path) + elif "summary" in pdf_file.lower(): + # Treat this as a Summary Report + return extract_summary_report(pdf_path) - return None # If no relevant PDF is found + # If no relevant PDF is found, raise an exception + raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.") def is_energy_report(text): From 371f17f87e986a5d70ae7b0e66f9748f82adac6e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 14:20:33 +0000 Subject: [PATCH 018/255] adding additional catch for summary report --- etl/customers/stonewater/Wave 3 Preparation.py | 14 +++++++++++++- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 14e50460..dc71d449 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2,6 +2,7 @@ import os import PyPDF2 import re import pandas as pd +from tqdm import tqdm FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" @@ -137,6 +138,10 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): elif "summary" in pdf_file.lower(): # Treat this as a Summary Report return extract_summary_report(pdf_path) + elif is_summary_report(first_page_text): + # other ways to detect a summary report + # Treat this as a Summary Report + return extract_summary_report(pdf_path) # If no relevant PDF is found, raise an exception raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.") @@ -150,6 +155,13 @@ def is_energy_report(text): return text.startswith("ENERGY REPORT") +def is_summary_report(text): + """ + Determines if the provided text indicates that the PDF is a Summary Report. + """ + return text.startswith("Summary Information") + + def extract_from_survey_folder_files(survey_folder_path): """ Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. @@ -184,7 +196,7 @@ def main(): survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))] extracted_data = [] - for survey_folder in survey_folders: + for survey_folder in tqdm(survey_folders): survey_folder_path = os.path.join(FILE_PATH, survey_folder) # List the folders inside of the survey folder diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 2cabb047..70bec3cc 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -1,2 +1,3 @@ PyPDF2 pandas +tqdm From 4e9acdeb8e2222b7c44c05749667fe258fa87982 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 14:23:34 +0000 Subject: [PATCH 019/255] refactored --- .../stonewater/Wave 3 Preparation.py | 67 +++++++------------ 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index dc71d449..30a23e86 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -118,30 +118,15 @@ def extract_epr(pdf_path): return data -def extract_retrofit_assessment_folder(retrofit_folder_path): +def extract_retrofit_pdfs(data_folder_path): """ - Handles extraction from a retrofit assessment folder if it exists and has content. + Handles extraction from a retrofit data folder if it exists and has content. """ - retrofit_files = [f for f in os.listdir(retrofit_folder_path) if f.endswith(".pdf")] + retrofit_files = [f for f in os.listdir(data_folder_path) if f.endswith(".pdf")] for pdf_file in retrofit_files: - pdf_path = os.path.join(retrofit_folder_path, pdf_file) - - # Attempt to read the first page of the PDF to determine the report type - with open(pdf_path, "rb") as file: - reader = PyPDF2.PdfReader(file) - first_page_text = reader.pages[0].extract_text() if reader.pages else "" - - if is_energy_report(first_page_text): - # Treat this as an Energy Report - return extract_epr(pdf_path) - elif "summary" in pdf_file.lower(): - # Treat this as a Summary Report - return extract_summary_report(pdf_path) - elif is_summary_report(first_page_text): - # other ways to detect a summary report - # Treat this as a Summary Report - return extract_summary_report(pdf_path) + pdf_path = os.path.join(data_folder_path, pdf_file) + return detect_and_parse_report(pdf_path, pdf_file) # If no relevant PDF is found, raise an exception raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.") @@ -162,30 +147,26 @@ def is_summary_report(text): return text.startswith("Summary Information") -def extract_from_survey_folder_files(survey_folder_path): +def detect_and_parse_report(pdf_path, pdf_file): """ - Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. + Detects the type of report and extracts the relevant data. + :param pdf_path: String path to the PDF file + :param pdf_file: String name of the PDF file + :return: """ - survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")] + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" - for pdf_file in survey_files: - pdf_path = os.path.join(survey_folder_path, pdf_file) - - # Attempt to read the first page of the PDF to determine type - with open(pdf_path, "rb") as file: - reader = PyPDF2.PdfReader(file) - first_page_text = reader.pages[0].extract_text() if reader.pages else "" - - if is_energy_report(first_page_text): - # Treat this as an Energy Report - return extract_epr(pdf_path) - elif "summary" in pdf_file.lower(): - # Treat this as a Summary Report - return extract_summary_report(pdf_path) - else: - raise NotImplementedError("Implement me") - - return None + if is_energy_report(first_page_text): + # Treat this as an Energy Report + return extract_epr(pdf_path) + elif "summary" in pdf_file.lower(): + # Treat this as a Summary Report + return extract_summary_report(pdf_path) + else: + raise NotImplementedError("Implement me") def main(): @@ -210,7 +191,7 @@ def main(): if retrofit_folder: retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) if os.listdir(retrofit_folder_path): # If not empty - summary_data = extract_retrofit_assessment_folder(retrofit_folder_path) + summary_data = extract_retrofit_pdfs(retrofit_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, @@ -219,7 +200,7 @@ def main(): extracted_data.append(summary_data) continue # If no retrofit folder or it was empty, check files in survey_folder - summary_data = extract_from_survey_folder_files(survey_folder_path) + summary_data = extract_retrofit_pdfs(survey_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, From 1db4c4319e2b7992405fb977705a90e8b3fb8618 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 14:28:27 +0000 Subject: [PATCH 020/255] removing raising of exception at end of function --- etl/customers/stonewater/Wave 3 Preparation.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 30a23e86..777f96c5 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -128,8 +128,8 @@ def extract_retrofit_pdfs(data_folder_path): pdf_path = os.path.join(data_folder_path, pdf_file) return detect_and_parse_report(pdf_path, pdf_file) - # If no relevant PDF is found, raise an exception - raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.") + # If no relevant PDF is found, exit + return None def is_energy_report(text): @@ -199,6 +199,10 @@ def main(): } extracted_data.append(summary_data) continue + else: + # Then we have an empty Retrofit Assessment folder + continue + # If no retrofit folder or it was empty, check files in survey_folder summary_data = extract_retrofit_pdfs(survey_folder_path) if summary_data: From 2a17831c7223e7614c6413c2f2b4fa09aca3d3a9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 17:16:27 +0000 Subject: [PATCH 021/255] added detection of condition report --- etl/customers/aiha/xml_extraction.py | 26 ++++++++++--------- .../stonewater/Wave 3 Preparation.py | 18 ++++++++++--- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index c246105a..038e8593 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -74,25 +74,26 @@ def main(): # The properties will still have "Very poor" ratings for their hot water # TODO - # - AIH001-03 has a basement and so we should discount this area from the ground floor # - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft + # [Can't remember, not clear - Chenai will check] # - AIH001-03 instead of cylinder insulation, we could install an air source heat pump but it might not be the # best option for this property due to it being extrememly large and the walls being uninsulated. It might not # be performant enough in the winter, when COP will be more like 1.5. # - AIH001-03 - can add additional 1.6kWp solar PV to flat roof to get close to EPC C. How many occupants are # in the property? Does it make sense to have such a large solar PV system (5.6kWp)? # - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C - # - Generally, should we consider insulated doors? + # - Potential measure - search for the cylinder and insulate it # - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same - # buulding - # - AIH001-09 - The extension is 1900-1929 but has a cavity wall - # - AIH001-09 - Is it not possible to install a loft hatch? - # - AIH001-09 - Why is there assumed secondary heating? + # buulding [Question for Lewis & Kevin] + # - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from + # the other unit] + # - AIH001-09 - Why is there assumed secondary heating? [Question for Lewis & Kevin] # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? - # - AIH001-11 - The layout of this unit is confusing, is there roof access? - # - AIH001-12 - Why was there not access to the cylinder? - # - AIH001-12 - Is the need to draught proofing due to the windows? - # - AIH001-04 - is the flat roof area correct? + # [Question for Lewis & Kevin] + # - AIH001-11 - The layout of this unit is confusing, is there roof access? [NO!!!! - It's a Sun room!!] + # - AIH001-12 - Why was there not access to the cylinder? [Sealed shut] + # - AIH001-12 - Is the need to draught proofing due to the windows? [This would be addressed by deailing with the + # windows] recommended_measures = [ { @@ -113,7 +114,7 @@ def main(): }, { "measure": "Solar PV", - "description": "4kWp Solar PV system", + "description": "5.6kWp Solar PV system", "config": [ { "size": "4kWp", @@ -497,6 +498,7 @@ def main(): {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'}, {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None}, {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, + {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'}, {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'}, {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'}, {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, @@ -505,7 +507,7 @@ def main(): {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'}, {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'}, - {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'} + {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'}, ] pricing_data = pd.DataFrame(pricing_data) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 777f96c5..62cec009 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -126,7 +126,10 @@ def extract_retrofit_pdfs(data_folder_path): for pdf_file in retrofit_files: pdf_path = os.path.join(data_folder_path, pdf_file) - return detect_and_parse_report(pdf_path, pdf_file) + extracted = detect_and_parse_report(pdf_path, pdf_file) + if extracted is not None: + return extracted + continue # If no relevant PDF is found, exit return None @@ -165,10 +168,19 @@ def detect_and_parse_report(pdf_path, pdf_file): elif "summary" in pdf_file.lower(): # Treat this as a Summary Report return extract_summary_report(pdf_path) + elif is_condition_report(first_page_text): + return None else: raise NotImplementedError("Implement me") +def is_condition_report(text): + """ + Determines if the provided text indicates that the PDF is a Condition Report. + """ + return text.startswith("OsmosisACDNEWPAS2035ConditionReport") + + def main(): """ This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. @@ -191,7 +203,7 @@ def main(): if retrofit_folder: retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) if os.listdir(retrofit_folder_path): # If not empty - summary_data = extract_retrofit_pdfs(retrofit_folder_path) + summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, @@ -204,7 +216,7 @@ def main(): continue # If no retrofit folder or it was empty, check files in survey_folder - summary_data = extract_retrofit_pdfs(survey_folder_path) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, From 54b09e88e15cfd6c824beff23f878525cb9d5d16 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 17:20:05 +0000 Subject: [PATCH 022/255] added usage of is_summary_report --- etl/customers/stonewater/Wave 3 Preparation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 62cec009..988a544a 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -165,7 +165,7 @@ def detect_and_parse_report(pdf_path, pdf_file): if is_energy_report(first_page_text): # Treat this as an Energy Report return extract_epr(pdf_path) - elif "summary" in pdf_file.lower(): + elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): # Treat this as a Summary Report return extract_summary_report(pdf_path) elif is_condition_report(first_page_text): From 6e8d9a025cc5b64c1a632bd9c95de140e9e58f82 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 19:26:14 +0000 Subject: [PATCH 023/255] adjusting search epc function to handle pydantic issues for the moment --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/SearchEpc.py | 10 +- .../livewest/route_march_2024_10_28.py | 171 ++++++++++++++++++ .../stonewater/Wave 3 Preparation.py | 2 + 5 files changed, 178 insertions(+), 9 deletions(-) create mode 100644 etl/customers/livewest/route_march_2024_10_28.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..850c0cda 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..e4070118 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 367d8c85..f9e978c6 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -256,16 +256,12 @@ class SearchEpc: else: params = {"address": self.address1, "postcode": self.postcode} + url = os.path.join(self.client.domestic.host, "search") + for retry in range(self.max_retries): try: - if "uprn" in params: - # We use the direct call method inside, since we need to implement uprn as a valid - # parameter for the search function - url = os.path.join(self.client.domestic.host, "search") - response = self.client.domestic.call(method="get", url=url, params=params) - else: - response = self.client.domestic.search(params=params, size=size) + response = self.client.domestic.call(method="get", url=url, params=params) if response: self.data = response diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py new file mode 100644 index 00000000..fff1e7e7 --- /dev/null +++ b/etl/customers/livewest/route_march_2024_10_28.py @@ -0,0 +1,171 @@ +import os + +import pandas as pd +from tqdm import tqdm + +from dotenv import load_dotenv +from utils.s3 import read_excel_from_s3 +from backend.SearchEpc import SearchEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/LIVEWEST 3578 ECO4 ECO PLUS GBIS.xlsx", header=0 + ) + + epc_data = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + + postcode = home["Postcode"] + house_number = home["Number"] + full_address = home["Full Address"] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "asset_list_address": full_address, + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + + epc_df = pd.DataFrame(epc_data) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "asset_list_address", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description" + # + "energy-consumption-current", # kwh/m2 + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + left_on=["ADDRESS"], + right_on=["asset_list_address"] + ) + + asset_list = asset_list.drop(columns=["asset_list_address"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]), axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"], + axis=1 + ) + + # Store as an excel + filename = "LHP EPC Data pull.xlsx" + asset_list.to_excel(filename, index=False) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 988a544a..8e1a7fdb 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -226,5 +226,7 @@ def main(): extracted_data = pd.DataFrame(extracted_data) + missed = [f for f in survey_folders if f not in extracted_data["survey_folder"].tolist()] + # if __name__ == "__main__": # main() From 86ca5b40074015c20dd35fe38eda7ac3799139f4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 19:50:09 +0000 Subject: [PATCH 024/255] addded catch for condition report --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../livewest/route_march_2024_10_28.py | 69 ++++++++++--------- .../stonewater/Wave 3 Preparation.py | 2 +- 4 files changed, 40 insertions(+), 35 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 850c0cda..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index e4070118..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py index fff1e7e7..47b86e89 100644 --- a/etl/customers/livewest/route_march_2024_10_28.py +++ b/etl/customers/livewest/route_march_2024_10_28.py @@ -1,4 +1,5 @@ import os +import time import pandas as pd from tqdm import tqdm @@ -46,42 +47,46 @@ def app(): ) epc_data = [] + errors = [] for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): - - postcode = home["Postcode"] - house_number = home["Number"] - full_address = home["Full Address"] - - searcher = SearchEpc( - address1=str(house_number), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None - - searcher.find_property(skip_os=True) - if searcher.newest_epc is None: - continue - - # Look for EPC recommendatons try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} + postcode = home["Postcode"] + house_number = home["Number"] + full_address = home["Full Address"] - epc = { - "asset_list_address": full_address, - **searcher.newest_epc.copy(), - "recommendations": property_recommendations["rows"] - } + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None - epc_data.append(epc) + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "asset_list_address": full_address, + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(e) + time.sleep(5) epc_df = pd.DataFrame(epc_data) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 8e1a7fdb..fc11f1c0 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -178,7 +178,7 @@ def is_condition_report(text): """ Determines if the provided text indicates that the PDF is a Condition Report. """ - return text.startswith("OsmosisACDNEWPAS2035ConditionReport") + return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport") def main(): From 8bf5b23410caccce29ddfaaf30953c1b48db4c7d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 20:29:31 +0000 Subject: [PATCH 025/255] handling extraction of windows data --- .../livewest/route_march_2024_10_28.py | 3 +- .../stonewater/Wave 3 Preparation.py | 58 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py index 47b86e89..c19c78b1 100644 --- a/etl/customers/livewest/route_march_2024_10_28.py +++ b/etl/customers/livewest/route_march_2024_10_28.py @@ -61,7 +61,8 @@ def app(): os_api_key="", property_type=None, fast=True, - full_address=full_address + full_address=full_address, + max_retries=3 ) # Force the skipping of estimating the EPC searcher.ordnance_survey_client.property_type = None diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fc11f1c0..a8e06416 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3,6 +3,7 @@ import PyPDF2 import re import pandas as pd from tqdm import tqdm +from collections import Counter FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" @@ -19,6 +20,8 @@ def extract_summary_report(pdf_path): "Address": None, "Current SAP Rating": None, "Fuel Bill": None, + "Window Age Description": None, + "Window Age Description Proportion (%)": None, } with open(pdf_path, "rb") as file: @@ -61,9 +64,56 @@ def extract_summary_report(pdf_path): # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) + return data +def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + + Parameters: + windows_text (str): The text section containing window data. + + Returns: + dict: A dictionary with the most common window age description and its proportion. + """ + # Clean up windows_text by removing line breaks for better pattern matching + windows_text = windows_text.replace("\n", "") + + # Define possible window age descriptions + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + + # Count occurrences of each description + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + # Determine the most common description and calculate its proportion + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion + } + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -74,6 +124,8 @@ def extract_epr(pdf_path): "Space Heating": None, "Water Heating": None, "Fuel Bill": None, + "Window Age Description": None, + "Window Age Description Proportion (%)": None, } with open(pdf_path, "rb") as file: @@ -115,6 +167,12 @@ def extract_epr(pdf_path): fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + # Extract the windows data + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) + return data From e22baed16fcf6ce86e38266d557aab3cc529953d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 12:29:24 +0000 Subject: [PATCH 026/255] sorted livewest data pull --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../livewest/route_march_2024_10_28.py | 148 ++++++++++++------ .../stonewater/Wave 3 Preparation.py | 2 + 4 files changed, 102 insertions(+), 52 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..850c0cda 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..e4070118 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py index c19c78b1..1b259fba 100644 --- a/etl/customers/livewest/route_march_2024_10_28.py +++ b/etl/customers/livewest/route_march_2024_10_28.py @@ -19,6 +19,53 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + house_number = home["Number"] + full_address = home["Full Address"] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + def app(): """ This app is EPC pulling data for some properties owned by Livewest @@ -45,56 +92,49 @@ def app(): asset_list = pd.read_excel( "/Users/khalimconn-kowlessar/Downloads/LIVEWEST 3578 ECO4 ECO PLUS GBIS.xlsx", header=0 ) + asset_list["row_id"] = asset_list.index - epc_data = [] - errors = [] - for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): - try: - postcode = home["Postcode"] - house_number = home["Number"] - full_address = home["Full Address"] + epc_data, errors = get_data(asset_list) - searcher = SearchEpc( - address1=str(house_number), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=3 - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data(asset_list_failed) - searcher.find_property(skip_os=True) - if searcher.newest_epc is None: - continue - - # Look for EPC recommendatons - try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} - - epc = { - "asset_list_address": full_address, - **searcher.newest_epc.copy(), - "recommendations": property_recommendations["rows"] - } - - epc_data.append(epc) - except Exception as e: - errors.append(e) - time.sleep(5) + # Append the failed data to the main data + epc_data.extend(epc_data_failed) epc_df = pd.DataFrame(epc_data) + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + # Retrieve just the data we need epc_df = epc_df[ [ - "asset_list_address", + "row_id", "uprn", "property-type", "built-form", @@ -110,7 +150,7 @@ def app(): "construction-age-band", "floor-height", "number-habitable-rooms", - "mainheat-description" + "mainheat-description", # "energy-consumption-current", # kwh/m2 ] @@ -119,11 +159,14 @@ def app(): asset_list = asset_list.merge( epc_df, how="left", - left_on=["ADDRESS"], - right_on=["asset_list_address"] + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" ) - asset_list = asset_list.drop(columns=["asset_list_address"]) + asset_list = asset_list.drop(columns=["row_id"]) # Rename the columns asset_list = asset_list.rename(columns={ @@ -140,14 +183,18 @@ def app(): "roof-description": "Roof Construction", "mainheat-description": "Heating Type", "secondheat-description": "Secondary Heating", - "transaction-type": "Reason for last EPC" + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)" }) asset_list["Estimated Number of Floors"] = asset_list.apply( - lambda x: estimate_number_of_floors(property_type=x["Property Type"]), axis=1 + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 ) asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) asset_list["Estimated Perimeter (m)"] = asset_list.apply( @@ -157,7 +204,7 @@ def app(): ), axis=1 ) - asset_list["Estimated Heat Loss Perimeter (m)"] = asset_list.apply( + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( lambda x: estimate_external_wall_area( num_floors=x["Estimated Number of Floors"], floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, @@ -168,10 +215,11 @@ def app(): ) asset_list["Roof Insulation Thickness"] = asset_list.apply( - lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"], + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, axis=1 ) # Store as an excel - filename = "LHP EPC Data pull.xlsx" + filename = "livewest EPC Data pull - 29 Oct.xlsx" asset_list.to_excel(filename, index=False) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index a8e06416..d8d01b22 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -283,6 +283,8 @@ def main(): extracted_data.append(summary_data) extracted_data = pd.DataFrame(extracted_data) + # Save this as a csv + # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) missed = [f for f in survey_folders if f not in extracted_data["survey_folder"].tolist()] From b7f402ba9d699ede3693068f8bec9e2087c0a8aa Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 13:55:18 +0000 Subject: [PATCH 027/255] addded # Storeys --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/stonewater/Wave 3 Preparation.py | 11 +++++++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 850c0cda..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index e4070118..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d8d01b22..b1b48cec 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -19,6 +19,7 @@ def extract_summary_report(pdf_path): data = { "Address": None, "Current SAP Rating": None, + "Number of Storeys": None, "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -32,13 +33,15 @@ def extract_summary_report(pdf_path): # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) - if sap_match: - data["Current SAP Rating"] = sap_match.group(1) + data["Current SAP Rating"] = sap_match.group(1) + + # Number of storeys + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + data["Number of Storeys"] = int(storeys_match.group(1)) # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) - if fuel_bill_match: - data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" # Extract individual address components postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) From 753bda6cb0bc4c8de266944c04ab99db7d74da3d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 14:21:01 +0000 Subject: [PATCH 028/255] extracting heating systems from summary report --- .../stonewater/Wave 3 Preparation.py | 86 ++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b1b48cec..863a6a6c 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -19,10 +19,26 @@ def extract_summary_report(pdf_path): data = { "Address": None, "Current SAP Rating": None, - "Number of Storeys": None, + "Space Heating": None, + "Water Heating": None, "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, + "Secondary Window Age Description": None, + "Secondary Window Age Description Proportion (%)": None, + "Number of Windows": None, + "Total Number of Doors": None, + "Number of Insulated Doors": None, + "Existing Primary Heating System": None, + "Existing Primary Heating PCDF Reference": None, + "Existing Primary Heating Controls": None, + "Existing Primary Heating % of Heat": None, + "Existing Secondary Heating System": None, + "Existing Secondary Heating PCDF Reference": None, + "Existing Secondary Heating Controls": None, + "Existing Secondary Heating % of Heat": None, + "Secondary Heating Code": None, + "Water Heating Code": None, } with open(pdf_path, "rb") as file: @@ -39,6 +55,10 @@ def extract_summary_report(pdf_path): storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) data["Number of Storeys"] = int(storeys_match.group(1)) + # Extract Carbon Emissions + carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) + data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) + # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" @@ -66,12 +86,58 @@ def extract_summary_report(pdf_path): # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) + data["Postcode"] = postcode.group(1).strip() windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) windows_text = windows_section.group(1) window_data = extract_window_age_description(windows_text) data.update(window_data) + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text) + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text) + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Extract heating system + # Extract Primary Heating Data + # Extract Primary Heating Section + primary_heating_section = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_text = primary_heating_section.group(1) + + data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + primary_text).group(1) + data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) + ) + + # Extract Secondary Heating Section + secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + secondary_text = secondary_heating_section.group(1) + + data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group( + 1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", + secondary_text).group(1).strip() + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) + + # Extract Secondary Heating and Water Heating Codes + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + return data @@ -111,9 +177,20 @@ def extract_window_age_description(windows_text): most_common_description, window_count = description_counts.most_common(1)[0] window_proportion = window_count / sum(description_counts.values()) * 100 + # Get the second most common and the proportion + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + return { "Window Age Description": most_common_description, - "Window Age Description Proportion (%)": window_proportion + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) } @@ -129,6 +206,11 @@ def extract_epr(pdf_path): "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, + "Secondary Window Age Description": None, + "Secondary Window Age Description Proportion (%)": None, + "Number of Windows": None, + "Total Number of Doors": None, + "Number of Insulated Doors": None, } with open(pdf_path, "rb") as file: From 364b5b07e8f1ff29b3da3625014e4250fc5954ce Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 14:46:01 +0000 Subject: [PATCH 029/255] adding to extract eprs --- .../stonewater/Wave 3 Preparation.py | 101 +++++++++++++----- 1 file changed, 73 insertions(+), 28 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 863a6a6c..4ab33732 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -18,6 +18,7 @@ def extract_summary_report(pdf_path): """ data = { "Address": None, + "Postcode": None, "Current SAP Rating": None, "Space Heating": None, "Water Heating": None, @@ -200,7 +201,9 @@ def extract_epr(pdf_path): """ data = { "Address": None, + "Postcode": None, "Current SAP Rating": None, + "Potential SAP Rating": None, "Space Heating": None, "Water Heating": None, "Fuel Bill": None, @@ -211,6 +214,16 @@ def extract_epr(pdf_path): "Number of Windows": None, "Total Number of Doors": None, "Number of Insulated Doors": None, + "Existing Primary Heating System": None, + "Existing Primary Heating PCDF Reference": None, + "Existing Primary Heating Controls": None, + "Existing Primary Heating % of Heat": None, + "Existing Secondary Heating System": None, + "Existing Secondary Heating PCDF Reference": None, + "Existing Secondary Heating Controls": None, + "Existing Secondary Heating % of Heat": None, + "Secondary Heating Code": None, + "Water Heating Code": None, } with open(pdf_path, "rb") as file: @@ -222,41 +235,73 @@ def extract_epr(pdf_path): # Extract Address address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) data["Address"] = address_match.group(1).strip() + data["Postcode"] = data["Address"].split(",")[-1].strip() - # Extract Total Floor Area - # area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) - # data["Total Floor Area"] = area_match.group(1) - - # Extract Current SAP rating - # Updated Regular Expression to find "GG (1-20)" followed by two numbers + # Extract Current and Potential SAP ratings sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text) + current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) + data["Current SAP Rating"] = current_sap - # Extract and validate the Current and Potential SAP ratings - current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2)) - # Ensure potential is greater than or equal to current - if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap: - data["Current SAP Rating"] = current_sap - data["Potential SAP Rating"] = potential_sap - else: - raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.") - - # Extract Space Heating (kWh) - space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text) - data["Space Heating"] = int(space_heating_match.group(1)) - - # Extract Water Heating (kWh) - water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text) - data["Water Heating"] = int(water_heating_match.group(1)) - - # Extract Fuel Bill (total estimated costs) + # Extract Fuel Bill fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" - # Extract the windows data + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Extract Primary Heating Section (Main Heating 1) + primary_heating_section = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + primary_text = primary_heating_section.group(1) + + data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + primary_text).group(1) + data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) + ) + + # Extract Secondary Heating Section (Main Heating 2) + secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) + secondary_text = secondary_heating_section.group(1) + + data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group( + 1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + + if data["Existing Secondary Heating System"] == "": + data["Existing Secondary Heating Controls"] = "" + else: + data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", + secondary_text).group(1).strip() + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) + ) + + # Extract Secondary Heating and Water Heating Codes + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + if data["Existing Secondary Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + # Extract Windows information windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) - windows_text = windows_section.group(1) - window_data = extract_window_age_description(windows_text) - data.update(window_data) + if windows_section: + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) return data From 9eb4720c91d22ed2084364d92a0c99cbb3088adc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 14:54:19 +0000 Subject: [PATCH 030/255] added peui --- etl/customers/stonewater/Wave 3 Preparation.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 4ab33732..1b7b1bcd 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -20,9 +20,8 @@ def extract_summary_report(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, - "Space Heating": None, - "Water Heating": None, "Fuel Bill": None, + "Number of Storeys": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, "Secondary Window Age Description": None, @@ -203,9 +202,8 @@ def extract_epr(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, - "Potential SAP Rating": None, - "Space Heating": None, - "Water Heating": None, + "Primary Energy Use Intensity (kWh/m2/yr)": None, + "Number of Storeys": None, "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -242,6 +240,14 @@ def extract_epr(pdf_path): current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) data["Current SAP Rating"] = current_sap + # Extract the primary energy use intensity + additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) + data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + + # Extract Number of Storeys + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + data["Number of Storeys"] = int(storeys_match.group(1)) + # Extract Fuel Bill fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" From b74b8823d18d428888fd832c515cc81cb2c6bdf1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 14:59:32 +0000 Subject: [PATCH 031/255] fixing bug extracting from epr --- .../stonewater/Wave 3 Preparation.py | 54 ++++++++++++------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 1b7b1bcd..02a5cd83 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -261,36 +261,50 @@ def extract_epr(pdf_path): data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) # Extract Primary Heating Section (Main Heating 1) - primary_heating_section = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + # We may not have a secondary heating + primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 primary_text = primary_heating_section.group(1) - data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( - 1).strip() - data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - primary_text).group(1) - data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group( - 1).strip() + data["Existing Primary Heating System"] = re.search( + r"Main Heating Code\s*(.*?)\n", primary_text + ).group(1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", primary_text + ).group(1) + data["Existing Primary Heating Controls"] = re.search( + r"Main Heating Controls\s*(.*?)\n", primary_text + ).group(1).strip() data["Existing Primary Heating % of Heat"] = int( re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) ) # Extract Secondary Heating Section (Main Heating 2) secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) - secondary_text = secondary_heating_section.group(1) - - data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group( - 1).strip() - data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - secondary_text).group(1) - - if data["Existing Secondary Heating System"] == "": + if secondary_heating_section is None: + data["Existing Secondary Heating System"] = "" + data["Existing Secondary Heating PCDF Reference"] = "" data["Existing Secondary Heating Controls"] = "" + data["Existing Secondary Heating % of Heat"] = 0 + else: - data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", - secondary_text).group(1).strip() - data["Existing Secondary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) - ) + secondary_text = secondary_heating_section.group(1) + + data["Existing Secondary Heating System"] = re.search( + r"Main Heating Code\s*(.*?)\n", secondary_text + ).group(1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + + if data["Existing Secondary Heating System"] == "": + data["Existing Secondary Heating Controls"] = "" + else: + data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", + secondary_text).group(1).strip() + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) + ) # Extract Secondary Heating and Water Heating Codes secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) From 9e752fca8db65d829cdac4ff15fc874fd086ad6d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 15:03:23 +0000 Subject: [PATCH 032/255] handling edge case extracting from summary report --- etl/customers/stonewater/Wave 3 Preparation.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 02a5cd83..0af43310 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -109,10 +109,12 @@ def extract_summary_report(pdf_path): data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( 1).strip() - data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - primary_text).group(1) - data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group( - 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", primary_text + ).group(1) + data["Existing Primary Heating Controls"] = re.search( + r"Main Heating Controls\s*(.*?)\n", primary_text + ).group(1).strip() data["Existing Primary Heating % of Heat"] = int( re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) ) @@ -125,8 +127,10 @@ def extract_summary_report(pdf_path): 1).strip() data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) - data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", - secondary_text).group(1).strip() + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + data["Existing Secondary Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) data["Existing Secondary Heating % of Heat"] = int( re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) ) @@ -287,7 +291,7 @@ def extract_epr(pdf_path): data["Existing Secondary Heating PCDF Reference"] = "" data["Existing Secondary Heating Controls"] = "" data["Existing Secondary Heating % of Heat"] = 0 - + else: secondary_text = secondary_heating_section.group(1) From a9ce5b68bb6b506b62179c7abac5f43da2498ad1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 15:11:16 +0000 Subject: [PATCH 033/255] debug extract of main heating code --- etl/customers/stonewater/Wave 3 Preparation.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 0af43310..bb100ae1 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -123,8 +123,8 @@ def extract_summary_report(pdf_path): secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) secondary_text = secondary_heating_section.group(1) - data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group( - 1).strip() + main_heating_code_match = re.search(r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text) + data["Existing Secondary Heating System"] = main_heating_code_match.group(1).strip() data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) @@ -139,7 +139,11 @@ def extract_summary_report(pdf_path): secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) - data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + if data["Existing Secondary Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Water Heating Code"] = water_heating_code_match.group(1).strip() return data From 48369ae1505a769339f7adaf713d809e0bfdd208 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 15:18:11 +0000 Subject: [PATCH 034/255] refactor to prioritise epc --- .../stonewater/Wave 3 Preparation.py | 66 +++++++++++++++---- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bb100ae1..7f4f81e9 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -123,8 +123,10 @@ def extract_summary_report(pdf_path): secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) secondary_text = secondary_heating_section.group(1) - main_heating_code_match = re.search(r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text) - data["Existing Secondary Heating System"] = main_heating_code_match.group(1).strip() + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) @@ -299,11 +301,14 @@ def extract_epr(pdf_path): else: secondary_text = secondary_heating_section.group(1) - data["Existing Secondary Heating System"] = re.search( - r"Main Heating Code\s*(.*?)\n", secondary_text - ).group(1).strip() - data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - secondary_text).group(1) + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() + + data["Existing Secondary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", secondary_text + ).group(1) if data["Existing Secondary Heating System"] == "": data["Existing Secondary Heating Controls"] = "" @@ -334,20 +339,57 @@ def extract_epr(pdf_path): return data +def detect_report_type(pdf_path, pdf_file): + """ + Detects the type of report based on content or filename. + :param pdf_path: String path to the PDF file + :param pdf_file: String name of the PDF file + :return: String type of the report ("epr", "summary", or None) + """ + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" + + if is_energy_report(first_page_text): + return "epr" + elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): + return "summary" + elif is_condition_report(first_page_text): + return "condition" + + return None + + def extract_retrofit_pdfs(data_folder_path): """ Handles extraction from a retrofit data folder if it exists and has content. + Prioritizes extracting data from an EPR if both EPR and summary report are present. """ retrofit_files = [f for f in os.listdir(data_folder_path) if f.endswith(".pdf")] + report_types = {"epr": None, "summary": None} + # First, identify the types of reports available for pdf_file in retrofit_files: pdf_path = os.path.join(data_folder_path, pdf_file) - extracted = detect_and_parse_report(pdf_path, pdf_file) - if extracted is not None: - return extracted - continue + report_type = detect_report_type(pdf_path, pdf_file) - # If no relevant PDF is found, exit + if report_type == "epr": + report_types["epr"] = pdf_path + elif report_type == "summary": + report_types["summary"] = pdf_path + + # Stop checking further if both EPR and summary are found + if report_types["epr"] and report_types["summary"]: + break + + # Extract data based on report availability and priority + if report_types["epr"]: + return extract_epr(report_types["epr"]) + elif report_types["summary"]: + return extract_summary_report(report_types["summary"]) + + # If no relevant PDF is found, return None return None From 5af1836aa7731613ed58437586ca7e592a66150a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 16:32:25 +0000 Subject: [PATCH 035/255] extracting dimensions from epr --- .../stonewater/Wave 3 Preparation.py | 82 ++++++++++++++++++- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 7f4f81e9..0b660c76 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -16,6 +16,7 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ + blah data = { "Address": None, "Postcode": None, @@ -56,8 +57,8 @@ def extract_summary_report(pdf_path): data["Number of Storeys"] = int(storeys_match.group(1)) # Extract Carbon Emissions - carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) - data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) + # carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) + # data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) @@ -204,6 +205,69 @@ def extract_window_age_description(windows_text): } +def extract_building_parts_epr(text): + """ + Extracts building parts and associated dimensions from the provided PDF file. + Each building part (main and extensions) includes floor area, room height, perimeter, and party wall length. + """ + data = [] + + # Pattern to locate each "Building part" section + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party " + r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)", + re.DOTALL + ) + + # Extract each building part + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + # Clean up building part name to keep only the descriptor (e.g., "Main" or "1st Extension") + cleaned_part_name = re.sub(r" - built in.*", "", part_name) + + floor_data = match.group(2) + + # Pattern to match each floor's measurements + floor_pattern = re.compile( + r"(Lowest floor|First floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract floor details for each building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + # We now extract out the aggregated data + + main_building = [part for part in data if "Main" in part["Building Part"]] + first_extension = [part for part in data if "1st Extension" in part["Building Part"]] + dimensions = { + "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), + "Total Ground Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]] + ), + "RIR Floor Area": 0, + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building]), + "First Extension Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension]) if first_extension else 0, + } + + return dimensions + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -212,6 +276,7 @@ def extract_epr(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, + "Primary Energy Use (kWh/yr)": None, "Primary Energy Use Intensity (kWh/m2/yr)": None, "Number of Storeys": None, "Fuel Bill": None, @@ -232,6 +297,11 @@ def extract_epr(pdf_path): "Existing Secondary Heating % of Heat": None, "Secondary Heating Code": None, "Water Heating Code": None, + 'Total Floor Area (m2)': None, + 'Total Ground Floor Area': None, + 'RIR Floor Area': None, + 'Main Building Wall Area (m2)': None, + 'First Extension Wall Area (m2)': None } with open(pdf_path, "rb") as file: @@ -336,6 +406,9 @@ def extract_epr(pdf_path): window_data = extract_window_age_description(windows_text) data.update(window_data) + building_parts = extract_building_parts_epr(text) + data.update(building_parts) + return data @@ -465,7 +538,7 @@ def main(): if summary_data: summary_data = { "survey_folder": survey_folder, - **summary_data + **summary_data, } extracted_data.append(summary_data) continue @@ -474,11 +547,12 @@ def main(): continue # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, - **summary_data + **summary_data, } extracted_data.append(summary_data) From 4e752fb6c48cb163e4350f32eceb14f5a97d2a94 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:00:02 +0000 Subject: [PATCH 036/255] added summary table dimension extraction --- .../stonewater/Wave 3 Preparation.py | 82 ++++++++++++++++++- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 0b660c76..b660ab64 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -16,7 +16,6 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ - blah data = { "Address": None, "Postcode": None, @@ -40,6 +39,11 @@ def extract_summary_report(pdf_path): "Existing Secondary Heating % of Heat": None, "Secondary Heating Code": None, "Water Heating Code": None, + 'Total Floor Area (m2)': None, + 'Total Ground Floor Area (m2)': None, + 'RIR Floor Area': None, + 'Main Building Wall Area (m2)': None, + 'First Extension Wall Area (m2)': None } with open(pdf_path, "rb") as file: @@ -149,6 +153,9 @@ def extract_summary_report(pdf_path): data["Water Heating Code"] = water_heating_code_match.group(1).strip() + dimensions = extract_building_parts_summary(text) + data.update(dimensions) + return data @@ -256,7 +263,7 @@ def extract_building_parts_epr(text): first_extension = [part for part in data if "1st Extension" in part["Building Part"]] dimensions = { "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), - "Total Ground Floor Area": sum( + "Total Ground Floor Area (m2)": sum( [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]] ), "RIR Floor Area": 0, @@ -268,6 +275,75 @@ def extract_building_parts_epr(text): return dimensions +def extract_building_parts_summary(text): + """ + Extracts building parts and associated dimensions from the summary report PDF. + This includes Main Property and multiple extensions if they exist. + """ + data = [] + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to extract each building part, starting from Main Property and including extensions + building_part_pattern = re.compile( + r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" + r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory)", + re.DOTALL + ) + + # Loop through each building part match, including Main Property and extensions + for match in building_part_pattern.finditer(dimensions_text): + part_name = match.group(1) + floor_data = match.group(2) + + # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length + floor_pattern = re.compile( + r"(1st Floor|Lowest Floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract data for each floor within the building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data list + data.append({ + "Building Part": part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + # Calculate aggregated dimensions + main_property = [part for part in data if "Main Property" in part["Building Part"]] + first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] + dimensions = { + "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), + "Total Ground Floor Area (m2)": sum( + [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] + ), + "RIR Floor Area": 0, + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property]), + "First Extension Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions] + ), + } + + return dimensions + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -298,7 +374,7 @@ def extract_epr(pdf_path): "Secondary Heating Code": None, "Water Heating Code": None, 'Total Floor Area (m2)': None, - 'Total Ground Floor Area': None, + 'Total Ground Floor Area (m2)': None, 'RIR Floor Area': None, 'Main Building Wall Area (m2)': None, 'First Extension Wall Area (m2)': None From a30ad1762a37c81c326412c43cfaa5c91f721ad0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:05:37 +0000 Subject: [PATCH 037/255] handled problem case for summary dimensions --- etl/customers/stonewater/Wave 3 Preparation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b660ab64..1973cbd8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -294,7 +294,7 @@ def extract_building_parts_summary(text): # Pattern to extract each building part, starting from Main Property and including extensions building_part_pattern = re.compile( r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" - r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory)", + r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)", re.DOTALL ) From 98ae672a6160d84e099125904dac390eda1f6fa2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:24:16 +0000 Subject: [PATCH 038/255] debuggin secondary heating code --- etl/customers/stonewater/Wave 3 Preparation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 1973cbd8..84d67f56 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -472,7 +472,8 @@ def extract_epr(pdf_path): if data["Existing Secondary Heating System"] == "": data["Secondary Heating Code"] = "" else: - data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Secondary Heating Code"] = secondary_heating_code_match.group( + 1).strip() if secondary_heating_code_match else "" data["Water Heating Code"] = water_heating_code_match.group(1).strip() # Extract Windows information From d8e8b997a46bf278154cea08444f9b8add3386c5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:31:23 +0000 Subject: [PATCH 039/255] extend to get dimensions from 2nd floor --- etl/customers/stonewater/Wave 3 Preparation.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 84d67f56..ad35e2d5 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -149,7 +149,8 @@ def extract_summary_report(pdf_path): if data["Existing Secondary Heating System"] == "": data["Secondary Heating Code"] = "" else: - data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Secondary Heating Code"] = secondary_heating_code_match.group( + 1).strip() if secondary_heating_code_match else "" data["Water Heating Code"] = water_heating_code_match.group(1).strip() @@ -236,7 +237,7 @@ def extract_building_parts_epr(text): # Pattern to match each floor's measurements floor_pattern = re.compile( - r"(Lowest floor|First floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" ) # Extract floor details for each building part @@ -305,7 +306,7 @@ def extract_building_parts_summary(text): # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length floor_pattern = re.compile( - r"(1st Floor|Lowest Floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" ) # Extract data for each floor within the building part @@ -634,6 +635,7 @@ def main(): extracted_data.append(summary_data) extracted_data = pd.DataFrame(extracted_data) + # Save this as a csv # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) From c0d896cd59dc3ba003024da9c1caf81737b28d55 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:35:57 +0000 Subject: [PATCH 040/255] Debugging secondary heating extraction --- etl/customers/stonewater/Wave 3 Preparation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ad35e2d5..dc01ef6f 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -460,8 +460,11 @@ def extract_epr(pdf_path): if data["Existing Secondary Heating System"] == "": data["Existing Secondary Heating Controls"] = "" else: - data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", - secondary_text).group(1).strip() + # Might not have heating controls on 2nd system + secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + data["Existing Secondary Heating Controls"] = ( + secondary_controls_match.group(1).strip() if secondary_controls_match else "" + ) data["Existing Secondary Heating % of Heat"] = int( re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) ) From 4160ec4dcbae01b438010cc75e0d6eb157d76df2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:52:51 +0000 Subject: [PATCH 041/255] debugging missing secondary heating for summary report, completed extraction for files --- .../stonewater/Wave 3 Preparation.py | 44 ++++++++++++------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index dc01ef6f..7bedef29 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -109,7 +109,10 @@ def extract_summary_report(pdf_path): # Extract heating system # Extract Primary Heating Data # Extract Primary Heating Section - primary_heating_section = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + primary_text = primary_heating_section.group(1) data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( @@ -126,21 +129,29 @@ def extract_summary_report(pdf_path): # Extract Secondary Heating Section secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) - secondary_text = secondary_heating_section.group(1) - main_heating_code_match_secondary = re.search( - r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text - ) - data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() - data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - secondary_text).group(1) - second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) - data["Existing Secondary Heating Controls"] = ( - second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" - ) - data["Existing Secondary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) - ) + if secondary_heating_section is None: + data["Existing Secondary Heating System"] = "" + data["Existing Secondary Heating PCDF Reference"] = "" + data["Existing Secondary Heating Controls"] = "" + data["Existing Secondary Heating % of Heat"] = 0 + + else: + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + data["Existing Secondary Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) # Extract Secondary Heating and Water Heating Codes secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) @@ -638,6 +649,9 @@ def main(): extracted_data.append(summary_data) extracted_data = pd.DataFrame(extracted_data) + extracted_data["Primary Energy Use (kWh/yr)"] = ( + extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] + ) # Save this as a csv # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) From dbee05e555d758d464efe2a43c18d6c3b017cef8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 18:37:47 +0000 Subject: [PATCH 042/255] working on matching lookup --- .../stonewater/Wave 3 Preparation.py | 48 ++++++++++++++++++- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 7bedef29..d90360aa 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -5,7 +5,8 @@ import pandas as pd from tqdm import tqdm from collections import Counter -FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" +CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" +FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys") def extract_summary_report(pdf_path): @@ -653,6 +654,51 @@ def main(): extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] ) + # We now merge on the coordinator data so that against each property, we can map the measures + retrofit_packages_board = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"), + header=4 + ) + retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] + # We now match this retrofit packages board to the extracted data + matching_lookup = [] + for _, home in retrofit_packages_board.iterrows(): + filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()] + if filtered.empty: + print("Check this once we have full data") + continue + + if filtered.shape[0] == 1: + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Osm. ID": home["Osm. ID"], + "Name": home["Name"] + } + ) + continue + + # home["Name"] should be contained in the survey_folder + filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] + # We have an edge case wher some properties have two outputs in Sharepoint + if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + + if filtered.empty: + raise Exception("somethign went wrong") + if filtered.shape[0] != 1: + raise Exception("somethign went wrong2") + + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Osm. ID": home["Osm. ID"], + "Name": home["Name"] + } + ) + + matching_lookup = pd.DataFrame(matching_lookup) + # Save this as a csv # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 70bec3cc..97314b32 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -1,3 +1,4 @@ PyPDF2 pandas tqdm +openpyxl From 791262fa866e420cef6a2eced9b4f4ec28897409 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 09:29:11 +0000 Subject: [PATCH 043/255] adding all surveys and updating creation of filepaths --- .../stonewater/Wave 3 Preparation.py | 124 +++++++++++++++++- 1 file changed, 117 insertions(+), 7 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d90360aa..fe1faa9d 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2,11 +2,13 @@ import os import PyPDF2 import re import pandas as pd +import numpy as np from tqdm import tqdm from collections import Counter CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" -FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys") +SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") +NUM_FOLDERS = 14 def extract_summary_report(pdf_path): @@ -610,11 +612,18 @@ def main(): This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. """ # List only directories in the specified FILE_PATH - survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))] + survey_folders = [] + + # Loop over each survey folder and list its contents + for i in range(1, NUM_FOLDERS + 1): + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list extracted_data = [] for survey_folder in tqdm(survey_folders): - survey_folder_path = os.path.join(FILE_PATH, survey_folder) + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) # List the folders inside of the survey folder survey_subfolders = [name for name in os.listdir(survey_folder_path) @@ -623,9 +632,17 @@ def main(): # Check if there's a "retrofit assessment" folder retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + ra_folder = next( + (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), + None + ) + # If retrofit assessment folder exists, check if it has content - if retrofit_folder: - retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + if retrofit_folder or ra_folder: + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + else: + retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) if os.listdir(retrofit_folder_path): # If not empty summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) if summary_data: @@ -642,6 +659,11 @@ def main(): # If no retrofit folder or it was empty, check files in survey_folder summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if not summary_data: + if len(survey_subfolders) == 1: + survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if summary_data: summary_data = { "survey_folder": survey_folder, @@ -650,9 +672,14 @@ def main(): extracted_data.append(summary_data) extracted_data = pd.DataFrame(extracted_data) + + # What was missed??? + extracted_data["Primary Energy Use (kWh/yr)"] = ( extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] ) + # TODO: Clean up SAP and extract EPC + # TODO: RIR floor area!!! # We now merge on the coordinator data so that against each property, we can map the measures retrofit_packages_board = pd.read_excel( @@ -663,7 +690,13 @@ def main(): # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in retrofit_packages_board.iterrows(): - filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()] + filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", ""), case=False + )] + if filtered.empty: print("Check this once we have full data") continue @@ -684,8 +717,12 @@ def main(): if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + if filtered.empty: - raise Exception("somethign went wrong") + print("Check this once we have full data2!!!") + continue if filtered.shape[0] != 1: raise Exception("somethign went wrong2") @@ -699,6 +736,79 @@ def main(): matching_lookup = pd.DataFrame(matching_lookup) + if matching_lookup["Osm. ID"].duplicated().sum(): + raise Exception("Duplicate Osm. IDs") + + if matching_lookup["survey_folder"].duplicated().sum(): + raise Exception("Duplicate survey folders") + + measure_columns = [ + 'Main Wall Insulation', + 'Secondary Wall Insulation', + 'Loft insulation', + 'Flat Roof', + 'Room in Roof', + 'Window Upgrade', + 'Door Upgrade', + 'Ventilation', + 'Main Heating', + 'Water Heating', + 'Heating Controls', + 'Solar PV', + 'Other measures' + ] + + # We should end up with a 1:1 mapping between the Osm. ID and the survey folder + stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge( + retrofit_packages_board[ + [ + "Name", + "Osm. ID", + "Address ID", + "Archetype ID", + "Arch. Group Rank", "Archetype Representative", + "Actual SAP Band", + "Actual SAP Rating", + "Modelled SAP Band", + "Modelled SAP Rating", + ] + measure_columns + ], + on=["Osm. ID", "Name"], + how="left" + ) + + # We've appended the recommended packages and modelled SAP ratings to the data + # We also want to append the windows data + windows_data = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx" + ), + header=12 + ) + + # We get a lookup id of Osm.ID and when the windows were fitted + windows_data = windows_data[ + ["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"] + ] + # Convert to string for the moment + windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[ + "Parent Asset Window attributes - Fitted/renewed date" + ].astype(str) + # Create a single date column + windows_data["Fitted/renewed date"] = np.where( + pd.notnull(windows_data["Window attributes - Fitted/renewed date"]), + windows_data["Window attributes - Fitted/renewed date"], + windows_data["Parent Asset Window attributes - Fitted/renewed date"] + ) + # Convert to a date + windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"]) + # Calculate the number of years since something was done on the windows + windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[ + "Fitted/renewed date"]).dt.days / 365 + + # TODO: Flag if a package includes windows + # Save this as a csv # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) From 8983ebec2fd9ea593f19990f5c02847da4adbc45 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 10:03:10 +0000 Subject: [PATCH 044/255] adding epc band --- .../stonewater/Wave 3 Preparation.py | 59 ++++++++++++++++++- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fe1faa9d..2654fae5 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -11,6 +11,32 @@ SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") NUM_FOLDERS = 14 +def sap_to_epc(sap_points: int | float): + """ + Simple utility function to convert SAP points to EPC rating. + :param sap_points: numerical value of SAP points, typically between 0 and 100 + :return: + """ + + if sap_points <= 0: + raise ValueError("SAP points should be above 0.") + + if sap_points >= 92: + return "A" + elif sap_points >= 81: + return "B" + elif sap_points >= 69: + return "C" + elif sap_points >= 55: + return "D" + elif sap_points >= 39: + return "E" + elif sap_points >= 21: + return "F" + else: + return "G" + + def extract_summary_report(pdf_path): """ Extracts specific data from the provided PDF file. @@ -23,6 +49,7 @@ def extract_summary_report(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, + "Current EPC Band": None, "Fuel Bill": None, "Number of Storeys": None, "Window Age Description": None, @@ -57,7 +84,7 @@ def extract_summary_report(pdf_path): # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) - data["Current SAP Rating"] = sap_match.group(1) + data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] # Number of storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) @@ -367,6 +394,7 @@ def extract_epr(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, + "Current EPC Band": None, "Primary Energy Use (kWh/yr)": None, "Primary Energy Use Intensity (kWh/m2/yr)": None, "Number of Storeys": None, @@ -621,6 +649,9 @@ def main(): folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] survey_folders.extend(folder_contents) # Append contents to the master list + # Get rid of .DS_Store files + survey_folders = [folder for folder in survey_folders if not folder.endswith(".DS_Store")] + extracted_data = [] for survey_folder in tqdm(survey_folders): survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) @@ -643,6 +674,16 @@ def main(): retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) else: retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) + + # Check if everything inside is a sub-folder and the number of folders is 2 + items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] + all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] + if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: + # Get the folder that isn't Property Pics + retrofit_folder_path = os.path.join( + retrofit_folder_path, [item for item in items if item != "Property Pics"][0] + ) + if os.listdir(retrofit_folder_path): # If not empty summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) if summary_data: @@ -673,14 +714,24 @@ def main(): extracted_data = pd.DataFrame(extracted_data) - # What was missed??? - extracted_data["Primary Energy Use (kWh/yr)"] = ( extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] ) + extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int) + extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc) + # TODO: Clean up SAP and extract EPC # TODO: RIR floor area!!! + # Remove some definite duplicates + extracted_data = extracted_data[ + ~extracted_data["survey_folder"].isin( + [ + "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", + ] + ) + ] + # We now merge on the coordinator data so that against each property, we can map the measures retrofit_packages_board = pd.read_excel( os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"), @@ -715,9 +766,11 @@ def main(): filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] # We have an edge case wher some properties have two outputs in Sharepoint if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + bl1h2 filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + blah1 filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] if filtered.empty: From cb9399a704bcf2605429bc18704c0ff2b413d406 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 10:22:23 +0000 Subject: [PATCH 045/255] investigating missings' --- .../stonewater/Wave 3 Preparation.py | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 2654fae5..53279eed 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -720,15 +720,22 @@ def main(): extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int) extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc) - # TODO: Clean up SAP and extract EPC # TODO: RIR floor area!!! # Remove some definite duplicates + dupes = extracted_data[extracted_data["Address"].duplicated()]["Address"] + dupes = extracted_data[extracted_data["Address"].isin(dupes)] + dupes = dupes.sort_values("Address") + # Get all of the folders that end with ROSS + to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() + extracted_data = extracted_data[ ~extracted_data["survey_folder"].isin( [ "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", - ] + "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS", + "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS" + ] + to_drop ) ] @@ -740,8 +747,15 @@ def main(): retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] # We now match this retrofit packages board to the extracted data matching_lookup = [] - for _, home in retrofit_packages_board.iterrows(): - filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() + for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] == "Flat 21 Walmer Street": + filtered = extracted_data[ + extracted_data["survey_folder"] == "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD" + ].copy() + else: + filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( @@ -749,7 +763,6 @@ def main(): )] if filtered.empty: - print("Check this once we have full data") continue if filtered.shape[0] == 1: @@ -766,18 +779,20 @@ def main(): filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] # We have an edge case wher some properties have two outputs in Sharepoint if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": - bl1h2 - filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + raise Exception("Fix me1") + # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': - blah1 - filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + raise Exception("Fix me2") + # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + + if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': + filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] if filtered.empty: - print("Check this once we have full data2!!!") continue if filtered.shape[0] != 1: - raise Exception("somethign went wrong2") + raise Exception("something went wrong") matching_lookup.append( { @@ -788,6 +803,9 @@ def main(): ) matching_lookup = pd.DataFrame(matching_lookup) + # Find Osmosis IDs that are in the packages board but not in the matching looking + # missing_osm_ids = set(retrofit_packages_board["Osm. ID"]) - set(matching_lookup["Osm. ID"]) + # missing_osm_ids = list(missing_osm_ids) if matching_lookup["Osm. ID"].duplicated().sum(): raise Exception("Duplicate Osm. IDs") From 51c2d04a6d0d919a07edac2d34e868a59c755b2d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 11:42:40 +0000 Subject: [PATCH 046/255] fixing missed matches --- .../stonewater/Wave 3 Preparation.py | 80 ++++++++++++++----- 1 file changed, 59 insertions(+), 21 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 53279eed..5e444ca8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -741,26 +741,53 @@ def main(): # We now merge on the coordinator data so that against each property, we can map the measures retrofit_packages_board = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"), + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater 3.0 Updated SAP Pre & Modelled 29.10.24.xlsx"), header=4 ) retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] + # Take just the rows that have been surveyed + retrofit_packages_board = retrofit_packages_board[ + retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) + ] + + # Replace \n with "" + extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "") + + manual_filters = { + "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", + "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", + "1 Cluny Way": "12-1-1 Cluny Way-SG15 6ZB", + "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", + 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", + '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', + '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', + 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', + 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', + } + # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): - # Handle the case that has the wrong postcode in the asset data - if home["Name"] == "Flat 21 Walmer Street": - filtered = extracted_data[ - extracted_data["survey_folder"] == "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD" - ].copy() + if home["Address ID"] == 6111566: + blah + # 6118117, 6118744, 6117091 + if home["Name"] in manual_filters: + filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy() else: filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() + filtered["survey_folder"].values - # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces - filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( - home["Name"].replace(r"[^\w\s]", ""), case=False - )] + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".", + "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + filtered = filtered[to_filter] if filtered.empty: continue @@ -769,7 +796,7 @@ def main(): matching_lookup.append( { "survey_folder": filtered["survey_folder"].values[0], - "Osm. ID": home["Osm. ID"], + "Address ID": home["Address ID"], "Name": home["Name"] } ) @@ -797,15 +824,23 @@ def main(): matching_lookup.append( { "survey_folder": filtered["survey_folder"].values[0], - "Osm. ID": home["Osm. ID"], + "Address ID": home["Address ID"], "Name": home["Name"] } ) matching_lookup = pd.DataFrame(matching_lookup) # Find Osmosis IDs that are in the packages board but not in the matching looking - # missing_osm_ids = set(retrofit_packages_board["Osm. ID"]) - set(matching_lookup["Osm. ID"]) - # missing_osm_ids = list(missing_osm_ids) + missing_ids = set(retrofit_packages_board["Address ID"]) - set(matching_lookup["Address ID"]) + missing_ids = list(missing_ids) + print(len(missing_ids)) + if missing_ids: + # We check that the missing ids have no data yet + missing_data = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)][ + ["Name", "Address ID", "Archetype ID"]] + extracted_data[extracted_data["survey_folder"].str.contains("23 Monmouth")]["survey_folder"].values + + matching_lookup[matching_lookup["survey_folder"].str.contains("23 Monmouth")] if matching_lookup["Osm. ID"].duplicated().sum(): raise Exception("Duplicate Osm. IDs") @@ -834,7 +869,6 @@ def main(): retrofit_packages_board[ [ "Name", - "Osm. ID", "Address ID", "Archetype ID", "Arch. Group Rank", "Archetype Representative", @@ -848,6 +882,14 @@ def main(): how="left" ) + # Create a section for costs + for measure in measure_columns: + stonewater_data[f"Cost of {measure}"] = None + + stonewater_data["Total Cost of Measures"] = None + stonewater_data["Contingency Cost"] = None + stonewater_data["Total Cost of Measures inc Contingency"] = None + # We've appended the recommended packages and modelled SAP ratings to the data # We also want to append the windows data windows_data = pd.read_excel( @@ -878,12 +920,8 @@ def main(): windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[ "Fitted/renewed date"]).dt.days / 365 - # TODO: Flag if a package includes windows - - # Save this as a csv - # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) - - missed = [f for f in survey_folders if f not in extracted_data["survey_folder"].tolist()] + stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"]) + stonewater_data = stonewater_data.merge(windows_data, on="Osm. ID", how="left") # if __name__ == "__main__": # main() From 90c9466421b5cb187c9355d0a8c005f379650ece Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 13:46:43 +0000 Subject: [PATCH 047/255] sorted dupes --- .../stonewater/Wave 3 Preparation.py | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 5e444ca8..67362865 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -756,27 +756,34 @@ def main(): manual_filters = { "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", - "1 Cluny Way": "12-1-1 Cluny Way-SG15 6ZB", "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', + '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS', + '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY', + '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW', + # '2 Sorrell Place': '', + # '72 St Ives Road': '', + # '1 The Close, Burton Gardens': '', + # '102 Cheaton Close': '', + # 'Flat 16 Spring Gardens': '', + # '4 Apple Close': '', + '25 Folly Lane': '', + } # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): # Handle the case that has the wrong postcode in the asset data - if home["Address ID"] == 6111566: - blah - # 6118117, 6118744, 6117091 if home["Name"] in manual_filters: filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy() else: filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() - filtered["survey_folder"].values # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( @@ -836,14 +843,11 @@ def main(): print(len(missing_ids)) if missing_ids: # We check that the missing ids have no data yet - missing_data = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)][ - ["Name", "Address ID", "Archetype ID"]] - extracted_data[extracted_data["survey_folder"].str.contains("23 Monmouth")]["survey_folder"].values + if len(missing_ids) != 8: + raise Exception("Unacceptable number of missings") - matching_lookup[matching_lookup["survey_folder"].str.contains("23 Monmouth")] - - if matching_lookup["Osm. ID"].duplicated().sum(): - raise Exception("Duplicate Osm. IDs") + if matching_lookup["Address ID"].duplicated().sum(): + raise Exception("Duplicate Address IDs") if matching_lookup["survey_folder"].duplicated().sum(): raise Exception("Duplicate survey folders") @@ -865,20 +869,21 @@ def main(): ] # We should end up with a 1:1 mapping between the Osm. ID and the survey folder - stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge( + stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="inner").merge( retrofit_packages_board[ [ "Name", + "RA", "Address ID", "Archetype ID", - "Arch. Group Rank", "Archetype Representative", + "Arch. Group Rank", "Actual SAP Band", "Actual SAP Rating", "Modelled SAP Band", "Modelled SAP Rating", ] + measure_columns ], - on=["Osm. ID", "Name"], + on=["Address ID", "Name"], how="left" ) @@ -900,9 +905,13 @@ def main(): header=12 ) + windows_data = windows_data[windows_data["Address ID"] != "Address ID"] + windows_data = windows_data[~pd.isnull(windows_data["Address ID"])] + # We get a lookup id of Osm.ID and when the windows were fitted windows_data = windows_data[ - ["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"] + ["Address ID", "Window attributes - Fitted/renewed date", + "Parent Asset Window attributes - Fitted/renewed date"] ] # Convert to string for the moment windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[ @@ -921,7 +930,8 @@ def main(): "Fitted/renewed date"]).dt.days / 365 stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"]) - stonewater_data = stonewater_data.merge(windows_data, on="Osm. ID", how="left") + windows_data["Address ID"] = windows_data["Address ID"].astype(float) + stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left") # if __name__ == "__main__": # main() From fba5b2b3cbe786dd7d16b1380fe59f9ff6447206 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 13:58:36 +0000 Subject: [PATCH 048/255] added RIR detection to summary report --- .../stonewater/Wave 3 Preparation.py | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 67362865..6cf26df8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -320,7 +320,7 @@ def extract_building_parts_epr(text): def extract_building_parts_summary(text): """ Extracts building parts and associated dimensions from the summary report PDF. - This includes Main Property and multiple extensions if they exist. + This includes Main Property, multiple extensions if they exist, and Room in Roof areas. """ data = [] @@ -368,6 +368,20 @@ def extract_building_parts_summary(text): "Party Wall Length (m)": party_wall_length }) + # Check specifically for "Room(s) in Roof" entries, which only have Floor Area + room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)") + room_in_roof_match = room_in_roof_pattern.search(floor_data) + if room_in_roof_match: + floor_area = float(room_in_roof_match.group(1)) + data.append({ + "Building Part": part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, # Placeholder for missing data + "Perimeter (m)": None, # Placeholder for missing data + "Party Wall Length (m)": None # Placeholder for missing data + }) + # Calculate aggregated dimensions main_property = [part for part in data if "Main Property" in part["Building Part"]] first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] @@ -376,10 +390,14 @@ def extract_building_parts_summary(text): "Total Ground Floor Area (m2)": sum( [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] ), - "RIR Floor Area": 0, - "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property]), + "RIR Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] + ), + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if + x["Perimeter (m)"] and x["Room Height (m)"]]), "First Extension Wall Area (m2)": sum( - [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions] + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if + x["Perimeter (m)"] and x["Room Height (m)"]] ), } @@ -887,6 +905,9 @@ def main(): how="left" ) + if stonewater_data["Address ID"].duplicated().sum(): + raise Exception("Duplicate Address IDs") + # Create a section for costs for measure in measure_columns: stonewater_data[f"Cost of {measure}"] = None @@ -933,5 +954,10 @@ def main(): windows_data["Address ID"] = windows_data["Address ID"].astype(float) stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left") + if stonewater_data["Address ID"].duplicated().sum(): + raise Exception("Duplicate Address IDs") + + # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values + # if __name__ == "__main__": # main() From d0cf88af6498d73a1155af320e5d6b899e3f94fa Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 14:09:42 +0000 Subject: [PATCH 049/255] added RIR area search for epr --- .../stonewater/Wave 3 Preparation.py | 43 ++++++++++++++----- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 6cf26df8..ee5cd1ca 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -256,8 +256,9 @@ def extract_window_age_description(windows_text): def extract_building_parts_epr(text): """ - Extracts building parts and associated dimensions from the provided PDF file. + Extracts building parts and associated dimensions from the provided PDF text. Each building part (main and extensions) includes floor area, room height, perimeter, and party wall length. + Handles cases where 'Room(s) in Roof area' appears within the part_name with only the Floor Area information. """ data = [] @@ -271,12 +272,28 @@ def extract_building_parts_epr(text): # Extract each building part for match in building_part_pattern.finditer(text): part_name = match.group(1).strip() - # Clean up building part name to keep only the descriptor (e.g., "Main" or "1st Extension") - cleaned_part_name = re.sub(r" - built in.*", "", part_name) - floor_data = match.group(2) - # Pattern to match each floor's measurements + # Check for "Room(s) in Roof area" within the part_name + room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name) + if room_in_roof_match: + # Extract Room in Roof area and add it as a separate entry + floor_area = float(room_in_roof_match.group(1)) + # Clean up part name to exclude "Room(s) in Roof area" from the building part name + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, # Placeholder for missing data + "Perimeter (m)": None, # Placeholder for missing data + "Party Wall Length (m)": None # Placeholder for missing data + }) + else: + # Clean up part name to keep only the descriptor (e.g., "Main" or "1st Extension") + cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip() + + # Pattern to match each floor's measurements in standard cases floor_pattern = re.compile( r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" ) @@ -299,8 +316,7 @@ def extract_building_parts_epr(text): "Party Wall Length (m)": party_wall_length }) - # We now extract out the aggregated data - + # Aggregated data calculation main_building = [part for part in data if "Main" in part["Building Part"]] first_extension = [part for part in data if "1st Extension" in part["Building Part"]] dimensions = { @@ -308,10 +324,17 @@ def extract_building_parts_epr(text): "Total Ground Floor Area (m2)": sum( [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]] ), - "RIR Floor Area": 0, - "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building]), + "RIR Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] + ), + "Main Building Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building if + x["Perimeter (m)"] and x["Room Height (m)"]] + ), "First Extension Wall Area (m2)": sum( - [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension]) if first_extension else 0, + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension if + x["Perimeter (m)"] and x["Room Height (m)"]] + ) if first_extension else 0, } return dimensions From f97bb7f1273b349abd77f75ff09152af87506f4e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 14:14:40 +0000 Subject: [PATCH 050/255] extract lighting fittings from epr --- etl/customers/stonewater/Wave 3 Preparation.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ee5cd1ca..16970803 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -461,7 +461,10 @@ def extract_epr(pdf_path): 'Total Ground Floor Area (m2)': None, 'RIR Floor Area': None, 'Main Building Wall Area (m2)': None, - 'First Extension Wall Area (m2)': None + 'First Extension Wall Area (m2)': None, + "Number of Light Fittings": None, + "Number of LEL Fittings": None, + "Number of fittings needing LEL": None } with open(pdf_path, "rb") as file: @@ -573,6 +576,13 @@ def extract_epr(pdf_path): building_parts = extract_building_parts_epr(text) data.update(building_parts) + # Get number of lighting outlets and number of fittings needing LEL + lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) + data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) + lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) + data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + return data From bccf3c621bbec73ac35a18f123ba73b456c695df Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 14:17:20 +0000 Subject: [PATCH 051/255] lighting fitting extraction from summary report --- etl/customers/stonewater/Wave 3 Preparation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 16970803..ccd062e2 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -73,7 +73,10 @@ def extract_summary_report(pdf_path): 'Total Ground Floor Area (m2)': None, 'RIR Floor Area': None, 'Main Building Wall Area (m2)': None, - 'First Extension Wall Area (m2)': None + 'First Extension Wall Area (m2)': None, + "Number of Light Fittings": None, + "Number of LEL Fittings": None, + "Number of fittings needing LEL": None } with open(pdf_path, "rb") as file: @@ -198,6 +201,10 @@ def extract_summary_report(pdf_path): dimensions = extract_building_parts_summary(text) data.update(dimensions) + data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) + data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + return data @@ -771,8 +778,6 @@ def main(): extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int) extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc) - # TODO: RIR floor area!!! - # Remove some definite duplicates dupes = extracted_data[extracted_data["Address"].duplicated()]["Address"] dupes = extracted_data[extracted_data["Address"].isin(dupes)] From 7e26fb4b86eee0c5f0ab3bd4e562796d44c5d0a7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 20:30:05 +0000 Subject: [PATCH 052/255] working on proposed sample for stonewater --- .../stonewater/Wave 3 Preparation.py | 203 +++++++++++++++++- 1 file changed, 201 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ccd062e2..bfdc8beb 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -486,7 +486,7 @@ def extract_epr(pdf_path): data["Postcode"] = data["Address"].split(",")[-1].strip() # Extract Current and Potential SAP ratings - sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text) + sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) data["Current SAP Rating"] = current_sap @@ -896,7 +896,6 @@ def main(): # Find Osmosis IDs that are in the packages board but not in the matching looking missing_ids = set(retrofit_packages_board["Address ID"]) - set(matching_lookup["Address ID"]) missing_ids = list(missing_ids) - print(len(missing_ids)) if missing_ids: # We check that the missing ids have no data yet if len(missing_ids) != 8: @@ -937,6 +936,7 @@ def main(): "Actual SAP Rating", "Modelled SAP Band", "Modelled SAP Rating", + "Package Ref", ] + measure_columns ], on=["Address ID", "Name"], @@ -995,7 +995,206 @@ def main(): if stonewater_data["Address ID"].duplicated().sum(): raise Exception("Duplicate Address IDs") + # Save this data to excel + stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages.xlsx", index=False) + + cost_sheet = [ + { + "measure": "EWI 0.30 w.m2.K", "cost": 298.35, "unit": "m2" + }, + { + "measure": "CWI RdSAP Default", "cost": 14.21, "unit": "m2" + }, + { + "measure": "Poss Extract CWI & Refill (issues identified)", "cost": 14.21 + 25, "unit": "m2" + }, + { + "measure": "IWI 0.30 w.m2.K", "cost": 244.80, "unit": "m2" + }, + { + "measure": "EWI/IWI 0.3", "cost": (298.35 + 244.8) / 2, "unit": "m2" + }, + { + "measure": "Loft Insulation 0.11 w.m2.K", "cost": 16.07, "unit": "m2" + }, + { + "measure": "Flat Roof 0.11 w.m2.K", "cost": 195, "unit": "m2" + }, + { + "measure": "DG Window 1.30 w.m2.K", "cost": 1140, "unit": "each" + }, + { + "measure": "Secondary 2.40", "cost": 974, "unit": "each" + }, + { + "measure": "Ins. Door 1.30 w.m2.K", "cost": None, "unit": "each" + }, + { + "measure": "Ins. Door 1.40 w.m2.K", "cost": None, "unit": "each" + }, + { + "measure": "DMEV", "cost": 900, "unit": "each" + }, + { + "measure": "ASHP Vaillant 102607 5kw", "cost": None, "unit": "each" + }, + { + "measure": "HHRSH Quantum 150", "cost": None, "unit": "each" + }, + { + "measure": "Dual Stat Tank 210lt 50mm Foam", "cost": None, "unit": "each" + }, + { + "measure": "Dual Stat Tank 160lt 50mm Foam", "cost": None, "unit": "each" + }, + { + "measure": "Dual Stat Tank 110lt 50mm Foam", "cost": None, "unit": "each" + }, + { + "measure": "Smart Thermostat", "cost": 1200, "unit": "each" + }, + { + "measure": "TRV's", "cost": 350, "unit": "each" + }, + { + "measure": "Solar PV - 3.0kwp", "cost": 4365.0, "unit": "each" + }, + { + "measure": "Solar PV - 1.5kwp", "cost": 3881, "unit": "each" + }, + { + "measure": "LEL", "cost": 35, "unit": "per bulb" + }, + { + "measure": "Roof 0.16 - Walls 0.30", "cost": 180, "unit": "floor area m2" + }, + { + "measure": "Roof 0.16 - Walls 0.16", "cost": 180, "unit": "floor area m2" + }, + ] + cost_sheet = pd.DataFrame(cost_sheet) + + # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater + cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False) + + stonewater_data["Room in Roof"].value_counts() + # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values + create_proposed_wave_3_bid( + costed_packages_filepath=os.path.join( + CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP).xlsx" + ), + archetypes_sheet_filepath=os.path.join( + CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" + ) + ) + + +def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath): + # We read in the costed packages + costed_packages = pd.read_excel(costed_packages_filepath) + + archetypes_to_cost = costed_packages[ + [ + "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band", + "Modelled SAP Rating", 'Total Cost of Measures', 'Contingency Cost', + 'Total Cost of Measures inc Contingency' + ] + ].copy() + + # We take properties that are EPC D and below (61% of units) + archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])] + + archetypes_to_cost["Has been modelled"] = ~pd.isnull(archetypes_to_cost["Modelled SAP Band"]) + + average_cost = archetypes_to_cost[ + archetypes_to_cost["Has been modelled"] + ]['Total Cost of Measures inc Contingency'].mean() + print(average_cost) + + # These are the Arhetypes that will likely be suitable for Wave 3 + archetypes_sheet = pd.read_excel(archetypes_sheet_filepath, header=4) + archetypes_sheet = archetypes_sheet[~pd.isnull(archetypes_sheet["Address ID"])] + archetypes_sheet = archetypes_sheet[archetypes_sheet["Address ID"] != "Address ID"] + archetypes_sheet["Address ID"] = archetypes_sheet["Address ID"].astype(int) + + # We merge the property details onto the costed archetypes + archetypes_to_cost = archetypes_to_cost.merge( + archetypes_sheet[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], + on="Address ID", + how="left" + ) + + proposed_sample = archetypes_sheet[archetypes_sheet["Archetype ID"].isin(archetypes_to_cost["Archetype ID"])] + + proposed_sample = proposed_sample[ + [ + "Name", "Postcode", "UPRN", "UDPRN", "Address ID", "Osm. ID", "Archetype ID", + "Property Type", "Wall Type", "Roof Type", "Heating" + ] + ] + + # We classify into high and low confidence + + match_classification = [] + for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)): + surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]] + # We now check if we have a perfect match + surveyed = surveyed[ + (surveyed["Property Type"] == home["Property Type"]) & + (surveyed["Wall Type"] == home["Wall Type"]) & + (surveyed["Roof Type"] == home["Roof Type"]) & + (surveyed["Heating"] == home["Heating"]) + ] + + if surveyed.empty: + match_classification.append( + { + "Address ID": home["Address ID"], + "Match to Surveyed": "Approximate" + } + ) + continue + match_classification.append( + { + "Address ID": home["Address ID"], + "Match to Surveyed": "Exact" + } + ) + + match_classification = pd.DataFrame(match_classification) + + proposed_sample = proposed_sample.merge( + match_classification, + on="Address ID", + how="left", + ) + + # Merge on the cost per archetype + cost_per_archetype = ( + archetypes_to_cost.groupby("Archetype ID")[['Total Cost of Measures inc Contingency']].mean().reset_index() + ) + proposed_sample = proposed_sample.merge( + cost_per_archetype, + on="Archetype ID", + how="left" + ) + + # We add on a boolean to indicate if a property from that archetype has been modelled + proposed_sample = proposed_sample.merge( + archetypes_to_cost.groupby("Archetype ID")[["Has been modelled"]].any().reset_index(), + on="Archetype ID", + how="left" + ) + + proposed_sample["Total Cost of Measures inc Contingency"] = np.where( + ~proposed_sample["Has been modelled"], + None, proposed_sample["Total Cost of Measures inc Contingency"] + ) + + # Save excel + proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid (WIP).xlsx", index=False) + # if __name__ == "__main__": # main() From a9ea89d2ae5253453e227c83c067f8a248d3f893 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 31 Oct 2024 12:03:17 +0000 Subject: [PATCH 053/255] done with stonewater for now --- .../stonewater/Wave 3 Preparation.py | 144 ++++++++++++++++-- 1 file changed, 133 insertions(+), 11 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bfdc8beb..477a73c8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -76,10 +76,13 @@ def extract_summary_report(pdf_path): 'First Extension Wall Area (m2)': None, "Number of Light Fittings": None, "Number of LEL Fittings": None, - "Number of fittings needing LEL": None + "Number of fittings needing LEL": None, + "Main Roof Type": None, + "Main Roof Insulation": None, + "Main Roof Insulation Thickness": None, } - with open(pdf_path, "rb") as file: + with (open(pdf_path, "rb") as file): reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: @@ -205,6 +208,27 @@ def extract_summary_report(pdf_path): data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + roof_section = re.search(r"8\.0 Roofs:\n(.*?)\n9\.0 Floors:", text, re.DOTALL) + roof_text = roof_section.group(1).strip() + roof_type_match = re.search(r"Type\s*([A-Za-z0-9\s]+)", roof_text) + data["Main Roof Type"] = roof_type_match.group(1).strip() if roof_type_match else None + + # Check if "Insulation" exists between Type and Insulation Thickness + insulation_search = re.search( + r"Type\s+.*?\n(Insulation\s+(.*?)\n)?(Insulation Thickness\s+(.*?)\n)", roof_text, re.DOTALL + ) + + if insulation_search: + # Insulation match will be present if it exists, otherwise it will be None + insulation_match = insulation_search.group(2) # Optional group for Insulation + insulation_thickness_match = insulation_search.group(4) # Required group for Insulation Thickness + + # Populate insulation fields + data["Main Roof Insulation"] = insulation_match.strip() if insulation_match else None + data["Main Roof Insulation Thickness"] = ( + insulation_thickness_match.strip() if insulation_thickness_match else None + ) + return data @@ -434,6 +458,49 @@ def extract_building_parts_summary(text): return dimensions +import re + + +def extract_roof_details_epr(text): + """ + Extracts roof type, insulation, and insulation thickness for each building part + in the provided EPR PDF text. + """ + # Define data structure to hold results + roof_data = [] + + # Locate each building part section + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + + # Extract each building part's data, including roof details + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + + # Clean up the building part name + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + + part_details = match.group(2) + + # Extract Roof Type, Roof Insulation, and Roof Insulation Thickness + roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details) + + # Store results for this building part + roof_data.append({ + "Building Part": cleaned_part_name, + "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None, + "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None, + "Roof Insulation Thickness": roof_insulation_thickness_match.group( + 1).strip() if roof_insulation_thickness_match else None, + }) + + return roof_data + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -471,7 +538,10 @@ def extract_epr(pdf_path): 'First Extension Wall Area (m2)': None, "Number of Light Fittings": None, "Number of LEL Fittings": None, - "Number of fittings needing LEL": None + "Number of fittings needing LEL": None, + "Main Roof Type": None, + "Main Roof Insulation": None, + "Main Roof Insulation Thickness": None, } with open(pdf_path, "rb") as file: @@ -590,6 +660,13 @@ def extract_epr(pdf_path): data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + roof_details = extract_roof_details_epr(text) + # Get from the main building + main_roof_details = [r for r in roof_details if "Main" in r["Building Part"]] + data["Main Roof Type"] = main_roof_details[0]["Roof Type"] + data["Main Roof Insulation"] = main_roof_details[0]["Roof Insulation"] + data["Main Roof Insulation Thickness"] = main_roof_details[0]["Roof Insulation Thickness"] + return data @@ -1077,13 +1154,11 @@ def main(): # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False) - stonewater_data["Room in Roof"].value_counts() - # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values create_proposed_wave_3_bid( costed_packages_filepath=os.path.join( - CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP).xlsx" + CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) V2.xlsx" ), archetypes_sheet_filepath=os.path.join( CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" @@ -1098,11 +1173,30 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa archetypes_to_cost = costed_packages[ [ "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band", - "Modelled SAP Rating", 'Total Cost of Measures', 'Contingency Cost', - 'Total Cost of Measures inc Contingency' + "Modelled SAP Rating", "Package Ref", 'Total Cost of Measures', 'Contingency Cost', + 'Total Cost of Measures inc Contingency', 'Main Roof Type', 'Main Roof Insulation', + 'Main Roof Insulation Thickness', 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference' ] ].copy() + # Combine 'Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', separating by colons! + archetypes_to_cost['Surveyed Main Roof'] = ( + archetypes_to_cost['Main Roof Type'] + ': ' + archetypes_to_cost['Main Roof Insulation'] + ': ' + + archetypes_to_cost['Main Roof Insulation Thickness'].astype(str) + ) + + # Combine the heating systems, separating by colons! + archetypes_to_cost['Surveyed Main Heating'] = ( + archetypes_to_cost['Existing Primary Heating System'] + ': code - ' + archetypes_to_cost[ + 'Existing Primary Heating PCDF Reference'].astype(str) + ) + + archetypes_to_cost = archetypes_to_cost.drop( + columns=['Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', + 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference']) + # We take properties that are EPC D and below (61% of units) archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])] @@ -1139,7 +1233,19 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa match_classification = [] for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)): - surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]] + + surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]].copy() + surveyed["Package Ref"] = surveyed["Package Ref"].astype(str) + + package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()])) + package = package.replace("\n", "") + + surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()])) + surveyed_roofs = surveyed_roofs.replace("\n", "") + + surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()])) + surveyed_heating = surveyed_heating.replace("\n", "") + # We now check if we have a perfect match surveyed = surveyed[ (surveyed["Property Type"] == home["Property Type"]) & @@ -1149,17 +1255,33 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa ] if surveyed.empty: + if package == "2B2A": + raise Exception("Fix me") match_classification.append( { "Address ID": home["Address ID"], - "Match to Surveyed": "Approximate" + "Match to Surveyed": "Approximate", + "Proposed Package Ref": package, + "Surveyed Archetype Roofs": surveyed_roofs, + "Surveyed Archetype Heating": surveyed_heating } ) continue + # Re-do + package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()])) + package = package.replace("\n", "") + surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()])) + surveyed_roofs = surveyed_roofs.replace("\n", "") + surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()])) + surveyed_heating = surveyed_heating.replace("\n", "") + match_classification.append( { "Address ID": home["Address ID"], - "Match to Surveyed": "Exact" + "Match to Surveyed": "Exact", + "Proposed Package Ref": package, + "Surveyed Archetype Roofs": surveyed_roofs, + "Surveyed Archetype Heating": surveyed_heating } ) From 6cf0db87f7a3fc68db02d518f9e57bc28b3fe0c1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 31 Oct 2024 14:35:14 +0000 Subject: [PATCH 054/255] completed packages for first 12 surveys --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/aiha/xml_extraction.py | 139 ++++++++++++++++----------- 3 files changed, 85 insertions(+), 58 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 038e8593..65e0eb1e 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -9,6 +9,32 @@ SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIH CONTINGENCY_RATE = 0.26 +def sap_to_epc(sap_points: int | float): + """ + Simple utility function to convert SAP points to EPC rating. + :param sap_points: numerical value of SAP points, typically between 0 and 100 + :return: + """ + + if sap_points <= 0: + raise ValueError("SAP points should be above 0.") + + if sap_points >= 92: + return "A" + elif sap_points >= 81: + return "B" + elif sap_points >= 69: + return "C" + elif sap_points >= 55: + return "D" + elif sap_points >= 39: + return "E" + elif sap_points >= 21: + return "F" + else: + return "G" + + def main(): """ This script handles the extraction of data from the XML files in the survey folders. @@ -76,24 +102,14 @@ def main(): # TODO # - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft # [Can't remember, not clear - Chenai will check] - # - AIH001-03 instead of cylinder insulation, we could install an air source heat pump but it might not be the - # best option for this property due to it being extrememly large and the walls being uninsulated. It might not - # be performant enough in the winter, when COP will be more like 1.5. - # - AIH001-03 - can add additional 1.6kWp solar PV to flat roof to get close to EPC C. How many occupants are - # in the property? Does it make sense to have such a large solar PV system (5.6kWp)? # - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C # - Potential measure - search for the cylinder and insulate it # - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same # buulding [Question for Lewis & Kevin] # - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from # the other unit] - # - AIH001-09 - Why is there assumed secondary heating? [Question for Lewis & Kevin] # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? - # [Question for Lewis & Kevin] - # - AIH001-11 - The layout of this unit is confusing, is there roof access? [NO!!!! - It's a Sun room!!] - # - AIH001-12 - Why was there not access to the cylinder? [Sealed shut] - # - AIH001-12 - Is the need to draught proofing due to the windows? [This would be addressed by deailing with the - # windows] + # [Question for Lewis & Kevin] - [YES - ASHP!!!!] recommended_measures = [ { @@ -114,40 +130,32 @@ def main(): }, { "measure": "Solar PV", - "description": "5.6kWp Solar PV system", + "description": "4kWp Solar PV system", "config": [ { "size": "4kWp", "orientation": "East", "elavation": 30, - "overshading": "Modest", + "overshading": "None or little", }, - { - "size": "1.6kWp", - "orientation": "Horizontal", - "elavation": "Horizontal", - "overshading": "Modest", - } ], - "sap_points": 7, - "ending_sap": 53 + "sap_points": 10, + "ending_sap": 54 }, { - "measure": "Loft Insulation", - "description": "300mm loft insulation", - "floor_area": 80, # Based on area of 1st floor - "sap_points": 8, - "ending_sap": 61 + "measure": "Air Source Heat Pump", + "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)", + "sap_points": 20, + "ending_sap": 74 }, { - "measure": "TTZC", - "description": "Smart Thermostat", - "sap_points": 3, - "ending_sap": 64 + "measure": "Tariff Review", + "description": "Switch to 24-hour tariff", + "sap_points": 15, + "ending_sap": 89 } ], - "notes": "There was no access to the loft for this property and so a loft hatch would need to be " - "installed..." + "notes": "Unclear if the loft is accessible" }, { "survey_key": "AIH001-04", @@ -174,14 +182,14 @@ def main(): "size": "4kWp", "orientation": "South", "elavation": 30, - "overshading": "Modest", + "overshading": "None or little", } ], - "sap_points": 12, - "ending_sap": 67 + "sap_points": 15, + "ending_sap": 70 } ], - "notes": "" + "notes": "Roof is flat, PV array should be installed south facing with elevation" }, { "survey_key": "AIH001-05", @@ -276,7 +284,7 @@ def main(): "measure": "Internal Wall Insulation", "description": "100mm internal wall insulation", "hlp": 24.13 * 2.63, - "sap_points": 5, + "sap_points": 7, "ending_sap": 69, }, { @@ -316,8 +324,32 @@ def main(): "description": "Smart Thermostat", "sap_points": 3, "ending_sap": 56, + }, + { + "measure": "Solar PV", + "description": "1.6kWp Solar PV system", + "config": [ + { + "size": "1.6W", + "orientation": "South-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 6, + "ending_sap": 62 + }, + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 63.59 + 12.31, # Based on area of main building and 1st extension + "sap_points": 8, + "ending_sap": 70, } - ] + ], + "notes": "This property is a house split into 2 flats. We can install a PV array for both units (one array" + "per unit). Area on south-east part of roof is ~22m2 with no overshadowing. Flat roof area is 8m2" + "with modest overshadowing. We suggest a 3.2kWp system, across two units" }, { "survey_key": "AIH001-11", @@ -353,14 +385,7 @@ def main(): "description": "Installation of double glazing", "n_windows": 20, # Counted the bay windows each as 3 "windows_area": 10.66, - "sap_points": 2, - "ending_sap": 48, - }, - { - "measure": "Draught Proofing", - "description": "Window draught proofing improvements", - "n_windows": 20, # Counted the bay windows each as 3 - "sap_points": 1, + "sap_points": 3, "ending_sap": 49, }, { @@ -379,7 +404,7 @@ def main(): }, { "measure": "Air Source Heat Pump", - "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump", + "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)", "sap_points": 15, "ending_sap": 73 }, @@ -497,17 +522,19 @@ def main(): {'item': 'Window draught proofing improvements', 'unit_price': 63, 'unit': 'window'}, {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'}, {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None}, - {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, - {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'}, {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'}, - {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'}, - {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, + {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)', 'unit_price': 21189 + 1200, + 'unit': 'unit'}, {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80, 'unit': 'floor_m2'}, - {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'}, {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'}, {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'}, + {'item': '1.6kWp Solar PV system', 'unit_price': 3040, 'unit': 'unit_needs_scaffolding'}, + {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, + {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, + {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, + {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'}, ] pricing_data = pd.DataFrame(pricing_data) @@ -587,13 +614,13 @@ def main(): result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left") # Step 5: Calculate the ending SAP. - result_df["ending_sap"] = result_df["starting_sap"] + result_df["total_sap_points"] + result_df["Ending SAP"] = result_df["starting_sap"] + result_df["total_sap_points"] + result_df["Ending EPC Rating"] = result_df["Ending SAP"].apply(sap_to_epc) # Step 6: Merge the result with the measures_data to get the final DataFrame. final_measures = measures_data.merge( result_df, how="left", on="survey_key" ) - -if __name__ == "__main__": - main() +# if __name__ == "__main__": +# main() From 8f8993ab6480f30cbefe0ec8d6295005ba12dc6f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 31 Oct 2024 15:31:09 +0000 Subject: [PATCH 055/255] added some additional aiha packages --- etl/customers/aiha/xml_extraction.py | 78 ++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 65e0eb1e..25917f1e 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -111,6 +111,9 @@ def main(): # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? # [Question for Lewis & Kevin] - [YES - ASHP!!!!] + # TODO: Need AIH001-02 9C Clapton Common + # TODO: Check which properties are in a conservation area + recommended_measures = [ { "survey_key": "AIH001-01", @@ -501,6 +504,81 @@ def main(): } ] }, + { + "survey_key": "AIH001-15", + "starting_sap": 60, + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 73.81, # Based on area of main building + "sap_points": 1, + "ending_sap": 61, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 64, + }, + { + "measure": "Solar PV", + "description": "3.2kWp Solar PV system", + "config": [ + { + "size": "3.2W", + "orientation": "North-West", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 7, + "ending_sap": 71, + "notes": "The array is North-west facing and therefore will be slightly less efficient than south" + "facing, however the impact is not so severe as to make the installation not worthwhile." + "Ground mounted" + } + ] + }, + { + "survey_key": "AIH001-16", + "starting_sap": 60, + "recommended_measures": [ + { + "measure": "Cavity Wall Insulation", + "description": "CWI to rdSAP default standard", + "hlp": (21.56 * 2.60) + (26.79 * 2.8) + (6.74 * 2.60), + "sap_points": 4, + "ending_sap": 64, + }, + { + "measure": "Ventilation", + "description": "2x DMEV fans", + "sap_points": 0, + "ending_sap": 64, + }, + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "sap_points": 1, + "ending_sap": 65, + }, + { + "measure": "Solar PV", + "description": "2.4kWp Solar PV system", + "config": [ + { + "size": "2.4W", + "orientation": "South-East", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 5, + "ending_sap": 70, + } + ] + } ] scaffolding_data = [ From b6cf10287b5867aa20a00123ee8c4de3e590e4a0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Nov 2024 07:20:55 +0000 Subject: [PATCH 056/255] added AIH001-17 --- etl/customers/aiha/xml_extraction.py | 38 +++++++++++++++++++ .../stonewater/Wave 3 Preparation.py | 8 ++-- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 25917f1e..8c5c9008 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -578,6 +578,44 @@ def main(): "ending_sap": 70, } ] + }, + { + "survey_key": "AIH001-17", + "starting_sap": 62, + "recommended_measures": [ + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 63, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 66, + }, + { + "measure": "Solar PV", + "description": "4kWp Solar PV system", + "config": [ + { + "size": "3.2kW", + "orientation": "East", + "elavation": 30, + "overshading": "None or little", + }, + { + "size": "0.8kW", + "orientation": "West", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 12, + "ending_sap": 78, + } + ] } ] diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 477a73c8..9f929db1 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -458,9 +458,6 @@ def extract_building_parts_summary(text): return dimensions -import re - - def extract_roof_details_epr(text): """ Extracts roof type, insulation, and insulation thickness for each building part @@ -1158,7 +1155,7 @@ def main(): create_proposed_wave_3_bid( costed_packages_filepath=os.path.join( - CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) V2.xlsx" + CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) MR Review v1.xlsx" ), archetypes_sheet_filepath=os.path.join( CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" @@ -1168,7 +1165,8 @@ def main(): def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath): # We read in the costed packages - costed_packages = pd.read_excel(costed_packages_filepath) + # Note: Header as 12 is for Matt Ratcliff's reviewed version + costed_packages = pd.read_excel(costed_packages_filepath, header=13, sheet_name="Modelled Packages") archetypes_to_cost = costed_packages[ [ From 9ad7d3e46f30ee6a24e5d8c81dbd7f1035c04bee Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 4 Nov 2024 11:24:02 +0000 Subject: [PATCH 057/255] added missing windows age extraction --- etl/customers/aiha/xml_extraction.py | 67 ++++++++++++++++++++++++-- etl/xml_survey_extraction/XmlParser.py | 1 + 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 8c5c9008..7dc516a6 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -113,6 +113,7 @@ def main(): # TODO: Need AIH001-02 9C Clapton Common # TODO: Check which properties are in a conservation area + # TODO: AIH001-16 - Is the loft insulation suitable (already has 100mm in the RIR) recommended_measures = [ { @@ -560,6 +561,7 @@ def main(): { "measure": "Loft Insulation", "description": "300mm loft insulation", + "floor_area": 20.92, # Based on floor area of RIR "sap_points": 1, "ending_sap": 65, }, @@ -616,6 +618,27 @@ def main(): "ending_sap": 78, } ] + }, + { + "survey_key": "AIH001-18", + "starting_sap": 58, + "recommended_measures": [], + + }, + { + "survey_key": "AIH001-19", + "starting_sap": 76, + "recommended_measures": [] + }, + { + "survey_key": "AIH001-20", + "starting_sap": 82, + "recommended_measures": [] + }, + { + "survey_key": "AIH001-21", + "starting_sap": 53, + "recommended_measures": [] } ] @@ -648,6 +671,7 @@ def main(): {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'}, {'item': '1.6kWp Solar PV system', 'unit_price': 3040, 'unit': 'unit_needs_scaffolding'}, {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, + {'item': '2.4kWp Solar PV system', 'unit_price': 3363, 'unit': 'unit_needs_scaffolding'}, {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'}, @@ -690,8 +714,14 @@ def main(): total_cost = survey.get("total_cost", 0) for measure in survey.get("recommended_measures", []): + # Include hlp and floor_area for each measure if available + hlp = measure.get("hlp", None) + floor_area = measure.get("floor_area", None) + normalized_measures.append({ "survey_key": survey_key, + "hlp": hlp, + "floor_area": floor_area, "starting_sap": starting_sap, "measure": measure["measure"], "description": measure.get("description", ""), @@ -712,16 +742,38 @@ def main(): fill_value=None ).reset_index() + measures_columns = [x for x in pivoted_measures.columns if x not in ["survey_key"]] + # We add a "Cost of" column for each measure + for measure in measures_columns: + pivoted_measures[f"Cost of {measure}"] = None + + pivoted_floor_area = measures_df.pivot_table( + index="survey_key", + columns="measure", + values="floor_area", + aggfunc="first" # Use 'first' since each measure should only appear once per survey_key + ).add_prefix("floor_area - ").reset_index() + + pivoted_hlp = measures_df.pivot_table( + index="survey_key", + columns="measure", + values="hlp", + aggfunc="first" + ).add_prefix("hlp - ").reset_index() + + # Merge hlp and floor_area data + pivoted_measures = pivoted_measures.merge(pivoted_hlp, on="survey_key", how="left") + pivoted_measures = pivoted_measures.merge(pivoted_floor_area, on="survey_key", how="left") + # Step 3: Calculate the total sap points and total cost for each survey. - sap_cost_totals = measures_df.groupby("survey_key").agg( + totals = measures_df.groupby("survey_key").agg( total_sap_points=("sap_points", "sum"), - total_cost_of_measures=("measure_cost", "sum") ).reset_index() # Merge total sap points into the pivoted measures. - pivoted_measures = pd.merge(pivoted_measures, sap_cost_totals, on="survey_key", how="left") - pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE - pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"] + pivoted_measures = pd.merge(pivoted_measures, totals, on="survey_key", how="left") + # pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE + # pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"] # Step 4: Extract starting SAP for each survey key. starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]] @@ -738,5 +790,10 @@ def main(): result_df, how="left", on="survey_key" ) + final_measures.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Measures packages.csv") + + # Store costs + pricing_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Pricing data.csv") + # if __name__ == "__main__": # main() diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index fa70b6b7..ef8daf51 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -784,6 +784,7 @@ class XmlParser: glazing_type_lookup = { "ND": "Single glazing", + "1": "double glazing installed before 2002", "2": "double glazing installed during or after 2002", "3": "double glazing, unknown install date", "5": "Single glazing", From 5dc78d6bb9c6b14029488bb27d769967bb4ba658 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 4 Nov 2024 12:14:52 +0000 Subject: [PATCH 058/255] added measures for more properties --- etl/customers/aiha/xml_extraction.py | 105 ++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 7dc516a6..d193c91e 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -122,6 +122,33 @@ def main(): "recommended_measures": [], "notes": "Is EPC C" }, + { + "survey_key": "AIH001-02", + "starting_sap": 65, + "recommended_measures": [ + { + "measure": "Solar PV", + "description": "2.4kWp Solar PV system", + "config": [ + { + "size": "2.4W", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 7, + "ending_sap": 72, + "notes": "The array can be mounted on the flat roof, so that panels are south facing" + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 4, + "ending_sap": 76 + } + ], + }, { "survey_key": "AIH001-03", "starting_sap": 43, @@ -622,7 +649,41 @@ def main(): { "survey_key": "AIH001-18", "starting_sap": 58, - "recommended_measures": [], + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 37.52, # Based on area of main building and 1st extension + "sap_points": 7, + "ending_sap": 65, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 66, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 2, + "ending_sap": 68, + }, + { + "measure": "Solar PV", + "description": "3.2kWp Solar PV system", + "config": [ + { + "size": "3.2W", + "orientation": "North-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 7, + "ending_sap": 75, + } + ], }, { @@ -638,7 +699,47 @@ def main(): { "survey_key": "AIH001-21", "starting_sap": 53, - "recommended_measures": [] + "recommended_measures": [ + { + "measure": "Cyliner Insulation", + "description": "80mm cylinder insulation", + "sap_points": 2, + "ending_sap": 55, + }, + { + "measure": "Roof Insulation", + "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)", + "floor_area": 22.80, # Based on floor area of RIR + "sap_points": 7, + "ending_sap": 62, + }, + { + "measure": "Solar PV", + "description": "2.4kWp Solar PV system", + "config": [ + { + "size": "1.6kWp", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "None or little", + }, + { + "size": "0.8kWp", + "orientation": "South-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 9, + "ending_sap": 71, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 74, + } + ] } ] From b75ae5f6b8de5855fd5278079de009e9a99ceb0e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 11:34:15 +0000 Subject: [PATCH 059/255] minor --- etl/customers/aiha/xml_extraction.py | 122 ++++++++++++++++++++++----- 1 file changed, 103 insertions(+), 19 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index d193c91e..531b6752 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -102,8 +102,6 @@ def main(): # TODO # - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft # [Can't remember, not clear - Chenai will check] - # - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C - # - Potential measure - search for the cylinder and insulate it # - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same # buulding [Question for Lewis & Kevin] # - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from @@ -111,9 +109,9 @@ def main(): # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? # [Question for Lewis & Kevin] - [YES - ASHP!!!!] - # TODO: Need AIH001-02 9C Clapton Common # TODO: Check which properties are in a conservation area # TODO: AIH001-16 - Is the loft insulation suitable (already has 100mm in the RIR) + # TODO: Adjust Archetype 14 homes to exclude double glazing? Or should we exclude entirely recommended_measures = [ { @@ -376,6 +374,8 @@ def main(): "floor_area": 63.59 + 12.31, # Based on area of main building and 1st extension "sap_points": 8, "ending_sap": 70, + "notes": "Loft is inaccessible from this unit - would need to be accessed from the other unit, " + "which is also owned by AIHA" } ], "notes": "This property is a house split into 2 flats. We can install a PV array for both units (one array" @@ -419,31 +419,31 @@ def main(): "sap_points": 3, "ending_sap": 49, }, - { - "measure": "Solar PV", - "description": "3.2kWp Solar PV system", - "config": [ - { - "size": "3.2W", - "orientation": "East", - "elavation": 30, - "overshading": "Little or none", - } - ], - "sap_points": 9, - "ending_sap": 58 - }, + # { + # "measure": "Solar PV", + # "description": "3.2kWp Solar PV system", + # "config": [ + # { + # "size": "3.2W", + # "orientation": "East", + # "elavation": 30, + # "overshading": "Little or none", + # } + # ], + # "sap_points": 9, + # "ending_sap": 58 + # }, { "measure": "Air Source Heat Pump", "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)", "sap_points": 15, - "ending_sap": 73 + "ending_sap": 65 }, { "measure": "Tariff Review", "description": "Switch to 24-hour tariff", "sap_points": 15, - "ending_sap": 88 + "ending_sap": 80 } ] }, @@ -740,6 +740,90 @@ def main(): "ending_sap": 74, } ] + }, + { + "survey_key": "AIH001-SIMULATED-01", + "elmhurst_reference": "000020", + "starting_sap": None, + "recommended_measures": [ + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "hlp": (22.35 * 3.24) + (22.13 * 2.53), + "sap_points": 8, + "ending_sap": 52, + }, + { + "measure": "Cavity Wall Insulation", + "description": "CWI to rdSAP default standard", + "hlp": (2.68 * 2.39) + (5.93 * 2.63) + (6.13 * 2.39), # 1st & 2nd extension + "sap_points": 1, + "ending_sap": 53, + }, + { + "measure": "Ventilation", + "description": "2x DMEV fans", + "sap_points": 0, + "ending_sap": 53, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 56, + }, + { + "measure": "Solar PV", + "description": "1.6kWp Solar PV system", + "config": [ + { + "size": "1.6W", + "orientation": "South-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 6, + "ending_sap": 62 + }, + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 63.59 + 12.31, # Based on area of main building and 1st extension + "sap_points": 8, + "ending_sap": 70, + "notes": "Loft is inaccessible from this unit - would need to be accessed from the other unit, " + "which is also owned by AIHA" + } + ], + "notes": "This was cloned from 80A. There is no existing data for 80B" + }, + { + "survey_key": "AIH001-SIMULATED-05", + "starting_sap": 68, + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 42.5, + "sap_points": 1, + "ending_sap": 69, + }, + { + "measure": "Solar PV", + "description": "3.2kWp Solar PV system", + "config": [ + { + "size": "3.2W", + "orientation": "North-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 8, + "ending_sap": 77, + } + ] } ] From cb4b59727202b5ae10726f94c7e97bbe414cf9ab Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 13:54:14 +0000 Subject: [PATCH 060/255] setting up route march script --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/SearchEpc.py | 3 + .../oo_prs_additional_units.py | 122 ++++++++++++++++++ .../oo_prs_additional_units/requirements.txt | 9 ++ 5 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py create mode 100644 etl/route_march/oo_prs_additional_units/requirements.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..0e963140 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..35513387 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index f9e978c6..2d658c04 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -2,6 +2,7 @@ import os import time import re +from urllib.parse import urlencode import usaddress import pandas as pd import numpy as np @@ -257,6 +258,8 @@ class SearchEpc: params = {"address": self.address1, "postcode": self.postcode} url = os.path.join(self.client.domestic.host, "search") + if size: + url += "?" + urlencode({k: v for k, v in {"size": size}.items() if v}) for retry in range(self.max_retries): try: diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py new file mode 100644 index 00000000..345f0afe --- /dev/null +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -0,0 +1,122 @@ +import os +import pandas as pd +import numpy as np +from dotenv import load_dotenv +from urllib.parse import urlencode +from epc_api.client import EpcClient + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +CONFIG = [ + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "SETTLE GBIS x 242 ", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "ACIS GBIS x 76", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "SOUTHERN GBIS x 150", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "COMMUNITY HOUSING GBIS x 199", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "EASTLIGHT GBIS x 42", + "postcode_column": "Postcode", + }, +] + +CAVITY_WALL_DESCRIPTIONS = [ + "Cavity wall, as built, no insulation (assumed)", + "Cavity wall, as built, partial insulation (assumed)", + "Cavity wall, as built, insulated (assumed)", + "Cavity wall, with internal insulation", + "Cavity wall, with external insulation", +] + +ROOF_DESCRIPTIONS = [ + "Pitched, no insulation", + "Pitched, no insulation (assumed)", + "Pitched, 25 mm loft insulation", + "Pitched, 50 mm loft insulation", + "Pitched, 75 mm loft insulation", + "Pitched, 100 mm loft insulation", + "Pitched, 150 mm loft insulation", + "Pitched, limited insulation (assumed)", + "Pitched, insulated (assumed)", +] + +SOCIAL_TENURES = ["Rented (social)", "rental (social)"] + + +def main(): + """ + This application is used to identify additional units that are private rentals or owner occupies that can be + included in the route marches + + Required inputs are the following: + - An excel file that contains one or many tabs that include the addresses to be visited + """ + + for config in CONFIG: + # Read in the data + route_march_addresses = pd.read_excel( + config["filepath"], + sheet_name=config["tab"], + engine="openpyxl" + ) + + postcodes = route_march_addresses[config["postcode_column"]].unique() + + epcs = [] + for postcode in postcodes: + # Get the EPCs in this postcode + + params = {"postcode": postcode} + client = EpcClient(auth_token=EPC_AUTH_TOKEN) + url = os.path.join(client.domestic.host, "search") + url += "?" + urlencode({k: v for k, v in {"size": 1000}.items() if v}) + response = client.domestic.call(method="get", url=url, params=params) + + postcode_epcs = pd.DataFrame(response["rows"]) + # Get the newest EPC, per UPRN + postcode_epcs["uprn"] = np.where( + pd.isnull(postcode_epcs["uprn"]), + postcode_epcs["address"], + postcode_epcs["uprn"] + ) + postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False) + postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first") + + postcode_epcs["Is Cavity Property"] = postcode_epcs["walls-description"].isin( + CAVITY_WALL_DESCRIPTIONS + ) & (postcode_epcs["current-energy-efficiency"].astype(int) <= 72) + + postcode_epcs["Solar and Loft"] = (postcode_epcs["roof-description"].isin(ROOF_DESCRIPTIONS)) & ( + postcode_epcs["photo-supply"].isin(["0", "", "0.0"])) & ( + postcode_epcs["current-energy-efficiency"].astype(int) <= 68 + ) + + postcode_epcs = postcode_epcs[postcode_epcs["Is Cavity Property"] | postcode_epcs["Solar and Loft"]] + + # Remove any social properties + postcode_epcs = postcode_epcs[~postcode_epcs["tenure"].isin(SOCIAL_TENURES)] + + epcs.append(postcode_epcs) + + epcs = pd.concat(epcs) diff --git a/etl/route_march/oo_prs_additional_units/requirements.txt b/etl/route_march/oo_prs_additional_units/requirements.txt new file mode 100644 index 00000000..fd763a3b --- /dev/null +++ b/etl/route_march/oo_prs_additional_units/requirements.txt @@ -0,0 +1,9 @@ +openpyxl +epc-api-python==1.0.2 +numpy==2.1.2 +pandas==2.2.3 +usaddress==0.5.11 +fuzzywuzzy==0.18.0 +boto3==1.35.44 +python-dotenv +tqdm \ No newline at end of file From 2f930e3fa278127c8d964f92761209d4ec4b23f4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 14:19:17 +0000 Subject: [PATCH 061/255] refactoring prs and oo data puls --- .../oo_prs_additional_units.py | 144 ++++++++++++++---- .../oo_prs_additional_units/requirements.txt | 3 +- 2 files changed, 113 insertions(+), 34 deletions(-) diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index 345f0afe..c1b562ea 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -1,10 +1,20 @@ import os import pandas as pd import numpy as np +from tqdm import tqdm from dotenv import load_dotenv from urllib.parse import urlencode from epc_api.client import EpcClient +from utils.logger import setup_logger +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +logger = setup_logger() load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") @@ -64,6 +74,89 @@ ROOF_DESCRIPTIONS = [ SOCIAL_TENURES = ["Rented (social)", "rental (social)"] +def process_postcode_epcs(postcode, client): + params = {"postcode": postcode} + url = os.path.join(client.domestic.host, "search") + "?" + urlencode({"size": 1000}) + response = client.domestic.call(method="get", url=url, params=params) + postcode_epcs = pd.DataFrame(response["rows"]) + + # Processing code here + postcode_epcs["uprn"] = np.where( + pd.isnull(postcode_epcs["uprn"]), + postcode_epcs["address"], + postcode_epcs["uprn"] + ) + postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False) + postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first") + return postcode_epcs + + +def filter_and_prepare_epcs(epcs): + epcs["Is Cavity Property"] = epcs["walls-description"].isin(CAVITY_WALL_DESCRIPTIONS) & ( + epcs["current-energy-efficiency"].astype(int) <= 72 + ) + epcs["Solar and Loft"] = ( + epcs["roof-description"].isin(ROOF_DESCRIPTIONS) + ) & ( + epcs["photo-supply"].isin(["0", "", "0.0"]) + ) & ( + epcs["current-energy-efficiency"].astype(int) <= 68 + ) + epcs = epcs[epcs["Is Cavity Property"] | epcs["Solar and Loft"]] + epcs = epcs[~epcs["tenure"].isin(SOCIAL_TENURES)] + return epcs + + +def rename_and_add_columns(epcs): + epcs = epcs.rename( + columns={ + "address": "Address", + "postcode": "Postcode", + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)", + "tenure": "Tenure" + } + ) + + # Add additional columns as in your original code + epcs["Estimated Number of Floors"] = epcs.apply( + lambda x: estimate_number_of_floors(x["Property Type"]) if pd.notnull(x["Property Type"]) else None, axis=1 + ) + epcs["Estimated Perimeter (m)"] = epcs.apply( + lambda x: estimate_perimeter( + x["Property Floor Area"] / x["Estimated Number of Floors"], + x["Number of Habitable Rooms"] / x["Estimated Number of Floors"] + ), axis=1 + ) + epcs["Estimated Heat Loss Perimeter (m2)"] = epcs.apply( + lambda x: estimate_external_wall_area( + x["Estimated Number of Floors"], + float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + x["Estimated Perimeter (m)"], + x["Archetype"] + ), axis=1 + ) + epcs["Roof Insulation Thickness"] = epcs.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()[ + "insulation_thickness"] if pd.notnull(x["Roof Construction"]) else None, + axis=1 + ) + return epcs + + def main(): """ This application is used to identify additional units that are private rentals or owner occupies that can be @@ -73,7 +166,13 @@ def main(): - An excel file that contains one or many tabs that include the addresses to be visited """ + # This should be set: + output_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/PRS and OO properties - WC 11.11.2024" + client = EpcClient(auth_token=EPC_AUTH_TOKEN) + writer = pd.ExcelWriter(output_filepath, engine="xlsxwriter") + for config in CONFIG: + logger.info("Processing %s", config["tab"]) # Read in the data route_march_addresses = pd.read_excel( config["filepath"], @@ -84,39 +183,18 @@ def main(): postcodes = route_march_addresses[config["postcode_column"]].unique() epcs = [] - for postcode in postcodes: - # Get the EPCs in this postcode - - params = {"postcode": postcode} - client = EpcClient(auth_token=EPC_AUTH_TOKEN) - url = os.path.join(client.domestic.host, "search") - url += "?" + urlencode({k: v for k, v in {"size": 1000}.items() if v}) - response = client.domestic.call(method="get", url=url, params=params) - - postcode_epcs = pd.DataFrame(response["rows"]) - # Get the newest EPC, per UPRN - postcode_epcs["uprn"] = np.where( - pd.isnull(postcode_epcs["uprn"]), - postcode_epcs["address"], - postcode_epcs["uprn"] - ) - postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False) - postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first") - - postcode_epcs["Is Cavity Property"] = postcode_epcs["walls-description"].isin( - CAVITY_WALL_DESCRIPTIONS - ) & (postcode_epcs["current-energy-efficiency"].astype(int) <= 72) - - postcode_epcs["Solar and Loft"] = (postcode_epcs["roof-description"].isin(ROOF_DESCRIPTIONS)) & ( - postcode_epcs["photo-supply"].isin(["0", "", "0.0"])) & ( - postcode_epcs["current-energy-efficiency"].astype(int) <= 68 - ) - - postcode_epcs = postcode_epcs[postcode_epcs["Is Cavity Property"] | postcode_epcs["Solar and Loft"]] - - # Remove any social properties - postcode_epcs = postcode_epcs[~postcode_epcs["tenure"].isin(SOCIAL_TENURES)] - + for postcode in tqdm(postcodes): + postcode_epcs = process_postcode_epcs(postcode, client) epcs.append(postcode_epcs) + # Concatenate all postcodes' data and filter it epcs = pd.concat(epcs) + epcs = filter_and_prepare_epcs(epcs) + epcs = rename_and_add_columns(epcs) + + sheet_name = config["tab"][:31] # Excel sheet names max length of 31 characters + epcs.to_excel(writer, sheet_name=sheet_name, index=False) + + # Save and close the writer outside the loop + writer.close() + logger.info("Data successfully written to %s", output_filepath) diff --git a/etl/route_march/oo_prs_additional_units/requirements.txt b/etl/route_march/oo_prs_additional_units/requirements.txt index fd763a3b..e2f4832c 100644 --- a/etl/route_march/oo_prs_additional_units/requirements.txt +++ b/etl/route_march/oo_prs_additional_units/requirements.txt @@ -6,4 +6,5 @@ usaddress==0.5.11 fuzzywuzzy==0.18.0 boto3==1.35.44 python-dotenv -tqdm \ No newline at end of file +tqdm +xlsxwriter \ No newline at end of file From 557c0b589862eb1391e96f1a447b3323ebace9db Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 14:24:50 +0000 Subject: [PATCH 062/255] debugging string data --- .../oo_prs_additional_units/oo_prs_additional_units.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index c1b562ea..69e08f9a 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -131,10 +131,14 @@ def rename_and_add_columns(epcs): } ) + epcs["Number of Habitable Rooms"] = epcs["Number of Habitable Rooms"].astype(int) + epcs["Property Floor Area"] = epcs["Property Floor Area"].astype(float) + # Add additional columns as in your original code epcs["Estimated Number of Floors"] = epcs.apply( lambda x: estimate_number_of_floors(x["Property Type"]) if pd.notnull(x["Property Type"]) else None, axis=1 ) + epcs["Estimated Perimeter (m)"] = epcs.apply( lambda x: estimate_perimeter( x["Property Floor Area"] / x["Estimated Number of Floors"], @@ -167,7 +171,9 @@ def main(): """ # This should be set: - output_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/PRS and OO properties - WC 11.11.2024" + output_filepath = ( + "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/PRS and OO properties - WC 11.11.2024.xlsx" + ) client = EpcClient(auth_token=EPC_AUTH_TOKEN) writer = pd.ExcelWriter(output_filepath, engine="xlsxwriter") From a7aecb24629519c028d0f2d144610a2cf8dc0e7a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 14:29:54 +0000 Subject: [PATCH 063/255] debugging data pull --- .../oo_prs_additional_units/oo_prs_additional_units.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index 69e08f9a..2c63a788 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -75,9 +75,12 @@ SOCIAL_TENURES = ["Rented (social)", "rental (social)"] def process_postcode_epcs(postcode, client): - params = {"postcode": postcode} + params = {"postcode": postcode.rstrip().lstrip()} url = os.path.join(client.domestic.host, "search") + "?" + urlencode({"size": 1000}) response = client.domestic.call(method="get", url=url, params=params) + if "rows" not in response: + logger.warning("No EPCs found for postcode %s", postcode) + return pd.DataFrame() postcode_epcs = pd.DataFrame(response["rows"]) # Processing code here @@ -191,6 +194,8 @@ def main(): epcs = [] for postcode in tqdm(postcodes): postcode_epcs = process_postcode_epcs(postcode, client) + if postcode_epcs.empty: + continue epcs.append(postcode_epcs) # Concatenate all postcodes' data and filter it From 4443f1aa4b3b9216f3643de376d8838e6bd89a5b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 14:33:01 +0000 Subject: [PATCH 064/255] re-added dropping of columns and changed default floor height to 2.4 --- .../oo_prs_additional_units.py | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index 2c63a788..93757051 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -111,6 +111,33 @@ def filter_and_prepare_epcs(epcs): def rename_and_add_columns(epcs): + # Retrieve just the data we need + epcs = epcs[ + [ + "uprn", + "address", + "postcode", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + "tenure" + ] + ] + epcs = epcs.rename( columns={ "address": "Address", @@ -151,7 +178,7 @@ def rename_and_add_columns(epcs): epcs["Estimated Heat Loss Perimeter (m2)"] = epcs.apply( lambda x: estimate_external_wall_area( x["Estimated Number of Floors"], - float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.4, x["Estimated Perimeter (m)"], x["Archetype"] ), axis=1 From 00bd1e0ce6ee788090baa98781979b06a313c812 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 14:56:28 +0000 Subject: [PATCH 065/255] prs and oo data pulled for now --- .../oo_prs_additional_units/oo_prs_additional_units.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index 93757051..3bd87a8c 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -134,7 +134,9 @@ def rename_and_add_columns(epcs): "mainheat-description", # "energy-consumption-current", # kwh/m2 - "tenure" + "tenure", + "Is Cavity Property", + "Solar and Loft", ] ] From 42dc635aa3e4f15400998da180d12a03dad0cbea Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 17:37:04 +0000 Subject: [PATCH 066/255] ammended RIR conditions --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/app/plan/router.py | 6 +- backend/ml_models/Valuation.py | 22 +++- etl/customers/warwick/remote_assessments.py | 123 ++++++++++++++++++++ recommendations/Recommendations.py | 8 +- recommendations/RoofRecommendations.py | 13 ++- 7 files changed, 166 insertions(+), 10 deletions(-) create mode 100644 etl/customers/warwick/remote_assessments.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 0e963140..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 35513387..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 119c2061..65a6c32c 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -759,7 +759,11 @@ async def trigger_plan(body: PlanTriggerRequest): new_epc = sap_to_epc(new_sap_points) new_epc_bands[p.id] = new_epc - valuations = PropertyValuation.estimate(property_instance=p, target_epc=new_epc) + total_cost = sum([r["total"] for r in default_recommendations]) + + valuations = PropertyValuation.estimate( + property_instance=p, target_epc=new_epc, total_cost=total_cost + ) property_value_increase_ranges[p.id] = valuations if p.is_new: diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index 92c55641..720005d3 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -203,7 +203,14 @@ class PropertyValuation: return msm_increase, lloyds_increase @classmethod - def estimate(cls, property_instance, target_epc): + def estimate(cls, property_instance, target_epc, total_cost=None): + """ + This function estimates the value of a property based on the current EPC rating and the target EPC rating + :param property_instance: An instance of the Property class + :param target_epc: The target EPC rating + :param total_cost: The total cost of the retrofit + :return: + """ current_value = ( property_instance.valuation if property_instance.valuation else cls.UPRN_VALUE_LOOKUP.get(property_instance.uprn) @@ -242,6 +249,19 @@ class PropertyValuation: avg_increase = np.mean(all_increases) + if total_cost is not None: + # We CAP the retrofit ROI at 2 + avg_increase_value = current_value * avg_increase + if avg_increase_value / total_cost > 2: + # We re-scale the % so that the average value increase is no more than 2 times the total cost + double_cost = 2 * total_cost + new_avg_increase = double_cost / current_value + scalar = new_avg_increase / avg_increase + # We scale the min and max increases by the same scalar + min_increase *= scalar + max_increase *= scalar + avg_increase = new_avg_increase + return { "current_value": current_value, "lower_bound_increased_value": float(current_value * (1 + min_increase)), diff --git a/etl/customers/warwick/remote_assessments.py b/etl/customers/warwick/remote_assessments.py new file mode 100644 index 00000000..a9b654b7 --- /dev/null +++ b/etl/customers/warwick/remote_assessments.py @@ -0,0 +1,123 @@ +import pandas as pd +from utils.s3 import save_csv_to_s3 + +PORTFOLIO_ID = 115 +USER_ID = 8 + + +def app(): + """ + Used to set up the remote assessments for Warwick + """ + + asset_list = [ + { + "uprn": 10033604792, + "address": "Flat 2, 3 Green Street", + "postcode": "W1K 6RN" + }, + { + "uprn": 10033604794, + "address": "Flat 4, 3 Green Street", + "postcode": "W1K 6RN" + }, + { + "uprn": 10033615515, + "address": "Apartment 4, 52 Green Street", + "postcode": "W1K 6RS" + } + ] + asset_list = pd.DataFrame(asset_list) + + # Store the asset list in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=asset_list, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + non_invasive_recommendations = [ + { + "uprn": 10033604792, + "recommendations": [ + { + "type": "internal_wall_insulation", + "sap_points": 16, + "survey": True + } + ] + }, + { + "uprn": 10033604794, + "recommendations": [ + { + "type": "internal_wall_insulation", + "sap_points": 14, + "survey": True + } + ] + }, + { + "uprn": 10033615515, + "recommendations": [ + { + "type": "room_roof_insulation", + "sap_points": 12, + "survey": True + }, + { + "type": "internal_wall_insulation", + "sap_points": 2, + "survey": True + } + ] + } + ] + + # Store non-invasive recommendations in S3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + valuation_data = [ + { + "uprn": 10033604792, + "value": 3_692_000 + }, + { + "uprn": 10033604794, + "value": 3_789_000 + }, + { + "uprn": 10033615515, + "value": 3_499_000 + } + ] + + # Store valuation data to s3 + valuation_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuation.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(valuation_data), + bucket_name="retrofit-plan-inputs-dev", + file_name=valuation_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": valuation_filename, + "scenario_name": "Full package remote assessment", + "multi_plan": True, + "budget": None, + } + print(body) diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index dd51b47d..a1183d33 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -519,6 +519,7 @@ class Recommendations: # heating_cost_starting and heating_cost_ending are just the values in the EPC. However, with # heating_cost_ending, we expect that the EPC will predict a heating cost based on what would happen # if we implemented the recommendation today, so our starting value is the EPC + previous_phase_values = { "sap": float(property_instance.data["current-energy-efficiency"]), "carbon": float(property_instance.data["co2-emissions-current"]), @@ -541,8 +542,13 @@ class Recommendations: previous_phase_values = previous_phase_values_multiple[0] # We extract the values for the current phase + if rec.get("survey", False): + current_phase_sap = rec["sap_points"] + previous_phase_values["sap"] + else: + current_phase_sap = phase_energy_efficiency_metrics["sap_change"] + current_phase_values = { - "sap": phase_energy_efficiency_metrics["sap_change"], + "sap": current_phase_sap, "carbon": phase_energy_efficiency_metrics["carbon_change"], "heat_demand": phase_energy_efficiency_metrics["heat_demand"], } diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index c0fa4eb2..acc78359 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -123,7 +123,11 @@ class RoofRecommendations: self.property.roof["insulation_thickness"] in ["average", "above_average"] ) - return full_insulated_room_roof or room_roof_insulated_at_rafters + has_non_invasive_recommendation = any( + x["type"] == "room_roof_insulation" for x in self.property.non_invasive_recommendations + ) + + return (full_insulated_room_roof or room_roof_insulated_at_rafters) and not has_non_invasive_recommendation def recommend(self, phase, measures=None, default_u_values=False): @@ -181,7 +185,8 @@ class RoofRecommendations: # We firstly handle non-intrusive recommendations, which may override the normal roof insulation recommendations if ("loft_insulation" in [x["type"] for x in non_invasive_recommendations]) or ( - self.property.roof["is_pitched"] and "loft_insulation" in measures + self.property.roof["is_pitched"] and "loft_insulation" in measures and + not self.property.roof["is_at_rafters"] ): self.recommend_roof_insulation( u_value=u_value, @@ -512,8 +517,6 @@ class RoofRecommendations: rir_non_invasive_recommendation.get("cost") ) - sap_points = rir_non_invasive_recommendation.get("sap_points", None) - # Could also be Roof room(s), ceiling insulated new_descriptin = "Roof room(s), insulated" roof_ending_config = RoofAttributes(new_descriptin).process() @@ -562,7 +565,7 @@ class RoofRecommendations: "description": "Insulate room in roof at rafters and re-decorate", "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": sap_points, + "sap_points": rir_non_invasive_recommendation.get("sap_points", None), "simulation_config": simulation_config, "description_simulation": { "roof-description": new_descriptin, From b59c3163bef5bcd54572967828cabb65d69d4ae4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Nov 2024 10:22:14 +0000 Subject: [PATCH 067/255] setting up stonewater potential eco properties --- etl/customers/stonewater/potential_eco_properties.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 etl/customers/stonewater/potential_eco_properties.py diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py new file mode 100644 index 00000000..6ea6962b --- /dev/null +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -0,0 +1,4 @@ +def app(): + """ + This code creates a list of cavity properties, for review + """ From 7c4e32abc9430d509809d69534c4630685dcf09d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Nov 2024 13:41:41 +0000 Subject: [PATCH 068/255] checking additional list --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../stonewater/potential_eco_properties.py | 274 ++++++++++++++++++ .../requirements/requirements-wave-3-prep.txt | 4 + 4 files changed, 280 insertions(+), 2 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index 6ea6962b..26321a41 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -1,4 +1,278 @@ +import os +import time +import json +import pandas as pd +import numpy as np +from tqdm import tqdm +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from utils.s3 import read_from_s3, read_pickle_from_s3 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + house_number = home["Number"] + full_address = home["Full Address"] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + def app(): """ This code creates a list of cavity properties, for review """ + + archetyped_properties = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 - " + "Archetyped V3.1.xlsx", + header=4 + ) + + cavity_descriptions = [ + "Cavity: AsBuilt (1983-1995)", + "Cavity: AsBuilt (Post 1995)", + "Cavity: AsBuilt (Pre 1976)", + "Cavity: AsBuilt (1976-1982)", + ] + + archetyped_properties["Is Cavity Property"] = archetyped_properties["Wall Type"].isin(cavity_descriptions) + # We also identify any properties where properties were found to need cavity wall insulation + + costed_packages = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages " + "20241030 (WIP) Single Model V2.xlsx", + sheet_name="Modelled Packages", + header=13 + ) + + needs_cwi = costed_packages[ + costed_packages["Main Wall Insulation"].isin( + [ + "Poss Extract CWI & Refill (issues identified)", + "CWI RdSAP Default" + ] + ) + ][["Address ID", "Address", "Current SAP Rating", "Current EPC Band", "Postcode", "Archetype ID", + "Main Wall Insulation", + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"]] + + # We flag these properties + archetyped_properties["Survey shows CWI needed for Archetype"] = archetyped_properties["Archetype ID"].isin( + needs_cwi["Archetype ID"] + ) + + archetyped_properties = archetyped_properties[~pd.isnull(archetyped_properties["Address ID"])] + archetyped_properties = archetyped_properties[archetyped_properties["Address ID"] != "Address ID"] + + # this is the big list!!! + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + features["Address ID"] = features["Address ID"].astype(str) + + features_to_merge = features[ + [ + "Address ID", "Age", "Property Type", "Walls", "Roofs", "Glazing", "Heating", "Main Fuel", "Hot Water", + "Renewables", "Total Floor Area" + ] + ] + + stonewater_cavity_properties = archetyped_properties[ + ["Name", "Postcode", "Osm. ID", "Address ID", "UPRN", "UDPRN", "Archetype ID", "House no", "Street name", + "Address line 2", "City/Town", "Is Cavity Property", "Survey shows CWI needed for Archetype"] + ].merge( + features_to_merge, how="left", on="Address ID" + ) + + # We filter this down to the properties that are cavity properties + stonewater_cavity_properties = stonewater_cavity_properties[ + stonewater_cavity_properties["Is Cavity Property"] | + stonewater_cavity_properties["Survey shows CWI needed for Archetype"] + ] + + stonewater_cavity_properties["Reason Included"] = "As Built Cavity Property" + stonewater_cavity_properties["Reason Included"] = np.where( + stonewater_cavity_properties["Survey shows CWI needed for Archetype"] & + ~stonewater_cavity_properties["Is Cavity Property"], + "Survey revealed potential need for CWI or extract and re-fill", + stonewater_cavity_properties["Reason Included"] + ) + stonewater_cavity_properties["Reason Included"] = np.where( + stonewater_cavity_properties["Survey shows CWI needed for Archetype"] & + stonewater_cavity_properties["Is Cavity Property"], + "Surveyed revealed potential need for CWI or extract and re-fill and is an as built cavity property", + stonewater_cavity_properties["Reason Included"] + ) + # We indicate the exact properties that need CWI, based on survey findings + stonewater_cavity_properties["Reason Included"] = np.where( + stonewater_cavity_properties["Address ID"].isin( + needs_cwi[needs_cwi["Main Wall Insulation"] == "CWI RdSAP Default"]["Address ID"].astype(int).astype( + str).values + ), + "Survey showed this property needs CWI", + stonewater_cavity_properties["Reason Included"] + ) + + stonewater_cavity_properties["Reason Included"] = np.where( + stonewater_cavity_properties["Address ID"].isin( + needs_cwi[needs_cwi["Main Wall Insulation"] == "Poss Extract CWI & Refill (issues identified)"][ + "Address ID"].astype(int).astype(str).values + ), + "Survey showed this property could need extract and re-fill", + stonewater_cavity_properties["Reason Included"] + ) + + # We get the EPC data + epc_data = json.loads( + read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/epc_data.json" + ) + ) + epc_data = pd.DataFrame(epc_data) + + epc_data["uprn"] = np.where( + epc_data["internal_id"] == 1091, + 83143766, + epc_data["uprn"] + ) + + epc_data_batch_2 = read_pickle_from_s3( + s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", + bucket_name="retrofit-data-dev" + ) + epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) + + complete_epcs = pd.concat([epc_data, epc_data_batch_2]) + + epcs_to_merge = complete_epcs[ + [ + "uprn", + "address", + "postcode", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + "energy-consumption-current" + ] + ].rename( + columns={ + "address": "Address", + "postcode": "Postcode", + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)", + } + ) + # We de-dupe, taking the newest on the date the EPC was lod + epcs_to_merge["Date of last EPC"] = pd.to_datetime(epcs_to_merge["Date of last EPC"]) + epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False) + epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn") + + # Merge the EPCs on, with the data we need + stonewater_cavity_properties = stonewater_cavity_properties.rename( + columns={ + "Age": "Parity - Build Age", + "Property Type": "Parity - Property Type", + "Walls": "Parity - Wall Construction", + "Roofs": "Parity - Roof Construction", + "Glazing": "Parity - Glazing Type", + "Heating": "Parity - Heating Type", + "Main Fuel": "Parity - Main Fuel", + "Hot Water": "Parity - Hot Water", + "Renewables": "Parity - Renewables", + "Total Floor Area": "Parity - Total Floor Area" + } + ).merge( + epcs_to_merge, + how="left", + left_on="UPRN", + right_on="uprn" + ) + + # We now flag the additional properties in the as built list + + additional_properties = features[ + ~features["Address ID"].isin(archetyped_properties["Address ID"].values) + ] + + # Filter on as built cavity properties + additional_properties = additional_properties[ + additional_properties["Walls"].isin( + cavity_descriptions + + ["Cavity: FilledCavity", "Cavity: External", "Cavity: Internal"] + ) + ] + + # Pull the EPCs for these properties + for _, home in tqdm(additional_properties.iterrows()): + full_address = home["Address"] + postcode = home["Postcode"] + address1 = full_address.split(",")[0] diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 97314b32..102f5930 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -2,3 +2,7 @@ PyPDF2 pandas tqdm openpyxl +boto3 +epc-api-python==1.0.2 +usaddress==0.5.11 +fuzzywuzzy==0.18.0 From b40f72216f97d644bdf48663a9f395589d2b124b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Nov 2024 12:46:46 +0000 Subject: [PATCH 069/255] debugging retrieve_newest_find_my_epc_data --- etl/customers/ksquared/Wave3 Modelling.py | 47 ++++ .../stonewater/potential_eco_properties.py | 115 ++++++++- .../requirements/requirements-wave-3-prep.txt | 2 + etl/find_my_epc/RetrieveFindMyEpc.py | 238 ++++++++++++++++++ etl/find_my_epc/requirements.txt | 2 + 5 files changed, 396 insertions(+), 8 deletions(-) create mode 100644 etl/customers/ksquared/Wave3 Modelling.py create mode 100644 etl/find_my_epc/RetrieveFindMyEpc.py create mode 100644 etl/find_my_epc/requirements.txt diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py new file mode 100644 index 00000000..bf9eb1e8 --- /dev/null +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -0,0 +1,47 @@ +import time + +from tqdm import tqdm +import pandas as pd +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc + + +def app(): + """ + This script prepares the asset lists for the additional housing associations, CAHA and Hornsey Housing Trust, + that are forming a consortium led by AIHA + :return: + """ + + hornsey_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing " + "Trust.xlsx", + sheet_name="Ksquared-All units information", + header=3 + ) + + # We don't need the first row + hornsey_asset_list = hornsey_asset_list.iloc[1:] + # Fill NA values with empty strings + hornsey_asset_list = hornsey_asset_list.fillna("") + hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype( + str + ).str.strip() + hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip() + hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip() + # Replace double spaces + for col in ["Address letter or number", "Street address", "Postcode"]: + hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ") + + extracted_data = [] + for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)): + time.sleep(0.5) + # Some properties do not have an epc + if not home["Energy starting band (EPC)"]: + continue + unit_number = home["Address letter or number"] + street = home["Street address"] + postcode = home["Postcode"] + address = ", ".join([x for x in [unit_number, street] if x]) + searcher = RetrieveFindMyEpc(address=address, postcode=postcode) + epc_data = searcher.retrieve_newest_find_my_epc_data() + extracted_data.append(epc_data) diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index 26321a41..4fb89113 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -236,6 +236,8 @@ def app(): epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False) epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn") + stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str) + # Merge the EPCs on, with the data we need stonewater_cavity_properties = stonewater_cavity_properties.rename( columns={ @@ -265,14 +267,111 @@ def app(): # Filter on as built cavity properties additional_properties = additional_properties[ - additional_properties["Walls"].isin( - cavity_descriptions + - ["Cavity: FilledCavity", "Cavity: External", "Cavity: Internal"] - ) + additional_properties["Walls"].isin(cavity_descriptions) ] + additional_properties["Full Address"] = additional_properties["Address"].copy() + house_numbers = [] + for _, x in tqdm(additional_properties.iterrows(), total=len(additional_properties)): + house_no = SearchEpc.get_house_number(x["Address"].split(",")[0], x["Postcode"]) + if house_no is None: + house_no = x["Address"].split(",")[0] + # If we end up with a number like "01" we need to remove the leading zero + house_no = house_no.lstrip("0") + house_numbers.append( + { + "Address ID": x["Address ID"], + "Number": house_no + } + ) + + house_numbers = pd.DataFrame(house_numbers) + additional_properties = additional_properties.merge(house_numbers, how="left", on="Address ID") + additional_properties["row_id"] = additional_properties["Address ID"].copy() # Pull the EPCs for these properties - for _, home in tqdm(additional_properties.iterrows()): - full_address = home["Address"] - postcode = home["Postcode"] - address1 = full_address.split(",")[0] + additional_properties_epcs, errors = get_data(additional_properties) + + # Save this data as a pickle + # import pickle + # with open("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/additional_properties_epcs.pkl", + # "wb") as f: + # pickle.dump(additional_properties_epcs, f) + + # We drop Full Address + additional_properties = additional_properties.drop(columns=["Full Address"]) + additional_properties2 = additional_properties[[ + "row_id", "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing", + "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", + + ]].rename( + columns={ + "SAP": "Parity - Predicted SAP", + "SAP Band": "Parity - Predicted SAP Band", + "Age": "Parity - Build Age", + "Property Type": "Parity - Property Type", + "Walls": "Parity - Wall Construction", + "Roofs": "Parity - Roof Construction", + "Glazing": "Parity - Glazing Type", + "Heating": "Parity - Heating Type", + "Main Fuel": "Parity - Main Fuel", + "Hot Water": "Parity - Hot Water", + "Renewables": "Parity - Renewables", + "Total Floor Area": "Parity - Total Floor Area" + } + ).merge( + pd.DataFrame(additional_properties_epcs)[ + [ + "row_id", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + "energy-consumption-current" + ] + ].rename( + columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)", + } + ), + how="left", + on="row_id" + ) + + # We save the data locally + stonewater_cavity_properties.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties.csv", + index=False + ) + additional_properties2.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties.csv", + index=False + ) + # Save the survey findings + needs_cwi.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv", + index=False + ) diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 102f5930..3ad5d2c1 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -6,3 +6,5 @@ boto3 epc-api-python==1.0.2 usaddress==0.5.11 fuzzywuzzy==0.18.0 +python-dotenv + diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py new file mode 100644 index 00000000..a6696021 --- /dev/null +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -0,0 +1,238 @@ +import requests +from bs4 import BeautifulSoup +from datetime import datetime + + +class RetrieveFindMyEpc: + SEARCH_POSTCODE_URL = ( + "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}" + ) + BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk" + + HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/111.0.0.0 Safari/537.36' + } + + def __init__(self, address: str, postcode: str): + """ + This class is tasked with retrieving the latest EPC data from the find my epc website + :param address: The address of the property + :param postcode: The postcode of the property + """ + self.address = address + self.postcode = postcode + + self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower() + + def retrieve_newest_find_my_epc_data(self): + """ + For a post code and address, we pull out all the required data from the find my epc website + """ + + postcode_input = self.postcode.replace(" ", "+") + postcode_search = self.SEARCH_POSTCODE_URL.format(postcode_input=postcode_input) + postcode_response = requests.get(postcode_search, headers=self.HEADERS) + + postcode_res = BeautifulSoup(postcode_response.text, features="html.parser") + rows = postcode_res.find_all('tr', class_='govuk-table__row') + + extracted_table = [] + for row in rows: + # Extract the address and URL + address_tag = row.find('a', class_='govuk-link') + if address_tag is None: + continue + extracted_address = None + extracted_address_url = None + if address_tag: + extracted_address = address_tag.text.strip() + extracted_address_url = address_tag['href'] + + extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower() + if not extracted_address_cleaned.startswith(self.address_cleaned): + continue + + # If the address is a match, we can extract the data + + # Extract the expiry date + expiry_date_tag = row.find('td', class_='govuk-table__cell date') + expiry_date = None + if expiry_date_tag is not None: + expiry_date = expiry_date_tag.parent.find('span').text.strip() + + extracted_table.append( + { + "extracted_address": extracted_address, + "extracted_address_url": extracted_address_url, + "expiry_date": datetime.strptime(expiry_date, '%d %B %Y'), + } + ) + + if not extracted_table: + raise ValueError("No EPC found") + + if len(extracted_table) > 1: + # We take the one with the most recent expiry date + extracted_table = sorted(extracted_table, key=lambda x: x['expiry_date'], reverse=True) + + chosen_epc = self.BASE_ENERGY_URL + extracted_table[0]['extracted_address_url'] + epc_certificate = chosen_epc.split('/')[-1] + + address_response = requests.get(chosen_epc, headers=self.HEADERS) + address_res = BeautifulSoup(address_response.text, features="html.parser") + + # Key data we want to retrieve: + # 1) Rating + # 2) Bills estimates + # 3) Recommendations and SAP points + # 4) Low and zero carbon energy sources + + ratings = address_res.find('desc', {'id': 'svg-desc'}).text + current_rating = ratings.split(".")[0] + potential_rating = ratings.split(".")[1] + current_sap = int(current_rating.split(' ')[-1]) + + # Retrieve the energy consumption + bills = address_res.find('div', {'id': 'bills-affected'}) + bills_list = bills.find_all('li') + if not bills_list: + # If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information + heating_text = None + hot_water_text = None + else: + heating_text = bills_list[0].text + hot_water_text = bills_list[1].text + + # Retrieve the recommendations and SAP points + recommendations = [] + recommendations_div = address_res.find('div', class_='epb-recommended-improvements') + if recommendations_div: + # Find all h3 headers for each step and extract their related information + step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m') + previous_sap_score = current_sap + for step_num, step_header in enumerate(step_headers, start=1): + # Extract the step title (the measure) + measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "") + + # Find the div containing the potential rating within the same section + potential_rating_div = step_header.find_next( + 'div', class_='epb-recommended-improvements__potential-rating' + ) + + # Check if the potential rating div is found + if potential_rating_div: + # Extract the rating text within the SVG text element + rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold').text.strip() + # Parse the rating text to separate the numeric rating and EPC letter + new_rating = int(rating_text.split()[0]) + new_epc = rating_text.split()[1] + + # Append the information as a dictionary to the recommendations list + recommendations.append({ + "step": step_num, + "measure": measure_title, + "new_rating": new_rating, + "new_epc": new_epc, + "sap_points": new_rating - previous_sap_score + }) + previous_sap_score = new_rating + + # Search for the assessment informaton + assessment_information = address_res.find('div', {'id': 'information'}) + # Parse this information + rows = assessment_information.find_all('div', class_='govuk-summary-list__row') + # Create a dictionary to hold the parsed information + assessment_data = {} + for row in rows: + key = row.find('dt').text.strip() + if key == "Type of assessment": + # We dont reliably extract this + continue + value_tag = row.find('dd') + + # Check if value contains a link (email) + if value_tag.find('a'): + value = value_tag.find('a').text.strip() + elif value_tag.find('summary'): + value = value_tag.find('span').text.strip() + else: + value = value_tag.text.strip() + + # These are keys that we have for both the surveyor and the acreditation scheme. Firstly, we'll + # get the surveyor's name and email so we make that information clear + if key in ["Telephone", "Email"]: + if "Assessor's " + key not in assessment_data: + assessment_data["Assessor's " + key] = value + else: + assessment_data["Accreditation Scheme's " + key] = value + continue + + assessment_data[key] = value + + expected_keys = [ + 'Assessor’s name', + "Assessor's Telephone", + "Assessor's Email", + 'Assessor’s ID', + 'Accreditation scheme', + 'Assessor’s declaration', + "Accreditation Scheme's Telephone", + "Accreditation Scheme's Email", + 'Date of assessment', + 'Date of certificate' + ] + # Check we have all the expected keys + for key in expected_keys: + if key not in assessment_data: + raise ValueError(f"Missing key: {key}") + + # Finally, we format the recommendations + recommendations = self.format_recommendations(recommendations) + + resulting_data = { + 'epc_certificate': epc_certificate, + 'current_epc_rating': current_rating.split(' ')[-6], + 'current_epc_efficiency': current_sap, + 'potential_epc_rating': potential_rating.split(' ')[-6], + "potential_epc_efficiency": int(potential_rating.split(' ')[-1]), + "heating_text": heating_text, + "hot_water_text": hot_water_text, + "recommendations": recommendations, + **assessment_data + } + + return resulting_data + + def format_recommendations(self, recommendations): + """ + This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey + :param recommendations: + :return: + """ + + measure_map = { + "Internal or external wall insulation": ["internal_wall_insulation", "external_wall_insulation"], + "Hot water cylinder insulation": ["hot_water_tank_insulation"], + "Hot water cylinder thermostat": ["cylinder_thermostat"], + "High performance external doors": ["insulated_doors"], + "Floor insulation (solid floor)": ["solid_floor_insulation"], + "Double glazed windows": ["double_glazing"], + "Cavity wall insulation": ["cavity_wall_insulation"], + "Replace boiler with new condensing boiler": ["boiler_upgrade"], + } + + formatted_recommendations = [] + for rec in recommendations: + + mapped = measure_map[rec["measure"]] + for measure in mapped: + formatted_recommendations.append( + { + "type": measure, + "sap_points": rec["sap_points"], + "survey": True + } + ) + + return formatted_recommendations diff --git a/etl/find_my_epc/requirements.txt b/etl/find_my_epc/requirements.txt new file mode 100644 index 00000000..9a3fc73f --- /dev/null +++ b/etl/find_my_epc/requirements.txt @@ -0,0 +1,2 @@ +pandas +beautifulsoup4 \ No newline at end of file From c67cf7becbfaa77677b4580135c4664e4e321811 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Nov 2024 12:52:36 +0000 Subject: [PATCH 070/255] data pulled together for hornset --- etl/customers/ksquared/Wave3 Modelling.py | 2 ++ etl/find_my_epc/RetrieveFindMyEpc.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index bf9eb1e8..023ae25c 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -45,3 +45,5 @@ def app(): searcher = RetrieveFindMyEpc(address=address, postcode=postcode) epc_data = searcher.retrieve_newest_find_my_epc_data() extracted_data.append(epc_data) + + # We format the extracted data so that is has the same structure as non-intrusive recommendations diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index a6696021..dad32bf6 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -220,6 +220,11 @@ class RetrieveFindMyEpc: "Double glazed windows": ["double_glazing"], "Cavity wall insulation": ["cavity_wall_insulation"], "Replace boiler with new condensing boiler": ["boiler_upgrade"], + "Floor insulation": ["floor_insulation"], # Recommendation typically associated to older EPCs + "Heating controls (programmer, room thermostat and TRVs)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Low energy lighting": ["low_energy_lighting"], } formatted_recommendations = [] From 9d668d4d8338dc6cedc10d40921505ddeee2ea81 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 8 Nov 2024 07:59:28 +0000 Subject: [PATCH 071/255] working on aiha project --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/SearchEpc.py | 24 ++- backend/app/plan/router.py | 8 + etl/customers/aiha/xml_extraction.py | 2 +- etl/customers/ksquared/Wave3 Modelling.py | 117 +++++++++++++- etl/epc/Record.py | 15 ++ recommendations/HotwaterRecommendations.py | 179 ++++++++++++++++----- recommendations/Recommendations.py | 18 ++- recommendations/RoofRecommendations.py | 3 + recommendations/WallRecommendations.py | 33 ++-- recommendations/rdsap_tables.py | 2 +- 12 files changed, 325 insertions(+), 80 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 2d658c04..8ec4fdbe 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -96,7 +96,7 @@ vartypes = { 'walls-env-eff': 'str', 'transaction-type': 'str', # 'uprn': "Int64", - 'current-energy-efficiency': 'float', + 'current-energy-efficiency': 'Int64', 'energy-consumption-current': 'float', 'mainheat-description': 'str', 'lighting-cost-current': 'float', @@ -342,8 +342,12 @@ class SearchEpc: rows_filtered = [r for r in rows if ", ".join([r["address"], r["posttown"]]) == best_match[0]] else: best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0) + # Get the UPRN for the best match + best_match_uprn = {r["uprn"] for r in rows if r["address"] == best_match[0]}.pop() # Get all of the scores - rows_filtered = [r for r in rows if r["address"] == best_match[0]] + rows_filtered = [ + r for r in rows if (r["address"] == best_match[0]) or (r["uprn"] == best_match_uprn) + ] if rows_filtered: return rows_filtered @@ -642,6 +646,7 @@ class SearchEpc: estimation_data = epc_data[[key, "weight", "lodgement-datetime"]].copy() estimation_data = estimation_data[~pd.isnull(estimation_data[key])] estimation_data = estimation_data[~estimation_data[key].isin(Definitions.DATA_ANOMALY_MATCHES)] + if vartype == "Int64": # We have some edge cases where we get the error "invalid literal for int() with base 10: '1.0'" # so this handles this @@ -653,6 +658,13 @@ class SearchEpc: estimated_epc[key] = None continue + if key == "floor-height": + # We speficially handle this, to avoid extreme values + # We check if we have any rows less than 3.5m + if estimation_data[estimation_data["floor-height"].astype(float) <= 3.5].shape[0] > 0: + # Perform the filter + estimation_data = estimation_data[estimation_data["floor-height"].astype(float) <= 3.5] + if vartype == "Int64": estimated_value = self._estimate_int(estimation_data, key) elif vartype == "float": @@ -675,6 +687,14 @@ class SearchEpc: estimated_epc["current-energy-rating"] = sap_to_epc(estimated_epc["current-energy-efficiency"]) + # Convert the cost current and potential variables - to string integers + for variable in ["heating-cost-current", "hot-water-cost-current", "lighting-cost-current", + "heating-cost-potential", "hot-water-cost-potential", "lighting-cost-potential"]: + estimated_epc[variable] = str(int(estimated_epc[variable])) + + # This is a string + estimated_epc["low-energy-fixed-light-count"] = str(estimated_epc["low-energy-fixed-light-count"]) + estimated_epc["postcode"] = self.postcode estimated_epc["uprn"] = self.uprn estimated_epc["address"] = self.full_address diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 65a6c32c..3b6f3985 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -393,6 +393,13 @@ async def trigger_plan(body: PlanTriggerRequest): session.begin() logger.info("Getting the inputs") plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path) + # Check for duplicate UPRNS + input_uprns = [x.get("uprn") for x in plan_input if "uprn" in x] + if input_uprns: + # Check for dupes + if len(input_uprns) != len(set(input_uprns)): + raise ValueError("Duplicate UPRNs in the input data") + # If we have patches or overrides, we should read them in here patches, already_installed, non_invasive_recommendations, valuation_data = get_request_property_data(body) @@ -848,6 +855,7 @@ async def trigger_plan(body: PlanTriggerRequest): # Commit final changes session.commit() + except IntegrityError: logger.error("Database integrity error occurred", exc_info=True) session.rollback() diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 531b6752..f96744ec 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -701,7 +701,7 @@ def main(): "starting_sap": 53, "recommended_measures": [ { - "measure": "Cyliner Insulation", + "measure": "Cylinder Insulation", "description": "80mm cylinder insulation", "sap_points": 2, "ending_sap": 55, diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index 023ae25c..b96b261f 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -1,8 +1,17 @@ +import os import time +from dotenv import load_dotenv from tqdm import tqdm import pandas as pd from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc +from utils.s3 import save_csv_to_s3 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +USER_ID = 8 +PORTFOLIO_ID = 117 def app(): @@ -32,18 +41,118 @@ def app(): for col in ["Address letter or number", "Street address", "Postcode"]: hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ") + hornsey_asset_list = hornsey_asset_list[hornsey_asset_list["Address letter or number"] != ""] + + missed_uprns = { + "Flat 13A Stowell House": 100021213098, + "Flat 24 Stowell House": 100021213110, + "Flat 1 36 Haringey Park": None + } extracted_data = [] + asset_list = [] for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)): - time.sleep(0.5) + + if home["Address letter or number"] == "Flat 1 36 Haringey Park": + continue + # Some properties do not have an epc if not home["Energy starting band (EPC)"]: + asset_list.append( + { + "uprn": missed_uprns[home["Address letter or number"]], + "address": home["Address letter or number"], + "postcode": home["Postcode"], + "property_type": "Flat", # They're all flats + } + ) continue + unit_number = home["Address letter or number"] street = home["Street address"] postcode = home["Postcode"] address = ", ".join([x for x in [unit_number, street] if x]) - searcher = RetrieveFindMyEpc(address=address, postcode=postcode) - epc_data = searcher.retrieve_newest_find_my_epc_data() - extracted_data.append(epc_data) + find_epc_searcher = RetrieveFindMyEpc(address=address, postcode=postcode) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + searcher = SearchEpc( + address1=address, + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + full_address=address, + ) + searcher.find_property(skip_os=True) + newest_epc = searcher.newest_epc + if newest_epc["current-energy-efficiency"] != home["Energy starting band (EPC)"].split("-")[1]: + raise Exception("Something went wrong with the EPC data") + + extracted_data.append( + { + "uprn": newest_epc["uprn"], + **find_epc_data, + "hotwater-description": newest_epc["hotwater-description"], + } + ) + + asset_list.append( + { + "uprn": newest_epc["uprn"], + "address": home["Address letter or number"], + "postcode": home["Postcode"], + "property_type": "Flat", # They're all flats + } + ) # We format the extracted data so that is has the same structure as non-intrusive recommendations + # We then get the UPRNs and create the asset list + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + for r in non_invasive_recommendations: + new_recommendations = [] + extracted = [r for r in extracted_data if r["uprn"] == r["uprn"]][0] + for rec in r["recommendations"]: + if extracted["hotwater-description"] == "Gas boiler/circulator, no cylinder thermostat": + if rec["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"]: + continue + rec["survey"] = False + new_recommendations.append(rec) + r["recommendations"] = new_recommendations + + # Store the asset list in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": ["boiler_upgrade"] + } + print(body) diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 4c1a912b..558dbacb 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -359,6 +359,7 @@ class EPCRecord: self._clean_property_dimensions() self._clean_number_lighting_outlets() self._clean_floor_level() + self._clean_floor_height() # self._clean_potential_energy_efficiency() # self._clean_environment_impact_potential() @@ -387,6 +388,20 @@ class EPCRecord: return df + def _clean_floor_height(self): + """ Remaps anomalies in floor height to the average floor height for the property type """ + floor_height_data = self.cleaning_data[ + (self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) & + (self.cleaning_data["built_form"] == self.prepared_epc["built-form"]) + ] + average = floor_height_data["floor_height"].mean() + sd = floor_height_data["floor_height"].std() + # If we're in the top 0.5 percentile of floor heights, we'll set it to the average + if self.prepared_epc["floor-height"] > average + 10 * sd: + self.prepared_epc["floor-height"] = average + if self.prepared_epc["floor-height"] <= 1.665: + self.prepared_epc["floor-height"] = average + def _clean_floor_level(self): """ This method will clean the floor level, if empty or invalid diff --git a/recommendations/HotwaterRecommendations.py b/recommendations/HotwaterRecommendations.py index 636a7be0..5ff7ae4f 100644 --- a/recommendations/HotwaterRecommendations.py +++ b/recommendations/HotwaterRecommendations.py @@ -21,11 +21,44 @@ class HotwaterRecommendations: """ # Reset the recommendations self.recommendations = [] + non_invasive_recommendations = self.property.non_invasive_recommendations + if non_invasive_recommendations: + measures = [ + r["type"] for r in non_invasive_recommendations if + r["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"] + ] + + recommendations_phase = phase + for m in measures: + non_invasive_rec = [ + r for r in non_invasive_recommendations if r["type"] == m + ][0] + if m == "hot_water_tank_insulation": + # We need to be able to stack these recommendations + self.recommend_tank_insulation( + phase=recommendations_phase, + sap_points=non_invasive_rec["sap_points"], + survey=non_invasive_rec["survey"], + ) + + recommendations_phase += 1 + elif m == "cylinder_thermostat": + self.recommend_cylinder_thermostat( + phase=recommendations_phase, + sap_points=non_invasive_rec["sap_points"], + survey=non_invasive_rec["survey"], + ) + recommendations_phase += 1 # This first iteration of the recommender will provide very basic recommendation # We recommend heating controls based on the main heating system - # If there is no system present, but access to the mains, we + if self.property.hotwater["clean_description"] == "Gas boiler/circulator, no cylinder thermostat": + # Handle this case specifically: + self.recommend_cylinder_thermostat_gas_boiler_circulator(phase=phase) + return + + # If there is no system present, but access to the mains, we if ( (self.property.hotwater["heater_type"] in ["electric immersion"]) & @@ -39,7 +72,7 @@ class HotwaterRecommendations: self.recommend_cylinder_thermostat(phase=phase) return - def recommend_tank_insulation(self, phase): + def recommend_tank_insulation(self, phase, sap_points=None, survey=False, _return=False): """ If the home has a very poor hot water system, this is often indicative of a lack of insulation on the hot water tank. This is a very simple and cost effective improvement that can be made to the home. It will likely @@ -55,27 +88,30 @@ class HotwaterRecommendations: else: description = "Insulate hot water tank" - self.recommendations.append( - { - "phase": phase, - "parts": [], - "type": "hot_water_tank_insulation", - "measure_type": "hot_water_tank_insulation", - "description": description, - "starting_u_value": None, - "new_u_value": None, - "sap_points": None, - "already_installed": already_installed, - **recommendation_cost, - "simulation_config": {"hot_water_energy_eff_ending": "Poor"}, - "description_simulation": { - "hot-water-energy-eff": "Poor" - } - } - ) + to_append = { + "phase": phase, + "parts": [], + "type": "hot_water_tank_insulation", + "measure_type": "hot_water_tank_insulation", + "description": description, + "starting_u_value": None, + "new_u_value": None, + "sap_points": sap_points, + "already_installed": already_installed, + **recommendation_cost, + "simulation_config": {"hot_water_energy_eff_ending": "Poor"}, + "description_simulation": { + "hot-water-energy-eff": "Poor" + }, + "survey": survey + } + if _return: + return to_append + + self.recommendations.append(to_append) return - def recommend_cylinder_thermostat(self, phase): + def recommend_cylinder_thermostat(self, phase, sap_points=None, survey=False, _return=False): """ If the home has a very poor hot water system, this is often indicative of a lack of insulation on the hot water tank. This is a very simple and cost effective improvement that can be made to the home. @@ -101,23 +137,86 @@ class HotwaterRecommendations: **hotwater_simulation_config } - self.recommendations.append( - { - "phase": phase, - "parts": [], - "type": "cylinder_thermostat", - "measure_type": "cylinder_thermostat", - "description": description, - "starting_u_value": None, - "new_u_value": None, - "sap_points": None, - "already_installed": already_installed, - **recommendation_cost, - "simulation_config": simulation_config, - "description_simulation": { - "hot-water-energy-eff": self.property.data["hot-water-energy-eff"], - "hotwater-description": new_epc_description, - } - } - ) + to_append = { + "phase": phase, + "parts": [], + "type": "cylinder_thermostat", + "measure_type": "cylinder_thermostat", + "description": description, + "starting_u_value": None, + "new_u_value": None, + "sap_points": sap_points, + "already_installed": already_installed, + **recommendation_cost, + "simulation_config": simulation_config, + "description_simulation": { + "hot-water-energy-eff": self.property.data["hot-water-energy-eff"], + "hotwater-description": new_epc_description, + }, + "survey": survey + } + if _return: + return to_append + + self.recommendations.append(to_append) + return + + def recommend_cylinder_thermostat_gas_boiler_circulator(self, phase): + """ + If the home has a very poor hot water system, this is often indicative of a lack of insulation on the + hot water + tank. This is a very simple and cost effective improvement that can be made to the home. + """ + + thermostat_recommendation_cost = self.costs.cylinder_thermostat() + cylinder_recommendation_cost = self.costs.hot_water_tank_insulation() + # Add them + total_cost = { + k: thermostat_recommendation_cost[k] + cylinder_recommendation_cost[k] for k in + thermostat_recommendation_cost.keys() + } + + already_installed = "cylinder_thermostat" in self.property.already_installed + if already_installed: + total_cost = override_costs(total_cost) + description = "Cylinder thermostat & insulation has already been installed, no further action required" + else: + description = "Install a smart cylinder thermostat and insulate the hot water tank with 80mm insulation" + + new_epc_description = "From main system" + hotwater_ending_config = HotWaterAttributes(new_epc_description).process() + hotwater_simulation_config = check_simulation_difference( + new_config=hotwater_ending_config, old_config=self.property.hotwater + ) + + if self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"]: + new_efficiency = "Good" + else: + new_efficiency = self.property.data["hot-water-energy-eff"] + + simulation_config = { + "hot_water_energy_eff_ending": new_efficiency, + **hotwater_simulation_config + } + + to_append = { + "phase": phase, + "parts": [], + "type": "cylinder_thermostat", + "measure_type": "cylinder_thermostat", + "description": description, + "starting_u_value": None, + "new_u_value": None, + "sap_points": None, + "already_installed": already_installed, + **total_cost, + "simulation_config": simulation_config, + "description_simulation": { + "hot-water-energy-eff": simulation_config["hot_water_energy_eff_ending"], + "hotwater-description": new_epc_description, + }, + "survey": False + } + + self.recommendations.append(to_append) return diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index a1183d33..ed6a8526 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -142,12 +142,9 @@ class Recommendations: # Ventilation recommendations # We only produce a ventilation recommendation if the property is recommended to have wall or roof - # insulation - # We will not attribute a SAP impact to the ventilation recommendation, since we've seen that this - # has no - # real impact on the SAP score. Therefore, we don't need to include phasing for ventilation. If we - # have any - # wall or roof recommendations, we will ensure that ventilation is included in the simulation + # insulation We will not attribute a SAP impact to the ventilation recommendation, since we've seen that this + # has no real impact on the SAP score. Therefore, we don't need to include phasing for ventilation. If we + # have any wall or roof recommendations, we will ensure that ventilation is included in the simulation if ( (self.wall_recomender.recommendations or self.roof_recommender.recommendations) and ("ventilation" in measures) @@ -253,8 +250,13 @@ class Recommendations: if "hot_water" in measures: self.hotwater_recommender.recommend(phase=phase) if self.hotwater_recommender.recommendations: - property_recommendations.append(self.hotwater_recommender.recommendations) - phase += 1 + if len(self.hotwater_recommender.recommendations) > 1: + for r in self.hotwater_recommender.recommendations: + property_recommendations.append([r]) + phase += 1 + else: + property_recommendations.append(self.hotwater_recommender.recommendations) + phase += 1 if "secondary_heating" in measures: self.secondary_heating_recommender.recommend(phase=phase) diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index acc78359..51264b75 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -152,6 +152,9 @@ class RoofRecommendations: if self.is_room_roof_insulated_or_unsuitable(measures): return + if self.property.roof["is_thatched"]: + return + # If we have a u-value already, need to implement this if u_value: if u_value <= self.BUILDING_REGULATIONS_PART_L_MAX_U_VALUE: diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py index c7917911..f77ae5a0 100644 --- a/recommendations/WallRecommendations.py +++ b/recommendations/WallRecommendations.py @@ -540,15 +540,10 @@ class WallRecommendations(Definitions): lowest_selected_u_value = None recommendations = [] - - iwi_non_invasive_recommendations = next( - (r for r in self.property.non_invasive_recommendations if r["type"] == "internal_wall_insulation"), {} + non_invasive_recommendations = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} ) - ewi_non_invasive_recommendations = next( - (r for r in self.property.non_invasive_recommendations if r["type"] == "external_wall_insulation"), {} - ) - if ewi_non_invasive_recommendations: - raise NotImplementedError("Implement ewi non-invasive recommendations") for _, insulation_material_group in insulation_materials.groupby("description"): @@ -590,31 +585,25 @@ class WallRecommendations(Definitions): if already_installed: cost_result = override_costs(cost_result) + if non_invasive_recommendations.get("cost") is not None: + raise NotImplementedError( + "Not handled passing costs from non-invasive recommendations for iwi" + ) + if material["type"] == "internal_wall_insulation": - - if iwi_non_invasive_recommendations.get("cost") is not None: - raise NotImplementedError( - "Not handled passing costs from non-invasive recommendations for iwi" - ) - - sap_points = iwi_non_invasive_recommendations.get("sap_points", None) - survey = iwi_non_invasive_recommendations.get("survey", False) - new_description = self.get_internal_external_wall_description( self.INTERNALLY_INSULATED_WALL_DESCRIPTIONS, new_u_value ) - elif material["type"] == "external_wall_insulation": - - sap_points = ewi_non_invasive_recommendations.get("sap_points", None) - survey = ewi_non_invasive_recommendations.get("survey", False) - new_description = self.get_internal_external_wall_description( self.EXTERNALLY_INSULATED_WALL_DESCRIPTIONS, new_u_value ) else: raise ValueError("Invalid material type") + sap_points = non_invasive_recommendations.get("sap_points", None) + survey = non_invasive_recommendations.get("survey", False) + wall_ending_config = WallAttributes(new_description).process() walls_simulation_config = check_simulation_difference( diff --git a/recommendations/rdsap_tables.py b/recommendations/rdsap_tables.py index 16c7d26e..e56faf7c 100644 --- a/recommendations/rdsap_tables.py +++ b/recommendations/rdsap_tables.py @@ -257,7 +257,7 @@ epc_wall_description_map = { "Timber frame, as built, partial insulation": "Timber frame as built", "Timber frame, as built, no insulation": "Timber frame as built", "Timber frame, with external insulation": "Timber frame with internal insulation", - + "Timber frame, with internal insulation": "Timber frame with internal insulation", ############################ # Sandstone/limestones wall mappings ############################ From fe6e83314f836f8268839b9d45b809bb8c4d83e2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 11 Nov 2024 16:35:12 +0000 Subject: [PATCH 072/255] working on stonewater matches --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../settle/route_march_2024_11_08.py | 226 ++++++++++++++++++ .../stonewater/Wave 3 Preparation.py | 58 ++++- 4 files changed, 282 insertions(+), 6 deletions(-) create mode 100644 etl/customers/settle/route_march_2024_11_08.py diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/settle/route_march_2024_11_08.py b/etl/customers/settle/route_march_2024_11_08.py new file mode 100644 index 00000000..21b6f2df --- /dev/null +++ b/etl/customers/settle/route_march_2024_11_08.py @@ -0,0 +1,226 @@ +import os +import time + +import pandas as pd +from tqdm import tqdm + +from dotenv import load_dotenv +from utils.s3 import read_excel_from_s3 +from backend.SearchEpc import SearchEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + house_number = home["AddressLine1"] + full_address = ", ".join([home["AddressLine1"], home["AddressLine4"], home["AddressLine5"]]) + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/SETTLE FULL PROPOSED PROGRAMME.xlsx", + header=0 + ) + asset_list["row_id"] = asset_list.index + + epc_data, errors = get_data(asset_list) + + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data(asset_list_failed) + + # Append the failed data to the main data + epc_data.extend(epc_data_failed) + + epc_df = pd.DataFrame(epc_data) + + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "row_id", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" + ) + + asset_list = asset_list.drop(columns=["row_id"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, + axis=1 + ) + + # Store as an excel + filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx" + asset_list.to_excel(filename, index=False) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 9f929db1..0036a0a4 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -8,7 +8,7 @@ from collections import Counter CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") -NUM_FOLDERS = 14 +NUM_FOLDERS = 15 def sap_to_epc(sap_points: int | float): @@ -871,7 +871,10 @@ def main(): # We now merge on the coordinator data so that against each property, we can map the measures retrofit_packages_board = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater 3.0 Updated SAP Pre & Modelled 29.10.24.xlsx"), + os.path.join( + CUSTOMER_FOLDER_PATH, + "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1731315080 11.11.24.xlsx" + ), header=4 ) retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] @@ -902,13 +905,24 @@ def main(): # '102 Cheaton Close': '', # 'Flat 16 Spring Gardens': '', # '4 Apple Close': '', - '25 Folly Lane': '', - + # '25 Folly Lane': '', + '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS', + '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX', + '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX', + '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ', + '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG", + '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX', + "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', + '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', + '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', + '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX' } # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): + # Handle the case that has the wrong postcode in the asset data if home["Name"] in manual_filters: filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy() @@ -972,6 +986,10 @@ def main(): missing_ids = list(missing_ids) if missing_ids: # We check that the missing ids have no data yet + missed = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)] + missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( + CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") + if len(missing_ids) != 8: raise Exception("Unacceptable number of missings") @@ -1316,5 +1334,37 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa # Save excel proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid (WIP).xlsx", index=False) + +def find_remaining_surveys(): + """ + This compares a list of properties that have been surveyed against a list of properties that I have produced + costed retrofit packages for, so I know what needs to be downloaded from Sharepoint + :return: + """ + + surveyed = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" + "/Stonewater_SHDF_3_0_Board_work_in_progress_- 07.11.24.xlsx", + header=4 + ) + + costed = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages " + "20241030 (WIP) MR Review v1.xlsx", + header=13, + sheet_name="Modelled Packages" + ) + costed = costed[~pd.isnull(costed["Address ID"])] + + needed = surveyed[~surveyed["Address ID"].isin(costed["Address ID"])] + + needed["id"] = needed["Archetype ID"].astype(str) + "-" + needed["Arch. Group Rank"].astype(str) + needed = needed.sort_values("id", ascending=True) + needed[["id", "Name", "Postcode"]].to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/needed_surveys.csv" + ) + + assert needed.shape[0] + costed.shape[0] == surveyed.shape[0] + # if __name__ == "__main__": # main() From dfa37f86d469d4ee926ee0dc2438629fb35e17cc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 12 Nov 2024 15:49:28 +0000 Subject: [PATCH 073/255] Adding postcode summary to stonewater --- .../stonewater/Wave 3 Preparation.py | 79 +++++++++++++++---- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 0036a0a4..889d8f88 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -916,13 +916,14 @@ def main(): "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', - '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX' + '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX', + '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA' } # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): - + # Handle the case that has the wrong postcode in the asset data if home["Name"] in manual_filters: filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy() @@ -986,11 +987,11 @@ def main(): missing_ids = list(missing_ids) if missing_ids: # We check that the missing ids have no data yet - missed = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)] - missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( - CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") + # missed = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)] + # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( + # CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") - if len(missing_ids) != 8: + if len(missing_ids) != 6: raise Exception("Unacceptable number of missings") if matching_lookup["Address ID"].duplicated().sum(): @@ -1083,12 +1084,20 @@ def main(): stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"]) windows_data["Address ID"] = windows_data["Address ID"].astype(float) stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left") + stonewater_data = stonewater_data.sort_values("Archetype ID", ascending=True) if stonewater_data["Address ID"].duplicated().sum(): raise Exception("Duplicate Address IDs") + for c in [ + 'Window attributes - Fitted/renewed date', + 'Parent Asset Window attributes - Fitted/renewed date', + 'Fitted/renewed date' + ]: + stonewater_data[c] = stonewater_data[c].astype(str) + # Save this data to excel - stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages.xlsx", index=False) + stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V2.xlsx", index=False) cost_sheet = [ { @@ -1173,7 +1182,7 @@ def main(): create_proposed_wave_3_bid( costed_packages_filepath=os.path.join( - CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) MR Review v1.xlsx" + CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) Single Model V3.xlsx" ), archetypes_sheet_filepath=os.path.join( CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" @@ -1183,8 +1192,8 @@ def main(): def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath): # We read in the costed packages - # Note: Header as 12 is for Matt Ratcliff's reviewed version costed_packages = pd.read_excel(costed_packages_filepath, header=13, sheet_name="Modelled Packages") + costed_packages = costed_packages[~pd.isnull(costed_packages["Address"])] archetypes_to_cost = costed_packages[ [ @@ -1213,16 +1222,11 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa 'Existing Primary Heating System', 'Existing Primary Heating PCDF Reference']) - # We take properties that are EPC D and below (61% of units) + # We take properties that are EPC D and below (59% of units) archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])] archetypes_to_cost["Has been modelled"] = ~pd.isnull(archetypes_to_cost["Modelled SAP Band"]) - average_cost = archetypes_to_cost[ - archetypes_to_cost["Has been modelled"] - ]['Total Cost of Measures inc Contingency'].mean() - print(average_cost) - # These are the Arhetypes that will likely be suitable for Wave 3 archetypes_sheet = pd.read_excel(archetypes_sheet_filepath, header=4) archetypes_sheet = archetypes_sheet[~pd.isnull(archetypes_sheet["Address ID"])] @@ -1236,7 +1240,21 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa how="left" ) - proposed_sample = archetypes_sheet[archetypes_sheet["Archetype ID"].isin(archetypes_to_cost["Archetype ID"])] + proposed_sample = archetypes_sheet[ + archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str)) + ] + + not_proposed = archetypes_sheet[ + ~archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str)) + ] + + # archetypes_without_survey = [] + # for p in list(set(not_proposed)): + # filtered = costed_packages[costed_packages["Archetype ID"].astype(int).astype(str) == p] + # if filtered.empty: + # archetypes_without_survey.append(p) + + # Can we propose anything about archetypes that were not surveyed? proposed_sample = proposed_sample[ [ @@ -1247,6 +1265,8 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa # We classify into high and low confidence + archetypes_to_cost["Surveyed Main Roof"] = archetypes_to_cost["Surveyed Main Roof"].fillna("") + match_classification = [] for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)): @@ -1331,8 +1351,33 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa None, proposed_sample["Total Cost of Measures inc Contingency"] ) + proposed_sample = proposed_sample.sort_values("Archetype ID", ascending=True) + # Save excel - proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid (WIP).xlsx", index=False) + proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid V2 (WIP).xlsx", index=False) + + # For each postcode that's in the bid, we also summarise the number of units in the bid and number left out + proposed_sample_postcodes = proposed_sample["Postcode"].unique() + + postcode_summary = [] + for postcode in proposed_sample_postcodes: + in_proposal = proposed_sample[proposed_sample["Postcode"] == postcode] + not_in_proposal = not_proposed[not_proposed["Postcode"] == postcode] + postcode_summary.append( + { + "Postcode": postcode, + "Number of properties in Proposal": len(in_proposal), + "Number of properties not in Proposal": len(not_in_proposal) + } + ) + postcode_summary = pd.DataFrame(postcode_summary) + postcode_summary = postcode_summary.sort_values( + "Number of properties not in Proposal", + ascending=False).reset_index(drop=True) + + postcode_summary.to_excel( + CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid Postcode Summary.xlsx", index=False + ) def find_remaining_surveys(): From 76f9b22ca22f97612ccf986273f3b99a7fa10b39 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 12 Nov 2024 17:42:31 +0000 Subject: [PATCH 074/255] adding maps to format_recommendations --- etl/customers/ksquared/Wave3 Modelling.py | 71 ++++++++++++++++++++++- etl/find_my_epc/RetrieveFindMyEpc.py | 6 ++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index b96b261f..c4858b5c 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -12,9 +12,10 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") USER_ID = 8 PORTFOLIO_ID = 117 +CAHA_PORTFOLIO_ID = 118 -def app(): +def hornsey(): """ This script prepares the asset lists for the additional housing associations, CAHA and Hornsey Housing Trust, that are forming a consortium led by AIHA @@ -156,3 +157,71 @@ def app(): "exclusions": ["boiler_upgrade"] } print(body) + + +def caha(): + caha_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Copy of AIHA - WHSHF Wave 3 bid - Consortium " + "member properties - CAHA.xlsx", + sheet_name="Ksquared-All units information", + header=3 + ) + + caha_asset_list = caha_asset_list.iloc[1:] + # Fill NA values with empty strings + caha_asset_list = caha_asset_list.fillna("") + caha_asset_list["Address letter or number"] = caha_asset_list["Address letter or number"].astype( + str + ).str.strip() + + # We Add POstcode as it wasn't populated - split on space and take the last two entries and re-concatenate on space + caha_asset_list["Postcode"] = caha_asset_list["Street address"].str.split(" ").str[-2:].str.join(" ") + # Take just the columns we need + caha_asset_list = caha_asset_list[["Address letter or number", "Street address", "Postcode"]] + + for col in ["Address letter or number", "Street address", "Postcode"]: + caha_asset_list[col] = caha_asset_list[col].str.replace(" ", " ") + + # Pull the data from find my epc + remap = { + "Flat A, 50 Talbot Road N6 4QP": "50a Talbot Road" + } + extracted_data = [] + asset_list = [] + for _, home in tqdm(caha_asset_list.iterrows(), total=len(caha_asset_list)): + unit_number = home["Address letter or number"] + street = home["Street address"] + postcode = home["Postcode"] + address = ", ".join([x for x in [unit_number, street] if x]) + address = remap.get(address, address) + address = address.replace(postcode, "").strip() + + find_epc_searcher = RetrieveFindMyEpc(address=address, postcode=postcode) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + searcher = SearchEpc( + address1=address, + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + full_address=address, + ) + searcher.find_property(skip_os=True) + newest_epc = searcher.newest_epc + + extracted_data.append( + { + "uprn": newest_epc["uprn"], + **find_epc_data, + } + ) + + asset_list.append( + { + "uprn": newest_epc["uprn"], + "address": home["Address letter or number"], + "postcode": home["Postcode"], + "property_type": newest_epc["property-type"], + } + ) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index dad32bf6..b2296a72 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -225,6 +225,12 @@ class RetrieveFindMyEpc: "roomstat_programmer_trvs", "time_temperature_zone_control" ], "Low energy lighting": ["low_energy_lighting"], + "Increase loft insulation to 270 mm": ["loft_insulation"], + "Heating controls (thermostatic radiator valves)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Solar water heating": ["solar_water_heating"], + "Solar photovoltaic panels, 2.5 kWp": ["solar_pv"], } formatted_recommendations = [] From 2e78ba6d5d3cc1736d84be6b0e6ac371d4d5781f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 12 Nov 2024 17:50:08 +0000 Subject: [PATCH 075/255] adding for format_recomendations --- etl/customers/ksquared/Wave3 Modelling.py | 4 +++- etl/find_my_epc/RetrieveFindMyEpc.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index c4858b5c..6a507728 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -175,6 +175,7 @@ def caha(): ).str.strip() # We Add POstcode as it wasn't populated - split on space and take the last two entries and re-concatenate on space + caha_asset_list["Street address"] = caha_asset_list["Street address"].str.strip() caha_asset_list["Postcode"] = caha_asset_list["Street address"].str.split(" ").str[-2:].str.join(" ") # Take just the columns we need caha_asset_list = caha_asset_list[["Address letter or number", "Street address", "Postcode"]] @@ -184,7 +185,8 @@ def caha(): # Pull the data from find my epc remap = { - "Flat A, 50 Talbot Road N6 4QP": "50a Talbot Road" + "Flat A, 50 Talbot Road N6 4QP": "50a Talbot Road", + "Flat A, 51 First Avenue EN1 1BN": "51a, First Avenue", } extracted_data = [] asset_list = [] diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index b2296a72..bcd3c356 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -231,6 +231,10 @@ class RetrieveFindMyEpc: ], "Solar water heating": ["solar_water_heating"], "Solar photovoltaic panels, 2.5 kWp": ["solar_pv"], + "Heating controls (room thermostat and TRVs)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Change heating to gas condensing boiler": ["boiler_upgrade"], } formatted_recommendations = [] From 6d01490962ebdc6e59d2ccbcf1dfad3280a12b49 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 12 Nov 2024 18:21:19 +0000 Subject: [PATCH 076/255] preparing caha --- etl/customers/ksquared/Wave3 Modelling.py | 37 +++++++++++++++++++++++ etl/find_my_epc/RetrieveFindMyEpc.py | 11 ++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index 6a507728..159fb20b 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -1,5 +1,6 @@ import os import time +import re from dotenv import load_dotenv from tqdm import tqdm @@ -187,16 +188,52 @@ def caha(): remap = { "Flat A, 50 Talbot Road N6 4QP": "50a Talbot Road", "Flat A, 51 First Avenue EN1 1BN": "51a, First Avenue", + "Flat B, 51 First Avenue EN1 1BN": "51b, First Avenue" } + + def remap_address(address): + # Match patterns like 'Flat A, 30 Grove Park Road' + match = re.match(r'Flat (\w), (\d+) (.+)', address) + if match: + flat_letter = match.group(1) # e.g., 'A' + number = match.group(2) # e.g., '30' + rest_of_address = match.group(3) # e.g., 'Grove Park Road' + + # Format the new address as '30A Grove Park Road' + return f"{number}{flat_letter} {rest_of_address}" + + # If pattern doesn't match, return original address + return address + extracted_data = [] asset_list = [] for _, home in tqdm(caha_asset_list.iterrows(), total=len(caha_asset_list)): + if home["Street address"] == "35 Stanford road N11 3HY" and home["Address letter or number"] == "": + continue + + if home["Street address"] == "29 Victoria Avenue N3 1BD" and home["Address letter or number"] == "": + continue + + if home["Street address"] == "11 Victoria Avenue N3 1BD" and home["Address letter or number"] == "Flat A": + continue + + if home["Street address"] == "11 Victoria Avenue N3 1BD" and home["Address letter or number"] == "Flat C": + continue + + if home["Street address"] == "10 Forest Gardens N17 6XA" and home["Address letter or number"] == "Flat C": + continue + + if home["Street address"] == "219 Cann Hall Road E11 3NJ" and home["Address letter or number"] == "Flat B": + continue + unit_number = home["Address letter or number"] street = home["Street address"] postcode = home["Postcode"] address = ", ".join([x for x in [unit_number, street] if x]) address = remap.get(address, address) address = address.replace(postcode, "").strip() + if "Victoria Avenue" not in address: + address = remap_address(address) find_epc_searcher = RetrieveFindMyEpc(address=address, postcode=postcode) find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index bcd3c356..e8e1ff1d 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -204,7 +204,8 @@ class RetrieveFindMyEpc: return resulting_data - def format_recommendations(self, recommendations): + @staticmethod + def format_recommendations(recommendations): """ This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey :param recommendations: @@ -217,6 +218,7 @@ class RetrieveFindMyEpc: "Hot water cylinder thermostat": ["cylinder_thermostat"], "High performance external doors": ["insulated_doors"], "Floor insulation (solid floor)": ["solid_floor_insulation"], + "Floor insulation (suspended floor)": ["suspended_floor_insulation"], "Double glazed windows": ["double_glazing"], "Cavity wall insulation": ["cavity_wall_insulation"], "Replace boiler with new condensing boiler": ["boiler_upgrade"], @@ -235,6 +237,13 @@ class RetrieveFindMyEpc: "roomstat_programmer_trvs", "time_temperature_zone_control" ], "Change heating to gas condensing boiler": ["boiler_upgrade"], + "Fan assisted storage heaters and dual immersion cylinder": ["high_heat_retention_storage_heaters"], + "Flat roof or sloping ceiling insulation": ["flat_roof_insulation"], + "Heating controls (room thermostat)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Band A condensing boiler": ["boiler_upgrade"], + "Double glazing": ["double_glazing"], } formatted_recommendations = [] From b01635ddd61d549dedeb8fd8eabf585afde180cc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 13 Nov 2024 12:13:37 +0000 Subject: [PATCH 077/255] added additional secondary heating recommendation --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/apis/GoogleSolarApi.py | 7 +- backend/app/plan/router.py | 2 +- etl/customers/ksquared/Wave3 Modelling.py | 93 ++++++++++++++++++- .../stonewater/Wave 3 Preparation.py | 40 ++++++++ etl/find_my_epc/RetrieveFindMyEpc.py | 34 ++++--- recommendations/HotwaterRecommendations.py | 10 +- recommendations/SecondaryHeating.py | 4 +- 9 files changed, 169 insertions(+), 25 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/apis/GoogleSolarApi.py b/backend/apis/GoogleSolarApi.py index 75f28ceb..e2b7d933 100644 --- a/backend/apis/GoogleSolarApi.py +++ b/backend/apis/GoogleSolarApi.py @@ -792,9 +792,14 @@ class GoogleSolarApi: property_instance = [p for p in input_properties if p.id == unit["property_id"]][0] # At this level, we check if the property is suitable for solar and if now, skip # Or if we have a solar non-invasive recommendation + + non_invasive_rec = next( + (r for r in property_instance.non_invasive_recommendations if r["type"] == "solar_pv"), {} + ).get("array_wattage") + if ( (not property_instance.is_solar_pv_valid()) or - [r for r in property_instance.non_invasive_recommendations if r["type"] == "solar_pv"] + non_invasive_rec is not None ): continue diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 3b6f3985..4a5b3bd4 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -394,7 +394,7 @@ async def trigger_plan(body: PlanTriggerRequest): logger.info("Getting the inputs") plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path) # Check for duplicate UPRNS - input_uprns = [x.get("uprn") for x in plan_input if "uprn" in x] + input_uprns = [x.get("uprn") for x in plan_input if "uprn" in x and x.get("uprn")] if input_uprns: # Check for dupes if len(input_uprns) != len(set(input_uprns)): diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index 159fb20b..c861edfc 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -2,6 +2,7 @@ import os import time import re +from etl.epc.settings import EARLIEST_EPC_DATE from dotenv import load_dotenv from tqdm import tqdm import pandas as pd @@ -236,7 +237,7 @@ def caha(): address = remap_address(address) find_epc_searcher = RetrieveFindMyEpc(address=address, postcode=postcode) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data(sap_2012_date=EARLIEST_EPC_DATE) time.sleep(0.5) # We need uprn searcher = SearchEpc( @@ -249,18 +250,102 @@ def caha(): searcher.find_property(skip_os=True) newest_epc = searcher.newest_epc + uprn = newest_epc["uprn"] + if address in ["Flat D, 11 Victoria Avenue", "Flat B, 11 Victoria Avenue"]: + uprn = None + extracted_data.append( { - "uprn": newest_epc["uprn"], + "uprn": uprn, **find_epc_data, } ) asset_list.append( { - "uprn": newest_epc["uprn"], - "address": home["Address letter or number"], + "uprn": uprn, + "address": address, "postcode": home["Postcode"], "property_type": newest_epc["property-type"], } ) + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + # for r in non_invasive_recommendations: + # new_recommendations = [] + # extracted = [r for r in extracted_data if r["uprn"] == r["uprn"]][0] + # for rec in r["recommendations"]: + # if extracted["hotwater-description"] == "Gas boiler/circulator, no cylinder thermostat": + # if rec["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"]: + # continue + # rec["survey"] = False + # new_recommendations.append(rec) + # r["recommendations"] = new_recommendations + + # We model the two properties separately + asset_list = pd.DataFrame(asset_list) + # Drop Flat D, 11 Victoria Avenue + asset_list1 = asset_list[asset_list["address"] != "Flat D, 11 Victoria Avenue"] + asset_list2 = asset_list[asset_list["address"] == "Flat D, 11 Victoria Avenue"] + + # Store the asset list in s3 + filename = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/asset_list1.csv" + save_csv_to_s3( + dataframe=asset_list1, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + filename2 = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/asset_list2.csv" + save_csv_to_s3( + dataframe=asset_list2, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename2 + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + body = { + "portfolio_id": str(CAHA_PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": ["boiler_upgrade"] + } + print(body) + + body2 = { + "portfolio_id": str(CAHA_PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename2, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": ["boiler_upgrade"] + } + print(body2) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 889d8f88..b6fed4db 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1411,5 +1411,45 @@ def find_remaining_surveys(): assert needed.shape[0] + costed.shape[0] == surveyed.shape[0] + +def append_stonewater_id(): + """ + This completes an adhoc request from Stonewater to add in their organisation Reference onto the model + :return: + """ + + model_proposed_sample = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Bid Packages WIP 13.11.24.xlsx", + sheet_name="Modelled Packages", + header=13 + ) + model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])] + model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int) + + original_archetypes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] + original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] + original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + + matched = model_proposed_sample.merge( + original_archetypes[["Address ID", 'Org. ref.']], + on="Address ID", + how="left" + ) + + if pd.isnull(matched["Org. ref."]).sum(): + raise ValueError("Something went wrong") + + # Save as CSV + matched.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater IDs.xlsx", + sheet_name="Proposed Wave 3 Sample", + index=False + ) + # if __name__ == "__main__": # main() diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index e8e1ff1d..cd76dae4 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -1,3 +1,4 @@ +import pandas as pd import requests from bs4 import BeautifulSoup from datetime import datetime @@ -25,7 +26,7 @@ class RetrieveFindMyEpc: self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower() - def retrieve_newest_find_my_epc_data(self): + def retrieve_newest_find_my_epc_data(self, sap_2012_date=None): """ For a post code and address, we pull out all the required data from the find my epc website """ @@ -188,7 +189,7 @@ class RetrieveFindMyEpc: raise ValueError(f"Missing key: {key}") # Finally, we format the recommendations - recommendations = self.format_recommendations(recommendations) + recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date) resulting_data = { 'epc_certificate': epc_certificate, @@ -205,11 +206,12 @@ class RetrieveFindMyEpc: return resulting_data @staticmethod - def format_recommendations(recommendations): + def format_recommendations(recommendations, assessment_data, sap_2012_date=None): """ This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey - :param recommendations: - :return: + :param recommendations: The recommendations from the EPC + :param assessment_data: The assessment data from the EPC + :param sap_2012_date: The date of the SAP 2012 update """ measure_map = { @@ -246,17 +248,23 @@ class RetrieveFindMyEpc: "Double glazing": ["double_glazing"], } + survey = True + if sap_2012_date is not None: + certificate_date = datetime.strptime(assessment_data["Date of certificate"], "%d %B %Y") + if certificate_date < pd.to_datetime(sap_2012_date): + survey = False + formatted_recommendations = [] for rec in recommendations: - mapped = measure_map[rec["measure"]] for measure in mapped: - formatted_recommendations.append( - { - "type": measure, - "sap_points": rec["sap_points"], - "survey": True - } - ) + to_append = { + "type": measure, + "sap_points": rec["sap_points"], + "survey": survey, + } + if measure == "solar_pv": + to_append["suitable"] = True + formatted_recommendations.append(to_append) return formatted_recommendations diff --git a/recommendations/HotwaterRecommendations.py b/recommendations/HotwaterRecommendations.py index 5ff7ae4f..aed1a5e5 100644 --- a/recommendations/HotwaterRecommendations.py +++ b/recommendations/HotwaterRecommendations.py @@ -60,15 +60,21 @@ class HotwaterRecommendations: # If there is no system present, but access to the mains, we + has_tank_recommendation = [r for r in self.recommendations if r["type"] == "hot_water_tank_insulation"] + if ( (self.property.hotwater["heater_type"] in ["electric immersion"]) & (self.property.data["hot-water-energy-eff"] == "Very Poor") & - (self.property.hotwater["no_system_present"] is None) + (self.property.hotwater["no_system_present"] is None) & + len(has_tank_recommendation) == 0 ): self.recommend_tank_insulation(phase=phase) return - if self.property.hotwater["clean_description"] == "From main system, no cylinder thermostat": + has_cylinder_recommendation = [r for r in self.recommendations if r["type"] == "cylinder_thermostat"] + + if ((self.property.hotwater["clean_description"] == "From main system, no cylinder thermostat") & + (len(has_cylinder_recommendation) == 0)): self.recommend_cylinder_thermostat(phase=phase) return diff --git a/recommendations/SecondaryHeating.py b/recommendations/SecondaryHeating.py index 7c20bcdd..931dbff0 100644 --- a/recommendations/SecondaryHeating.py +++ b/recommendations/SecondaryHeating.py @@ -13,7 +13,7 @@ class SecondaryHeating: ACCEPTED_MAINHEAT_DESCRIPTIONS = ["Boiler and radiators, mains gas"] ACCEPTED_SECONDHEAT_DESCRIPTIONS = ["Room heaters, electric"] # These are the heaters where works are required to remove them - FIXED_HEATER_DESCRIPTIONS = ["Room heaters, electric"] + FIXED_HEATER_DESCRIPTIONS = ["Room heaters, electric", 'Portable electric heaters (assumed)'] def __init__(self, property_instance: Property): self.property = property_instance @@ -34,7 +34,7 @@ class SecondaryHeating: if self.property.data['secondheat-description'] in self.FIXED_HEATER_DESCRIPTIONS: # We have an associated cost otherwise, there is no cost - n_rooms = self.property.data['number-heated-rooms'] + n_rooms = self.property.data['number-habitable-rooms'] - self.property.data['number-heated-rooms'] else: n_rooms = 0 From 8da1aecb556d0134fe06d5f1ab409068c7843e6a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 13 Nov 2024 23:26:04 +0000 Subject: [PATCH 078/255] updating stonewater to extract wall --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/ksquared/Wave3 Modelling.py | 29 ++++ .../stonewater/Wave 3 Preparation.py | 156 ++++++++++++++++++ recommendations/SecondaryHeating.py | 6 +- 5 files changed, 190 insertions(+), 5 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index c861edfc..845ab634 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -349,3 +349,32 @@ def caha(): "exclusions": ["boiler_upgrade"] } print(body2) + + # + asset_list3 = [ + { + "address": "10b Forest Gardens", "postcode": "N17 6XA", "uprn": 100021180197 + } + ] + filename3 = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/asset_list3.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list3), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename3 + ) + body3 = { + "portfolio_id": str(119), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename3, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": "", + "valuation_file_path": "", + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": ["boiler_upgrade"] + } + print(body3) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b6fed4db..0af3ffbb 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -37,6 +37,68 @@ def sap_to_epc(sap_points: int | float): return "G" +def extract_wall_details_summary(text): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part, + including any alternative wall details within the 7.0 Walls section of the summary PDF text. + """ + # Define data structure to hold all building part wall entries + wall_data = [] + + # Locate the entire 7.0 Walls section + wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1) + + # Define pattern to match each building part's wall entry within the section + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part + r"Type\s+(.*?)\n" # Matches main wall Type + r"Insulation\s+(.*?)\n" # Matches main wall Insulation + r"(Dry-lining\s+(.*?)\n)?" # Optional Dry-lining + r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown + r"Wall Thickness \[mm\]\s+(\d+)" # Matches main wall Thickness + r"(?:\nAlternative Wall Area.*?\n" # Starts matching alternative wall section if present + r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type + r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation + r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining + r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown + r"Alternative Wall Thickness\s+(\d+))?", # Matches alternative wall Thickness + re.DOTALL + ) + + # Find all building part entries within the 7.0 Walls section + for match in building_part_pattern.finditer(wall_section): + wall_label = match.group(1).strip() + main_wall_type = match.group(2).strip() + main_wall_insulation = match.group(3).strip() + main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A" + main_wall_thickness_unknown = match.group(6).strip() + main_wall_thickness = int(match.group(7)) + + # Optional alternative wall fields + alt_wall_type = match.group(8).strip() if match.group(8) else None + alt_wall_insulation = match.group(9).strip() if match.group(9) else None + alt_wall_dry_lining = match.group(10).strip() if match.group(10) else None + alt_wall_thickness_unknown = match.group(11).strip() if match.group(11) else None + alt_wall_thickness = int(match.group(12)) if match.group(12) else None + + # Append each building part as a dictionary in the wall_data list + wall_data.append({ + "Building Part": wall_label, + "Wall Type": main_wall_type, + "Wall Insulation": main_wall_insulation, + "Wall Dry-lining": main_wall_dry_lining, + "Wall Thickness Unknown": main_wall_thickness_unknown, + "Wall Thickness (mm)": main_wall_thickness, + "Alternative Wall Type": alt_wall_type, + "Alternative Wall Insulation": alt_wall_insulation, + "Alternative Wall Dry-lining": alt_wall_dry_lining, + "Alternative Wall Thickness Unknown": alt_wall_thickness_unknown, + "Alternative Wall Thickness (mm)": alt_wall_thickness, + }) + + return wall_data + + def extract_summary_report(pdf_path): """ Extracts specific data from the provided PDF file. @@ -80,6 +142,14 @@ def extract_summary_report(pdf_path): "Main Roof Type": None, "Main Roof Insulation": None, "Main Roof Insulation Thickness": None, + "Main Wall Type": None, + "Main Wall Insulation": None, + "Main Wall Dry-lining": None, + "Main Wall Thickness": None, + "Main Building Alternative Wall Type": None, + "Main Building Alternative Wall Insulation": None, + "Main Building Alternative Wall Dry-lining": None, + "Main Building Alternative Wall Thickness": None, } with (open(pdf_path, "rb") as file): @@ -229,6 +299,18 @@ def extract_summary_report(pdf_path): insulation_thickness_match.strip() if insulation_thickness_match else None ) + walls_data = extract_wall_details_summary(text) + # Get the main building wall data + main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] + data["Main Wall Type"] = main_building_walls["Wall Type"] + data["Main Wall Insulation"] = main_building_walls["Wall Insulation"] + data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"] + data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"] + data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"] + data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"] + data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"] + data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"] + return data @@ -498,10 +580,64 @@ def extract_roof_details_epr(text): return roof_data +def extract_wall_details_epr(text): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part + in the provided EPR PDF text. + """ + # Define data structure to hold results + wall_data = [] + + # Locate each building part section + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + + # Extract each building part's data, including wall details + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + + # Clean up the building part name + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + + part_details = match.group(2) + + # Extract Wall Type, Wall Insulation, Wall Dry-lining, and Wall Thickness + wall_type_match = re.search(r"Wall Type:\s*(.*?)(?=\n|$)", part_details) + wall_insulation_match = re.search(r"Wall Insulation:\s*(.*?)(?=\n|$)", part_details) + wall_drylining_match = re.search(r"Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details) + wall_thickness_match = re.search(r"Wall Thickness:\s*(\d+)(?=\n|$)", part_details) + + # Extract Alternative Wall information if available + alt_wall_type_match = re.search(r"Alternative Wall Type:\s*(.*?)(?=\n|$)", part_details) + alt_wall_insulation_match = re.search(r"Alternative Wall Insulation:\s*(.*?)(?=\n|$)", part_details) + alt_wall_drylining_match = re.search(r"Alternative Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details) + alt_wall_thickness_match = re.search(r"Alternative Wall Thickness:\s*(\d+)(?=\n|$)", part_details) + + # Store results for this building part + wall_data.append({ + "Building Part": cleaned_part_name, + "Wall Type": wall_type_match.group(1).strip() if wall_type_match else None, + "Wall Insulation": wall_insulation_match.group(1).strip() if wall_insulation_match else None, + "Wall Dry-lining": wall_drylining_match.group(1).strip() if wall_drylining_match else None, + "Wall Thickness": int(wall_thickness_match.group(1)) if wall_thickness_match else None, + "Alternative Wall Type": alt_wall_type_match.group(1).strip() if alt_wall_type_match else None, + "Alternative Wall Insulation": alt_wall_insulation_match.group( + 1).strip() if alt_wall_insulation_match else None, + "Alternative Wall Dry-lining": alt_wall_drylining_match.group( + 1).strip() if alt_wall_drylining_match else None, + "Alternative Wall Thickness": int(alt_wall_thickness_match.group(1)) if alt_wall_thickness_match else None, + }) + + return wall_data + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. """ + data = { "Address": None, "Postcode": None, @@ -539,6 +675,14 @@ def extract_epr(pdf_path): "Main Roof Type": None, "Main Roof Insulation": None, "Main Roof Insulation Thickness": None, + "Main Wall Type": None, + "Main Wall Insulation": None, + "Main Wall Dry-lining": None, + "Main Wall Thickness": None, + "Main Building Alternative Wall Type": None, + "Main Building Alternative Wall Insulation": None, + "Main Building Alternative Wall Dry-lining": None, + "Main Building Alternative Wall Thickness": None, } with open(pdf_path, "rb") as file: @@ -664,6 +808,17 @@ def extract_epr(pdf_path): data["Main Roof Insulation"] = main_roof_details[0]["Roof Insulation"] data["Main Roof Insulation Thickness"] = main_roof_details[0]["Roof Insulation Thickness"] + wall_details = extract_wall_details_epr(text) + main_wall_details = [w for w in wall_details if "Main" in w["Building Part"]][0] + data["Main Wall Type"] = main_wall_details["Wall Type"] + data["Main Wall Insulation"] = main_wall_details["Wall Insulation"] + data["Main Wall Dry-lining"] = main_wall_details["Wall Dry-lining"] + data["Main Wall Thickness"] = main_wall_details["Wall Thickness"] + data["Main Building Alternative Wall Type"] = main_wall_details["Alternative Wall Type"] + data["Main Building Alternative Wall Insulation"] = main_wall_details["Alternative Wall Insulation"] + data["Main Building Alternative Wall Dry-lining"] = main_wall_details["Alternative Wall Dry-lining"] + data["Main Building Alternative Wall Thickness"] = main_wall_details["Alternative Wall Thickness"] + return data @@ -1425,6 +1580,7 @@ def append_stonewater_id(): ) model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])] model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int) + z = model_proposed_sample["Archetype ID"].drop_duplicates().sort_values() original_archetypes = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " diff --git a/recommendations/SecondaryHeating.py b/recommendations/SecondaryHeating.py index 931dbff0..a9d5de04 100644 --- a/recommendations/SecondaryHeating.py +++ b/recommendations/SecondaryHeating.py @@ -10,10 +10,10 @@ class SecondaryHeating: """ # The list of existing heating systems that are accepted - ACCEPTED_MAINHEAT_DESCRIPTIONS = ["Boiler and radiators, mains gas"] - ACCEPTED_SECONDHEAT_DESCRIPTIONS = ["Room heaters, electric"] + ACCEPTED_MAINHEAT_DESCRIPTIONS = ["Boiler and radiators, mains gas", "Electric storage heaters"] + ACCEPTED_SECONDHEAT_DESCRIPTIONS = ["Room heaters, electric", 'Portable electric heaters (assumed)'] # These are the heaters where works are required to remove them - FIXED_HEATER_DESCRIPTIONS = ["Room heaters, electric", 'Portable electric heaters (assumed)'] + FIXED_HEATER_DESCRIPTIONS = ["Room heaters, electric"] def __init__(self, property_instance: Property): self.property = property_instance From 69b3ec7961eda55e7b2fe36d17353f215a1bf068 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 13 Nov 2024 23:35:02 +0000 Subject: [PATCH 079/255] fixing alternative wall extracction for summary report --- .../stonewater/Wave 3 Preparation.py | 50 +++++++++++-------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 0af3ffbb..8791912a 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -50,18 +50,23 @@ def extract_wall_details_summary(text): # Define pattern to match each building part's wall entry within the section building_part_pattern = re.compile( - r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label r"Type\s+(.*?)\n" # Matches main wall Type r"Insulation\s+(.*?)\n" # Matches main wall Insulation - r"(Dry-lining\s+(.*?)\n)?" # Optional Dry-lining + r"(Dry-lining\s+(.*?)\n)?" # Optional main wall Dry-lining r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown - r"Wall Thickness \[mm\]\s+(\d+)" # Matches main wall Thickness - r"(?:\nAlternative Wall Area.*?\n" # Starts matching alternative wall section if present + r"Wall Thickness \[mm\]\s+(\d+)", # Matches main wall Thickness + re.DOTALL + ) + + # Define pattern to capture alternative wall details, if present + alternative_wall_pattern = re.compile( + r"Alternative Wall Area.*?\n" # Matches start of alternative wall section r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown - r"Alternative Wall Thickness\s+(\d+))?", # Matches alternative wall Thickness + r"Alternative Wall Thickness\s+(\d+)", # Matches alternative wall Thickness re.DOTALL ) @@ -74,27 +79,32 @@ def extract_wall_details_summary(text): main_wall_thickness_unknown = match.group(6).strip() main_wall_thickness = int(match.group(7)) - # Optional alternative wall fields - alt_wall_type = match.group(8).strip() if match.group(8) else None - alt_wall_insulation = match.group(9).strip() if match.group(9) else None - alt_wall_dry_lining = match.group(10).strip() if match.group(10) else None - alt_wall_thickness_unknown = match.group(11).strip() if match.group(11) else None - alt_wall_thickness = int(match.group(12)) if match.group(12) else None - - # Append each building part as a dictionary in the wall_data list - wall_data.append({ + # Initialize dictionary for this wall entry + wall_entry = { "Building Part": wall_label, "Wall Type": main_wall_type, "Wall Insulation": main_wall_insulation, "Wall Dry-lining": main_wall_dry_lining, "Wall Thickness Unknown": main_wall_thickness_unknown, "Wall Thickness (mm)": main_wall_thickness, - "Alternative Wall Type": alt_wall_type, - "Alternative Wall Insulation": alt_wall_insulation, - "Alternative Wall Dry-lining": alt_wall_dry_lining, - "Alternative Wall Thickness Unknown": alt_wall_thickness_unknown, - "Alternative Wall Thickness (mm)": alt_wall_thickness, - }) + "Alternative Wall Type": None, + "Alternative Wall Insulation": None, + "Alternative Wall Dry-lining": "N/A", + "Alternative Wall Thickness Unknown": None, + "Alternative Wall Thickness (mm)": None, + } + + # Check if there's an alternative wall section following this wall entry + alt_match = alternative_wall_pattern.search(wall_section, match.end()) + if alt_match: + wall_entry["Alternative Wall Type"] = alt_match.group(1).strip() + wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip() + wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A" + wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip() + wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6)) + + # Append each building part as a dictionary in the wall_data list + wall_data.append(wall_entry) return wall_data From 2eaf19c2bb21d3b2be6ab223502e3d8a50081b26 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Nov 2024 14:56:00 +0000 Subject: [PATCH 080/255] minor --- etl/customers/aiha/bid_numbers.py | 92 +++++++ etl/customers/aiha/xml_extraction.py | 6 +- etl/customers/ksquared/Wave3 Modelling.py | 10 + .../southend/epc_data_pull_2024_11_14.py | 235 ++++++++++++++++++ .../stonewater/Wave 3 Preparation.py | 1 + 5 files changed, 343 insertions(+), 1 deletion(-) create mode 100644 etl/customers/aiha/bid_numbers.py create mode 100644 etl/customers/southend/epc_data_pull_2024_11_14.py diff --git a/etl/customers/aiha/bid_numbers.py b/etl/customers/aiha/bid_numbers.py new file mode 100644 index 00000000..96859f99 --- /dev/null +++ b/etl/customers/aiha/bid_numbers.py @@ -0,0 +1,92 @@ +""" +This is an adhoc script, used to pull together some of the figures that are being included in the +Warm Homes: Social Housing Wave 3 funding application +""" + +import pandas as pd +import numpy as np + +aiha_all_units = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/AIHA Measures Packages 2024_11_13.xlsx", + sheet_name="All Properties - AIHA", + header=2 +) +modelled_units = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/AIHA Measures Packages 2024_11_13.xlsx", + sheet_name="Modelled Properties - Measures", + header=5 +) +aiha_all_units = aiha_all_units.drop(columns=['Unnamed: 0', 'Unnamed: 1']) +aiha_extracted_property_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/extracted_property_data.csv" +) +aiha_wave_3_units = aiha_all_units[aiha_all_units["Expected Package Cost"].astype(float) > 0] +# TODO: The EPC C property isn't a C! +aiha_epc_breakdown = aiha_wave_3_units["Expected EPC Rating"].replace({"D or E": "E"}).value_counts() +# For CAHA +caha_epc_breakdown = modelled_units[ + modelled_units['Survey Key'].str.contains("CAHA") +]['Current EPC Rating'].value_counts() +# For Hornsey +hornsey_epc_breakdown = modelled_units[ + modelled_units['Survey Key'].str.contains("HORNSEY") +]['Current EPC Rating'].value_counts() + +aiha_original_asset_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/240924- KSQ & Domna Info Merge - AIHA - SHDF Wave 3 " + "bid - Supplementary information.xlsx", + sheet_name="Archetyping Data", + header=2 +) + +# Get the units in the bid: +aiha_wave_3_features = aiha_original_asset_data[ + ['Address letter or number', 'Street address', 'Postcode', "Wall type", + "Property type", "built-form", "floor"] +].merge( + aiha_wave_3_units[['Address letter or number', 'Street address', 'Postcode']], + how="inner", + on=["Address letter or number", "Street address", "Postcode"] +) + +wall_type_breakdown = aiha_wave_3_features["Wall type"].value_counts() +property_type_breakdown = aiha_wave_3_features.groupby(["Property type", "floor"]).size().reset_index() + +# Hornsey data - contained in original asset list +hornsey_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing " + "Trust.xlsx", + sheet_name="Ksquared-All units information", + header=3 +) + +# We don't need the first row +hornsey_asset_list = hornsey_asset_list.iloc[1:] +# Fill NA values with empty strings +hornsey_asset_list = hornsey_asset_list.fillna("") +hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype( + str +).str.strip() +hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip() +hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip() +# Replace double spaces +for col in ["Address letter or number", "Street address", "Postcode"]: + hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ") + +hornsey_asset_list = hornsey_asset_list[hornsey_asset_list["Address letter or number"] != ""] + +hornsey_asset_list["Wall Type Cleaned"] = np.where( + hornsey_asset_list["Wall type"].str.contains("Cavity"), + "Cavity", + "Solid" +) + +hornsey_asset_list["Property type"].value_counts() + +# CAHA +caha_epc_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_extracted_property_data.xlsx" +) + +caha_epc_data["property_type"].value_counts() +caha_epc_data["wall_type"].value_counts() diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index f96744ec..44baef80 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -92,9 +92,13 @@ def main(): # THis is the data we need for the AIHA project measures_data = extracted_surveys[ - ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating", "number_of_floors"] + ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating", + "number_of_floors", "walls-description", "property-type", "built-form"] ] measures_data = measures_data.sort_values("survey_key", ascending=True) + measures_data.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/extracted_property_data.csv", + ) # Note: # The properties will still have "Very poor" ratings for their hot water diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index 845ab634..96ea2b03 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -6,6 +6,7 @@ from etl.epc.settings import EARLIEST_EPC_DATE from dotenv import load_dotenv from tqdm import tqdm import pandas as pd +import numpy as np from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc from backend.SearchEpc import SearchEpc from utils.s3 import save_csv_to_s3 @@ -46,6 +47,12 @@ def hornsey(): hornsey_asset_list = hornsey_asset_list[hornsey_asset_list["Address letter or number"] != ""] + hornsey_asset_list["Wall Type Cleaned"] = np.where( + "Cavity" in hornsey_asset_list["Wall type"], + "Cavity", + "Solid" + ) + missed_uprns = { "Flat 13A Stowell House": 100021213098, "Flat 24 Stowell House": 100021213110, @@ -267,6 +274,9 @@ def caha(): "address": address, "postcode": home["Postcode"], "property_type": newest_epc["property-type"], + "wall_type": newest_epc["walls-description"], + "built_form": newest_epc["built-form"], + "flat_storey_count": newest_epc['flat-storey-count'], } ) diff --git a/etl/customers/southend/epc_data_pull_2024_11_14.py b/etl/customers/southend/epc_data_pull_2024_11_14.py new file mode 100644 index 00000000..14cd73be --- /dev/null +++ b/etl/customers/southend/epc_data_pull_2024_11_14.py @@ -0,0 +1,235 @@ +import os +import time + +import pandas as pd +from tqdm import tqdm + +from dotenv import load_dotenv +from utils.s3 import read_excel_from_s3 +from backend.SearchEpc import SearchEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + address1 = home["address1"].split(",")[0] + full_address = home["Address"] + + searcher = SearchEpc( + address1=str(address1), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/Southend Planned programme.xlsx", + header=0, + sheet_name="Planned RM" + ) + asset_list["row_id"] = asset_list.index + asset_list["address1"] = asset_list["Address"].str.split(",").str[0] + + epc_data, errors = get_data(asset_list) + + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data(asset_list_failed) + + # Append the failed data to the main data + epc_data.extend(epc_data_failed) + + epc_df = pd.DataFrame(epc_data) + + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "row_id", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + "photo-supply", + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" + ) + + asset_list = asset_list.drop(columns=["row_id"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)", + "photo-supply": "% of the Roof with PV" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, + axis=1 + ) + + # Store as an excel + filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/southend EPC Data pull - 14 Nov " + "2024.xlsx") + asset_list.to_excel(filename, index=False) + + asset_list["% of the Roof with PV"].value_counts() + + asset_list[asset_list["% of the Roof with PV"] == "50.0"][["Address", "Postcode"]] diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 8791912a..a5bbff7b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -117,6 +117,7 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ + data = { "Address": None, "Postcode": None, From 0796b384fb3aa8bf6cb3689c21cd5c5ac5acfc87 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Nov 2024 18:42:08 +0000 Subject: [PATCH 081/255] added non-invasive rec --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/remote_assessments/app.py | 27 +++++++++++++++++-------- recommendations/FloorRecommendations.py | 11 +++++++++- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index a0d01f7d..33015d87 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -1,7 +1,7 @@ import pandas as pd from utils.s3 import save_csv_to_s3 -PORTFOLIO_ID = 111 +PORTFOLIO_ID = 120 USER_ID = 8 @@ -13,9 +13,9 @@ def app(): asset_list = [ { - "uprn": 100050770761, - "address": "12 Sheardown Street", - "postcode": "DN4 0BH" + "uprn": 100030334057, + "address": "5, Lynton Street", + "postcode": "DE22 3RW" } ] asset_list = pd.DataFrame(asset_list) @@ -30,11 +30,22 @@ def app(): non_invasive_recommendations = [ { - "uprn": 100050770761, + "uprn": 100030334057, "recommendations": [ { - "type": "extension_cavity_wall_insulation", + "type": "internal_wall_insulation", + "sap_points": 9, + "survey": True + }, + { + "type": "external_wall_insulation", + "sap_points": 9, + "survey": True + }, + { + "type": "suspended_floor_insulation", "sap_points": 2, + "survey": True } ] } @@ -49,8 +60,8 @@ def app(): valuation_data = [ { - "uprn": 100050770761, - "value": 67_000 + "uprn": 100030334057, + "value": 133_000 } ] # Store valuation data to s3 diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py index 25741e7a..ed00bbe9 100644 --- a/recommendations/FloorRecommendations.py +++ b/recommendations/FloorRecommendations.py @@ -172,6 +172,11 @@ class FloorRecommendations(Definitions): insulation_materials = pd.DataFrame(insulation_materials) + non_invasive_recs = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} + ) + lowest_selected_u_value = None for _, insulation_material_group in insulation_materials.groupby("description"): @@ -217,6 +222,9 @@ class FloorRecommendations(Definitions): else: raise NotImplementedError("Implement me!") + sap_points = non_invasive_recs.get("sap_points", None) + survey = non_invasive_recs.get("survey", False) + floor_ending_config = FloorAttributes(new_description).process() floor_simulation_config = check_simulation_difference( new_config=floor_ending_config, old_config=self.property.floor, prefix="floor_" @@ -245,7 +253,8 @@ class FloorRecommendations(Definitions): "description": self._make_floor_description(material), "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": None, + "sap_points": sap_points, + "survey": survey, "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": { From 2b22a6012fc11b9e94cd430d0b4ae8426293ef9e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Nov 2024 21:17:37 +0000 Subject: [PATCH 082/255] remote assessment complete --- recommendations/HotwaterRecommendations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recommendations/HotwaterRecommendations.py b/recommendations/HotwaterRecommendations.py index aed1a5e5..b86329e4 100644 --- a/recommendations/HotwaterRecommendations.py +++ b/recommendations/HotwaterRecommendations.py @@ -66,7 +66,7 @@ class HotwaterRecommendations: (self.property.hotwater["heater_type"] in ["electric immersion"]) & (self.property.data["hot-water-energy-eff"] == "Very Poor") & (self.property.hotwater["no_system_present"] is None) & - len(has_tank_recommendation) == 0 + (len(has_tank_recommendation) == 0) ): self.recommend_tank_insulation(phase=phase) return From 31c5935577d6723360841f3ddb2803f82a6b6123 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Nov 2024 21:58:51 +0000 Subject: [PATCH 083/255] creating route march planning app --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/find_my_epc/RetrieveFindMyEpc.py | 25 +- etl/route_march_data_pull/app.py | 300 +++++++++++++++++++++ etl/route_march_data_pull/requirements.txt | 0 5 files changed, 326 insertions(+), 3 deletions(-) create mode 100644 etl/route_march_data_pull/app.py create mode 100644 etl/route_march_data_pull/requirements.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index cd76dae4..913a04b8 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -26,6 +26,20 @@ class RetrieveFindMyEpc: self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower() + @staticmethod + def extract_low_carbon_sources(soup): + # Find the section header + section_header = soup.find("h3", string="Low and zero carbon energy sources") + if not section_header: + return {} + + # Locate the list following the header + energy_list = section_header.find_next("ul") + + # Extract the list items + sources = {item.get_text(strip=True): True for item in energy_list.find_all("li")} + return sources + def retrieve_newest_find_my_epc_data(self, sap_2012_date=None): """ For a post code and address, we pull out all the required data from the find my epc website @@ -191,6 +205,9 @@ class RetrieveFindMyEpc: # Finally, we format the recommendations recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date) + # 4) Low and zero carbon energy sources + low_carbon_energy_sources = self.extract_low_carbon_sources(address_res) + resulting_data = { 'epc_certificate': epc_certificate, 'current_epc_rating': current_rating.split(' ')[-6], @@ -200,7 +217,8 @@ class RetrieveFindMyEpc: "heating_text": heating_text, "hot_water_text": hot_water_text, "recommendations": recommendations, - **assessment_data + **assessment_data, + **low_carbon_energy_sources } return resulting_data @@ -246,6 +264,11 @@ class RetrieveFindMyEpc: ], "Band A condensing boiler": ["boiler_upgrade"], "Double glazing": ["double_glazing"], + "Flue gas heat recovery device in conjunction with boiler": ["flue_gas_heat_recovery"], + "Wind turbine": ["wind_turbine"], + "Loft insulation": ["loft_insulation"], + "Solar photovoltaic (PV) panels": ["solar_pv"], + "Party wall insulation": ["party_wall_insulation"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py new file mode 100644 index 00000000..060897f8 --- /dev/null +++ b/etl/route_march_data_pull/app.py @@ -0,0 +1,300 @@ +import os +import time + +import pandas as pd +import numpy as np +from tqdm import tqdm + +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list, fulladdress_column, address1_column, postcode_column): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + postcode = home[postcode_column] + house_number = home[address1_column] + full_address = home[fulladdress_column] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + # Retrieve data from FindMyEPC + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(np.random.uniform(0.1, 1)) + try: + postcode = home[postcode_column] + house_number = home[address1_column] + full_address = home[fulladdress_column] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + # Retrieve data from FindMyEPC + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(np.random.uniform(0.1, 1)) + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"], + "find_my_epc_data": find_epc_data, + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def extract_address1(asset_list, full_address_col, method="first_two_words"): + if method == "first_two_words": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + raise ValueError(f"Method {method} not recognized") + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/" + DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx" + POSTCODE_COLUMN = "Postcode" + FULLADDRESS_COLUMN = "Address" + ADDRESS1_COLUMN = None + ADDRESS1_METHOD = "first_two_words" + + asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0) + asset_list["row_id"] = asset_list.index + + # We clean up portential non-breaking spaces, and double spaces + for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]: + asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) + asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) + + if ADDRESS1_COLUMN is None: + ADDRESS1_COLUMN = "address1_extracted" + asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD) + + epc_data, errors = get_data( + asset_list=asset_list, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN + ) + + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data( + asset_list=asset_list_failed, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN + ) + + # Append the failed data to the main data + epc_data.extend(epc_data_failed) + + epc_df = pd.DataFrame(epc_data) + + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "row_id", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" + ) + + asset_list = asset_list.drop(columns=["row_id"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, + axis=1 + ) + + # Store as an excel + filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx" + asset_list.to_excel(filename, index=False) diff --git a/etl/route_march_data_pull/requirements.txt b/etl/route_march_data_pull/requirements.txt new file mode 100644 index 00000000..e69de29b From dc1cf6d6045c5f94e2826f6ff20010e05043d1ff Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 15:49:08 +0000 Subject: [PATCH 084/255] working on stonewater matching algorithm --- .../southend/epc_data_pull_2024_11_14.py | 4 - .../stonewater/Wave 3 Preparation.py | 133 +++++++++++++++++- etl/route_march_data_pull/app.py | 43 +++++- 3 files changed, 171 insertions(+), 9 deletions(-) diff --git a/etl/customers/southend/epc_data_pull_2024_11_14.py b/etl/customers/southend/epc_data_pull_2024_11_14.py index 14cd73be..11ddcc6f 100644 --- a/etl/customers/southend/epc_data_pull_2024_11_14.py +++ b/etl/customers/southend/epc_data_pull_2024_11_14.py @@ -229,7 +229,3 @@ def app(): filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/southend EPC Data pull - 14 Nov " "2024.xlsx") asset_list.to_excel(filename, index=False) - - asset_list["% of the Roof with PV"].value_counts() - - asset_list[asset_list["% of the Roof with PV"] == "50.0"][["Address", "Postcode"]] diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index a5bbff7b..019c51c9 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -117,7 +117,7 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ - + data = { "Address": None, "Postcode": None, @@ -1618,5 +1618,136 @@ def append_stonewater_id(): index=False ) + +def propsed_wave_3_sample(): + """ + Stonewater want to ensure that the properties that when selecting properties for wave 3, they choose properties + such that most of the properties within a geographical area are treatable within the bid. + Name, if we take a geographical area (which could be postal region) they want the most, and ideally all, of the + properties within that geographical area to be included within the bid + :return: + """ + + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + # Clean address ids + asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] + asset_list = asset_list[asset_list["Address ID"] != "Address ID"] + asset_list["Address ID"] = asset_list["Address ID"].astype(int) + + # Create the postal region, taking the first part of the postcode + asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0] + unique_postal_regions = asset_list["Postal Region"].unique() + + # Keep just the columns we need + asset_list = asset_list[ + ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", + "Heating"] + ] + + survey_results = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"), + header=13, + sheet_name="Modelled Packages" + ) + + # TOOD: We probably want the actual surveyed wall, roof, heating type + survey_results = survey_results[ + ["Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode"] + ] + survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] + + survey_results_with_original_features = survey_results.merge( + asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], + on="Address ID", + how="left" + ) + + if survey_results_with_original_features.shape[0] != survey_results.shape[0]: + raise ValueError("Something went wrong") + + # Tier definitions + # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D + # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D + # + + for region in unique_postal_regions: + # Take all of the properties in that region + region_assets = asset_list[asset_list["Postal Region"] == region].copy() + archetypes = region_assets["Archetype ID"].unique() + # We get the properties that have been surveyed + region_surveyed = survey_results[ + survey_results["Archetype ID"].isin(archetypes) & + (survey_results["Postal Region"] == region) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + if region_surveyed["Archetype ID"].duplicated().sum(): + raise NotImplementedError("Fix me") + + region_assets = region_assets.merge( + region_surveyed, + on="Archetype ID", + how="left" + ) + + # Label the tier 1 properties + region_assets["Confidence Tier"] = None + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]), + "1", region_assets["Confidence Tier"] + ) + # TODO: Turn into a function + missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + + region_surveyed = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + if region_surveyed["Archetype ID"].duplicated().sum(): + raise NotImplementedError("Fix me 2") + + region_assets = region_assets.merge( + region_surveyed, + on="Archetype ID", + how="left", + suffixes=("", "_method2") + ) + + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]), + "2 - same archetype", region_assets["Confidence Tier"] + ) + + region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna( + region_assets["Current EPC Band_method2"]) + + region_assets = region_assets.drop(columns=["Current EPC Band_method2"]) + + missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() + + # This means that this archetype was never surveyed and so we need to find a sufficiently similar property + for a_id in missed_addressids: + property = asset_list[asset_list["Address ID"] == a_id].squeeze() + + surveyed_same_postcode = survey_results_with_original_features[ + (survey_results_with_original_features["Postcode"] == property["Postcode"]) & + (survey_results_with_original_features["Property Type"] == property["Property Type"]) + ] + + surveyed_same_region = survey_results_with_original_features[ + (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + (survey_results_with_original_features["Property Type"] == property["Property Type"]) + ] + + same_postcode = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) & + (survey_results["Postal Region"] == region) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + pd.isnull(region_assets["Current EPC Band"]).sum() + # if __name__ == "__main__": # main() diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 060897f8..f24c5bb2 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -206,6 +206,14 @@ def app(): # Drop the column that is "" transformed_df = transformed_df.drop(columns=[""]) + # Get the find my epc data + find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( + pd.json_normalize(epc_df["find_my_epc_data"]) + ) + # We check if we get the solar pv column: + if "Solar photovoltaics" not in find_my_epc_data.columns: + find_my_epc_data["Solar photovoltaics"] = False + # Retrieve just the data we need epc_df = epc_df[ [ @@ -228,6 +236,7 @@ def app(): "mainheat-description", # "energy-consumption-current", # kwh/m2 + "photo-supply", ] ] @@ -236,12 +245,25 @@ def app(): how="left", on="row_id" ).merge( - transformed_df, + find_my_epc_data[ + [ + "row_id", "heating_text", "hot_water_text", 'Assessor’s name', + "Assessor's Telephone", "Assessor's Email", "Accreditation scheme", + "Assessor’s ID", "Solar photovoltaics" + ] + ].rename( + columns={ + "Solar photovoltaics": "Has Solar PV", + "heating_text": "Heating Estimated kWh", + "hot_water_text": "Hot Water Estimated kWh", + } + ), how="left", on="row_id" ) - asset_list = asset_list.drop(columns=["row_id"]) + asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) + asset_list = asset_list.drop(columns=["photo-supply"]) # Rename the columns asset_list = asset_list.rename(columns={ @@ -259,7 +281,7 @@ def app(): "mainheat-description": "Heating Type", "secondheat-description": "Secondary Heating", "transaction-type": "Reason for last EPC", - "energy-consumption-current": "Heat Demand (kWh/m2)" + "energy-consumption-current": "Heat Demand (kWh/m2)", }) asset_list["Estimated Number of Floors"] = asset_list.apply( @@ -295,6 +317,19 @@ def app(): axis=1 ) + # For all of the columns in transformed_df, prefix with "Recommendation: " + for col in transformed_df.columns: + if col == "row_id": + continue + transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"}) + + asset_list = asset_list.merge( + transformed_df, + how="left", + on="row_id" + ) + asset_list = asset_list.drop(columns=["row_id"]) + # Store as an excel - filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx" + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" asset_list.to_excel(filename, index=False) From c13c84b98cbab169300306adeba534145496251c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 15:55:19 +0000 Subject: [PATCH 085/255] First region implemented --- .../stonewater/Wave 3 Preparation.py | 58 +++++++++++++++---- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 019c51c9..7c104f97 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1729,25 +1729,61 @@ def propsed_wave_3_sample(): missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() # This means that this archetype was never surveyed and so we need to find a sufficiently similar property + final_missed_matches = [] for a_id in missed_addressids: property = asset_list[asset_list["Address ID"] == a_id].squeeze() - surveyed_same_postcode = survey_results_with_original_features[ + # TODO: This is quite strict for the moment - we might want to relax this by creating reduced versions + # of the wall, roof and heating features, splitting them on the colons and taking the first part + surveyed_similar = survey_results_with_original_features[ (survey_results_with_original_features["Postcode"] == property["Postcode"]) & - (survey_results_with_original_features["Property Type"] == property["Property Type"]) + (survey_results_with_original_features["Property Type"] == property["Property Type"]) & + (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) & + (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) & + (survey_results_with_original_features["Heating"] == property["Heating"]) ] + if surveyed_similar.empty: + surveyed_similar = survey_results_with_original_features[ + (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + (survey_results_with_original_features["Property Type"] == property["Property Type"]) & + (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) & + (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) & + (survey_results_with_original_features["Heating"] == property["Heating"]) + ] - surveyed_same_region = survey_results_with_original_features[ - (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - (survey_results_with_original_features["Property Type"] == property["Property Type"]) - ] + if surveyed_similar.empty: + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": "5 - no similar property, needs survey to confirm" + } + ) + continue - same_postcode = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) & - (survey_results["Postal Region"] == region) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + raise NotImplementedError("Implement me") - pd.isnull(region_assets["Current EPC Band"]).sum() + final_missed_matches = pd.DataFrame(final_missed_matches) + + region_assets = region_assets.merge( + final_missed_matches, + on="Address ID", + how="left", + suffixes=("", "_method3") + ) + + region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna( + region_assets["Confidence Tier_method3"] + ) + + region_assets = region_assets.drop(columns=["Confidence Tier_method3"]) + + region_assets["Current EPC Band"] = np.where( + region_assets["Confidence Tier"] == "5 - no similar property, needs survey to confirm", + "Unknown", region_assets["Current EPC Band"] + ) + + if pd.isnull(region_assets["Current EPC Band"]).sum(): + raise Exception("Something went wrong") # if __name__ == "__main__": # main() From 8f9b8f08862cbadcbd0daaa29219cd0980606b3f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 16:30:23 +0000 Subject: [PATCH 086/255] working on algorithm --- etl/customers/stonewater/Wave 3 Preparation.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 7c104f97..008fd3bc 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1633,6 +1633,9 @@ def propsed_wave_3_sample(): "- Archetyped V3.1.xlsx", header=4 ) + + # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater + asset_list = asset_list[asset_list["Archetype ID"] == "NOT PRIORITY POSTCODE"] # Clean address ids asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] asset_list = asset_list[asset_list["Address ID"] != "Address ID"] @@ -1674,6 +1677,7 @@ def propsed_wave_3_sample(): # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D # + results = [] for region in unique_postal_regions: # Take all of the properties in that region region_assets = asset_list[asset_list["Postal Region"] == region].copy() @@ -1722,10 +1726,17 @@ def propsed_wave_3_sample(): ) region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna( - region_assets["Current EPC Band_method2"]) + region_assets["Current EPC Band_method2"].astype(str), + ) region_assets = region_assets.drop(columns=["Current EPC Band_method2"]) + # We label EPC C properties + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["C", "B", "A"]), + "6 - EPC C or above", region_assets["Confidence Tier"] + ) + missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() # This means that this archetype was never surveyed and so we need to find a sufficiently similar property @@ -1785,5 +1796,7 @@ def propsed_wave_3_sample(): if pd.isnull(region_assets["Current EPC Band"]).sum(): raise Exception("Something went wrong") + results.append(region_assets) + # if __name__ == "__main__": # main() From 2158ab2cd50df7edcfc7e119b56237145f4f1dd1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 16:33:43 +0000 Subject: [PATCH 087/255] debugging stoneater alg --- etl/customers/stonewater/Wave 3 Preparation.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 008fd3bc..ef7dd414 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1635,7 +1635,7 @@ def propsed_wave_3_sample(): ) # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater - asset_list = asset_list[asset_list["Archetype ID"] == "NOT PRIORITY POSTCODE"] + asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"] # Clean address ids asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] asset_list = asset_list[asset_list["Address ID"] != "Address ID"] @@ -1678,7 +1678,7 @@ def propsed_wave_3_sample(): # results = [] - for region in unique_postal_regions: + for region in tqdm(unique_postal_regions): # Take all of the properties in that region region_assets = asset_list[asset_list["Postal Region"] == region].copy() archetypes = region_assets["Archetype ID"].unique() @@ -1739,7 +1739,11 @@ def propsed_wave_3_sample(): missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() - # This means that this archetype was never surveyed and so we need to find a sufficiently similar property + if not missed_addressids: + results.append(region_assets) + continue + + # This means that this archetype was never surveyed and so we need to find a sufficiently similar property final_missed_matches = [] for a_id in missed_addressids: property = asset_list[asset_list["Address ID"] == a_id].squeeze() From 4d021f0ba6a5894659275d8090e1f65be6ca68f6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 17:12:55 +0000 Subject: [PATCH 088/255] working on stonewater alg --- .../stonewater/Wave 3 Preparation.py | 102 +++++++++++++++--- 1 file changed, 86 insertions(+), 16 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ef7dd414..40dfd38e 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3,6 +3,7 @@ import PyPDF2 import re import pandas as pd import numpy as np +from docutils.utils.math.tex2mathml_extern import blahtexml from tqdm import tqdm from collections import Counter @@ -1681,19 +1682,15 @@ def propsed_wave_3_sample(): for region in tqdm(unique_postal_regions): # Take all of the properties in that region region_assets = asset_list[asset_list["Postal Region"] == region].copy() - archetypes = region_assets["Archetype ID"].unique() - # We get the properties that have been surveyed - region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(archetypes) & - (survey_results["Postal Region"] == region) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() - if region_surveyed["Archetype ID"].duplicated().sum(): - raise NotImplementedError("Fix me") + # We have a tier 1 match if the property itself was surveyed + exact_surveyed = survey_results[ + survey_results["Address ID"].isin(region_assets["Address ID"]) + ] region_assets = region_assets.merge( - region_surveyed, - on="Archetype ID", + exact_surveyed[["Address ID", "Current EPC Band"]], + on="Address ID", how="left" ) @@ -1701,22 +1698,95 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = None region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]), - "1", region_assets["Confidence Tier"] + "1 - property was surveyed", region_assets["Confidence Tier"] ) - # TODO: Turn into a function - missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["C", "B", "A"]), + "6 - property was surveyed", region_assets["Confidence Tier"] + ) + + archetypes = region_assets[ + pd.isnull(region_assets["Confidence Tier"]) + ]["Archetype ID"].unique() + # We get the properties that have been surveyed region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + survey_results["Archetype ID"].isin(archetypes) & + (survey_results["Postal Region"] == region) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): - raise NotImplementedError("Fix me 2") + # Take the duplicated archetypes + duplicated_archetypes = region_surveyed[ + region_surveyed["Archetype ID"].duplicated() + ]["Archetype ID"].unique() + duplicated_archetypes = region_surveyed[ + region_surveyed["Archetype ID"].isin(duplicated_archetypes) + ] + + # We need to select which one is the most relevant to these properties + survey_data = survey_results_with_original_features[ + survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values) + ] + + raise NotImplementedError("Fix me") region_assets = region_assets.merge( region_surveyed, on="Archetype ID", how="left", + suffixes=("", "_method1") + ) + + # Label the tier 1 properties + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & + pd.isnull(region_assets["Confidence Tier"]), + "1 - Archetype surveyed", region_assets["Confidence Tier"] + ) + region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) + # TODO: Turn into a function + missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + + archetype_surveyed = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + if archetype_surveyed["Archetype ID"].duplicated().sum(): + # We need to select which one is the most relevant to these properties + duplicated_archetypes = archetype_surveyed[ + archetype_surveyed["Archetype ID"].duplicated() + ]["Archetype ID"].unique() + + survey_data = survey_results_with_original_features[ + survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes) + ] + + homes_with_these_archetypes = region_assets[ + region_assets["Archetype ID"].isin(duplicated_archetypes) + ] + + for _, home in homes_with_these_archetypes.iterrows(): + first_filter = survey_data[ + (survey_data["Postal Region"] == home["Postal Region"]) & + (survey_data["Property Type"] == home["Property Type"]) & + (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) + ] + + if not first_filter.empty: + NotImplementedError("Fix me 0") + + second_filter = survey_data[ + (survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) & + (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) + ] + + raise NotImplementedError("Fix me 2") + + region_assets = region_assets.merge( + archetype_surveyed, + on="Archetype ID", + how="left", suffixes=("", "_method2") ) From d00c291c17dacb545eef4b708047ec5c699baf18 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 15:16:54 +0000 Subject: [PATCH 089/255] debugging stonewater algorithm --- .../stonewater/Wave 3 Preparation.py | 68 +++++++------------ 1 file changed, 25 insertions(+), 43 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 40dfd38e..5b1e2f91 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1716,20 +1716,11 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): - # Take the duplicated archetypes - duplicated_archetypes = region_surveyed[ - region_surveyed["Archetype ID"].duplicated() - ]["Archetype ID"].unique() - duplicated_archetypes = region_surveyed[ - region_surveyed["Archetype ID"].isin(duplicated_archetypes) - ] - - # We need to select which one is the most relevant to these properties - survey_data = survey_results_with_original_features[ - survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values) - ] - - raise NotImplementedError("Fix me") + region_surveyed = survey_results[ + survey_results["Archetype ID"].isin(archetypes) & + (survey_results["Postal Region"] == region) + ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() + region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc) region_assets = region_assets.merge( region_surveyed, @@ -1744,6 +1735,17 @@ def propsed_wave_3_sample(): pd.isnull(region_assets["Confidence Tier"]), "1 - Archetype surveyed", region_assets["Confidence Tier"] ) + + region_assets["Current EPC Band"] = np.where( + pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]), + region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"] + ) + # Handle EPC C + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + "6 - EPC C or above", region_assets["Confidence Tier"] + ) + region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) # TODO: Turn into a function missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) @@ -1752,36 +1754,16 @@ def propsed_wave_3_sample(): survey_results["Archetype ID"].isin(missed_archetypes) ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + # TODO - We could average the property?? And call it borderline, call out it was averaged!!! + # We could also find the nearest property to it, with similar wall, roof, heating? + # Can use long/lag to distance calc. We have this data from previous + if archetype_surveyed["Archetype ID"].duplicated().sum(): - # We need to select which one is the most relevant to these properties - duplicated_archetypes = archetype_surveyed[ - archetype_surveyed["Archetype ID"].duplicated() - ]["Archetype ID"].unique() - - survey_data = survey_results_with_original_features[ - survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes) - ] - - homes_with_these_archetypes = region_assets[ - region_assets["Archetype ID"].isin(duplicated_archetypes) - ] - - for _, home in homes_with_these_archetypes.iterrows(): - first_filter = survey_data[ - (survey_data["Postal Region"] == home["Postal Region"]) & - (survey_data["Property Type"] == home["Property Type"]) & - (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) - ] - - if not first_filter.empty: - NotImplementedError("Fix me 0") - - second_filter = survey_data[ - (survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) & - (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) - ] - - raise NotImplementedError("Fix me 2") + archetype_surveyed = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) + ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() + archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc) + archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"]) region_assets = region_assets.merge( archetype_surveyed, From 05cf7514783786261f7efe70eda5486712f8fb4c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 16:00:59 +0000 Subject: [PATCH 090/255] debuggin --- .../stonewater/Wave 3 Preparation.py | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 5b1e2f91..d2110de8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1777,8 +1777,9 @@ def propsed_wave_3_sample(): "2 - same archetype", region_assets["Confidence Tier"] ) - region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna( - region_assets["Current EPC Band_method2"].astype(str), + region_assets["Current EPC Band"] = np.where( + pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method2"]), + region_assets["Current EPC Band_method2"], region_assets["Current EPC Band"] ) region_assets = region_assets.drop(columns=["Current EPC Band_method2"]) @@ -1822,12 +1823,26 @@ def propsed_wave_3_sample(): final_missed_matches.append( { "Address ID": a_id, - "Confidence Tier": "5 - no similar property, needs survey to confirm" + "Confidence Tier": "5 - no similar property, needs survey to confirm", + "Current EPC Band": "Unknown" } ) continue + # We take an average + expected_sap = surveyed_similar["Current SAP Rating"].mean() + expected_epc = sap_to_epc(expected_sap) + if expected_epc in ["C", "B", "A"]: + tier = "6 - EPC C or above" + else: + tier = "3 - similar property" - raise NotImplementedError("Implement me") + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": tier, + "Current EPC Band": "Unknown" + } + ) final_missed_matches = pd.DataFrame(final_missed_matches) @@ -1841,14 +1856,13 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna( region_assets["Confidence Tier_method3"] ) + region_assets["Current EPC Band"] = np.where( + pd.isnull(region_assets["Current EPC Band"]), + region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"] + ) region_assets = region_assets.drop(columns=["Confidence Tier_method3"]) - region_assets["Current EPC Band"] = np.where( - region_assets["Confidence Tier"] == "5 - no similar property, needs survey to confirm", - "Unknown", region_assets["Current EPC Band"] - ) - if pd.isnull(region_assets["Current EPC Band"]).sum(): raise Exception("Something went wrong") From 7d209d5d8e07b4112bffcdcfc748d04cc299abe6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 16:28:43 +0000 Subject: [PATCH 091/255] creating loss and gain columns --- .../stonewater/Wave 3 Preparation.py | 48 +++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d2110de8..b36ae756 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1703,7 +1703,7 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band"].isin(["C", "B", "A"]), - "6 - property was surveyed", region_assets["Confidence Tier"] + "5 - property was surveyed", region_assets["Confidence Tier"] ) archetypes = region_assets[ @@ -1721,6 +1721,7 @@ def propsed_wave_3_sample(): (survey_results["Postal Region"] == region) ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc) + region_surveyed = region_surveyed.drop(columns=["Current SAP Rating"]) region_assets = region_assets.merge( region_surveyed, @@ -1743,7 +1744,7 @@ def propsed_wave_3_sample(): # Handle EPC C region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), - "6 - EPC C or above", region_assets["Confidence Tier"] + "5 - EPC C or above", region_assets["Confidence Tier"] ) region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) @@ -1773,7 +1774,8 @@ def propsed_wave_3_sample(): ) region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]), + region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( + region_assets["Confidence Tier"]), "2 - same archetype", region_assets["Confidence Tier"] ) @@ -1786,8 +1788,8 @@ def propsed_wave_3_sample(): # We label EPC C properties region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band"].isin(["C", "B", "A"]), - "6 - EPC C or above", region_assets["Confidence Tier"] + region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + "5 - EPC C or above", region_assets["Confidence Tier"] ) missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() @@ -1823,7 +1825,7 @@ def propsed_wave_3_sample(): final_missed_matches.append( { "Address ID": a_id, - "Confidence Tier": "5 - no similar property, needs survey to confirm", + "Confidence Tier": "4 - no similar property, needs survey to confirm", "Current EPC Band": "Unknown" } ) @@ -1832,7 +1834,7 @@ def propsed_wave_3_sample(): expected_sap = surveyed_similar["Current SAP Rating"].mean() expected_epc = sap_to_epc(expected_sap) if expected_epc in ["C", "B", "A"]: - tier = "6 - EPC C or above" + tier = "5 - EPC C or above" else: tier = "3 - similar property" @@ -1861,12 +1863,42 @@ def propsed_wave_3_sample(): region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"] ) - region_assets = region_assets.drop(columns=["Confidence Tier_method3"]) + region_assets = region_assets.drop(columns=["Confidence Tier_method3", "Current EPC Band_method3"]) if pd.isnull(region_assets["Current EPC Band"]).sum(): raise Exception("Something went wrong") results.append(region_assets) + results = pd.concat(results) + + # Create a pivot table for counts of Confidence Tier by Postal Region + geographic_summary = results.pivot_table( + index='Postal Region', + columns='Confidence Tier', + aggfunc='size', + fill_value=0 + ).reset_index() + + # We create the gain and loss columns + # Gain is the sum of these columns: + # '1 - Archetype surveyed', '1 - property was surveyed', + # '2 - same archetype', '3 - similar property', + # Loss is the sum of these columns: + # '4 - no similar property, needs survey to confirm', + # '5 - EPC C or above', '5 - property was surveyed' + geographic_summary["Gain"] = geographic_summary[ + ['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property'] + ].sum(axis=1) + + geographic_summary["Loss"] = geographic_summary[ + ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed'] + ].sum(axis=1) + + geographic_summary.sum() + + geographic_summary = geographic_summary.sort_values("Loss", ascending=True) + geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() + # if __name__ == "__main__": # main() From a01ff1d8dedaaf78e8ce95b21305a6f1a430ae3e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 16:45:10 +0000 Subject: [PATCH 092/255] tweaking postal region algorithm - may need to swap to postcode or street --- .../stonewater/Wave 3 Preparation.py | 44 ++++++++++++++----- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b36ae756..20f771ec 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1803,22 +1803,43 @@ def propsed_wave_3_sample(): for a_id in missed_addressids: property = asset_list[asset_list["Address ID"] == a_id].squeeze() - # TODO: This is quite strict for the moment - we might want to relax this by creating reduced versions - # of the wall, roof and heating features, splitting them on the colons and taking the first part + if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: + filter_property_types = ["House", "Bungalow"] + else: + filter_property_types = ["Flat"] + surveyed_similar = survey_results_with_original_features[ (survey_results_with_original_features["Postcode"] == property["Postcode"]) & - (survey_results_with_original_features["Property Type"] == property["Property Type"]) & - (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) & - (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) & - (survey_results_with_original_features["Heating"] == property["Heating"]) + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + ) + ) & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] + ) ] if surveyed_similar.empty: surveyed_similar = survey_results_with_original_features[ (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - (survey_results_with_original_features["Property Type"] == property["Property Type"]) & - (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) & - (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) & - (survey_results_with_original_features["Heating"] == property["Heating"]) + (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + )) & + (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0]) & + (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0]) & + (survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0]) ] if surveyed_similar.empty: @@ -1842,7 +1863,7 @@ def propsed_wave_3_sample(): { "Address ID": a_id, "Confidence Tier": tier, - "Current EPC Band": "Unknown" + "Current EPC Band": expected_epc } ) @@ -1899,6 +1920,7 @@ def propsed_wave_3_sample(): geographic_summary = geographic_summary.sort_values("Loss", ascending=True) geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() + geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum() # if __name__ == "__main__": # main() From 7d63c164045c6855ea6cb13091788a2ed7db2afb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 18:05:05 +0000 Subject: [PATCH 093/255] implemented linear programming to find maximal bid size --- .../stonewater/Wave 3 Preparation.py | 71 ++++++++++++++++--- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 20f771ec..c397f962 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3,9 +3,9 @@ import PyPDF2 import re import pandas as pd import numpy as np -from docutils.utils.math.tex2mathml_extern import blahtexml from tqdm import tqdm from collections import Counter +from scipy.optimize import linprog CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") @@ -1843,13 +1843,38 @@ def propsed_wave_3_sample(): ] if surveyed_similar.empty: - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": "4 - no similar property, needs survey to confirm", - "Current EPC Band": "Unknown" - } - ) + + # We get an average based on the postcode + surveyed_similar = survey_results_with_original_features[ + (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + )) + ] + if surveyed_similar.empty: + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": "4 - no similar property, needs survey to confirm", + "Current EPC Band": "Unknown" + } + + ) + else: + expected_sap = surveyed_similar["Current SAP Rating"].mean() + expected_epc = sap_to_epc(expected_sap) + if expected_epc in ["C", "B", "A"]: + tier = "5 - EPC C or above" + else: + tier = "3 - similar property, relaxed conditions" + + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": tier, + "Current EPC Band": expected_epc + } + ) continue # We take an average expected_sap = surveyed_similar["Current SAP Rating"].mean() @@ -1922,5 +1947,35 @@ def propsed_wave_3_sample(): geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum() + geographic_summary[["Loss", "Gain"]].head() + + loss = geographic_summary["Loss"].values + gain = geographic_summary["Gain"].values + + # Define the coefficients for the objective function (negative because we maximize Gain) + c = -gain + + # Define constraints + A = [loss] # Only 1 constraint for now, total Loss + b = [250] # Maximum total Loss allowed + + # Bounds for each variable (select or not select each row, 0 <= x <= 1) + bounds = [(0, 1) for _ in gain] + + # Solve the problem using linprog with HiGHS solver + result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') + if not result.success: + raise Exception("Optimization failed") + + selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 + optimal_gain = -result.fun + print(optimal_gain) + + # Select the rows that are selected + geographic_summary["Selected"] = selected_rows == 1 + geographic_summary[geographic_summary["Selected"]].sum() + bid_size = geographic_summary[geographic_summary["Selected"]][["Gain", "Loss"]].sum().sum() + print("Bid Size:", bid_size) + # if __name__ == "__main__": # main() diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 3ad5d2c1..09ba20bd 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -7,4 +7,5 @@ epc-api-python==1.0.2 usaddress==0.5.11 fuzzywuzzy==0.18.0 python-dotenv +scipy From eff80e637f73490c3f45d2ef0ffcc71a188e95cb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 19:10:23 +0000 Subject: [PATCH 094/255] implementing distance weighting --- .../stonewater/Wave 3 Preparation.py | 332 +++++++++++++----- 1 file changed, 248 insertions(+), 84 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index c397f962..3b44d560 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1635,8 +1635,9 @@ def propsed_wave_3_sample(): header=4 ) - # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater - asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"] + # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing + # UPRN + asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])] # Clean address ids asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] asset_list = asset_list[asset_list["Address ID"] != "Address ID"] @@ -1648,7 +1649,7 @@ def propsed_wave_3_sample(): # Keep just the columns we need asset_list = asset_list[ - ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", + ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", "Heating"] ] @@ -1665,7 +1666,7 @@ def propsed_wave_3_sample(): survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] survey_results_with_original_features = survey_results.merge( - asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], + asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], on="Address ID", how="left" ) @@ -1673,6 +1674,45 @@ def propsed_wave_3_sample(): if survey_results_with_original_features.shape[0] != survey_results.shape[0]: raise ValueError("Something went wrong") + # We get longitude & Latitude + from utils.s3 import read_pickle_from_s3 + archetyping_spatial_features = read_pickle_from_s3( + bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", + ) + archetyping_spatial_features = pd.concat(archetyping_spatial_features) + archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename( + columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"} + ) + # Merge them onto both datasets + asset_list = asset_list.merge( + archetyping_spatial_features, how="left", on="UPRN" + ) + if pd.isnull(asset_list["longitude"]).sum(): + raise ValueError("Something went wrong") + + survey_results_with_original_features = survey_results_with_original_features.merge( + archetyping_spatial_features, how="left", on="UPRN" + ) + if pd.isnull(survey_results_with_original_features["longitude"]).sum(): + raise ValueError("Something went wrong") + + def haversine(lat1, lon1, lat2, lon2): + # Radius of Earth in meters + R = 6371000 + + # Convert degrees to radians + lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) + + # Differences + dlat = lat2 - lat1 + dlon = lon2 - lon1 + + # Haversine formula + a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2 + c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)) + distance = R * c + return distance + # Tier definitions # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D @@ -1716,6 +1756,7 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): + blah1 region_surveyed = survey_results[ survey_results["Archetype ID"].isin(archetypes) & (survey_results["Postal Region"] == region) @@ -1755,23 +1796,46 @@ def propsed_wave_3_sample(): survey_results["Archetype ID"].isin(missed_archetypes) ][["Archetype ID", "Current EPC Band"]].drop_duplicates() - # TODO - We could average the property?? And call it borderline, call out it was averaged!!! - # We could also find the nearest property to it, with similar wall, roof, heating? - # Can use long/lag to distance calc. We have this data from previous - if archetype_surveyed["Archetype ID"].duplicated().sum(): - archetype_surveyed = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) - ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() - archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc) - archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"]) - region_assets = region_assets.merge( - archetype_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method2") - ) + archetype_surveyed = [] + for arch_id in missed_archetypes: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + archetype_data["distance_meters"] = haversine( + lat1=property.latitude, lon1=property.longitude, + lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + ) + expected_sap = np.average( + archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) + archetype_surveyed.append( + { + "Archetype ID": arch_id, + "Address ID": property["Address ID"], + "Current EPC Band": expected_epc + } + ) + archetype_surveyed = pd.DataFrame(archetype_surveyed) + region_assets = region_assets.merge( + archetype_surveyed, + on=["Archetype ID", "Address ID"], + how="left", + suffixes=("", "_method2") + ) + else: + + region_assets = region_assets.merge( + archetype_surveyed, + on="Archetype ID", + how="left", + suffixes=("", "_method2") + ) region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( @@ -1792,6 +1856,16 @@ def propsed_wave_3_sample(): "5 - EPC C or above", region_assets["Confidence Tier"] ) + region_assets["Confidence Tier"] = np.where( + region_assets["Archetype ID"] == "EPC C OR ABOVE", + "5 - EPC C or above", region_assets["Confidence Tier"] + ) + + region_assets["Current EPC Band"] = np.where( + region_assets["Archetype ID"] == "EPC C OR ABOVE", + "C", region_assets["Current EPC Band"] + ) + missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() if not missed_addressids: @@ -1803,17 +1877,10 @@ def propsed_wave_3_sample(): for a_id in missed_addressids: property = asset_list[asset_list["Address ID"] == a_id].squeeze() - if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: - filter_property_types = ["House", "Bungalow"] - else: - filter_property_types = ["Flat"] - - surveyed_similar = survey_results_with_original_features[ - (survey_results_with_original_features["Postcode"] == property["Postcode"]) & + surveyed = survey_results_with_original_features[ ( - survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - filter_property_types - ) + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] ) & ( survey_results_with_original_features["Wall Type"].str.split(":").str[0] == @@ -1827,62 +1894,38 @@ def propsed_wave_3_sample(): survey_results_with_original_features["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0] ) - ] - if surveyed_similar.empty: - surveyed_similar = survey_results_with_original_features[ - (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - filter_property_types - )) & - (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - property["Wall Type"].split(":")[0]) & - (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - property["Roof Type"].split(":")[0]) & - (survey_results_with_original_features["Heating"].str.split(":").str[0] == - property["Heating"].split(":")[0]) - ] + ].copy() - if surveyed_similar.empty: + if surveyed.empty: + blah3 - # We get an average based on the postcode - surveyed_similar = survey_results_with_original_features[ - (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - filter_property_types - )) - ] - if surveyed_similar.empty: - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": "4 - no similar property, needs survey to confirm", - "Current EPC Band": "Unknown" - } + # Calculate distance + surveyed["distance_meters"] = haversine( + lat1=property["latitude"], lon1=property["longitude"], + lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values + ) + surveyed = surveyed.sort_values("distance_meters", ascending=True) - ) - else: - expected_sap = surveyed_similar["Current SAP Rating"].mean() - expected_epc = sap_to_epc(expected_sap) - if expected_epc in ["C", "B", "A"]: - tier = "5 - EPC C or above" - else: - tier = "3 - similar property, relaxed conditions" + # Check if we have a postcode match check if surveyed postcode is the same as the property postcode + if any(surveyed["Postcode"] == property["Postcode"]): + surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]] - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": tier, - "Current EPC Band": expected_epc - } - ) - continue - # We take an average - expected_sap = surveyed_similar["Current SAP Rating"].mean() + if any(surveyed["Postal Region"] == property["Postal Region"]): + surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]] + + # Take the 5 nearest + surveyed_similar = surveyed_similar.head(5) + + # perform a weighted mean of SAP rating - the closer the better + expected_sap = np.average( + surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1) + ) expected_epc = sap_to_epc(expected_sap) + if expected_epc in ["C", "B", "A"]: tier = "5 - EPC C or above" else: - tier = "3 - similar property" + tier = "3 - similar property, weighted on distance" final_missed_matches.append( { @@ -1891,6 +1934,121 @@ def propsed_wave_3_sample(): "Current EPC Band": expected_epc } ) + continue + + # if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: + # filter_property_types = ["House", "Bungalow"] + # else: + # filter_property_types = ["Flat"] + # + # surveyed_similar = survey_results_with_original_features[ + # (survey_results_with_original_features["Postcode"] == property["Postcode"]) & + # ( + # survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + # filter_property_types + # ) + # ) & + # ( + # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + # property["Wall Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Heating"].str.split(":").str[0] == + # property["Heating"].split(":")[0] + # ) + # ] + # if surveyed_similar.empty: + # surveyed_similar = survey_results_with_original_features[ + # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + # filter_property_types + # )) & + # (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + # property["Wall Type"].split(":")[0]) & + # (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0]) & + # (survey_results_with_original_features["Heating"].str.split(":").str[0] == + # property["Heating"].split(":")[0]) + # ] + # + # if surveyed_similar.empty: + # + # # We get an average based on the postcode + # surveyed_similar = survey_results_with_original_features[ + # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + # filter_property_types + # )) + # ] + # if surveyed_similar.empty: + # surveyed_similar_entire_population = survey_results_with_original_features[ + # ( + # survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[ + # "Property Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + # property["Wall Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Heating"].str.split(":").str[0] == + # property["Heating"].split(":")[0] + # ) + # ] + # + # # We order them by distance on postcode + # + # # Average + # expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean() + # expected_epc = sap_to_epc(expected_sap) + # + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": "3 - similar property, all areas searched", + # "Current EPC Band": expected_epc + # } + # + # ) + # else: + # expected_sap = surveyed_similar["Current SAP Rating"].mean() + # expected_epc = sap_to_epc(expected_sap) + # if expected_epc in ["C", "B", "A"]: + # tier = "5 - EPC C or above" + # else: + # tier = "3 - similar property, relaxed conditions" + # + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": tier, + # "Current EPC Band": expected_epc + # } + # ) + # continue + # # We take an average + # expected_sap = surveyed_similar["Current SAP Rating"].mean() + # expected_epc = sap_to_epc(expected_sap) + # if expected_epc in ["C", "B", "A"]: + # tier = "5 - EPC C or above" + # else: + # tier = "3 - similar property" + # + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": tier, + # "Current EPC Band": expected_epc + # } + # ) final_missed_matches = pd.DataFrame(final_missed_matches) @@ -1928,27 +2086,33 @@ def propsed_wave_3_sample(): # We create the gain and loss columns # Gain is the sum of these columns: - # '1 - Archetype surveyed', '1 - property was surveyed', - # '2 - same archetype', '3 - similar property', + # '1 - Archetype surveyed', + # '1 - property was surveyed', + # '2 - same archetype', + # '3 - similar property', + # '3 - similar property, all areas searched', + # '3 - similar property, relaxed conditions' + # # Loss is the sum of these columns: # '4 - no similar property, needs survey to confirm', # '5 - EPC C or above', '5 - property was surveyed' geographic_summary["Gain"] = geographic_summary[ - ['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property'] + [ + '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property', + '3 - similar property, all areas searched', '3 - similar property, relaxed conditions' + ] ].sum(axis=1) geographic_summary["Loss"] = geographic_summary[ - ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed'] + ['5 - EPC C or above', '5 - property was surveyed'] ].sum(axis=1) - geographic_summary.sum() + print(geographic_summary.sum()) geographic_summary = geographic_summary.sort_values("Loss", ascending=True) geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum() - geographic_summary[["Loss", "Gain"]].head() - loss = geographic_summary["Loss"].values gain = geographic_summary["Gain"].values From a630fe05c485aca2c5509748eecb5544ddc78dbe Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 19:46:17 +0000 Subject: [PATCH 095/255] fixing unhandled cases in matching algorithm --- .../stonewater/Wave 3 Preparation.py | 92 ++++++++++++++++--- 1 file changed, 78 insertions(+), 14 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 3b44d560..460aa8ee 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1756,20 +1756,44 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): - blah1 - region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(archetypes) & - (survey_results["Postal Region"] == region) - ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() - region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc) - region_surveyed = region_surveyed.drop(columns=["Current SAP Rating"]) + region_surveyed = [] + for arch_id in archetypes: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + archetype_data["distance_meters"] = haversine( + lat1=property.latitude, lon1=property.longitude, + lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + ) + expected_sap = np.average( + archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) + region_surveyed.append( + { + "Archetype ID": arch_id, + "Address ID": property["Address ID"], + "Current EPC Band": expected_epc + } + ) - region_assets = region_assets.merge( - region_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method1") - ) + region_surveyed = pd.DataFrame(region_surveyed) + region_assets = region_assets.merge( + region_surveyed, + on=["Archetype ID", "Address ID"], + how="left", + suffixes=("", "_method1") + ) + else: + region_assets = region_assets.merge( + region_surveyed, + on="Archetype ID", + how="left", + suffixes=("", "_method1") + ) # Label the tier 1 properties region_assets["Confidence Tier"] = np.where( @@ -1897,7 +1921,47 @@ def propsed_wave_3_sample(): ].copy() if surveyed.empty: - blah3 + # In this case, we do one additional check where we filter on everything the same apart from heating, + # where we do a slightly more rough match + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) + ].copy() + + if "Electric" in property["Heating"]: + # Take other electric heating systems + surveyed = surveyed[surveyed["Heating"].str.contains("Electric")] + elif property["Heating"] == "Community Heating Systems: Community boilers only (RdSAP)": + # Take other community heating systems + surveyed = surveyed[surveyed["Heating"].str.contains("Community")] + elif property["Heating"] == 'Heat Pump: (from database)': + # Take other heat pumps + surveyed = surveyed[surveyed["Heating"].str.contains("Heat Pump")] + elif property["Heating"] == "Solid fuel room heaters: Open fire in grate": + # Take other properties with room heaters + surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")] + else: + raise Exception("Fix me") + + if surveyed.empty: + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": "4 - no similar property, needs survey to confirm", + "Current EPC Band": "Needs Survey" + } + ) + continue # Calculate distance surveyed["distance_meters"] = haversine( From 1b38832e27abcbebe575f4be867a41e4ae772949 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 20:13:19 +0000 Subject: [PATCH 096/255] 2044 properties added --- .../stonewater/Wave 3 Preparation.py | 148 ++++++++++++++---- 1 file changed, 117 insertions(+), 31 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 460aa8ee..6f98c9fd 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1938,6 +1938,27 @@ def propsed_wave_3_sample(): ) ].copy() + if surveyed.empty: + if property["Property Type"].split(":")[0] in ["House", "Bungalow", "Maisonette"]: + filter_property_types = ["House", "Bungalow", ] + else: + filter_property_types = ["Flat"] + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + ) + ) & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) + ].copy() + if "Electric" in property["Heating"]: # Take other electric heating systems surveyed = surveyed[surveyed["Heating"].str.contains("Electric")] @@ -1950,6 +1971,9 @@ def propsed_wave_3_sample(): elif property["Heating"] == "Solid fuel room heaters: Open fire in grate": # Take other properties with room heaters surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")] + elif "Boiler" in property["Heating"]: + # Take other properties with boilers + surveyed = surveyed[surveyed["Heating"].str.contains("Boiler")] else: raise Exception("Fix me") @@ -1972,17 +1996,29 @@ def propsed_wave_3_sample(): # Check if we have a postcode match check if surveyed postcode is the same as the property postcode if any(surveyed["Postcode"] == property["Postcode"]): - surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]] + surveyed = surveyed[surveyed["Postcode"] == property["Postcode"]] if any(surveyed["Postal Region"] == property["Postal Region"]): - surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]] + surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]] # Take the 5 nearest - surveyed_similar = surveyed_similar.head(5) + surveyed = surveyed.head(5) + + # # We allow a max distance of 10km + # surveyed = surveyed[surveyed["distance_meters"] < 10000] + # if surveyed.empty: + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": "4 - no similar property, needs survey to confirm", + # "Current EPC Band": "Needs Survey" + # } + # ) + # continue # perform a weighted mean of SAP rating - the closer the better expected_sap = np.average( - surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1) + surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1) ) expected_epc = sap_to_epc(expected_sap) @@ -2153,23 +2189,21 @@ def propsed_wave_3_sample(): # '1 - Archetype surveyed', # '1 - property was surveyed', # '2 - same archetype', - # '3 - similar property', - # '3 - similar property, all areas searched', - # '3 - similar property, relaxed conditions' + # '3 - similar property, weighted on distance' + + gain_columns = [ + '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', + '3 - similar property, weighted on distance' + ] # # Loss is the sum of these columns: # '4 - no similar property, needs survey to confirm', # '5 - EPC C or above', '5 - property was surveyed' - geographic_summary["Gain"] = geographic_summary[ - [ - '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property', - '3 - similar property, all areas searched', '3 - similar property, relaxed conditions' - ] - ].sum(axis=1) - geographic_summary["Loss"] = geographic_summary[ - ['5 - EPC C or above', '5 - property was surveyed'] - ].sum(axis=1) + loss_columns = ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', + '5 - property was surveyed'] + geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1) + geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1) print(geographic_summary.sum()) @@ -2180,30 +2214,82 @@ def propsed_wave_3_sample(): loss = geographic_summary["Loss"].values gain = geographic_summary["Gain"].values - # Define the coefficients for the objective function (negative because we maximize Gain) - c = -gain + def optimise(gain, loss, max_loss=250): - # Define constraints - A = [loss] # Only 1 constraint for now, total Loss - b = [250] # Maximum total Loss allowed + # Define the coefficients for the objective function (negative because we maximize Gain) + c = -gain - # Bounds for each variable (select or not select each row, 0 <= x <= 1) - bounds = [(0, 1) for _ in gain] + # Define constraints + A = [loss] # Only 1 constraint for now, total Loss + b = [max_loss] # Maximum total Loss allowed - # Solve the problem using linprog with HiGHS solver - result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') - if not result.success: - raise Exception("Optimization failed") + # Bounds for each variable (select or not select each row, 0 <= x <= 1) + bounds = [(0, 1) for _ in gain] - selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 - optimal_gain = -result.fun - print(optimal_gain) + # Solve the problem using linprog with HiGHS solver + result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') + if not result.success: + raise Exception("Optimization failed") + + selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 + optimal_gain = -result.fun + + return selected_rows, optimal_gain + + selected_rows, _ = optimise(gain, loss, 250) # Select the rows that are selected geographic_summary["Selected"] = selected_rows == 1 geographic_summary[geographic_summary["Selected"]].sum() - bid_size = geographic_summary[geographic_summary["Selected"]][["Gain", "Loss"]].sum().sum() + + region_totals = geographic_summary[ + geographic_summary["Selected"] + ][["Gain", "Loss"]].sum() + + # We now see if there are any postcodes that have no loss that can be added + unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values + + postcode_summary = results.pivot_table( + index='Postcode', + columns='Confidence Tier', + aggfunc='size', + fill_value=0 + ).reset_index() + postcode_summary = postcode_summary.merge( + results[["Postcode", "Postal Region"]].drop_duplicates(), + how="left", on="Postcode" + ) + + postcode_summary_unselected_regions = postcode_summary[ + postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions) + ].copy() + + postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1) + postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1) + + # Remaining loss allowed + remaining_loss_constraint = 250 - region_totals["Loss"] + postcode_selected_rows, _ = optimise( + gain=postcode_summary_unselected_regions["Gain"].values, + loss=postcode_summary_unselected_regions["Loss"].values, + max_loss=int(remaining_loss_constraint) + ) + + postcode_summary_unselected_regions["Selected"] = postcode_selected_rows == 1 + postcode_summary_unselected_regions[postcode_summary_unselected_regions["Selected"]][["Gain", "Loss"]].sum() + + postcode_optimised_additional_properties = postcode_summary_unselected_regions[ + postcode_summary_unselected_regions["Selected"] + ] + + postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum() + + bid_size = region_totals.sum() + postcode_totals.sum() print("Bid Size:", bid_size) + total_epc_d_or_below = region_totals["Gain"] + postcode_totals["Gain"] + print("Total EPC D or below:", total_epc_d_or_below) + total_epc_c = region_totals["Loss"] + postcode_totals["Loss"] + print("Total EPC C or above:", total_epc_c) # if __name__ == "__main__": # main() From 67f97feb18829a4a2d327335a4a6ed8c8c06e495 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 22:33:42 +0000 Subject: [PATCH 097/255] messing around with street match --- .../stonewater/Wave 3 Preparation.py | 105 ++++++++++++------ 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 6f98c9fd..5ebb06e2 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1637,7 +1637,7 @@ def propsed_wave_3_sample(): # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing # UPRN - asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])] + asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])] # Clean address ids asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] asset_list = asset_list[asset_list["Address ID"] != "Address ID"] @@ -1645,12 +1645,13 @@ def propsed_wave_3_sample(): # Create the postal region, taking the first part of the postcode asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0] + asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"] unique_postal_regions = asset_list["Postal Region"].unique() # Keep just the columns we need asset_list = asset_list[ - ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", - "Heating"] + ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Street and Region", + "Property Type", "Wall Type", "Roof Type", "Heating"] ] survey_results = pd.read_excel( @@ -1853,7 +1854,6 @@ def propsed_wave_3_sample(): suffixes=("", "_method2") ) else: - region_assets = region_assets.merge( archetype_surveyed, on="Archetype ID", @@ -1903,20 +1903,20 @@ def propsed_wave_3_sample(): surveyed = survey_results_with_original_features[ ( - survey_results_with_original_features["Property Type"].str.split(":").str[0] == - property["Property Type"].split(":")[0] + survey_results_with_original_features["Property Type"] == + property["Property Type"] ) & ( - survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - property["Wall Type"].split(":")[0] + survey_results_with_original_features["Wall Type"] == + property["Wall Type"] ) & ( - survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - property["Roof Type"].split(":")[0] + survey_results_with_original_features["Roof Type"] == + property["Roof Type"] ) & ( - survey_results_with_original_features["Heating"].str.split(":").str[0] == - property["Heating"].split(":")[0] + survey_results_with_original_features["Heating"] == + property["Heating"] ) ].copy() @@ -1962,7 +1962,10 @@ def propsed_wave_3_sample(): if "Electric" in property["Heating"]: # Take other electric heating systems surveyed = surveyed[surveyed["Heating"].str.contains("Electric")] - elif property["Heating"] == "Community Heating Systems: Community boilers only (RdSAP)": + elif property["Heating"] in [ + "Community Heating Systems: Community boilers only (RdSAP)", + "Community Heating Systems: Community CHP and boilers (RdSAP)" + ]: # Take other community heating systems surveyed = surveyed[surveyed["Heating"].str.contains("Community")] elif property["Heating"] == 'Heat Pump: (from database)': @@ -2001,8 +2004,8 @@ def propsed_wave_3_sample(): if any(surveyed["Postal Region"] == property["Postal Region"]): surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]] - # Take the 5 nearest - surveyed = surveyed.head(5) + # Take the 3 nearest + surveyed = surveyed.head(3) # # We allow a max distance of 10km # surveyed = surveyed[surveyed["distance_meters"] < 10000] @@ -2176,6 +2179,9 @@ def propsed_wave_3_sample(): results = pd.concat(results) + # home = results[results["Confidence Tier"] == "5 - EPC C or above"].sample(1) + # region = home["Postal Region"].values[0] + # Create a pivot table for counts of Confidence Tier by Postal Region geographic_summary = results.pivot_table( index='Postal Region', @@ -2192,7 +2198,9 @@ def propsed_wave_3_sample(): # '3 - similar property, weighted on distance' gain_columns = [ - '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', + '1 - Archetype surveyed', + '1 - property was surveyed', + '2 - same archetype', '3 - similar property, weighted on distance' ] # @@ -2200,8 +2208,11 @@ def propsed_wave_3_sample(): # '4 - no similar property, needs survey to confirm', # '5 - EPC C or above', '5 - property was surveyed' - loss_columns = ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', - '5 - property was surveyed'] + loss_columns = [ + '4 - no similar property, needs survey to confirm', + '5 - EPC C or above', + '5 - property was surveyed' + ] geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1) geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1) @@ -2249,26 +2260,30 @@ def propsed_wave_3_sample(): # We now see if there are any postcodes that have no loss that can be added unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values + # TODO: Try on street + postcode_summary = results.pivot_table( - index='Postcode', + index='Street and Region', columns='Confidence Tier', aggfunc='size', fill_value=0 ).reset_index() - postcode_summary = postcode_summary.merge( - results[["Postcode", "Postal Region"]].drop_duplicates(), - how="left", on="Postcode" - ) - - postcode_summary_unselected_regions = postcode_summary[ - postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions) - ].copy() + # postcode_summary = postcode_summary.merge( + # results[["Postcode", "Postal Region"]].drop_duplicates(), + # how="left", on="Postcode" + # ) + # + postcode_summary_unselected_regions = postcode_summary.copy() + # postcode_summary_unselected_regions = postcode_summary[ + # postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions) + # ].copy() postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1) postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1) # Remaining loss allowed - remaining_loss_constraint = 250 - region_totals["Loss"] + # remaining_loss_constraint = 230 - region_totals["Loss"] + remaining_loss_constraint = 250 postcode_selected_rows, _ = optimise( gain=postcode_summary_unselected_regions["Gain"].values, loss=postcode_summary_unselected_regions["Loss"].values, @@ -2284,12 +2299,40 @@ def propsed_wave_3_sample(): postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum() - bid_size = region_totals.sum() + postcode_totals.sum() + bid_size = postcode_totals.sum() print("Bid Size:", bid_size) - total_epc_d_or_below = region_totals["Gain"] + postcode_totals["Gain"] + total_epc_d_or_below = postcode_totals["Gain"] print("Total EPC D or below:", total_epc_d_or_below) - total_epc_c = region_totals["Loss"] + postcode_totals["Loss"] + total_epc_c = postcode_totals["Loss"] print("Total EPC C or above:", total_epc_c) + # Total needing a survey + total_needing_survey = postcode_optimised_additional_properties[ + "4 - no similar property, needs survey to confirm" + ].sum() + print("Total needing survey:", total_needing_survey) + + # Look for postcodes that have no loss + unselected_streets = postcode_summary_unselected_regions[ + ~postcode_summary_unselected_regions["Selected"] + ]["Street and Region"].values + + postcode_summary2 = results[ + results["Street and Region"].isin(unselected_streets) + ].pivot_table( + index='Postcode', + columns='Confidence Tier', + aggfunc='size', + fill_value=0 + ).reset_index() + + postcode_summary2["Gain"] = postcode_summary2[gain_columns].sum(axis=1) + postcode_summary2["Loss"] = postcode_summary2[loss_columns].sum(axis=1) + + no_loss_postcodes = postcode_summary2[postcode_summary2["Loss"] == 0].sort_values("Gain", ascending=False) + total_bid_size = bid_size + no_loss_postcodes["Gain"].sum() + print(total_bid_size) + + z = results[results["Confidence Tier"] == "5 - EPC C or above"] # if __name__ == "__main__": # main() From efba61c6ac52740d70c51864ea49c0d5623b353d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 22:51:24 +0000 Subject: [PATCH 098/255] tweaking --- .../stonewater/Wave 3 Preparation.py | 121 ++++++++++++------ 1 file changed, 83 insertions(+), 38 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 5ebb06e2..974cd908 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1719,6 +1719,72 @@ def propsed_wave_3_sample(): # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D # + def match_property_to_surveyed(property, survey_results_with_original_features): + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"] == + property["Property Type"] + ) & + ( + survey_results_with_original_features["Wall Type"] == + property["Wall Type"] + ) & + ( + survey_results_with_original_features["Roof Type"] == + property["Roof Type"] + ) & + ( + survey_results_with_original_features["Heating"] == + property["Heating"] + ) + ].copy() + + if not surveyed.empty: + return surveyed + + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"] == + property["Property Type"] + ) & + ( + survey_results_with_original_features["Wall Type"] == + property["Wall Type"] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Heating"] == + property["Heating"] + ) + ].copy() + + if not surveyed.empty: + return surveyed + + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"] == + property["Property Type"] + ) & + ( + survey_results_with_original_features["Wall Type"] == + property["Wall Type"] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] + ) + ].copy() + + return surveyed + results = [] for region in tqdm(unique_postal_regions): # Take all of the properties in that region @@ -1757,6 +1823,7 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): + region_surveyed = [] for arch_id in archetypes: for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): @@ -1765,6 +1832,12 @@ def propsed_wave_3_sample(): ].copy() if archetype_data.empty: continue + if archetype_data.shape[0] > 1: + # Look for an exact match, or as close as possible + archetype_data_filtered = match_property_to_surveyed(property, archetype_data) + if not archetype_data_filtered.empty: + archetype_data = archetype_data_filtered + archetype_data["distance_meters"] = haversine( lat1=property.latitude, lon1=property.longitude, lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values @@ -1899,28 +1972,15 @@ def propsed_wave_3_sample(): # This means that this archetype was never surveyed and so we need to find a sufficiently similar property final_missed_matches = [] for a_id in missed_addressids: + + match_type = "3 - compared to similar properties" + property = asset_list[asset_list["Address ID"] == a_id].squeeze() - surveyed = survey_results_with_original_features[ - ( - survey_results_with_original_features["Property Type"] == - property["Property Type"] - ) & - ( - survey_results_with_original_features["Wall Type"] == - property["Wall Type"] - ) & - ( - survey_results_with_original_features["Roof Type"] == - property["Roof Type"] - ) & - ( - survey_results_with_original_features["Heating"] == - property["Heating"] - ) - ].copy() + surveyed = match_property_to_surveyed(property, survey_results_with_original_features) if surveyed.empty: + match_type = "3 - compared to similar properties, relaxed" # In this case, we do one additional check where we filter on everything the same apart from heating, # where we do a slightly more rough match surveyed = survey_results_with_original_features[ @@ -2026,14 +2086,12 @@ def propsed_wave_3_sample(): expected_epc = sap_to_epc(expected_sap) if expected_epc in ["C", "B", "A"]: - tier = "5 - EPC C or above" - else: - tier = "3 - similar property, weighted on distance" + match_type = "5 - EPC C or above" final_missed_matches.append( { "Address ID": a_id, - "Confidence Tier": tier, + "Confidence Tier": match_type, "Current EPC Band": expected_epc } ) @@ -2197,22 +2255,9 @@ def propsed_wave_3_sample(): # '2 - same archetype', # '3 - similar property, weighted on distance' - gain_columns = [ - '1 - Archetype surveyed', - '1 - property was surveyed', - '2 - same archetype', - '3 - similar property, weighted on distance' - ] - # - # Loss is the sum of these columns: - # '4 - no similar property, needs survey to confirm', - # '5 - EPC C or above', '5 - property was surveyed' + gain_columns = sorted([x for x in results["Confidence Tier"].unique() if "1 - " in x or "2 - " in x or "3 - " in x]) + loss_columns = sorted([x for x in results["Confidence Tier"].unique() if "4 - " in x or "5 - " in x]) - loss_columns = [ - '4 - no similar property, needs survey to confirm', - '5 - EPC C or above', - '5 - property was surveyed' - ] geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1) geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1) @@ -2283,7 +2328,7 @@ def propsed_wave_3_sample(): # Remaining loss allowed # remaining_loss_constraint = 230 - region_totals["Loss"] - remaining_loss_constraint = 250 + remaining_loss_constraint = 220 postcode_selected_rows, _ = optimise( gain=postcode_summary_unselected_regions["Gain"].values, loss=postcode_summary_unselected_regions["Loss"].values, From 294506853dd32fb9aa21ce6500d6eebed7e41be6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 18:24:26 +0000 Subject: [PATCH 099/255] adding in new features --- etl/customers/aiha/bid_numbers.py | 18 +++++- etl/customers/remote_assessments/app.py | 1 + .../stonewater/Wave 3 Preparation.py | 59 +++++++++++++++++-- 3 files changed, 71 insertions(+), 7 deletions(-) diff --git a/etl/customers/aiha/bid_numbers.py b/etl/customers/aiha/bid_numbers.py index 96859f99..b371e2e5 100644 --- a/etl/customers/aiha/bid_numbers.py +++ b/etl/customers/aiha/bid_numbers.py @@ -52,6 +52,20 @@ aiha_wave_3_features = aiha_original_asset_data[ wall_type_breakdown = aiha_wave_3_features["Wall type"].value_counts() property_type_breakdown = aiha_wave_3_features.groupby(["Property type", "floor"]).size().reset_index() +aiha_wave_3_features[aiha_wave_3_features["Property type"] == "Flat"][["Street address", "Postcode"]] + +# 4 Yetev Lev Court  ... Semi-Detached mid - Medium +# B 86 Bethune Road ... Mid-Terrace top. - Low +# A 80 Bethune Road ... Mid-Terrace ground. - Low +# B 80 Bethune Road ... \n \n - Low +# A 9 Clapton Common ... Semi-Detached ground. - Low +# C 9 Clapton Common ... End-Terrace \n. - Low +# B 89 Manor Road ... \n \n. - Low +# A 6 Northfield Road ... Detached top. - Low +# 13 Northfield Rd ... Semi-Detached \n - Low +# A 73 Manor Road ... End-Terrace \n - Low +# B 73 Manor Road ... Detached top - Low + # Hornsey data - contained in original asset list hornsey_asset_list = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing " @@ -88,5 +102,5 @@ caha_epc_data = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_extracted_property_data.xlsx" ) -caha_epc_data["property_type"].value_counts() -caha_epc_data["wall_type"].value_counts() +caha_epc_data[caha_epc_data["address"] != "33 Woodhouse Road"]["property_type"].value_counts() +caha_epc_data[caha_epc_data["address"] != "33 Woodhouse Road"]["wall_type"].value_counts() diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 33015d87..59e0e868 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -17,6 +17,7 @@ def app(): "address": "5, Lynton Street", "postcode": "DE22 3RW" } + ] asset_list = pd.DataFrame(asset_list) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 974cd908..81b5915f 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -6,6 +6,7 @@ import numpy as np from tqdm import tqdm from collections import Counter from scipy.optimize import linprog +from utils.s3 import read_pickle_from_s3 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") @@ -1264,7 +1265,7 @@ def main(): stonewater_data[c] = stonewater_data[c].astype(str) # Save this data to excel - stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V2.xlsx", index=False) + stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False) cost_sheet = [ { @@ -1654,17 +1655,66 @@ def propsed_wave_3_sample(): "Property Type", "Wall Type", "Roof Type", "Heating"] ] + # Updated packages: to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False) survey_results = pd.read_excel( os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"), header=13, sheet_name="Modelled Packages" ) + additional_survey_data = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"), + header=0 + ) + survey_results = survey_results.merge( + additional_survey_data[ + [ + "Address ID", + "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness", + "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", + "Main Building Alternative Wall Thickness" + ] + ].rename(columns={"Main Wall Insulation_x": "Main Wall Insulation Type"}), + how="left", + on="Address ID" + ) + # TOOD: We probably want the actual surveyed wall, roof, heating type survey_results = survey_results[ - ["Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode"] - ] - survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] + [ + "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode", + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", + "Existing Primary Heating System", + "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness", + "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", + "Main Building Alternative Wall Thickness" + ] + ].rename( + columns={ + "Existing Primary Heating System": "Surveyed Primary Heating System" + } + ) + + # Concatenate from the wall information + survey_results["Surveyed: Wall Type"] = survey_results["Main Wall Type"] + ": " + survey_results[ + "Main Wall Insulation Type"] + # Alternative wall + survey_results["Survey: Main Alternative Wall"] = ( + survey_results["Main Building Alternative Wall Type"] + ": " + survey_results[ + "Main Building Alternative Wall Insulation"] + ) + # Roof information + survey_results["Survey: Type"] = survey_results["Main Roof Type"] + ": " + survey_results[ + "Main Roof Insulation"] + ": " + survey_results["Main Roof Insulation Thickness"].astype(str) + + # Drop the individual columns: + survey_results = survey_results.drop( + columns=[ + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", + "Main Wall Type", "Main Wall Insulation Type", + "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation" + ] + ) survey_results_with_original_features = survey_results.merge( asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], @@ -1676,7 +1726,6 @@ def propsed_wave_3_sample(): raise ValueError("Something went wrong") # We get longitude & Latitude - from utils.s3 import read_pickle_from_s3 archetyping_spatial_features = read_pickle_from_s3( bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", ) From 377d9929e418073567b6af8f589eb5fe58e92a1e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 19:21:35 +0000 Subject: [PATCH 100/255] cleaning roof extraction --- .../stonewater/Wave 3 Preparation.py | 100 +++++++++++++----- 1 file changed, 72 insertions(+), 28 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 81b5915f..aa9e4488 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -291,26 +291,11 @@ def extract_summary_report(pdf_path): data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] - roof_section = re.search(r"8\.0 Roofs:\n(.*?)\n9\.0 Floors:", text, re.DOTALL) - roof_text = roof_section.group(1).strip() - roof_type_match = re.search(r"Type\s*([A-Za-z0-9\s]+)", roof_text) - data["Main Roof Type"] = roof_type_match.group(1).strip() if roof_type_match else None - - # Check if "Insulation" exists between Type and Insulation Thickness - insulation_search = re.search( - r"Type\s+.*?\n(Insulation\s+(.*?)\n)?(Insulation Thickness\s+(.*?)\n)", roof_text, re.DOTALL - ) - - if insulation_search: - # Insulation match will be present if it exists, otherwise it will be None - insulation_match = insulation_search.group(2) # Optional group for Insulation - insulation_thickness_match = insulation_search.group(4) # Required group for Insulation Thickness - - # Populate insulation fields - data["Main Roof Insulation"] = insulation_match.strip() if insulation_match else None - data["Main Roof Insulation Thickness"] = ( - insulation_thickness_match.strip() if insulation_thickness_match else None - ) + extracted_roof_data = extract_roof_details_summary(text) + main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] + data["Main Roof Type"] = main_roof_data["Roof Type"] + data["Main Roof Insulation"] = main_roof_data["Roof Insulation"] + data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"] walls_data = extract_wall_details_summary(text) # Get the main building wall data @@ -593,6 +578,54 @@ def extract_roof_details_epr(text): return roof_data +def extract_roof_details_summary(text): + """ + Extracts roof type, insulation, and insulation thickness for each building part + in the 8.0 Roofs section of the summary report. + """ + # Define data structure to hold results + roof_data = [] + + # Locate the entire 8.0 Roofs section + roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL) + if not roof_section_match: + return roof_data # Return empty if no roof section is found + + # Extract the roof section and append "9.0 Floors:" as the boundary + roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:" + + # Define pattern to match each building part's roof entry + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label + r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label, or end + r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation + r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness + re.DOTALL + ) + + # Extract each building part's data + for match in building_part_pattern.finditer(roof_section): + part_name = match.group(1).strip() # Building part label + roof_type = match.group(2).strip() # Roof Type + roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation + roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness + + # Cleaning to handle annoying cases when it comes out like this: + # 'A Another dwelling above\n1st Extension' + if roof_type.startswith("A Another dwelling above"): + roof_type = "A Another dwelling above" + + # Store results for this building part + roof_data.append({ + "Building Part": part_name, + "Roof Type": roof_type, + "Roof Insulation": roof_insulation, + "Roof Insulation Thickness": roof_insulation_thickness, + }) + + return roof_data + + def extract_wall_details_epr(text): """ Extracts wall type, insulation, dry-lining, and thickness for each building part @@ -1691,21 +1724,21 @@ def propsed_wave_3_sample(): ] ].rename( columns={ - "Existing Primary Heating System": "Surveyed Primary Heating System" + "Existing Primary Heating System": "Survey: Primary Heating System" } ) # Concatenate from the wall information - survey_results["Surveyed: Wall Type"] = survey_results["Main Wall Type"] + ": " + survey_results[ - "Main Wall Insulation Type"] + survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[ + "Main Wall Insulation Type"].astype(str) # Alternative wall survey_results["Survey: Main Alternative Wall"] = ( - survey_results["Main Building Alternative Wall Type"] + ": " + survey_results[ - "Main Building Alternative Wall Insulation"] + survey_results["Main Building Alternative Wall Type"].astype(str) + ": " + survey_results[ + "Main Building Alternative Wall Insulation"].astype(str) ) # Roof information - survey_results["Survey: Type"] = survey_results["Main Roof Type"] + ": " + survey_results[ - "Main Roof Insulation"] + ": " + survey_results["Main Roof Insulation Thickness"].astype(str) + survey_results["Survey: Main Roof Type"] = survey_results["Main Roof Type"].astype(str) + ": " + survey_results[ + "Main Roof Insulation"].astype(str) + ": " + survey_results["Main Roof Insulation Thickness"].astype(str) # Drop the individual columns: survey_results = survey_results.drop( @@ -1834,6 +1867,11 @@ def propsed_wave_3_sample(): return surveyed + survey_attribute_columns = [ + "Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', + 'Survey: Primary Heating System' + ] + results = [] for region in tqdm(unique_postal_regions): # Take all of the properties in that region @@ -1845,7 +1883,8 @@ def propsed_wave_3_sample(): ] region_assets = region_assets.merge( - exact_surveyed[["Address ID", "Current EPC Band"]], + exact_surveyed[ + ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns], on="Address ID", how="left" ) @@ -2286,6 +2325,11 @@ def propsed_wave_3_sample(): results = pd.concat(results) + # Check if there are missings in current epc band, current sap rating or any of the survey attributes + for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns: + if pd.isnull(results[c]).sum(): + raise Exception("Something went wrong") + # home = results[results["Confidence Tier"] == "5 - EPC C or above"].sample(1) # region = home["Postal Region"].values[0] From a7857c0375949f5d45d47afe41f59e07de883e71 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 20:30:57 +0000 Subject: [PATCH 101/255] pulling out data from best match --- .../stonewater/Wave 3 Preparation.py | 111 ++++++++++-------- etl/find_my_epc/RetrieveFindMyEpc.py | 1 + etl/route_march_data_pull/app.py | 65 ++++------ 3 files changed, 83 insertions(+), 94 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index aa9e4488..08236d5b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1727,7 +1727,7 @@ def propsed_wave_3_sample(): "Existing Primary Heating System": "Survey: Primary Heating System" } ) - + survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] # Concatenate from the wall information survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[ "Main Wall Insulation Type"].astype(str) @@ -1872,6 +1872,8 @@ def propsed_wave_3_sample(): 'Survey: Primary Heating System' ] + survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy() + results = [] for region in tqdm(unique_postal_regions): # Take all of the properties in that region @@ -1884,10 +1886,14 @@ def propsed_wave_3_sample(): region_assets = region_assets.merge( exact_surveyed[ - ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns], + ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [ + "Survey: Matching Address ID" + ] + ], on="Address ID", how="left" ) + region_assets['Distance to Closest Match (m)'] = 0 # Label the tier 1 properties region_assets["Confidence Tier"] = None @@ -1901,61 +1907,62 @@ def propsed_wave_3_sample(): "5 - property was surveyed", region_assets["Confidence Tier"] ) - archetypes = region_assets[ + archetype_ids = region_assets[ pd.isnull(region_assets["Confidence Tier"]) ]["Archetype ID"].unique() # We get the properties that have been surveyed - region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(archetypes) & - (survey_results["Postal Region"] == region) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() - if region_surveyed["Archetype ID"].duplicated().sum(): + region_surveyed = [] + for arch_id in archetype_ids: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + if archetype_data.shape[0] > 1: + # Look for an exact match, or as close as possible + archetype_data_filtered = match_property_to_surveyed(property, archetype_data) + if not archetype_data_filtered.empty: + archetype_data = archetype_data_filtered - region_surveyed = [] - for arch_id in archetypes: - for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): - archetype_data = survey_results_with_original_features[ - survey_results["Archetype ID"] == arch_id - ].copy() - if archetype_data.empty: - continue - if archetype_data.shape[0] > 1: - # Look for an exact match, or as close as possible - archetype_data_filtered = match_property_to_surveyed(property, archetype_data) - if not archetype_data_filtered.empty: - archetype_data = archetype_data_filtered + archetype_data["distance_meters"] = haversine( + lat1=property.latitude, lon1=property.longitude, + lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + ) + expected_sap = np.average( + archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) - archetype_data["distance_meters"] = haversine( - lat1=property.latitude, lon1=property.longitude, - lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values - ) - expected_sap = np.average( - archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) - ) - expected_epc = sap_to_epc(expected_sap) - region_surveyed.append( - { - "Archetype ID": arch_id, - "Address ID": property["Address ID"], - "Current EPC Band": expected_epc - } - ) + # We take the features of the closest matching property + closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0] - region_surveyed = pd.DataFrame(region_surveyed) - region_assets = region_assets.merge( - region_surveyed, - on=["Archetype ID", "Address ID"], - how="left", - suffixes=("", "_method1") - ) - else: - region_assets = region_assets.merge( - region_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method1") - ) + region_surveyed.append( + { + "Archetype ID": arch_id, + "Address ID": property["Address ID"], + "Current EPC Band": expected_epc, + "Current SAP Rating": expected_sap, + 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"], + 'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"], + 'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"], + 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], + "Survey: Matching Address ID": closest_match["Address ID"], + 'Distance to Closest Match (m)': closest_match["distance_meters"] + } + ) + + region_surveyed = pd.DataFrame(region_surveyed) + starting_shape = region_assets.shape[0] + region_assets = region_assets.merge( + region_surveyed, + on=["Archetype ID", "Address ID"], + how="left", + suffixes=("", "_method1") + ) + if region_assets.shape[0] != starting_shape: + raise ValueError("Something went wrong") # Label the tier 1 properties region_assets["Confidence Tier"] = np.where( @@ -2326,7 +2333,9 @@ def propsed_wave_3_sample(): results = pd.concat(results) # Check if there are missings in current epc band, current sap rating or any of the survey attributes - for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns: + for c in ( + ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + + survey_attribute_columns): if pd.isnull(results[c]).sum(): raise Exception("Something went wrong") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 913a04b8..d5a5134f 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -269,6 +269,7 @@ class RetrieveFindMyEpc: "Loft insulation": ["loft_insulation"], "Solar photovoltaic (PV) panels": ["solar_pv"], "Party wall insulation": ["party_wall_insulation"], + 'Draught proofing': ["draught_proofing"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index f24c5bb2..1e478b0c 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -23,41 +23,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data(asset_list, fulladdress_column, address1_column, postcode_column): epc_data = [] errors = [] + no_epc = [] for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): - postcode = home[postcode_column] - house_number = home[address1_column] - full_address = home[fulladdress_column] - - searcher = SearchEpc( - address1=str(house_number), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5 - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None - - searcher.find_property(skip_os=True) - if searcher.newest_epc is None: - continue - - # Look for EPC recommendatons - try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} - - # Retrieve data from FindMyEPC - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - time.sleep(np.random.uniform(0.1, 1)) try: postcode = home[postcode_column] house_number = home[address1_column] @@ -79,6 +46,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): searcher.find_property(skip_os=True) if searcher.newest_epc is None: + no_epc.append(home["row_id"]) continue # Look for EPC recommendatons @@ -106,7 +74,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): errors.append(home["row_id"]) time.sleep(5) - return epc_data, errors + return epc_data, errors, no_epc def extract_address1(asset_list, full_address_col, method="first_two_words"): @@ -140,26 +108,37 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/" - DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/" + DATA_FILENAME = "Bromford programme review.xlsx" + SHEET_NAME = "Bromford" POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = "Address" - ADDRESS1_COLUMN = None + FULLADDRESS_COLUMN = None + ADDRESS1_COLUMN = "No." ADDRESS1_METHOD = "first_two_words" + ADDRESS_COLS_TO_CONCAT = ["No.", "Address"] - asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0) + asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) + asset_list = asset_list[~pd.isnull(asset_list["Postcode"])] asset_list["row_id"] = asset_list.index # We clean up portential non-breaking spaces, and double spaces for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]: + asset_list[col] = asset_list[col].astype(str) asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) if ADDRESS1_COLUMN is None: ADDRESS1_COLUMN = "address1_extracted" - asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD) + asset_list = extract_address1( + asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD + ) - epc_data, errors = get_data( + if FULLADDRESS_COLUMN is None: + FULLADDRESS_COLUMN = "fulladdress_extracted" + # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas + asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1) + + epc_data, errors, no_epc = get_data( asset_list=asset_list, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, @@ -168,7 +147,7 @@ def app(): # We now retrieve any failed properties asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] - epc_data_failed, _ = get_data( + epc_data_failed, _, _ = get_data( asset_list=asset_list_failed, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, From 7accbded137918ba4e38c5b6ed79703b0e727e3d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 21:38:00 +0000 Subject: [PATCH 102/255] debugging find epc pull --- etl/find_my_epc/RetrieveFindMyEpc.py | 21 ++++++++++++++++++++- etl/route_march_data_pull/app.py | 22 ++++++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index d5a5134f..ac0e8235 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -126,6 +126,7 @@ class RetrieveFindMyEpc: # Find all h3 headers for each step and extract their related information step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m') previous_sap_score = current_sap + previous_epc = current_rating.split(' ')[-6] for step_num, step_header in enumerate(step_headers, start=1): # Extract the step title (the measure) measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "") @@ -138,7 +139,11 @@ class RetrieveFindMyEpc: # Check if the potential rating div is found if potential_rating_div: # Extract the rating text within the SVG text element - rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold').text.strip() + extracted_rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold') + if extracted_rating_text is not None: + rating_text = extracted_rating_text.text.strip() + else: + rating_text = " ".join([str(previous_sap_score), previous_epc]) # Parse the rating text to separate the numeric rating and EPC letter new_rating = int(rating_text.split()[0]) new_epc = rating_text.split()[1] @@ -152,6 +157,7 @@ class RetrieveFindMyEpc: "sap_points": new_rating - previous_sap_score }) previous_sap_score = new_rating + previous_epc = new_epc # Search for the assessment informaton assessment_information = address_res.find('div', {'id': 'information'}) @@ -270,6 +276,19 @@ class RetrieveFindMyEpc: "Solar photovoltaic (PV) panels": ["solar_pv"], "Party wall insulation": ["party_wall_insulation"], 'Draught proofing': ["draught_proofing"], + "Roof insulation recommendation": [], + "Cavity wall insulation recommendation": [], + "Windows draught proofing": [], + "Low energy lighting for all fixed outlets": ["low_energy_lighting"], + "Cylinder thermostat recommendation": [], + "Heating controls recommendation": [], + "Replace boiler with Band A condensing boiler": [], + "Solar panel recommendation": [], + "Double glazing recommendation": [], + "Solid wall insulation recommendation": [], + "Fuel change recommendation": [], + "PV Cells recommendation": [], + "Replacement glazing units": ["double_glazing"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 1e478b0c..80caefc9 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -1,5 +1,6 @@ import os import time +from idlelib.iomenu import errors import pandas as pd import numpy as np @@ -21,6 +22,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data(asset_list, fulladdress_column, address1_column, postcode_column): + home = asset_list[asset_list["row_id"].isin(errors)].head(1).tail(1).squeeze() + epc_data = [] errors = [] no_epc = [] @@ -56,10 +59,21 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): property_recommendations = {"rows": []} # Retrieve data from FindMyEPC - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e): + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + else: + find_epc_data = {} + except Exception as e: + raise Exception(f"Error retrieving FindMyEPC data: {e}") time.sleep(np.random.uniform(0.1, 1)) epc = { From 6eb52a509ebb8a110ca09533e4cba85b66edacf2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 21:38:40 +0000 Subject: [PATCH 103/255] removing error line --- etl/route_march_data_pull/app.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 80caefc9..d9f6bf43 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -22,8 +22,6 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data(asset_list, fulladdress_column, address1_column, postcode_column): - home = asset_list[asset_list["row_id"].isin(errors)].head(1).tail(1).squeeze() - epc_data = [] errors = [] no_epc = [] From ac9b7b37300204c83f862871ebd511208625978b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 22:08:10 +0000 Subject: [PATCH 104/255] updating methdology for matching --- .../stonewater/Wave 3 Preparation.py | 193 +++++++++++------- 1 file changed, 114 insertions(+), 79 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 08236d5b..f74dc19d 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1867,6 +1867,19 @@ def propsed_wave_3_sample(): return surveyed + def fill_survey_columns(region_assets, suffix): + for col in [ + 'Current EPC Band', 'Current SAP Rating', + 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', + 'Survey: Main Roof Type', 'Survey: Primary Heating System', + 'Survey: Matching Address ID', 'Distance to Closest Match (m)' + ]: + region_assets[col] = np.where( + pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]), + region_assets[col + suffix], region_assets[col] + ) + return region_assets + survey_attribute_columns = [ "Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', 'Survey: Primary Heating System' @@ -1920,6 +1933,14 @@ def propsed_wave_3_sample(): ].copy() if archetype_data.empty: continue + + match_type = "2 - same archetype" + if any(archetype_data["Postal Region"] == property["Postal Region"]): + match_type = "1 - same archetype, same postal region" + archetype_data = archetype_data[ + archetype_data["Postal Region"] == property["Postal Region"] + ] + if archetype_data.shape[0] > 1: # Look for an exact match, or as close as possible archetype_data_filtered = match_property_to_surveyed(property, archetype_data) @@ -1949,11 +1970,21 @@ def propsed_wave_3_sample(): 'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"], 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], - 'Distance to Closest Match (m)': closest_match["distance_meters"] + 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Match Type": match_type } ) - region_surveyed = pd.DataFrame(region_surveyed) + + if region_surveyed.empty: + region_surveyed = pd.DataFrame( + columns=[ + "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", + 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', + 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' + ] + ) + starting_shape = region_assets.shape[0] region_assets = region_assets.merge( region_surveyed, @@ -1968,95 +1999,99 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & pd.isnull(region_assets["Confidence Tier"]), - "1 - Archetype surveyed", region_assets["Confidence Tier"] + "1 - Archetype surveyed in region", region_assets["Confidence Tier"] ) - region_assets["Current EPC Band"] = np.where( - pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]), - region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"] - ) # Handle EPC C region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + region_assets["Current EPC Band_method1"].isin(["C", "B", "F", "G"]) & + pd.isnull(region_assets["Confidence Tier"]), "5 - EPC C or above", region_assets["Confidence Tier"] ) - region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) - # TODO: Turn into a function - missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + region_assets = fill_survey_columns(region_assets, suffix="_method1") - archetype_surveyed = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")] + region_assets = region_assets.drop(columns=method_1_columns) - if archetype_surveyed["Archetype ID"].duplicated().sum(): + missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"]) - archetype_surveyed = [] - for arch_id in missed_archetypes: - for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): - archetype_data = survey_results_with_original_features[ - survey_results["Archetype ID"] == arch_id - ].copy() - if archetype_data.empty: - continue - archetype_data["distance_meters"] = haversine( - lat1=property.latitude, lon1=property.longitude, - lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values - ) - expected_sap = np.average( - archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) - ) - expected_epc = sap_to_epc(expected_sap) - archetype_surveyed.append( - { - "Archetype ID": arch_id, - "Address ID": property["Address ID"], - "Current EPC Band": expected_epc - } - ) - archetype_surveyed = pd.DataFrame(archetype_surveyed) - region_assets = region_assets.merge( - archetype_surveyed, - on=["Archetype ID", "Address ID"], - how="left", - suffixes=("", "_method2") - ) - else: - region_assets = region_assets.merge( - archetype_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method2") - ) - - region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( - region_assets["Confidence Tier"]), - "2 - same archetype", region_assets["Confidence Tier"] - ) - - region_assets["Current EPC Band"] = np.where( - pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method2"]), - region_assets["Current EPC Band_method2"], region_assets["Current EPC Band"] - ) - - region_assets = region_assets.drop(columns=["Current EPC Band_method2"]) + archetype_surveyed = [] + for arch_id in missed_archetypes: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + raise Exception("IMPLEMENT ME") + # archetype_data["distance_meters"] = haversine( + # lat1=property.latitude, lon1=property.longitude, + # lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + # ) + # expected_sap = np.average( + # archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + # ) + # expected_epc = sap_to_epc(expected_sap) + # archetype_surveyed.append( + # { + # "Archetype ID": arch_id, + # "Address ID": property["Address ID"], + # "Current EPC Band": expected_epc + # } + # ) + # archetype_surveyed = pd.DataFrame(archetype_surveyed) + # if archetype_surveyed.empty: + # archetype_surveyed = pd.DataFrame( + # columns=[ + # "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", + # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', + # 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' + # ] + # ) + # + # region_assets = region_assets.merge( + # archetype_surveyed, + # on=["Archetype ID", "Address ID"], + # how="left", + # suffixes=("", "_method2") + # ) + # + # region_assets["Confidence Tier"] = np.where( + # region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( + # region_assets["Confidence Tier"]), + # "2 - same archetype", region_assets["Confidence Tier"] + # ) + # + # for col in [ + # 'Current EPC Band', 'Current SAP Rating', + # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', + # 'Survey: Main Roof Type', 'Survey: Primary Heating System', + # 'Survey: Matching Address ID', 'Distance to Closest Match (m)' + # ]: + # region_assets[col] = np.where( + # pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]), + # region_assets[col + "_method2"], region_assets[col] + # ) + # + # method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")] + # region_assets = region_assets.drop(columns=method_2_columns) # We label EPC C properties - region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), - "5 - EPC C or above", region_assets["Confidence Tier"] - ) - - region_assets["Confidence Tier"] = np.where( - region_assets["Archetype ID"] == "EPC C OR ABOVE", - "5 - EPC C or above", region_assets["Confidence Tier"] - ) - - region_assets["Current EPC Band"] = np.where( - region_assets["Archetype ID"] == "EPC C OR ABOVE", - "C", region_assets["Current EPC Band"] - ) + # region_assets["Confidence Tier"] = np.where( + # region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + # "5 - EPC C or above", region_assets["Confidence Tier"] + # ) + # + # region_assets["Confidence Tier"] = np.where( + # region_assets["Archetype ID"] == "EPC C OR ABOVE", + # "5 - EPC C or above", region_assets["Confidence Tier"] + # ) + # + # region_assets["Current EPC Band"] = np.where( + # region_assets["Archetype ID"] == "EPC C OR ABOVE", + # "C", region_assets["Current EPC Band"] + # ) missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() From 5d5001fec3114eab4ba84e7fc0e40270ec017d35 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 22:47:39 +0000 Subject: [PATCH 105/255] added de-duping --- .../stonewater/Wave 3 Preparation.py | 221 ++++++------------ etl/find_my_epc/RetrieveFindMyEpc.py | 6 + etl/route_march_data_pull/app.py | 7 + 3 files changed, 85 insertions(+), 149 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index f74dc19d..744b3400 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1803,21 +1803,26 @@ def propsed_wave_3_sample(): def match_property_to_surveyed(property, survey_results_with_original_features): surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Postal Region"] == + property["Postal Region"] + ) & ( survey_results_with_original_features["Property Type"] == property["Property Type"] + ) + & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] ) & ( - survey_results_with_original_features["Wall Type"] == - property["Wall Type"] + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] ) & ( - survey_results_with_original_features["Roof Type"] == - property["Roof Type"] - ) & - ( - survey_results_with_original_features["Heating"] == - property["Heating"] + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] ) ].copy() @@ -1826,23 +1831,47 @@ def propsed_wave_3_sample(): surveyed = survey_results_with_original_features[ ( - survey_results_with_original_features["Property Type"] == - property["Property Type"] + survey_results_with_original_features["Postal Region"] == + property["Postal Region"] ) & ( - survey_results_with_original_features["Wall Type"] == - property["Wall Type"] + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] + ) + & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] ) & ( survey_results_with_original_features["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0] ) & ( - survey_results_with_original_features["Heating"] == - property["Heating"] + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] ) ].copy() + # surveyed = survey_results_with_original_features[ + # ( + # survey_results_with_original_features["Property Type"] == + # property["Property Type"] + # ) & + # ( + # survey_results_with_original_features["Wall Type"] == + # property["Wall Type"] + # ) & + # ( + # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Heating"] == + # property["Heating"] + # ) + # ].copy() + if not surveyed.empty: return surveyed @@ -1906,7 +1935,12 @@ def propsed_wave_3_sample(): on="Address ID", how="left" ) - region_assets['Distance to Closest Match (m)'] = 0 + region_assets['Distance to Closest Match (m)'] = None + region_assets["Distance to Closest Match (m)"] = np.where( + ~pd.isnull(region_assets["Current EPC Band"]), + 0, + region_assets["Distance to Closest Match (m)"] + ) # Label the tier 1 properties region_assets["Confidence Tier"] = None @@ -2016,7 +2050,7 @@ def propsed_wave_3_sample(): missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"]) - archetype_surveyed = [] + # archetype_surveyed = [] for arch_id in missed_archetypes: for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): archetype_data = survey_results_with_original_features[ @@ -2175,7 +2209,14 @@ def propsed_wave_3_sample(): { "Address ID": a_id, "Confidence Tier": "4 - no similar property, needs survey to confirm", - "Current EPC Band": "Needs Survey" + "Current EPC Band": "Needs Survey", + "Current SAP Rating": "Needs Survey", + 'Survey: Main Wall Type': "Not Surveyed", + "Survey: Main Alternative Wall": "Not Surveyed", + "Survey: Main Roof Type": "Not Surveyed", + "Survey: Primary Heating System": "Not Surveyed", + "Survey: Matching Address ID": "Not Surveyed", + 'Distance to Closest Match (m)': 9999999, } ) continue @@ -2197,18 +2238,6 @@ def propsed_wave_3_sample(): # Take the 3 nearest surveyed = surveyed.head(3) - # # We allow a max distance of 10km - # surveyed = surveyed[surveyed["distance_meters"] < 10000] - # if surveyed.empty: - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": "4 - no similar property, needs survey to confirm", - # "Current EPC Band": "Needs Survey" - # } - # ) - # continue - # perform a weighted mean of SAP rating - the closer the better expected_sap = np.average( surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1) @@ -2218,129 +2247,24 @@ def propsed_wave_3_sample(): if expected_epc in ["C", "B", "A"]: match_type = "5 - EPC C or above" + closest_match = surveyed.iloc[0] + final_missed_matches.append( { "Address ID": a_id, "Confidence Tier": match_type, - "Current EPC Band": expected_epc + "Current EPC Band": expected_epc, + "Current SAP Rating": expected_sap, + 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"], + "Survey: Main Alternative Wall": closest_match["Survey: Main Alternative Wall"], + "Survey: Main Roof Type": closest_match["Survey: Main Roof Type"], + "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], + "Survey: Matching Address ID": closest_match["Address ID"], + 'Distance to Closest Match (m)': closest_match["distance_meters"], } ) continue - # if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: - # filter_property_types = ["House", "Bungalow"] - # else: - # filter_property_types = ["Flat"] - # - # surveyed_similar = survey_results_with_original_features[ - # (survey_results_with_original_features["Postcode"] == property["Postcode"]) & - # ( - # survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - # filter_property_types - # ) - # ) & - # ( - # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - # property["Wall Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - # property["Roof Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Heating"].str.split(":").str[0] == - # property["Heating"].split(":")[0] - # ) - # ] - # if surveyed_similar.empty: - # surveyed_similar = survey_results_with_original_features[ - # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - # filter_property_types - # )) & - # (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - # property["Wall Type"].split(":")[0]) & - # (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - # property["Roof Type"].split(":")[0]) & - # (survey_results_with_original_features["Heating"].str.split(":").str[0] == - # property["Heating"].split(":")[0]) - # ] - # - # if surveyed_similar.empty: - # - # # We get an average based on the postcode - # surveyed_similar = survey_results_with_original_features[ - # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - # filter_property_types - # )) - # ] - # if surveyed_similar.empty: - # surveyed_similar_entire_population = survey_results_with_original_features[ - # ( - # survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[ - # "Property Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - # property["Wall Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - # property["Roof Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Heating"].str.split(":").str[0] == - # property["Heating"].split(":")[0] - # ) - # ] - # - # # We order them by distance on postcode - # - # # Average - # expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean() - # expected_epc = sap_to_epc(expected_sap) - # - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": "3 - similar property, all areas searched", - # "Current EPC Band": expected_epc - # } - # - # ) - # else: - # expected_sap = surveyed_similar["Current SAP Rating"].mean() - # expected_epc = sap_to_epc(expected_sap) - # if expected_epc in ["C", "B", "A"]: - # tier = "5 - EPC C or above" - # else: - # tier = "3 - similar property, relaxed conditions" - # - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": tier, - # "Current EPC Band": expected_epc - # } - # ) - # continue - # # We take an average - # expected_sap = surveyed_similar["Current SAP Rating"].mean() - # expected_epc = sap_to_epc(expected_sap) - # if expected_epc in ["C", "B", "A"]: - # tier = "5 - EPC C or above" - # else: - # tier = "3 - similar property" - # - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": tier, - # "Current EPC Band": expected_epc - # } - # ) - final_missed_matches = pd.DataFrame(final_missed_matches) region_assets = region_assets.merge( @@ -2353,12 +2277,11 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna( region_assets["Confidence Tier_method3"] ) - region_assets["Current EPC Band"] = np.where( - pd.isnull(region_assets["Current EPC Band"]), - region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"] - ) - region_assets = region_assets.drop(columns=["Confidence Tier_method3", "Current EPC Band_method3"]) + region_assets = fill_survey_columns(region_assets, suffix="_method3") + + method_3_columns = [c for c in region_assets.columns if c.endswith("_method3")] + region_assets = region_assets.drop(columns=method_3_columns) if pd.isnull(region_assets["Current EPC Band"]).sum(): raise Exception("Something went wrong") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index ac0e8235..b6394275 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -289,6 +289,12 @@ class RetrieveFindMyEpc: "Fuel change recommendation": [], "PV Cells recommendation": [], "Replacement glazing units": ["double_glazing"], + "Heating controls (time and temperature zone control)": ["time_temperature_zone_control"], + "High heat retention storage heaters": ["high_heat_retention_storage_heaters"], + "Gas condensing boiler": ["boiler_upgrade"], + "Change room heaters to condensing boiler": ["boiler_upgrade"], + "Cylinder thermostat": ["cylinder_thermostat"], + "Heat recovery system for mixer showers": ["heat_recovery_shower"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index d9f6bf43..6f9dd135 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -150,6 +150,13 @@ def app(): # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1) + # We check for duplicated addresses + asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] + if asset_list["deduper"].duplicated().sum(): + # Drop the dupes + print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping") + asset_list = asset_list[~asset_list["deduper"].duplicated()] + epc_data, errors, no_epc = get_data( asset_list=asset_list, fulladdress_column=FULLADDRESS_COLUMN, From d65c99f62a0fd7cb6e1c58a5816db0e4e4477fb5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 08:41:44 +0000 Subject: [PATCH 106/255] tidying up optimisation process --- .../stonewater/Wave 3 Preparation.py | 105 ++++-------------- 1 file changed, 24 insertions(+), 81 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 744b3400..c8e61a0e 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2297,39 +2297,9 @@ def propsed_wave_3_sample(): if pd.isnull(results[c]).sum(): raise Exception("Something went wrong") - # home = results[results["Confidence Tier"] == "5 - EPC C or above"].sample(1) - # region = home["Postal Region"].values[0] - - # Create a pivot table for counts of Confidence Tier by Postal Region - geographic_summary = results.pivot_table( - index='Postal Region', - columns='Confidence Tier', - aggfunc='size', - fill_value=0 - ).reset_index() - - # We create the gain and loss columns - # Gain is the sum of these columns: - # '1 - Archetype surveyed', - # '1 - property was surveyed', - # '2 - same archetype', - # '3 - similar property, weighted on distance' - gain_columns = sorted([x for x in results["Confidence Tier"].unique() if "1 - " in x or "2 - " in x or "3 - " in x]) loss_columns = sorted([x for x in results["Confidence Tier"].unique() if "4 - " in x or "5 - " in x]) - geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1) - geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1) - - print(geographic_summary.sum()) - - geographic_summary = geographic_summary.sort_values("Loss", ascending=True) - geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() - geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum() - - loss = geographic_summary["Loss"].values - gain = geographic_summary["Gain"].values - def optimise(gain, loss, max_loss=250): # Define the coefficients for the objective function (negative because we maximize Gain) @@ -2352,76 +2322,51 @@ def propsed_wave_3_sample(): return selected_rows, optimal_gain - selected_rows, _ = optimise(gain, loss, 250) - - # Select the rows that are selected - geographic_summary["Selected"] = selected_rows == 1 - geographic_summary[geographic_summary["Selected"]].sum() - - region_totals = geographic_summary[ - geographic_summary["Selected"] - ][["Gain", "Loss"]].sum() - - # We now see if there are any postcodes that have no loss that can be added - unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values - - # TODO: Try on street - - postcode_summary = results.pivot_table( + street_summary = results.pivot_table( index='Street and Region', columns='Confidence Tier', aggfunc='size', fill_value=0 ).reset_index() - # postcode_summary = postcode_summary.merge( - # results[["Postcode", "Postal Region"]].drop_duplicates(), - # how="left", on="Postcode" - # ) - # - postcode_summary_unselected_regions = postcode_summary.copy() - # postcode_summary_unselected_regions = postcode_summary[ - # postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions) - # ].copy() - postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1) - postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1) + street_summary["Gain"] = street_summary[gain_columns].sum(axis=1) + street_summary["Loss"] = street_summary[loss_columns].sum(axis=1) - # Remaining loss allowed - # remaining_loss_constraint = 230 - region_totals["Loss"] - remaining_loss_constraint = 220 - postcode_selected_rows, _ = optimise( - gain=postcode_summary_unselected_regions["Gain"].values, - loss=postcode_summary_unselected_regions["Loss"].values, - max_loss=int(remaining_loss_constraint) + print(street_summary.sum()) + + selected_rows, _ = optimise( + gain=street_summary["Gain"].values, + loss=street_summary["Loss"].values, + max_loss=250 ) - postcode_summary_unselected_regions["Selected"] = postcode_selected_rows == 1 - postcode_summary_unselected_regions[postcode_summary_unselected_regions["Selected"]][["Gain", "Loss"]].sum() + street_summary["Selected"] = selected_rows == 1 + print(street_summary[street_summary["Selected"]][["Gain", "Loss"]].sum()) - postcode_optimised_additional_properties = postcode_summary_unselected_regions[ - postcode_summary_unselected_regions["Selected"] + selected_streets = street_summary[ + street_summary["Selected"] ] - postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum() + totals = selected_streets[["Gain", "Loss"]].sum() - bid_size = postcode_totals.sum() + bid_size = totals.sum() print("Bid Size:", bid_size) - total_epc_d_or_below = postcode_totals["Gain"] + total_epc_d_or_below = totals["Gain"] print("Total EPC D or below:", total_epc_d_or_below) - total_epc_c = postcode_totals["Loss"] + total_epc_c = totals["Loss"] print("Total EPC C or above:", total_epc_c) # Total needing a survey - total_needing_survey = postcode_optimised_additional_properties[ + total_needing_survey = selected_streets[ "4 - no similar property, needs survey to confirm" ].sum() print("Total needing survey:", total_needing_survey) # Look for postcodes that have no loss - unselected_streets = postcode_summary_unselected_regions[ - ~postcode_summary_unselected_regions["Selected"] + unselected_streets = street_summary[ + ~street_summary["Selected"] ]["Street and Region"].values - postcode_summary2 = results[ + postcode_summary = results[ results["Street and Region"].isin(unselected_streets) ].pivot_table( index='Postcode', @@ -2430,14 +2375,12 @@ def propsed_wave_3_sample(): fill_value=0 ).reset_index() - postcode_summary2["Gain"] = postcode_summary2[gain_columns].sum(axis=1) - postcode_summary2["Loss"] = postcode_summary2[loss_columns].sum(axis=1) + postcode_summary["Gain"] = postcode_summary[gain_columns].sum(axis=1) + postcode_summary["Loss"] = postcode_summary[loss_columns].sum(axis=1) - no_loss_postcodes = postcode_summary2[postcode_summary2["Loss"] == 0].sort_values("Gain", ascending=False) + no_loss_postcodes = postcode_summary[postcode_summary["Loss"] == 0].sort_values("Gain", ascending=False) total_bid_size = bid_size + no_loss_postcodes["Gain"].sum() print(total_bid_size) - z = results[results["Confidence Tier"] == "5 - EPC C or above"] - # if __name__ == "__main__": # main() From d163ca99315b2e2c82b95ab629041351374fb081 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 13:54:46 +0000 Subject: [PATCH 107/255] fixing filling of property --- .../stonewater/Wave 3 Preparation.py | 188 +++++++++--------- 1 file changed, 98 insertions(+), 90 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index c8e61a0e..426097e8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1669,7 +1669,7 @@ def propsed_wave_3_sample(): header=4 ) - # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing + # TODO: We drop 7 properties missing # UPRN asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])] # Clean address ids @@ -1699,15 +1699,23 @@ def propsed_wave_3_sample(): os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"), header=0 ) - survey_results = survey_results.merge( + + survey_results = survey_results.drop( + columns=["Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"] + ).merge( additional_survey_data[ [ "Address ID", "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness", "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", - "Main Building Alternative Wall Thickness" + "Main Building Alternative Wall Thickness", + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness" ] - ].rename(columns={"Main Wall Insulation_x": "Main Wall Insulation Type"}), + ].rename( + columns={ + "Main Wall Insulation_x": "Main Wall Insulation Type", + } + ), how="left", on="Address ID" ) @@ -1718,6 +1726,7 @@ def propsed_wave_3_sample(): "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode", "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", "Existing Primary Heating System", + "Package Ref", "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness", "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", "Main Building Alternative Wall Thickness" @@ -1727,6 +1736,7 @@ def propsed_wave_3_sample(): "Existing Primary Heating System": "Survey: Primary Heating System" } ) + survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] # Concatenate from the wall information survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[ @@ -1929,7 +1939,7 @@ def propsed_wave_3_sample(): region_assets = region_assets.merge( exact_surveyed[ ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [ - "Survey: Matching Address ID" + "Survey: Matching Address ID", "Package Ref" ] ], on="Address ID", @@ -2005,6 +2015,7 @@ def propsed_wave_3_sample(): 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Package Ref": closest_match["Package Ref"], "Match Type": match_type } ) @@ -2015,7 +2026,8 @@ def propsed_wave_3_sample(): columns=[ "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', - 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' + 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)', + "Match Type" ] ) @@ -2032,8 +2044,8 @@ def propsed_wave_3_sample(): # Label the tier 1 properties region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & - pd.isnull(region_assets["Confidence Tier"]), - "1 - Archetype surveyed in region", region_assets["Confidence Tier"] + pd.isnull(region_assets["Confidence Tier"]) & ~pd.isnull(region_assets["Match Type"]), + region_assets["Match Type"], region_assets["Confidence Tier"] ) # Handle EPC C @@ -2046,86 +2058,7 @@ def propsed_wave_3_sample(): region_assets = fill_survey_columns(region_assets, suffix="_method1") method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")] - region_assets = region_assets.drop(columns=method_1_columns) - - missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"]) - - # archetype_surveyed = [] - for arch_id in missed_archetypes: - for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): - archetype_data = survey_results_with_original_features[ - survey_results["Archetype ID"] == arch_id - ].copy() - if archetype_data.empty: - continue - raise Exception("IMPLEMENT ME") - # archetype_data["distance_meters"] = haversine( - # lat1=property.latitude, lon1=property.longitude, - # lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values - # ) - # expected_sap = np.average( - # archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) - # ) - # expected_epc = sap_to_epc(expected_sap) - # archetype_surveyed.append( - # { - # "Archetype ID": arch_id, - # "Address ID": property["Address ID"], - # "Current EPC Band": expected_epc - # } - # ) - # archetype_surveyed = pd.DataFrame(archetype_surveyed) - # if archetype_surveyed.empty: - # archetype_surveyed = pd.DataFrame( - # columns=[ - # "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", - # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', - # 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' - # ] - # ) - # - # region_assets = region_assets.merge( - # archetype_surveyed, - # on=["Archetype ID", "Address ID"], - # how="left", - # suffixes=("", "_method2") - # ) - # - # region_assets["Confidence Tier"] = np.where( - # region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( - # region_assets["Confidence Tier"]), - # "2 - same archetype", region_assets["Confidence Tier"] - # ) - # - # for col in [ - # 'Current EPC Band', 'Current SAP Rating', - # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', - # 'Survey: Main Roof Type', 'Survey: Primary Heating System', - # 'Survey: Matching Address ID', 'Distance to Closest Match (m)' - # ]: - # region_assets[col] = np.where( - # pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]), - # region_assets[col + "_method2"], region_assets[col] - # ) - # - # method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")] - # region_assets = region_assets.drop(columns=method_2_columns) - - # We label EPC C properties - # region_assets["Confidence Tier"] = np.where( - # region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), - # "5 - EPC C or above", region_assets["Confidence Tier"] - # ) - # - # region_assets["Confidence Tier"] = np.where( - # region_assets["Archetype ID"] == "EPC C OR ABOVE", - # "5 - EPC C or above", region_assets["Confidence Tier"] - # ) - # - # region_assets["Current EPC Band"] = np.where( - # region_assets["Archetype ID"] == "EPC C OR ABOVE", - # "C", region_assets["Current EPC Band"] - # ) + region_assets = region_assets.drop(columns=method_1_columns + ["Match Type"]) missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() @@ -2217,6 +2150,7 @@ def propsed_wave_3_sample(): "Survey: Primary Heating System": "Not Surveyed", "Survey: Matching Address ID": "Not Surveyed", 'Distance to Closest Match (m)': 9999999, + "Package Ref": "Not Surveyed", } ) continue @@ -2261,6 +2195,7 @@ def propsed_wave_3_sample(): "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Package Ref": closest_match["Package Ref"] } ) continue @@ -2292,8 +2227,10 @@ def propsed_wave_3_sample(): # Check if there are missings in current epc band, current sap rating or any of the survey attributes for c in ( - ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + - survey_attribute_columns): + [ + "Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + + survey_attribute_columns + ): if pd.isnull(results[c]).sum(): raise Exception("Something went wrong") @@ -2382,5 +2319,76 @@ def propsed_wave_3_sample(): total_bid_size = bid_size + no_loss_postcodes["Gain"].sum() print(total_bid_size) + # Label final outputs + # We create a summary of packages by street + results["Package Ref"] = results["Package Ref"].fillna("Incomplete") + results["Package Ref"] = results["Package Ref"].astype(str) + package_summary = results.pivot_table( + index='Street and Region', + columns='Package Ref', + aggfunc='size', + fill_value=0 + ).reset_index() + + street_bid_structure = street_summary.merge( + package_summary, how="left", on="Street and Region" + ) + street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False) + street_bid_structure.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False + ) + + individual_units_programme = results.copy() + individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin( + street_bid_structure[street_bid_structure["Selected"]]["Street and Region"].values + ) + + # Merge on Stonewaters ID + asset_list_ids = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + )[["Address ID", "Org. ref."]] + # Clean address ids + asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])] + asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"] + asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int) + individual_units_programme = individual_units_programme.merge( + asset_list_ids, + how="left", + on="Address ID", + ) + + individual_units_programme = individual_units_programme.merge( + asset_list_ids.rename( + columns={"Org. ref.": "Survey: Org. ref.", "Address ID": "Survey: Matching Address ID"} + ), + how="left", + on="Survey: Matching Address ID" + ) + + individual_units_programme["Survey: Org. ref."] = np.where( + (individual_units_programme["Survey: Matching Address ID"] == "Not Surveyed"), + "Not Surveyed", + individual_units_programme["Survey: Org. ref."] + ) + + if pd.isnull(individual_units_programme["Survey: Org. ref."]).sum() or pd.isnull( + individual_units_programme["Org. ref."]).sum(): + raise ValueError("something went wrong") + + for col in ["Survey: Main Roof Type", "Survey: Main Wall Type", "Survey: Main Alternative Wall"]: + individual_units_programme[col] = ( + individual_units_programme[col] + .str.replace(r': nan(?=$|:)', '', regex=True) # Remove ': nan' at the end or before another ':' + .str.replace(r':\s+:', ': ', regex=True) # Replace occurrences of ': :' with ': ' + .str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space + .str.strip() # Strip leading/trailing spaces + ) + + individual_units_programme.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False + ) + # if __name__ == "__main__": # main() From 1645f9ab9ed84bdb90fa2a732d697111b36bd17b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 22:00:00 +0000 Subject: [PATCH 108/255] updating stonewater modelling code to use new data --- .../stonewater/Wave 3 Preparation.py | 288 +++++++++++++++--- 1 file changed, 247 insertions(+), 41 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 426097e8..f4195592 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1071,10 +1071,13 @@ def main(): ] # We now merge on the coordinator data so that against each property, we can map the measures + # TODO: Get the pre & post primary energy numbers + # TODO: Make sure the numbers are going down + retrofit_packages_board = pd.read_excel( os.path.join( CUSTOMER_FOLDER_PATH, - "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1731315080 11.11.24.xlsx" + "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx" ), header=4 ) @@ -1084,6 +1087,18 @@ def main(): retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) ] + # populated_primary_energy = retrofit_packages_board[ + # ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)']) + # ] + # + # z = populated_primary_energy[ + # populated_primary_energy['POST Primary energy (13a - 272)'] > populated_primary_energy[ + # 'BASE Primary energy (13a-272)'] + # ] + # + # all(populated_primary_energy['POST Primary energy (13a - 272)'] <= populated_primary_energy[ + # 'BASE Primary energy (13a-272)']) + # Replace \n with "" extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "") @@ -1192,7 +1207,7 @@ def main(): # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( # CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") - if len(missing_ids) != 6: + if len(missing_ids) != 1: raise Exception("Unacceptable number of missings") if matching_lookup["Address ID"].duplicated().sum(): @@ -1239,7 +1254,6 @@ def main(): if stonewater_data["Address ID"].duplicated().sum(): raise Exception("Duplicate Address IDs") - # Create a section for costs for measure in measure_columns: stonewater_data[f"Cost of {measure}"] = None @@ -1297,8 +1311,41 @@ def main(): ]: stonewater_data[c] = stonewater_data[c].astype(str) + # FIll the primary energy numbers from the excel + stonewater_data = stonewater_data.merge( + retrofit_packages_board[ + [ + "Name", "Address ID", "BASE Primary energy (13a-272)", "POST Primary energy (13a - 272)" + ] + ], + on=["Address ID", "Name"], + how="left" + ) + stonewater_data["Primary Energy Use (kWh/yr)"] = np.where( + pd.isnull(stonewater_data["Primary Energy Use (kWh/yr)"]), + stonewater_data["BASE Primary energy (13a-272)"], + stonewater_data["Primary Energy Use (kWh/yr)"] + ) + stonewater_data = stonewater_data.drop(columns=["BASE Primary energy (13a-272)"]) + + # Add on organisation reference + original_archetypes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] + original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] + original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + + stonewater_data = stonewater_data.merge( + original_archetypes[["Address ID", 'Org. ref.']], + on="Address ID", + how="left" + ) + # Save this data to excel - stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False) + stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V4.xlsx", index=False) cost_sheet = [ { @@ -1677,6 +1724,12 @@ def propsed_wave_3_sample(): asset_list = asset_list[asset_list["Address ID"] != "Address ID"] asset_list["Address ID"] = asset_list["Address ID"].astype(int) + asset_list["Street name"] = np.where( + pd.isnull(asset_list["Street name"]), + asset_list["Postcode"], + asset_list["Street name"] + ) + # Create the postal region, taking the first part of the postcode asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0] asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"] @@ -1684,43 +1737,16 @@ def propsed_wave_3_sample(): # Keep just the columns we need asset_list = asset_list[ - ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Street and Region", + ["UPRN", "Address ID", 'Org. ref.', "Archetype ID", "Postal Region", "Name", "Postcode", "Street and Region", "Property Type", "Wall Type", "Roof Type", "Heating"] ] - # Updated packages: to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False) survey_results = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"), + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"), header=13, sheet_name="Modelled Packages" ) - additional_survey_data = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"), - header=0 - ) - - survey_results = survey_results.drop( - columns=["Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"] - ).merge( - additional_survey_data[ - [ - "Address ID", - "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness", - "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", - "Main Building Alternative Wall Thickness", - "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness" - ] - ].rename( - columns={ - "Main Wall Insulation_x": "Main Wall Insulation Type", - } - ), - how="left", - on="Address ID" - ) - - # TOOD: We probably want the actual surveyed wall, roof, heating type survey_results = survey_results[ [ "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode", @@ -1768,6 +1794,105 @@ def propsed_wave_3_sample(): if survey_results_with_original_features.shape[0] != survey_results.shape[0]: raise ValueError("Something went wrong") + # Against properties that have NO package ref, we assign a package ref + properties_with_packages = survey_results_with_original_features[ + ~pd.isnull(survey_results_with_original_features["Package Ref"]) + ] + + properties_without_packages = survey_results_with_original_features[ + (survey_results_with_original_features["Current SAP Rating"] < 69) & pd.isnull( + survey_results_with_original_features["Package Ref"] + ) + ] + + # Change this to a lookup + package_ratings = pd.DataFrame([ + { + "1A": 1, + "1B": 2, + "2A": 3, + "2B": 4, + "3A": 5, + "3B": 6, + 4: 7 + } + ]) + package_ratings = pd.melt(package_ratings, var_name="Package Ref", value_name="Rank") + + mapped_package_refs = [] + for _, property in tqdm(properties_without_packages.iterrows(), total=len(properties_without_packages)): + # Same archetype? + matches = properties_with_packages[properties_with_packages["Archetype ID"] == property["Archetype ID"]] + + if matches.empty: + # Similar property + matches = properties_with_packages[ + (properties_with_packages["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0]) & + (properties_with_packages["Wall Type"] == property["Wall Type"]) & + (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) & + (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0]) + ] + if matches.empty: + matches = properties_with_packages[ + (properties_with_packages["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0]) & + (properties_with_packages["Wall Type"].str.split(":").str[0] == property["Wall Type"].split(":")[0]) & + (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) & + (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0]) + ] + if matches.empty: + raise Exception("Implement me") + if matches.shape[0] > 1: + # Take the package with the highest rank + matches = matches.merge( + package_ratings, + on="Package Ref", + how="left" + ).sort_values("Rank", ascending=False).head(1) + + mapped_package_refs.append( + { + "Address ID": property["Address ID"], + "Matched Package Ref": matches["Package Ref"].values[0] + } + ) + + mapped_package_refs = pd.DataFrame(mapped_package_refs) + + survey_results = survey_results.merge( + mapped_package_refs, + on="Address ID", + how="left" + ) + survey_results["Package Ref"] = np.where( + pd.notnull(survey_results["Matched Package Ref"]), + survey_results["Matched Package Ref"], + survey_results["Package Ref"] + ) + survey_results = survey_results.drop(columns=["Matched Package Ref"]) + + # Do the same with survey_results_with_original_features + survey_results_with_original_features = survey_results_with_original_features.merge( + mapped_package_refs, + on="Address ID", + how="left" + ) + survey_results_with_original_features["Package Ref"] = np.where( + pd.notnull(survey_results_with_original_features["Matched Package Ref"]), + survey_results_with_original_features["Matched Package Ref"], + survey_results_with_original_features["Package Ref"] + ) + survey_results_with_original_features = survey_results_with_original_features.drop(columns=["Matched Package Ref"]) + + # Save the data for reference + # mapped_package_refs = mapped_package_refs.merge( + # asset_list[["Name", "Postcode", "Address ID", "Org. ref."]], + # on="Address ID", + # how="left" + # ) + # mapped_package_refs.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "mapped_package_refs.csv"), index=False) + # We get longitude & Latitude archetyping_spatial_features = read_pickle_from_s3( bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", @@ -1911,7 +2036,8 @@ def propsed_wave_3_sample(): 'Current EPC Band', 'Current SAP Rating', 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', 'Survey: Primary Heating System', - 'Survey: Matching Address ID', 'Distance to Closest Match (m)' + 'Survey: Matching Address ID', 'Distance to Closest Match (m)', + "Package Ref" ]: region_assets[col] = np.where( pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]), @@ -2027,7 +2153,7 @@ def propsed_wave_3_sample(): "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)', - "Match Type" + "Match Type", "Package Ref" ] ) @@ -2183,6 +2309,13 @@ def propsed_wave_3_sample(): closest_match = surveyed.iloc[0] + # The closest property may be an EPC C, we we take the package ref from the property that's the nearest + # with non-NA package ref + if expected_epc in ["C", "B", "A"]: + package_ref = None + else: + package_ref = surveyed["Package Ref"].dropna().values[0] + final_missed_matches.append( { "Address ID": a_id, @@ -2195,7 +2328,7 @@ def propsed_wave_3_sample(): "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], - "Package Ref": closest_match["Package Ref"] + "Package Ref": package_ref } ) continue @@ -2225,6 +2358,11 @@ def propsed_wave_3_sample(): results = pd.concat(results) + results[ + pd.isnull(results["Package Ref"]) & (results["Current EPC Band"] == "D") + ]["Postal Region"] + results[resul] + # Check if there are missings in current epc band, current sap rating or any of the survey attributes for c in ( [ @@ -2269,8 +2407,6 @@ def propsed_wave_3_sample(): street_summary["Gain"] = street_summary[gain_columns].sum(axis=1) street_summary["Loss"] = street_summary[loss_columns].sum(axis=1) - print(street_summary.sum()) - selected_rows, _ = optimise( gain=street_summary["Gain"].values, loss=street_summary["Loss"].values, @@ -2334,9 +2470,6 @@ def propsed_wave_3_sample(): package_summary, how="left", on="Street and Region" ) street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False) - street_bid_structure.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False - ) individual_units_programme = results.copy() individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin( @@ -2386,6 +2519,79 @@ def propsed_wave_3_sample(): .str.strip() # Strip leading/trailing spaces ) + # Any EPC C properties that have been included should be flagged as potential low carbon heating + selected_epc_c = individual_units_programme[ + (individual_units_programme["Current EPC Band"].isin(["C", "B", "A", "Needs Survey"])) & + (individual_units_programme["Unit in Programme"]) + ] + + flat_wall_map = { + "CA Cavity: F Filled Cavity": False, + "CA Cavity: A As Built": True, + "SO Solid Brick: A As Built": True, + "Not Surveyed": False + } + + heating_map = { + "BGW Post 98 Combi condens. with auto ign.": False, + "BGB Post 98 Regular condens. with auto ign.": False, + "SEK High heat retention storage heaters": False, + "SEB Modern slimline storage heaters": True, + "Not Surveyed": False + } + + infill_data = [] + for _, epc_c_property in selected_epc_c.iterrows(): + if epc_c_property["Property Type"].split(":")[0] == "Flat": + # Look for a wall insulation measure + infill = flat_wall_map[epc_c_property["Survey: Main Wall Type"]] + infill_data.append( + { + "Address ID": epc_c_property["Address ID"], + "Street and Region": epc_c_property["Street and Region"], + "Possible Flat Infill?": infill + } + ) + continue + + infill = heating_map[epc_c_property["Survey: Primary Heating System"]] + infill_data.append( + { + "Address ID": epc_c_property["Address ID"], + "Street and Region": epc_c_property["Street and Region"], + "Low Carbon Heating Infill?": infill + } + ) + infill_data = pd.DataFrame(infill_data) + + individual_units_programme = individual_units_programme.merge( + infill_data[["Address ID", 'Possible Flat Infill?', 'Low Carbon Heating Infill?']], + how="left", on="Address ID" + ) + + for c in ['Possible Flat Infill?', 'Low Carbon Heating Infill?']: + individual_units_programme[c] = individual_units_programme[c].fillna(False) + + infill_by_street = infill_data.pivot_table( + index='Street and Region', + values=['Possible Flat Infill?', 'Low Carbon Heating Infill?'], + aggfunc='sum', + fill_value=0 + ).reset_index() + + street_bid_structure = street_bid_structure.merge( + infill_by_street, how="left", on="Street and Region" + ) + + for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']: + street_bid_structure[c] = street_bid_structure[c].fillna(0) + + street_bid_structure.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False + ) + + # TODO: Add the full Address!!! + individual_units_programme.to_csv( os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False ) From 9057b3d4da71f3dd63a8ae2924a073f6cc168dc8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 22:04:19 +0000 Subject: [PATCH 109/255] fixing assignment of package ref --- etl/customers/stonewater/Wave 3 Preparation.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index f4195592..4a841f61 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2126,8 +2126,16 @@ def propsed_wave_3_sample(): ) expected_epc = sap_to_epc(expected_sap) + archetype_data = archetype_data.sort_values("distance_meters", ascending=True) + # We take the features of the closest matching property - closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0] + closest_match = archetype_data.iloc[0] + + # Set the package ref + if expected_epc in ["C", "B", "A"]: + package_ref = None + else: + package_ref = archetype_data["Package Ref"].dropna().values[0] region_surveyed.append( { @@ -2141,7 +2149,7 @@ def propsed_wave_3_sample(): 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], 'Distance to Closest Match (m)': closest_match["distance_meters"], - "Package Ref": closest_match["Package Ref"], + "Package Ref": package_ref, "Match Type": match_type } ) From 0fafb03deebca4833680594b989b8362386257be Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 Nov 2024 22:06:51 +0000 Subject: [PATCH 110/255] tidying up code --- .../stonewater/Wave 3 Preparation.py | 27 ++----------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 4a841f61..34ab778a 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2366,10 +2366,8 @@ def propsed_wave_3_sample(): results = pd.concat(results) - results[ - pd.isnull(results["Package Ref"]) & (results["Current EPC Band"] == "D") - ]["Postal Region"] - results[resul] + if (pd.isnull(results["Package Ref"]) & (~results["Current EPC Band"].isin(["A", "B", "C"]))).sum(): + raise ValueError("Missing Package Refs") # Check if there are missings in current epc band, current sap rating or any of the survey attributes for c in ( @@ -2442,27 +2440,6 @@ def propsed_wave_3_sample(): ].sum() print("Total needing survey:", total_needing_survey) - # Look for postcodes that have no loss - unselected_streets = street_summary[ - ~street_summary["Selected"] - ]["Street and Region"].values - - postcode_summary = results[ - results["Street and Region"].isin(unselected_streets) - ].pivot_table( - index='Postcode', - columns='Confidence Tier', - aggfunc='size', - fill_value=0 - ).reset_index() - - postcode_summary["Gain"] = postcode_summary[gain_columns].sum(axis=1) - postcode_summary["Loss"] = postcode_summary[loss_columns].sum(axis=1) - - no_loss_postcodes = postcode_summary[postcode_summary["Loss"] == 0].sort_values("Gain", ascending=False) - total_bid_size = bid_size + no_loss_postcodes["Gain"].sum() - print(total_bid_size) - # Label final outputs # We create a summary of packages by street results["Package Ref"] = results["Package Ref"].fillna("Incomplete") From 631a76cb99d213d857c732ea1a58dd9d4291a716 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 21 Nov 2024 11:41:16 +0000 Subject: [PATCH 111/255] stonewater model completed --- etl/customers/ksquared/Wave3 Modelling.py | 35 +++++++++++++++++++ .../stonewater/Wave 3 Preparation.py | 32 +++++++++++------ 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index 96ea2b03..7bfa33b3 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -8,6 +8,7 @@ from tqdm import tqdm import pandas as pd import numpy as np from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from etl.spatial.OpenUprnClient import OpenUprnClient from backend.SearchEpc import SearchEpc from utils.s3 import save_csv_to_s3 @@ -60,6 +61,7 @@ def hornsey(): } extracted_data = [] asset_list = [] + hornsey_asset_list["row_id"] = hornsey_asset_list.index for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)): if home["Address letter or number"] == "Flat 1 36 Haringey Park": @@ -108,12 +110,24 @@ def hornsey(): asset_list.append( { "uprn": newest_epc["uprn"], + "row_id": home["row_id"], "address": home["Address letter or number"], "postcode": home["Postcode"], "property_type": "Flat", # They're all flats } ) + # Get conservation area data + # uprns = [x["uprn"] for x in extracted_data] + # conservation_area_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev") + # + # addresses = pd.DataFrame(asset_list) + # addresses["uprn"] = addresses["uprn"].astype(int) + # conservation_area_df = conservation_area_data.merge(addresses, how="left", right_on="uprn", left_on="UPRN") + # conservation_area_df.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/hornsey_conservation_area_data.csv" + # ) + # We format the extracted data so that is has the same structure as non-intrusive recommendations # We then get the UPRNs and create the asset list @@ -213,6 +227,8 @@ def caha(): # If pattern doesn't match, return original address return address + caha_asset_list["row_id"] = caha_asset_list.index + extracted_data = [] asset_list = [] for _, home in tqdm(caha_asset_list.iterrows(), total=len(caha_asset_list)): @@ -270,6 +286,7 @@ def caha(): asset_list.append( { + "row_id": home["row_id"], "uprn": uprn, "address": address, "postcode": home["Postcode"], @@ -280,6 +297,24 @@ def caha(): } ) + # Missing row ids + missed = [r for r in caha_asset_list["row_id"].tolist() if r not in [x["row_id"] for x in asset_list]] + + no_data = [x for x in asset_list if x["uprn"] in [None, ""]] + no_data = pd.DataFrame(no_data) + + # Get conservation area data + uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]] + conservation_area_data = OpenUprnClient.get_spatial_data([100022526362], "retrofit-data-dev") + + addresses = pd.DataFrame(asset_list) + addresses["uprn"] = addresses["uprn"].astype(str) + conservation_area_data["UPRN"] = conservation_area_data["UPRN"].astype(str) + conservation_area_df = conservation_area_data.merge(addresses, how="left", right_on="uprn", left_on="UPRN") + conservation_area_df.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_conservation_area_data.csv" + ) + non_invasive_recommendations = [ { "uprn": r["uprn"], diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 34ab778a..b6c29863 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -729,6 +729,7 @@ def extract_epr(pdf_path): "Main Building Alternative Wall Insulation": None, "Main Building Alternative Wall Dry-lining": None, "Main Building Alternative Wall Thickness": None, + "Main Fuel": None } with open(pdf_path, "rb") as file: @@ -1086,7 +1087,6 @@ def main(): retrofit_packages_board = retrofit_packages_board[ retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) ] - # populated_primary_energy = retrofit_packages_board[ # ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)']) # ] @@ -2442,8 +2442,11 @@ def propsed_wave_3_sample(): # Label final outputs # We create a summary of packages by street - results["Package Ref"] = results["Package Ref"].fillna("Incomplete") + results["Package Ref"] = results["Package Ref"].fillna("EPC C - No Package") results["Package Ref"] = results["Package Ref"].astype(str) + results["Package Ref"] = np.where( + results["Package Ref"] == "4.0", "4", results["Package Ref"] + ) package_summary = results.pivot_table( index='Street and Region', columns='Package Ref', @@ -2451,6 +2454,8 @@ def propsed_wave_3_sample(): fill_value=0 ).reset_index() + assert sum([v for k, v in package_summary.sum().items() if k != "Street and Region"]) == results.shape[0] + street_bid_structure = street_summary.merge( package_summary, how="left", on="Street and Region" ) @@ -2471,11 +2476,6 @@ def propsed_wave_3_sample(): asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])] asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"] asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int) - individual_units_programme = individual_units_programme.merge( - asset_list_ids, - how="left", - on="Address ID", - ) individual_units_programme = individual_units_programme.merge( asset_list_ids.rename( @@ -2571,14 +2571,24 @@ def propsed_wave_3_sample(): for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']: street_bid_structure[c] = street_bid_structure[c].fillna(0) - street_bid_structure.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False + master_sheet = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master " + "sheet.csv", + encoding='latin1' + ) + master_sheet = master_sheet[["Address ID", "Main Fuel"]] + + individual_units_programme = individual_units_programme.merge( + master_sheet, how="left", on="Address ID" ) - # TODO: Add the full Address!!! + street_bid_structure.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure V2.csv"), index=False + ) individual_units_programme.to_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False + os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False ) # if __name__ == "__main__": From fff8f50f69cad56ffe353bdf2ab0aa6f2d12573e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 10:16:06 +0000 Subject: [PATCH 112/255] wave 3 applications closed --- etl/customers/cottons/parse_pdf_asset_list.py | 64 +++++++++++++++++++ etl/customers/cottons/prep_asset_list.py | 15 +++++ etl/customers/gla/hug_postcodes.py | 46 +++++++++++++ etl/customers/ksquared/Wave3 Modelling.py | 2 +- .../stonewater/Wave 3 Preparation.py | 16 +++++ .../stonewater/potential_eco_properties.py | 38 +++++++++++ etl/route_march_data_pull/app.py | 21 +++--- 7 files changed, 191 insertions(+), 11 deletions(-) create mode 100644 etl/customers/cottons/parse_pdf_asset_list.py create mode 100644 etl/customers/cottons/prep_asset_list.py create mode 100644 etl/customers/gla/hug_postcodes.py diff --git a/etl/customers/cottons/parse_pdf_asset_list.py b/etl/customers/cottons/parse_pdf_asset_list.py new file mode 100644 index 00000000..7d442e97 --- /dev/null +++ b/etl/customers/cottons/parse_pdf_asset_list.py @@ -0,0 +1,64 @@ +import re +import pandas as pd +from PyPDF2 import PdfReader + +# Paths to the uploaded files +file_paths = [ + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf" +] + + +# Function to extract text from PDFs +def extract_text_from_pdf_with_pypdf2(file_path): + text = "" + reader = PdfReader(file_path) + for page in reader.pages: + text += page.extract_text() + return text + + +# Initialize a list to hold all parsed data +all_parsed_data = [] + +# Process each PDF individually +for i, path in enumerate(file_paths): + # Extract text from the PDF + extracted_text = extract_text_from_pdf_with_pypdf2(path) + + # Step 1: Remove titles and repeated headers + cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text) + cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text) + + # Step 2: Extract rows ending with "Managed" + rows = re.findall(r".*?Managed", cleaned_text) + + # Step 3: Parse rows into structured data + parsed_data = [] + for row in rows: + match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip()) + if match: + code = match.group(1).strip() + address = match.group(2).strip() + parsed_data.append((code, address, "Managed")) + + # Append parsed data to the global list + all_parsed_data.extend(parsed_data) + + # Provide feedback for debugging + print(f"File {i + 1} processed: {len(parsed_data)} rows") + +# Step 4: Create a unified DataFrame +final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"]) + +# Step 5: Save the unified DataFrame to an Excel file +final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx" +final_df.to_excel(final_output_file_path, index=False) + +# Provide feedback +print(f"All files processed and combined. Total rows: {len(final_df)}") +print(f"Unified file saved to: {final_output_file_path}") diff --git a/etl/customers/cottons/prep_asset_list.py b/etl/customers/cottons/prep_asset_list.py new file mode 100644 index 00000000..db7c6583 --- /dev/null +++ b/etl/customers/cottons/prep_asset_list.py @@ -0,0 +1,15 @@ +import pandas as pd + +df = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx" +) + +# split up the address on commas. First section is address1, last seciton is postcode +df["address1"] = df["Property Address"].apply(lambda x: x.split(",")[0].strip()) +df["postcode"] = df["Property Address"].apply(lambda x: x.split(",")[-1].strip()) + +# Re-save +df.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx", + index=False, +) diff --git a/etl/customers/gla/hug_postcodes.py b/etl/customers/gla/hug_postcodes.py new file mode 100644 index 00000000..85783d62 --- /dev/null +++ b/etl/customers/gla/hug_postcodes.py @@ -0,0 +1,46 @@ +import inspect +import pandas as pd +from pathlib import Path +from tqdm import tqdm +from etl.epc.settings import EARLIEST_EPC_DATE + +src_file_path = inspect.getfile(lambda: None) + +EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates") +epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] + +aggregation = [] +for directory in tqdm(epc_directories): + data = pd.read_csv(directory / "certificates.csv", low_memory=False) + # Rename the columns to the same format as the api returns + data.columns = [c.replace("_", "-").lower() for c in data.columns] + + data = data[data["posttown"].str.contains("London", case=False, na=False)] + if data.empty: + continue + # Take just date before the date threshold + data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] + + data = data[~pd.isnull(data["uprn"])] + # Take just the newest EPC per uprn, based on lodgement-date + data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") + # Take EPC D and below + data = data[data["current-energy-rating"].isin(["D", "E", "F", "G"])] + data["postal_region"] = data["postcode"].str.split(" ").str[0] + + # Take homes that don't have a gas boiler + off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)] + + region_summary = off_gas.groupby("postal_region").size().reset_index(name="count") + + aggregation.append(region_summary) + +postal_region_aggregation = pd.concat(aggregation) +postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False) +postal_region_aggregation = postal_region_aggregation.rename( + columns={"postal_region": "Postcode Region", "count": "Number of Homes"} +) +postal_region_aggregation.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions.xlsx", + index=False +) diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index 7bfa33b3..0bf6eb18 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -305,7 +305,7 @@ def caha(): # Get conservation area data uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]] - conservation_area_data = OpenUprnClient.get_spatial_data([100022526362], "retrofit-data-dev") + conservation_area_data = OpenUprnClient.get_spatial_data([36284], "retrofit-data-dev") addresses = pd.DataFrame(asset_list) addresses["uprn"] = addresses["uprn"].astype(str) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b6c29863..77200e69 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2591,5 +2591,21 @@ def propsed_wave_3_sample(): os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False ) + survey_results = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"), + header=13, + sheet_name="Modelled Packages" + ) + + indivual_units = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv") + ) + + u_aids = survey_results["Archetype ID"].astype(str).unique() + units_in_bid = indivual_units[indivual_units['Unit in Programme']]["Archetype ID"].astype(str).values + + len({v for v in units_in_bid if str(v) in u_aids}) + len(list(set(units_in_bid))) + # if __name__ == "__main__": # main() diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index 4fb89113..c0301e9a 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -375,3 +375,41 @@ def app(): "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv", index=False ) + + +def cross_reference_epc_programme(): + eco3_fallout = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE " + "SURVEYED - ECO3 NOT COMPLETED.xlsx" + ) + + eco3_fallout["house_number"] = eco3_fallout.apply( + lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1 + ) + + # for _, x in eco3_fallout.ite + + stonewater_modelled_above_c = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + + stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply( + lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1 + ) + + eco3_fallout_matched_to_above_c = [] + for _, property in eco3_fallout.iterrows(): + # Match on house number + match = stonewater_modelled_above_c[ + stonewater_modelled_above_c["house_number"] == property["house_number"] + ] + + # We do a fuzzy match on the address, with levenstein distance + + from fuzzywuzzy import fuzz + match = stonewater_modelled_above_c[ + stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90) + ] + match.head() diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 6f9dd135..b53b36c2 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -120,17 +120,17 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/" - DATA_FILENAME = "Bromford programme review.xlsx" - SHEET_NAME = "Bromford" - POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = None - ADDRESS1_COLUMN = "No." - ADDRESS1_METHOD = "first_two_words" - ADDRESS_COLS_TO_CONCAT = ["No.", "Address"] + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/" + DATA_FILENAME = "Cottons Asset List.xlsx" + SHEET_NAME = "Sheet1" + POSTCODE_COLUMN = "postcode" + FULLADDRESS_COLUMN = "Property Address" + ADDRESS1_COLUMN = "address1" + ADDRESS1_METHOD = None + ADDRESS_COLS_TO_CONCAT = [] asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) - asset_list = asset_list[~pd.isnull(asset_list["Postcode"])] + # asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() asset_list["row_id"] = asset_list.index # We clean up portential non-breaking spaces, and double spaces @@ -202,7 +202,8 @@ def app(): transformed_df = pd.DataFrame(transformed_data) # Drop the column that is "" - transformed_df = transformed_df.drop(columns=[""]) + if "" in transformed_df.columns: + transformed_df = transformed_df.drop(columns=[""]) # Get the find my epc data find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( From 965cf975e289b11bd1387a55c251e1c50e0327e0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 13:08:03 +0000 Subject: [PATCH 113/255] setting up data extraction pilot --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/lodgement/app.py | 47 +++++ etl/lodgement/requirements.txt | 8 + utils/file_data_extraction.py | 343 +++++++++++++++++++++++++++++++++ 5 files changed, 400 insertions(+), 2 deletions(-) create mode 100644 etl/lodgement/app.py create mode 100644 etl/lodgement/requirements.txt create mode 100644 utils/file_data_extraction.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..9b63b142 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..acd935c1 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py new file mode 100644 index 00000000..ede644b8 --- /dev/null +++ b/etl/lodgement/app.py @@ -0,0 +1,47 @@ +import os +import utils.file_data_extraction as file_extraction_tools + + +def handler(): + """ + This is a simple application that will extract the data from documents that have been uploaded to Sharepoint + to populate the lodgement spreadsheet with + :return: + """ + + # Ths source data will eventually come from Sharepoint + source_data_path = "/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot" + output_template = "Trustmark Details - Template REV.25.11.24.xlsx" + + # List the folders in the source data path + folders = [x for x in os.listdir(source_data_path) if os.path.isdir(os.path.join(source_data_path, x))] + + extractors = { + "elmhurst epr": file_extraction_tools.ElmhurstEprExtractor, + "elmhurst summary report": None, + "osmosis condition report": None, + "elmhurst evidence report": None + } + + for property_folder in folders: + coordinator_folder = os.path.join(source_data_path, property_folder, "2. RA Coordinator Info") + + # Get the contents of the folder + coordinator_folder_contents = [ + file for file in os.listdir(coordinator_folder) if os.path.isfile(os.path.join(coordinator_folder, file)) + ] + + # We detect the various file types + extracted_contents = {} + for filename in coordinator_folder_contents: + filepath = os.path.join(coordinator_folder, filename) + if file_extraction_tools.is_pdf(filepath): + report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) + if report_type is None: + raise ValueError(f"Unknown report type for {filename}") + + file_extractor = extractors.get(report_type) + if file_extractor is None: + continue + + extracted_contents[report_type] = file_extractor(filepath).extract() diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt new file mode 100644 index 00000000..601907ed --- /dev/null +++ b/etl/lodgement/requirements.txt @@ -0,0 +1,8 @@ +PyPDF2 +pandas +tqdm +openpyxl +boto3 +usaddress==0.5.11 +fuzzywuzzy==0.18.0 +python-dotenv \ No newline at end of file diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py new file mode 100644 index 00000000..cdd25f8a --- /dev/null +++ b/utils/file_data_extraction.py @@ -0,0 +1,343 @@ +import PyPDF2 +import re +from collections import Counter + +""" +This script contains functions used to extract data from retrofit survey files, including EPRs, +summary reports, etc +""" + + +def is_elmhurst_energy_report(text): + """ + Determines if the provided text indicates that the PDF is an Energy Report. + Returns True if the text contains 'Energy Report'. + """ + return text.startswith("ENERGY REPORT") + + +def is_elmhurst_summary_report(text): + """ + Determines if the provided text indicates that the PDF is a Summary Report. + """ + return text.startswith("Summary Information") + + +def is_osmosis_condition_report(text): + """ + Determines if the provided text indicates that the PDF is a Condition Report. + """ + return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport") + + +def is_elmhurst_evidence_report(text): + """ + Determines if the provided text indicates that the PDF is an Elmhurst Evidence Report. + """ + return text.startswith("RdSAP Evidence Report") + + +def detect_pdf_report_type(pdf_path): + """ + Detects the type of report based on content or filename. + :param pdf_path: String path to the PDF file + :param pdf_file: String name of the PDF file + :return: String type of the report ("epr", "summary", or None) + """ + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" + + if is_elmhurst_energy_report(first_page_text): + return "elmhurst epr" + elif is_elmhurst_summary_report(first_page_text): + return "elmhurst summary report" + elif is_osmosis_condition_report(first_page_text): + return "osmosis condition report" + elif is_elmhurst_evidence_report(first_page_text): + return "elmhurst evidence report" + + return None + + +def is_pdf(filename): + """ + Determines if the provided filename is a PDF file. + """ + return filename.endswith(".pdf") + + +class ElmhurstEprExtractor: + def __init__(self, file_path): + self.file_path = file_path + + @staticmethod + def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + """ + windows_text = windows_text.replace("\n", "") + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) + } + + @staticmethod + def extract_building_parts(text): + """ + Extracts building parts and associated dimensions from the provided text. + """ + data = [] + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party " + r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)", + re.DOTALL + ) + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + floor_data = match.group(2) + room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name) + if room_in_roof_match: + floor_area = float(room_in_roof_match.group(1)) + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, + "Perimeter (m)": None, + "Party Wall Length (m)": None + }) + else: + cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip() + + floor_pattern = re.compile( + r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + return data + + @staticmethod + def extract_roof_details(text): + """ + Extracts roof details for each building part in the provided text. + """ + roof_data = [] + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + part_details = match.group(2) + roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details) + + roof_data.append({ + "Building Part": cleaned_part_name, + "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None, + "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None, + "Roof Insulation Thickness": roof_insulation_thickness_match.group( + 1).strip() if roof_insulation_thickness_match else None, + }) + + return roof_data + + @staticmethod + def extract_wall_details(text): + """ + Extracts wall details for each building part in the provided text. + """ + wall_data = [] + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + part_details = match.group(2) + wall_type_match = re.search(r"Wall Type:\s*(.*?)(?=\n|$)", part_details) + wall_insulation_match = re.search(r"Wall Insulation:\s*(.*?)(?=\n|$)", part_details) + wall_drylining_match = re.search(r"Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details) + wall_thickness_match = re.search(r"Wall Thickness:\s*(\d+)(?=\n|$)", part_details) + + wall_data.append({ + "Building Part": cleaned_part_name, + "Wall Type": wall_type_match.group(1).strip() if wall_type_match else None, + "Wall Insulation": wall_insulation_match.group(1).strip() if wall_insulation_match else None, + "Wall Dry-lining": wall_drylining_match.group(1).strip() if wall_drylining_match else None, + "Wall Thickness": int(wall_thickness_match.group(1)) if wall_thickness_match else None, + }) + + return wall_data + + @staticmethod + def extract_primary_heating(text): + + # Extract Primary Heating Section (Main Heating 1) + primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + # We may not have a secondary heating + primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + primary_text = primary_heating_section.group(1) + + primary_heating_output = { + "Existing Primary Heating System": re.search( + r"Main Heating Code\s*(.*?)\n", primary_text + ).group(1).strip(), + "Existing Primary Heating PCDF Reference": re.search( + r"PCDF boiler Reference\s*(\d+)", primary_text + ).group(1), + "Existing Primary Heating Controls": re.search( + r"Main Heating Controls\s*(.*?)\n", primary_text + ).group(1).strip(), + "Existing Primary Heating % of Heat": int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) + ) + } + + return primary_heating_output + + @staticmethod + def extract_secondary_heating(text): + # Extract Secondary Heating Section (Main Heating 2) + secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) + output = {} + if secondary_heating_section is None: + output["Existing Heating System"] = "" + output["Existing Heating PCDF Reference"] = "" + output["Existing Heating Controls"] = "" + output["Existing Heating % of Heat"] = 0 + + else: + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + output["Existing Heating System"] = main_heating_code_match_secondary.group(1).strip() + + output["Existing Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", secondary_text + ).group(1) + + if output["Existing Heating System"] == "": + output["Existing Heating Controls"] = "" + else: + # Might not have heating controls on 2nd system + secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + output["Existing Heating Controls"] = ( + secondary_controls_match.group(1).strip() if secondary_controls_match else "" + ) + output["Existing Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) + ) + + return output + + def extract(self): + data = {} + + with open(self.file_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + text = "".join(page.extract_text() for page in reader.pages) + + # Extracting individual components + address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) + data["Address"] = address_match.group(1).strip() + data["Postcode"] = data["Address"].split(",")[-1].strip() + + sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) + data["Current SAP Rating"] = int(sap_match.group(1)) + + energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) + data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1)) + + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + data["Number of Storeys"] = int(storeys_match.group(1)) + + fuel_match = re.search(r"TOTAL\s*£(\d+)", text) + data["Fuel Bill"] = f"£{fuel_match.group(1)}" + + total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Get number of lighting outlets and number of fittings needing LEL + lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) + data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) + lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) + data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + data["Windows"] = self.extract_window_age_description(windows_section.group(1)) + + data["Primary Heating"] = self.extract_primary_heating(text) + data["Secondary Heating"] = self.extract_secondary_heating(text) + data["Building Parts"] = self.extract_building_parts(text) + data["Roof Details"] = self.extract_roof_details(text) + data["Wall Details"] = self.extract_wall_details(text) + + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + if data["Secondary Heating"]["Existing Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group( + 1).strip() if secondary_heating_code_match else "" + + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + return data From 0efd0163ee9ad103b97ef6ebdf4419c580f305b3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 13:30:36 +0000 Subject: [PATCH 114/255] refactoring epr extraction --- etl/lodgement/app.py | 1 - utils/file_data_extraction.py | 126 ++++++++++++++++++++-------------- 2 files changed, 76 insertions(+), 51 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index ede644b8..a395508c 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -43,5 +43,4 @@ def handler(): file_extractor = extractors.get(report_type) if file_extractor is None: continue - extracted_contents[report_type] = file_extractor(filepath).extract() diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index cdd25f8a..15b183dc 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -69,6 +69,10 @@ def is_pdf(filename): class ElmhurstEprExtractor: + """ + A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). + """ + def __init__(self, file_path): self.file_path = file_path @@ -219,7 +223,30 @@ class ElmhurstEprExtractor: return wall_data @staticmethod - def extract_primary_heating(text): + def _extract_heating_details(section_text): + """ + Extracts heating details from a given section of text. + + Args: + section_text (str): The section of text containing heating details. + + Returns: + dict: A dictionary containing heating system details. + """ + + system_search = re.search(r"Main Heating Code\s*(.*?)\n", section_text) + pcdf_search = re.search(r"PCDF boiler Reference\s*(\d+)", section_text) + controls_search = re.search(r"Main Heating Controls\s*(.*?)\n", section_text) + heat_search = re.search(r"Percentage of Heat\s*(\d+)\s*%?", section_text) + + return { + "System": system_search.group(1).strip() if system_search else "", + "PCDF Reference": pcdf_search.group(1) if pcdf_search else "", + "Controls": controls_search.group(1).strip() if controls_search else "", + "% of Heat": int(heat_search.group(1)) if heat_search else 0, + } + + def extract_primary_heating(self, text): # Extract Primary Heating Section (Main Heating 1) primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) @@ -228,61 +255,46 @@ class ElmhurstEprExtractor: primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 primary_text = primary_heating_section.group(1) - primary_heating_output = { - "Existing Primary Heating System": re.search( - r"Main Heating Code\s*(.*?)\n", primary_text - ).group(1).strip(), - "Existing Primary Heating PCDF Reference": re.search( - r"PCDF boiler Reference\s*(\d+)", primary_text - ).group(1), - "Existing Primary Heating Controls": re.search( - r"Main Heating Controls\s*(.*?)\n", primary_text - ).group(1).strip(), - "Existing Primary Heating % of Heat": int( - re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) - ) - } + return self._extract_heating_details(primary_text) - return primary_heating_output - - @staticmethod - def extract_secondary_heating(text): + def extract_secondary_heating(self, text): # Extract Secondary Heating Section (Main Heating 2) secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) + output = {} if secondary_heating_section is None: - output["Existing Heating System"] = "" - output["Existing Heating PCDF Reference"] = "" - output["Existing Heating Controls"] = "" - output["Existing Heating % of Heat"] = 0 + + output["System"] = "" + output[" PCDF Reference"] = "" + output["Controls"] = "" + output["% of Heat"] = 0 else: secondary_text = secondary_heating_section.group(1) - - main_heating_code_match_secondary = re.search( - r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + output.update( + **self._extract_heating_details(secondary_text) ) - output["Existing Heating System"] = main_heating_code_match_secondary.group(1).strip() - output["Existing Heating PCDF Reference"] = re.search( - r"PCDF boiler Reference\s*(\d+)", secondary_text - ).group(1) - - if output["Existing Heating System"] == "": - output["Existing Heating Controls"] = "" - else: - # Might not have heating controls on 2nd system - secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) - output["Existing Heating Controls"] = ( - secondary_controls_match.group(1).strip() if secondary_controls_match else "" - ) - output["Existing Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) - ) + output["Heating Code"] = ( + re.search(r"Secondary Heating Code\s*(.*?)\n", text).group(1).strip() + if output["System"] and re.search(r"Secondary Heating Code\s*(.*?)\n", text) + else "" + ) return output def extract(self): + """ + Extracts all relevant data from the EPR PDF. + + Returns: + dict: A dictionary containing extracted data, including: + - Address and Postcode + - SAP Rating and Primary Energy Use + - Lighting, Doors, Windows, Roof, and Wall Details + - Heating systems (Primary and Secondary) + - Building Parts + """ data = {} with open(self.file_path, "rb") as file: @@ -291,36 +303,56 @@ class ElmhurstEprExtractor: # Extracting individual components address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) + if not address_match: + raise ValueError("Failed to extract address.") data["Address"] = address_match.group(1).strip() data["Postcode"] = data["Address"].split(",")[-1].strip() sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) + if not sap_match: + raise ValueError("Failed to extract SAP rating.") data["Current SAP Rating"] = int(sap_match.group(1)) energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) + if not energy_match: + raise ValueError("Failed to extract primary energy use.") data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1)) storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + if not storeys_match: + raise ValueError("Failed to extract number of storeys.") data["Number of Storeys"] = int(storeys_match.group(1)) fuel_match = re.search(r"TOTAL\s*£(\d+)", text) + if not fuel_match: + raise ValueError("Failed to extract fuel bill.") data["Fuel Bill"] = f"£{fuel_match.group(1)}" total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) + if not total_doors_match: + raise ValueError("Failed to extract total doors.") data["Total Number of Doors"] = int(total_doors_match.group(1)) # Extract Number of Insulated Doors insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) + if not insulated_doors_match: + raise ValueError("Failed to extract insulated doors.") data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) # Get number of lighting outlets and number of fittings needing LEL lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) + if not lighting_fittings_match: + raise ValueError("Failed to extract lighting") data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) + if not lel_fittings_match: + raise ValueError("Failed to extract LEL fittings.") data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + if not windows_section: + raise ValueError("Failed to extract window data.") data["Windows"] = self.extract_window_age_description(windows_section.group(1)) data["Primary Heating"] = self.extract_primary_heating(text) @@ -329,15 +361,9 @@ class ElmhurstEprExtractor: data["Roof Details"] = self.extract_roof_details(text) data["Wall Details"] = self.extract_wall_details(text) - secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) - - if data["Secondary Heating"]["Existing Heating System"] == "": - data["Secondary Heating Code"] = "" - else: - data["Secondary Heating Code"] = secondary_heating_code_match.group( - 1).strip() if secondary_heating_code_match else "" - + if not water_heating_code_match: + raise ValueError("Failed to extract water heating code.") data["Water Heating Code"] = water_heating_code_match.group(1).strip() return data From 749faaebca22c8353ab09965ddc9c087a6c5d0d0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 17:07:56 +0000 Subject: [PATCH 115/255] extending extraction --- etl/lodgement/app.py | 151 ++++++++++++++++- utils/file_data_extraction.py | 116 ++++++++++++- utils/fullSapParser.py | 306 ++++++++++++++++++++++++++++++++++ 3 files changed, 562 insertions(+), 11 deletions(-) create mode 100644 utils/fullSapParser.py diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index a395508c..b8b7e393 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -1,5 +1,83 @@ import os import utils.file_data_extraction as file_extraction_tools +from utils.fullSapParser import FullSapParser + +output_template = { + "Property Address": None, + "Osm. ID": None, + "Postcode": None, + "City/County": None, + "District/Town": None, + "Funding Stream": None, + "Local Authority": None, + "Trustmark Lodgement ID": None, + "Certificate Number": None, + "EWI UMR": None, + "Loft UMR": None, + "Windows UMR": None, + "Doors UMR": None, + "Measure Lodgement Date": None, + "Full Lodgement Date": None, + "Name": None, + "Phone": None, + "Email": None, + "Secondary Contact Name": None, + "Secondary Contact Phone": None, + "Trustmark Licence Number": None, + "Retrofit Assessment Date": None, + "Company Name": None, + "Retrofit Designer Name": None, + "Property Type": None, + "Property Detachment": None, + "No. of Bedrooms": None, + "Property Age": None, + "SAP Rating Pre (from IMA)": None, + "Pre Heat Transfer": None, + "Pre Total Floor Area": None, + "Pre Heat Demand": None, + "Pre Air Tightness": None, + "SAP Rating Post (from EPC)": None, + "Post Heat Transfer": None, + "Post Total Floor Area": None, + "Post Heat Demand": None, + "Post Air Tightness": None, + "Number of Eligible Measures Installed": None, + "Total Cost of Works": None, + "Annual Fuel Saving (MTP)": None, + "Work Type ID": None, + "Measure Category": None, + "Installer": None, + "Operative Name": None, + "Operative Certif. Reference": None, + "Manufacturer": None, + "Model": None, + "Financial Protection Body (IBG)": None, + "Policy Start Date": None, + "IBG Policy Reference": None, + "Warranty Duration": None, + "Total Invoiced (Including VAT)": None, + "Installation Date": None, + "Handover Date": None, + "Percentage": None, + "Reference Number": None, +} + + +def update_dictionary_with_check(dictionary, updates): + """ + Updates a dictionary with key-value pairs, raising an error if the key does not exist. + + Args: + dictionary (dict): The dictionary to update. + updates (dict): The updates to apply. + + Raises: + KeyError: If a key in updates does not exist in the dictionary. + """ + for key, value in updates.items(): + if key not in dictionary: + raise KeyError(f"Key '{key}' does not exist in the dictionary.") + dictionary[key] = value def handler(): @@ -11,7 +89,11 @@ def handler(): # Ths source data will eventually come from Sharepoint source_data_path = "/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot" - output_template = "Trustmark Details - Template REV.25.11.24.xlsx" + output_template_file = "Trustmark Details - Template REV.25.11.24.xlsx" + funding_stream = "HUG2" + customer_name = "Shropshire Council" + customer_phone = "0345 678 9000" + customer_email = "affordablewarmth@shropshire.gov.uk" # List the folders in the source data path folders = [x for x in os.listdir(source_data_path) if os.path.isdir(os.path.join(source_data_path, x))] @@ -20,7 +102,8 @@ def handler(): "elmhurst epr": file_extraction_tools.ElmhurstEprExtractor, "elmhurst summary report": None, "osmosis condition report": None, - "elmhurst evidence report": None + "elmhurst evidence report": None, + "full sap xml": FullSapParser, } for property_folder in folders: @@ -43,4 +126,68 @@ def handler(): file_extractor = extractors.get(report_type) if file_extractor is None: continue + extracted_contents[report_type] = file_extractor(filepath).extract() + + if file_extraction_tools.is_xml(filepath): + xml_type = file_extraction_tools.detect_xml_report_type(xml_path=filepath) + if xml_type is None: + raise ValueError(f"Unknown report type for {filename}") + file_extractor = extractors.get(xml_type) + if file_extractor is None: + continue + extracted_contents[xml_type] = file_extractor(filepath).extract() + + output_row_data = output_template.copy() + + # dict_keys([, , , 'City/County', 'District/Town', + # 'Local Authority', + # 'Trustmark Lodgement ID', + # 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR', + # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Name', 'Phone', 'Email', 'Secondary Contact + # Name', 'Secondary Contact Phone', 'Trustmark Licence Number', 'Retrofit Assessment Date', 'Company Name', + # 'Retrofit Designer Name', , 'No. of Bedrooms', + # , 'Pre Heat Transfer', 'Pre Total Floor Area', 'Pre Heat Demand', + # 'Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat Transfer', 'Post Total Floor Area', + # 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures Installed', 'Total Cost of Works', + # 'Annual Fuel Saving (MTP)', 'Work Type ID', 'Measure Category', 'Installer', 'Operative Name', 'Operative + # Certif. Reference', 'Manufacturer', 'Model', 'Financial Protection Body (IBG)', 'Policy Start Date', + # 'IBG Policy Reference', 'Warranty Duration', 'Total Invoiced (Including VAT)', 'Installation Date', + # 'Handover Date', 'Percentage', 'Reference Number']) + # Populate the output row data + if extracted_contents["elmhurst epr"]: + total_floor_area = sum( + [x["Floor Area (m2)"] for x in extracted_contents["elmhurst epr"]["Building Parts"]] + + # Get the conservatory floor area + extracted_contents["elmhurst epr"]["Conservatory"]["Floor Area (m2)"] + ) + + to_insert = { + "Property Address": property_folder.split(")")[1].strip(), + "Osm. ID": property_folder.split(")")[0].strip().lstrip("(").strip(), + "Postcode": extracted_contents["elmhurst epr"]["Postcode"], + "City/County": None, + "District/Town": None, + "Funding Stream": funding_stream, + "Local Authority": None, + 'Property Age': extracted_contents["elmhurst epr"]["Property Age"], + 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"], + 'Pre Heat Transfer': extracted_contents["elmhurst epr"][ + "Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area, + } + + output_row_data["Property Address"] = property_folder.split(")")[1].strip() + output_row_data["Osm. ID"] = property_folder.split(")")[0].strip().lstrip("(").strip() + output_row_data["Postcode"] = extracted_contents["elmhurst epr"]["Postcode"] + output_row_data["City/County"] = () + output_row_data["Batch"] = () + output_row_data["Funding Stream"] = funding_stream + output_row_data["Risk Path"] = () + + if extracted_contents["full sap xml"]: + to_insert = { + "Property Type": extracted_contents["full sap xml"]["Property Type"], + "Property Detachment": extracted_contents["full sap xml"]["Built Form"], + "Property Age": extracted_contents["full sap xml"]["Age Band"], + + } diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index 15b183dc..f0d341c6 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -1,6 +1,10 @@ import PyPDF2 import re from collections import Counter +from utils.logger import setup_logger +from xml.dom.minidom import parseString + +logger = setup_logger() """ This script contains functions used to extract data from retrofit survey files, including EPRs, @@ -61,6 +65,25 @@ def detect_pdf_report_type(pdf_path): return None +def detect_xml_report_type(xml_path): + """ + Detects the type of XML report based on content or filename. + :param xml_path: String path to the XML file + :return: String type of the report ("full sap xml", or None) + """ + # Attempt to read the first page of the PDF to determine type + with open(xml_path, "r") as file: + contents = file.read() + + contents = parseString(contents) + product_tag_search = contents.getElementsByTagName("Product") + if product_tag_search: + if product_tag_search[0].firstChild.nodeValue == "Sap 2012 Desktop": + return "full sap xml" + + raise Exception("Not implemented") + + def is_pdf(filename): """ Determines if the provided filename is a PDF file. @@ -68,6 +91,13 @@ def is_pdf(filename): return filename.endswith(".pdf") +def is_xml(filename): + """ + Determines if the provided filename is an XML file. + """ + return filename.endswith(".xml") + + class ElmhurstEprExtractor: """ A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). @@ -223,26 +253,82 @@ class ElmhurstEprExtractor: return wall_data @staticmethod - def _extract_heating_details(section_text): + def extract_conservatory(text): + """ + Extracts conservatory data from the provided text. + The section is located between "Conservatory" and "Doors". + + Args: + text (str): The full text of the EPR PDF. + + Returns: + dict: A dictionary with conservatory details: + - "Conservatory Present" + - "Conservatory Separated" + - "Conservatory Floor Area" + - "Conservatory Double Glazed" + - "Conservatory Glazed Perimeter" + - "Heated Conservatory Height" + """ + + conservatory_match = re.search(r"Conservatory\s*(.*?)\s*Doors", text, re.DOTALL) + if not conservatory_match: + logger.error("Failed to extract conservatory data.") + raise ValueError("Could not extract conservatory data.") + + conservatory_text = conservatory_match.group(1) + + # Check if conservatory is present + present_match = re.search(r"Conservatory Present:\s*(Yes|No)", conservatory_text) + + if not present_match or present_match.group(1).strip() == "No": + logger.info("Conservatory not present.") + return { + "Conservatory Present": "No", + "Conservatory Separated": "", + "Conservatory Floor Area": 0, + "Conservatory Double Glazed": "", + "Conservatory Glazed Perimeter": 0, + "Heated Conservatory Height": "", + } + + # Extract conservatory details + separated_match = re.search(r"Conservatory Separated:\s*(Yes|No)", conservatory_text) + floor_area_match = re.search(r"Conservatory Floor Area:\s*([\d.]+)", conservatory_text) + double_glazed_match = re.search(r"Conservatory Double Glazed:\s*(Yes|No)", conservatory_text) + glazed_perimeter_match = re.search(r"Conservatory Glazed Perimeter:\s*([\d.]+)", conservatory_text) + height_match = re.search(r"Heated Conservatory Height:\s*(.*?)(?=\n|$)", conservatory_text) + + return { + "Conservatory Present": "Yes", + "Conservatory Separated": separated_match.group(1).strip() if separated_match else "", + "Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0, + "Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "", + "Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0, + "Heated Conservatory Height": height_match.group(1).strip() if height_match else "", + } + + @staticmethod + def _extract_heating_details(section_text, default_value=""): """ Extracts heating details from a given section of text. Args: section_text (str): The section of text containing heating details. + default_value (str, optional): The default value to return for missing fields. Defaults to "". Returns: dict: A dictionary containing heating system details. """ - system_search = re.search(r"Main Heating Code\s*(.*?)\n", section_text) pcdf_search = re.search(r"PCDF boiler Reference\s*(\d+)", section_text) controls_search = re.search(r"Main Heating Controls\s*(.*?)\n", section_text) heat_search = re.search(r"Percentage of Heat\s*(\d+)\s*%?", section_text) return { - "System": system_search.group(1).strip() if system_search else "", - "PCDF Reference": pcdf_search.group(1) if pcdf_search else "", - "Controls": controls_search.group(1).strip() if controls_search else "", + "System": system_search.group(1).strip() if system_search else default_value, + "PCDF Reference": pcdf_search.group(1) if pcdf_search else default_value, + "Controls": controls_search.group(1).strip() if controls_search else default_value, "% of Heat": int(heat_search.group(1)) if heat_search else 0, } @@ -257,7 +343,7 @@ class ElmhurstEprExtractor: return self._extract_heating_details(primary_text) - def extract_secondary_heating(self, text): + def extract_secondary_heating_details(self, text): # Extract Secondary Heating Section (Main Heating 2) secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) @@ -265,7 +351,7 @@ class ElmhurstEprExtractor: if secondary_heating_section is None: output["System"] = "" - output[" PCDF Reference"] = "" + output["PCDF Reference"] = "" output["Controls"] = "" output["% of Heat"] = 0 @@ -304,65 +390,77 @@ class ElmhurstEprExtractor: # Extracting individual components address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) if not address_match: + logger.error("Failed to extract address.") raise ValueError("Failed to extract address.") data["Address"] = address_match.group(1).strip() data["Postcode"] = data["Address"].split(",")[-1].strip() sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) if not sap_match: + logger.error("Failed to extract SAP rating.") raise ValueError("Failed to extract SAP rating.") data["Current SAP Rating"] = int(sap_match.group(1)) energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) if not energy_match: + logger.error("Failed to extract primary energy use.") raise ValueError("Failed to extract primary energy use.") data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1)) storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) if not storeys_match: - raise ValueError("Failed to extract number of storeys.") + logger.error("Failed to extract the number of storeys.") + raise ValueError("Failed to extract the number of storeys.") data["Number of Storeys"] = int(storeys_match.group(1)) fuel_match = re.search(r"TOTAL\s*£(\d+)", text) if not fuel_match: + logger.error("Failed to extract fuel bill.") raise ValueError("Failed to extract fuel bill.") data["Fuel Bill"] = f"£{fuel_match.group(1)}" total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) if not total_doors_match: + logger.error("Failed to extract total doors.") raise ValueError("Failed to extract total doors.") data["Total Number of Doors"] = int(total_doors_match.group(1)) # Extract Number of Insulated Doors insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) if not insulated_doors_match: + logger.error("Failed to extract insulated doors.") raise ValueError("Failed to extract insulated doors.") data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) # Get number of lighting outlets and number of fittings needing LEL lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) if not lighting_fittings_match: + logger.error("Failed to extract lighting.") raise ValueError("Failed to extract lighting") data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) if not lel_fittings_match: + logger.error("Failed to extract LEL fittings.") raise ValueError("Failed to extract LEL fittings.") data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) if not windows_section: + logger.error("Failed to extract window data.") raise ValueError("Failed to extract window data.") data["Windows"] = self.extract_window_age_description(windows_section.group(1)) data["Primary Heating"] = self.extract_primary_heating(text) - data["Secondary Heating"] = self.extract_secondary_heating(text) + data["Secondary Heating"] = self.extract_secondary_heating_details(text) data["Building Parts"] = self.extract_building_parts(text) data["Roof Details"] = self.extract_roof_details(text) data["Wall Details"] = self.extract_wall_details(text) + data["Conservatory"] = self.extract_conservatory(text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) if not water_heating_code_match: + logger.error("Failed to extract water heating code.") raise ValueError("Failed to extract water heating code.") data["Water Heating Code"] = water_heating_code_match.group(1).strip() diff --git a/utils/fullSapParser.py b/utils/fullSapParser.py new file mode 100644 index 00000000..540eff6f --- /dev/null +++ b/utils/fullSapParser.py @@ -0,0 +1,306 @@ +import boto3 +from xml.dom.minidom import parseString + +PROPERTY_AGE_BAND = { + "A": "before 1900", + "B": "1900-1929", + "C": "1930-1949", + "D": "1950-1966", + "E": "1967-1975", + "F": "1976-1982", + "G": "1983-1990", + "H": "1991-1995", + "I": "1996-2002", + "J": "2003-2006", + "K": "2007-2011", + "L": "2012 onwards" +} + +POSITION_OF_FLAT = { + "TopFloorFlat": "(top floor)" +} + +MAINHEATING_LOOKUP = { + "SEB": "Electric (SEB modern slimline storage heaters)" +} + +WINDOWS_YEAR_LOOKUP = { + "unknown install date": "unknown year", + "unknown install": "unknown year", + "post or during 2002": "2002 onwards", +} + + +class FullSapParser: + full_address = None + archetype = None + age_band = None + unheated_corridor = None + property_type = None + built_form = None + + # ventilation + mechanical_ventilation = None + cross_ventilation = None + night_ventilation = None + + # dimensions + number_of_storeys = None + property_dimensions = None + + # fabric + low_energy_lighting = None + + # Heating + heating1 = None + cylinder = None + cylinder_stat = None + + def __init__(self, filekey, bucket_name=None): + self.s3_client = boto3.client('s3') + self.bucket_name = bucket_name + self.filekey = filekey + self.full_sap = None + + self._read_file() + + def _read_file(self): + """ + Reads the XML file either locally or from S3 and parses it using minidom. + + Raises: + ValueError: If the file cannot be found, read, or parsed. + """ + try: + if self.bucket_name: + # Read from S3 + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=self.filekey) + xml_content = response['Body'].read() + else: + # Read locally + with open(self.filekey, "r") as f: + xml_content = f.read() + + # Parse the XML content using minidom + self.full_sap = parseString(xml_content) + except FileNotFoundError: + raise ValueError(f"Local file not found: {self.filekey}") + except Exception as e: + raise ValueError(f"An error occurred while reading or parsing the XML: {e}") + + def extract(self, _return=True): + self.get_address() + self.get_archetype() + self.get_age_band() + self.get_unheated_corridor() + self.get_heating_1() + self.get_ventilation() + self.get_floor_area() + self.get_low_energy_lighting() + self.get_cylinder() + + if _return: + return { + "Property Type": self.property_type, + "Built Form": self.built_form, + "Age Band": self.age_band, + } + + def get_address(self): + if not self.full_sap: + raise ValueError("You need to read the file first") + + address = self.full_sap.getElementsByTagName("AddressAsDesigned") + if len(address) != 1: + raise ValueError("Non-unique address tag found - investigate me") + + address = address[0] + data = {} + for node in address.childNodes: + if node.nodeType == node.ELEMENT_NODE: + data[node.nodeName] = node.firstChild.nodeValue if node.firstChild else None + + self.full_address = " ".join( + [ + x.title() for x in [data["AddressLine1"], data["AddressLine2"], data["AddressLine3"], data["Town"]] + if x is not None + ] + ) + " " + data["Postcode"] + + def get_archetype(self): + if not self.full_sap: + raise ValueError("You need to read the file first") + + property_type1 = self.full_sap.getElementsByTagName('PropertyType1') + property_type2 = self.full_sap.getElementsByTagName('PropertyType2') + position_of_flat = self.full_sap.getElementsByTagName('PositionOfFlat') + + if len(property_type1) != 1 or len(property_type2) != 1: + raise ValueError("Non-unique property tag found - investigate me") + + property_type1 = property_type1[0].firstChild.nodeValue + property_type2 = property_type2[0].firstChild.nodeValue + if position_of_flat[0].firstChild: + position_of_flat = POSITION_OF_FLAT[position_of_flat[0].firstChild.nodeValue] + else: + position_of_flat = None + + self.property_type = property_type1 + self.built_form = property_type2 + self.archetype = property_type1 + " - " + property_type2 + + if position_of_flat: + self.archetype = self.archetype + " " + position_of_flat + + def get_age_band(self): + if not self.full_sap: + raise ValueError("You need to read the file first") + + property_age_band = self.full_sap.getElementsByTagName('PropertyAgeBand') + + if len(property_age_band) != 1: + raise ValueError("Non-unique property age band tag found - investigate me") + + property_age_band = property_age_band[0].firstChild.nodeValue + self.age_band = PROPERTY_AGE_BAND[property_age_band] + + def get_wall_area_for_description(self, description): + wall_recs = self.full_sap.getElementsByTagName("WallRec") + for wall_rec in wall_recs: + desc_elements = wall_rec.getElementsByTagName("Description") + if desc_elements and desc_elements[0].firstChild.data == description: + area_elements = wall_rec.getElementsByTagName("Area") + if area_elements: + area = float(area_elements[0].firstChild.data) + # Placeholder for wall_description which you'll populate later + return f"Unheated corridor - {area} area" + return None + + def get_unheated_corridor(self): + """ + Unheated corridors don't always exist so we'll need to search for it + :return: + """ + + if not self.full_sap: + raise ValueError("You need to read the file first") + + self.unheated_corridor = self.get_wall_area_for_description("Flat corridor Main") + + def get_heating_1(self): + + if not self.full_sap: + raise ValueError("You need to read the file first") + + main_heating_system = self.full_sap.getElementsByTagName('MainHeatingSystem1') + + if len(main_heating_system) != 1: + raise ValueError("Non-unique main heating system tag found - investigate me") + + main_heating_system = main_heating_system[0] + + mhs = main_heating_system.getElementsByTagName('MHS')[0].firstChild.nodeValue + mhs = MAINHEATING_LOOKUP.get(mhs, mhs) + + fraction = main_heating_system.getElementsByTagName('Fraction')[0].firstChild.nodeValue + + self.heating1 = f"{mhs} : {fraction}% of heating" + + def get_ventilation(self): + + bool_lookup = { + "true": True, + "false": False + } + + # Extract MechanicalVentilationDecentralised + mech_vent = self.full_sap.getElementsByTagName("MechanicalVentilationDecentralised") + if mech_vent and mech_vent[0].childNodes: + mech_vent_value = mech_vent[0].firstChild.nodeValue + else: + mech_vent_value = None + + # Extract CrossVentilation + cross_vent = self.full_sap.getElementsByTagName("CrossVentilation") + if cross_vent and cross_vent[0].childNodes: + cross_vent_value = cross_vent[0].firstChild.nodeValue + cross_vent_value = bool_lookup.get(cross_vent_value, cross_vent_value) + else: + cross_vent_value = None + + # Extract NightVentilation + night_vent = self.full_sap.getElementsByTagName("NightVentilation") + if night_vent and night_vent[0].childNodes: + night_vent_value = night_vent[0].firstChild.nodeValue + night_vent_value = bool_lookup.get(night_vent_value, night_vent_value) + else: + night_vent_value = None + + # Create the outputs + self.mechanical_ventilation = "Mechanical ventilation present" if mech_vent_value else "No mechanical " \ + "ventilation" + self.cross_ventilation = "Cross ventilation present" if cross_vent_value else "No cross ventilation" + self.night_ventilation = "Night ventilation present" if night_vent_value else "No night ventilation" + + def get_floor_area(self): + + self.number_of_storeys = int(self.full_sap.getElementsByTagName('NumberOfStoreys')[0].firstChild.nodeValue) + storeys = self.full_sap.getElementsByTagName('StoreyMeasurementRec') + + # TODO: The first StoreyMeasurementRec tag looks like this in the examples we've seen: + # + # Indicating that the tag is explicitly indicated as empty + + storey_data = [] + storey_index = -1 + for storey in storeys: + storey_index += 1 + + if storey.getAttribute("xsi:nil") == "true": + continue + + if storey_index == -1: + raise NotImplementedError( + "Investigated me - potentially basement found but need to confirm with Basement tag" + ) + + floor_area = storey.getElementsByTagName('InternalFloorArea') + if not floor_area: + continue + + floor_area = float(floor_area[0].firstChild.nodeValue) + # If floor area is 0, skip this storey + if not floor_area: + continue + + perimeter = float(storey.getElementsByTagName('InternalPerimeter')[0].firstChild.nodeValue) + height = float(storey.getElementsByTagName('StoreyHeight')[0].firstChild.nodeValue) + + storey_data.append({ + "storey_index": storey_index, + "Floor Area": floor_area, + "Perimeter": perimeter, + "Height": height + }) + + # We will convert this into a table in the markdown + self.property_dimensions = storey_data + + def get_low_energy_lighting(self): + # Extract the values of the LightFittings and LELFittings tags + light_fittings = self.full_sap.getElementsByTagName('LightFittings')[0].firstChild.data + lel_fittings = self.full_sap.getElementsByTagName('LELFittings')[0].firstChild.data + + # Construct the string message + self.low_energy_lighting = f"{lel_fittings} out of {light_fittings} lighting fittings are low energy." + + def get_cylinder(self): + insulation_type = self.full_sap.getElementsByTagName('InsulationType')[0].firstChild.data + insulation_thickness = self.full_sap.getElementsByTagName('InsulationThickness')[0].firstChild.data + + if insulation_type and insulation_thickness: + self.cylinder = f"Insulated, {insulation_type}: {insulation_thickness}mm." + else: + self.cylinder = "Not insulated." + + self.cylinder_stat = self.full_sap.getElementsByTagName('CylinderStat')[0].firstChild.data From 63521dd1e303cae0daa1fdf8e405d7e8c953a1da Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 17:18:17 +0000 Subject: [PATCH 116/255] extending extraction --- etl/lodgement/app.py | 21 +++++++++++++++------ utils/file_data_extraction.py | 2 ++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index b8b7e393..4ff8bdf1 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -126,7 +126,7 @@ def handler(): file_extractor = extractors.get(report_type) if file_extractor is None: continue - + extracted_contents[report_type] = file_extractor(filepath).extract() if file_extraction_tools.is_xml(filepath): @@ -136,6 +136,7 @@ def handler(): file_extractor = extractors.get(xml_type) if file_extractor is None: continue + extracted_contents[xml_type] = file_extractor(filepath).extract() output_row_data = output_template.copy() @@ -144,10 +145,12 @@ def handler(): # 'Local Authority', # 'Trustmark Lodgement ID', # 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR', - # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Name', 'Phone', 'Email', 'Secondary Contact + # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', + # 'Name', 'Phone', 'Email', (owner) + # 'Secondary Contact # Name', 'Secondary Contact Phone', 'Trustmark Licence Number', 'Retrofit Assessment Date', 'Company Name', # 'Retrofit Designer Name', , 'No. of Bedrooms', - # , 'Pre Heat Transfer', 'Pre Total Floor Area', 'Pre Heat Demand', + # , # 'Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat Transfer', 'Post Total Floor Area', # 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures Installed', 'Total Cost of Works', # 'Annual Fuel Saving (MTP)', 'Work Type ID', 'Measure Category', 'Installer', 'Operative Name', 'Operative @@ -159,7 +162,12 @@ def handler(): total_floor_area = sum( [x["Floor Area (m2)"] for x in extracted_contents["elmhurst epr"]["Building Parts"]] + # Get the conservatory floor area - extracted_contents["elmhurst epr"]["Conservatory"]["Floor Area (m2)"] + [extracted_contents["elmhurst epr"]["Conservatory"]["Conservatory Floor Area"]] + ) + + pre_heat_transfer = extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"] + pre_heat_demand = ( + extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area ) to_insert = { @@ -172,8 +180,9 @@ def handler(): "Local Authority": None, 'Property Age': extracted_contents["elmhurst epr"]["Property Age"], 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"], - 'Pre Heat Transfer': extracted_contents["elmhurst epr"][ - "Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area, + 'Pre Heat Transfer': pre_heat_transfer, + 'Pre Total Floor Area': total_floor_area, + 'Pre Heat Demand': pre_heat_demand, } output_row_data["Property Address"] = property_folder.split(")")[1].strip() diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index f0d341c6..ae75735b 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -387,6 +387,8 @@ class ElmhurstEprExtractor: reader = PyPDF2.PdfReader(file) text = "".join(page.extract_text() for page in reader.pages) + data["Assessor Name"] = re.search(r"Created by:\s*(.*?)\n", text).group(1).strip() + # Extracting individual components address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) if not address_match: From bcbb43ed8f045e53607cd17e1b4cff4709208cf9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 08:38:38 +0000 Subject: [PATCH 117/255] adding the summary report extraction class --- etl/lodgement/app.py | 128 +++++++++-------- etl/lodgement/requirements.txt | 4 +- utils/OsmosisCondtionReportParser.py | 49 +++++++ utils/file_data_extraction.py | 196 ++++++++++++++++++++++++++- 4 files changed, 315 insertions(+), 62 deletions(-) create mode 100644 utils/OsmosisCondtionReportParser.py diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 4ff8bdf1..3688ca19 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -1,6 +1,10 @@ import os + +import pandas as pd + import utils.file_data_extraction as file_extraction_tools from utils.fullSapParser import FullSapParser +from utils.OsmosisCondtionReportParser import OsmosisConditionReportParser output_template = { "Property Address": None, @@ -9,6 +13,7 @@ output_template = { "City/County": None, "District/Town": None, "Funding Stream": None, + # "Risk Path": None, "Local Authority": None, "Trustmark Lodgement ID": None, "Certificate Number": None, @@ -18,11 +23,12 @@ output_template = { "Doors UMR": None, "Measure Lodgement Date": None, "Full Lodgement Date": None, - "Name": None, - "Phone": None, - "Email": None, - "Secondary Contact Name": None, - "Secondary Contact Phone": None, + "Owner - Name": None, + "Owner - Phone": None, + "Owner - Email": None, + "Tenant - Name": None, + "Tenant - Phone": None, + "R. Assessor - Name": None, "Trustmark Licence Number": None, "Retrofit Assessment Date": None, "Company Name": None, @@ -30,7 +36,7 @@ output_template = { "Property Type": None, "Property Detachment": None, "No. of Bedrooms": None, - "Property Age": None, + "Property age": None, "SAP Rating Pre (from IMA)": None, "Pre Heat Transfer": None, "Pre Total Floor Area": None, @@ -44,22 +50,6 @@ output_template = { "Number of Eligible Measures Installed": None, "Total Cost of Works": None, "Annual Fuel Saving (MTP)": None, - "Work Type ID": None, - "Measure Category": None, - "Installer": None, - "Operative Name": None, - "Operative Certif. Reference": None, - "Manufacturer": None, - "Model": None, - "Financial Protection Body (IBG)": None, - "Policy Start Date": None, - "IBG Policy Reference": None, - "Warranty Duration": None, - "Total Invoiced (Including VAT)": None, - "Installation Date": None, - "Handover Date": None, - "Percentage": None, - "Reference Number": None, } @@ -100,14 +90,19 @@ def handler(): extractors = { "elmhurst epr": file_extraction_tools.ElmhurstEprExtractor, - "elmhurst summary report": None, - "osmosis condition report": None, + "elmhurst summary report": file_extraction_tools.ElmhurstSummaryReportExtractor, + "osmosis condition report": OsmosisConditionReportParser, "elmhurst evidence report": None, "full sap xml": FullSapParser, } + extracted = [] for property_folder in folders: + coordinator_folder = os.path.join(source_data_path, property_folder, "2. RA Coordinator Info") + # Check if this folder exists + if not os.path.exists(coordinator_folder): + coordinator_folder = os.path.join(source_data_path, property_folder, "1. RA Coordinator Info") # Get the contents of the folder coordinator_folder_contents = [ @@ -123,10 +118,10 @@ def handler(): if report_type is None: raise ValueError(f"Unknown report type for {filename}") - file_extractor = extractors.get(report_type) + file_extractor = extractors[report_type] if file_extractor is None: continue - + extracted_contents[report_type] = file_extractor(filepath).extract() if file_extraction_tools.is_xml(filepath): @@ -141,24 +136,27 @@ def handler(): output_row_data = output_template.copy() - # dict_keys([, , , 'City/County', 'District/Town', - # 'Local Authority', - # 'Trustmark Lodgement ID', - # 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR', - # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', - # 'Name', 'Phone', 'Email', (owner) - # 'Secondary Contact - # Name', 'Secondary Contact Phone', 'Trustmark Licence Number', 'Retrofit Assessment Date', 'Company Name', - # 'Retrofit Designer Name', , 'No. of Bedrooms', - # , - # 'Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat Transfer', 'Post Total Floor Area', - # 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures Installed', 'Total Cost of Works', - # 'Annual Fuel Saving (MTP)', 'Work Type ID', 'Measure Category', 'Installer', 'Operative Name', 'Operative - # Certif. Reference', 'Manufacturer', 'Model', 'Financial Protection Body (IBG)', 'Policy Start Date', - # 'IBG Policy Reference', 'Warranty Duration', 'Total Invoiced (Including VAT)', 'Installation Date', - # 'Handover Date', 'Percentage', 'Reference Number']) + # dict_keys([ 'City/County', 'District/Town', + # 'Local Authority', 'Trustmark Lodgement ID', 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR', + # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone', + # 'Owner - Email', 'Tenant - Name', 'Tenant - Phone', + # 'Trustmark Licence Number', + # 'Company Name', 'Retrofit Designer Name', + # Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat + # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures + # Installed', 'Total Cost of Works', 'Annual Fuel Saving (MTP)']) # Populate the output row data - if extracted_contents["elmhurst epr"]: + + update_dictionary_with_check( + output_row_data, + { + "Funding Stream": funding_stream, + "Property Address": property_folder.split(")")[1].strip(), + "Osm. ID": property_folder.split(")")[0].strip().lstrip("(").strip(), + } + ) + + if extracted_contents.get("elmhurst epr"): total_floor_area = sum( [x["Floor Area (m2)"] for x in extracted_contents["elmhurst epr"]["Building Parts"]] + # Get the conservatory floor area @@ -170,33 +168,45 @@ def handler(): extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area ) - to_insert = { - "Property Address": property_folder.split(")")[1].strip(), - "Osm. ID": property_folder.split(")")[0].strip().lstrip("(").strip(), + epr_to_insert = { "Postcode": extracted_contents["elmhurst epr"]["Postcode"], "City/County": None, "District/Town": None, - "Funding Stream": funding_stream, "Local Authority": None, - 'Property Age': extracted_contents["elmhurst epr"]["Property Age"], 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"], 'Pre Heat Transfer': pre_heat_transfer, 'Pre Total Floor Area': total_floor_area, 'Pre Heat Demand': pre_heat_demand, + "R. Assessor - Name": extracted_contents["elmhurst epr"]["Assessor Name"], + "Retrofit Assessment Date": extracted_contents["elmhurst epr"]["Assessment Date"], } + update_dictionary_with_check( + output_row_data, + epr_to_insert + ) - output_row_data["Property Address"] = property_folder.split(")")[1].strip() - output_row_data["Osm. ID"] = property_folder.split(")")[0].strip().lstrip("(").strip() - output_row_data["Postcode"] = extracted_contents["elmhurst epr"]["Postcode"] - output_row_data["City/County"] = () - output_row_data["Batch"] = () - output_row_data["Funding Stream"] = funding_stream - output_row_data["Risk Path"] = () - - if extracted_contents["full sap xml"]: - to_insert = { + if extracted_contents.get("full sap xml"): + xml_to_insert = { "Property Type": extracted_contents["full sap xml"]["Property Type"], "Property Detachment": extracted_contents["full sap xml"]["Built Form"], - "Property Age": extracted_contents["full sap xml"]["Age Band"], + "Property age": extracted_contents["full sap xml"]["Age Band"], } + update_dictionary_with_check( + output_row_data, + xml_to_insert + ) + + if extracted_contents.get("osmosis condition report"): + cr_to_insert = { + "No. of Bedrooms": extracted_contents["osmosis condition report"]["No. of Bedrooms"], + # "Risk Path": extracted_contents["osmosis condition report"]["Risk Assessment Pathway"], + } + update_dictionary_with_check( + output_row_data, + cr_to_insert + ) + + extracted.append(output_row_data) + + extracted_df = pd.DataFrame(extracted) diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt index 601907ed..75c63b26 100644 --- a/etl/lodgement/requirements.txt +++ b/etl/lodgement/requirements.txt @@ -5,4 +5,6 @@ openpyxl boto3 usaddress==0.5.11 fuzzywuzzy==0.18.0 -python-dotenv \ No newline at end of file +python-dotenv +python-docx +pymupdf diff --git a/utils/OsmosisCondtionReportParser.py b/utils/OsmosisCondtionReportParser.py new file mode 100644 index 00000000..4d8873a2 --- /dev/null +++ b/utils/OsmosisCondtionReportParser.py @@ -0,0 +1,49 @@ +import re +import boto3 +import PyPDF2 +import fitz + + +class OsmosisConditionReportParser: + + def __init__(self, filekey, bucket_name=None): + self.s3_client = boto3.client('s3') + self.bucket_name = bucket_name + self.filekey = filekey + self.pdf_text = None + + self._read_file() + + def _read_file(self): + """ + Reads the XML file either locally or from S3 and parses it using minidom. + + Raises: + ValueError: If the file cannot be found, read, or parsed. + """ + + chunk_size = 10 + + try: + if self.bucket_name: + # Read from S3 + raise NotImplementedError("Imeplement me") + else: + + with fitz.open(self.filekey) as pdf: + text = "" + for page in pdf: + text += page.get_text() + + # Parse the XML content using minidom + self.pdf_text = text + except FileNotFoundError: + raise ValueError(f"Local file not found: {self.filekey}") + except Exception as e: + raise ValueError(f"An error occurred while reading or parsing the XML: {e}") + + def extract(self): + return { + "No. of Bedrooms": int(re.search(r"No\. of Bedrooms \(Total\)\s*(\d+)", self.pdf_text).group(1)), + "Risk Assessment Pathway": re.search(r"Risk\s*Assessment\s*Pathway\s*([A-Z])", self.pdf_text).group(1) + } diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index ae75735b..2337ea9d 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -100,8 +100,8 @@ def is_xml(filename): class ElmhurstEprExtractor: """ - A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). - """ + A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). + """ def __init__(self, file_path): self.file_path = file_path @@ -388,6 +388,7 @@ class ElmhurstEprExtractor: text = "".join(page.extract_text() for page in reader.pages) data["Assessor Name"] = re.search(r"Created by:\s*(.*?)\n", text).group(1).strip() + data["Assessment Date"] = re.search(r"\nAssessment Date\s*(.*?)\n", text).group(1).strip() # Extracting individual components address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) @@ -467,3 +468,194 @@ class ElmhurstEprExtractor: data["Water Heating Code"] = water_heating_code_match.group(1).strip() return data + + +class ElmhurstSummaryReportExtractor: + """ + A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). + """ + + def __init__(self, file_path): + self.file_path = file_path + + def extract(self): + """ + Extracts specific data from the provided PDF file. + Data includes: + - Current SAP rating + - Fuel Bill + - Address + """ + + # Expected keys: + # dict_keys(['Total Number of Doors', 'Number of Insulated + # Doors', 'Number of Light Fittings', 'Number of LEL Fittings', 'Number of fittings needing LEL', 'Windows', + # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', + # 'Water Heating Code']) + + data = { + + } + + with (open(self.file_path, "rb") as file): + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Match and extract + name_match = re.search(r"Name:\s*([A-Za-z\s]+)\s*Title:\s*([A-Za-z\.]+)", text) + if not name_match: + raise ValueError("Couldn't extract surveyor name") + data["Assessor Name"] = name_match.group(2).strip() + " " + name_match.group(1).strip() + data["Assessment Date"] = re.search(r"Inspection Date:\s*(.*?)\n", text).group(1).strip() + + # Address and postcode + postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) + region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) + house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) + house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) + street = re.search(r"Street:\s*(.*?)\nLocality:", text) + locality = re.search(r"Locality:\s*(.*?)\nTown:", text) + town = re.search(r"Town:\s*(.*?)\nCounty:", text) + county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) + + # Clean extracted values and remove any prefixes + address_parts = [ + house_no.group(1).strip() if house_no else "", + house_name.group(1).strip() if house_name else "", + street.group(1).strip() if street else "", + locality.group(1).strip() if locality else "", + town.group(1).strip() if town else "", + county.group(1).strip() if county else "", + region.group(1).strip() if region else "", + postcode.group(1).strip() if postcode else "" + ] + + # Join non-empty parts with a comma + data["Address"] = ", ".join([part for part in address_parts if part]) + data["Postcode"] = postcode.group(1).strip() + + # Extract Current SAP rating + sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) + if not sap_match: + raise ValueError("Could not extract SAP rating") + data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] + + # We don't have primary energy in the summary report + data['Primary Energy Use Intensity (kWh/m2/yr)'] = None + + # Number of storeys + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + if not storeys_match: + raise ValueError("Could not extract number of storeys") + data["Number of Storeys"] = int(storeys_match.group(1)) + + # Extract Fuel Bill + fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) + if not fuel_bill_match: + raise ValueError("Could not extract fuel bill") + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text) + if not total_doors_match: + raise ValueError("Could not extract total number of doors") + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text) + if not insulated_doors_match: + raise ValueError("Could not extract number of insulated doors") + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) + + # Extract heating system + # Extract Primary Heating Data + # Extract Primary Heating Section + primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + + primary_text = primary_heating_section.group(1) + + data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", primary_text + ).group(1) + data["Existing Primary Heating Controls"] = re.search( + r"Main Heating Controls\s*(.*?)\n", primary_text + ).group(1).strip() + data["Existing Primary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) + ) + + # Extract Secondary Heating Section + secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + + if secondary_heating_section is None: + data["Existing Secondary Heating System"] = "" + data["Existing Secondary Heating PCDF Reference"] = "" + data["Existing Secondary Heating Controls"] = "" + data["Existing Secondary Heating % of Heat"] = 0 + + else: + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + data["Existing Secondary Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) + + # Extract Secondary Heating and Water Heating Codes + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + if data["Existing Secondary Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group( + 1).strip() if secondary_heating_code_match else "" + + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + dimensions = extract_building_parts_summary(text) + data.update(dimensions) + + data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) + data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + + extracted_roof_data = extract_roof_details_summary(text) + main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] + data["Main Roof Type"] = main_roof_data["Roof Type"] + data["Main Roof Insulation"] = main_roof_data["Roof Insulation"] + data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"] + + walls_data = extract_wall_details_summary(text) + # Get the main building wall data + main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] + data["Main Wall Type"] = main_building_walls["Wall Type"] + data["Main Wall Insulation"] = main_building_walls["Wall Insulation"] + data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"] + data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"] + data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"] + data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"] + data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"] + data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"] + + return data From f141aa4d842a38d8133bdf9b586224333f5372be Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 08:42:53 +0000 Subject: [PATCH 118/255] extracting windows --- utils/file_data_extraction.py | 71 ++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index 2337ea9d..d444bff8 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -478,6 +478,59 @@ class ElmhurstSummaryReportExtractor: def __init__(self, file_path): self.file_path = file_path + @staticmethod + def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + + Parameters: + windows_text (str): The text section containing window data. + + Returns: + dict: A dictionary with the most common window age description and its proportion. + """ + # Clean up windows_text by removing line breaks for better pattern matching + windows_text = windows_text.replace("\n", "") + + # Define possible window age descriptions + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + + # Count occurrences of each description + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + # Determine the most common description and calculate its proportion + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + # Get the second most common and the proportion + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) + } + def extract(self): """ Extracts specific data from the provided PDF file. @@ -488,8 +541,7 @@ class ElmhurstSummaryReportExtractor: """ # Expected keys: - # dict_keys(['Total Number of Doors', 'Number of Insulated - # Doors', 'Number of Light Fittings', 'Number of LEL Fittings', 'Number of fittings needing LEL', 'Windows', + # dict_keys(['Windows', # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', # 'Water Heating Code']) @@ -569,10 +621,15 @@ class ElmhurstSummaryReportExtractor: raise ValueError("Could not extract number of insulated doors") data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + # lighting + data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) + data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) - windows_text = windows_section.group(1) - window_data = extract_window_age_description(windows_text) - data.update(window_data) + if not windows_section: + raise ValueError("Failed to extract window data.") + data["Windows"] = self.extract_window_age_description(windows_section.group(1)) # Extract heating system # Extract Primary Heating Data @@ -636,10 +693,6 @@ class ElmhurstSummaryReportExtractor: dimensions = extract_building_parts_summary(text) data.update(dimensions) - data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) - data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) - data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] - extracted_roof_data = extract_roof_details_summary(text) main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] data["Main Roof Type"] = main_roof_data["Roof Type"] From d489b4346fd6e1f940de4fb1f61ca6cd6b10cf24 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 09:14:38 +0000 Subject: [PATCH 119/255] extracting secondary heating --- utils/file_data_extraction.py | 116 ++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index d444bff8..20590afd 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -531,6 +531,62 @@ class ElmhurstSummaryReportExtractor: "Number of Windows": sum(description_counts.values()) } + @staticmethod + def extract_primary_heating(text): + primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + if primary_heating_section is None: + raise ValueError("Failed to extract primary heating data.") + + primary_text = primary_heating_section.group(1) + + output = { + 'System': re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(1).strip(), + 'PCDF Reference': re.search(r"PCDF boiler Reference\s*(\d+)", primary_text).group(1), + 'Controls': re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(1).strip(), + '% of Heat': int(re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1)) + } + return output + + @staticmethod + def extract_secondary_heating_details(text): + secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + + # Defaults + output = { + "System": "", + "PCDF Reference": "", + "Controls": "", + "% of Heat": 0, + "Heating Code": "" + } + if secondary_heating_section is not None: + # Overwrite defaults + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + output["System"] = main_heating_code_match_secondary.group(1).strip() + output["PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) + + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + output["Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) + output["% of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) + + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + if output["System"] != "": + output["Heating Code"] = ( + secondary_heating_code_match.group(1).strip() if secondary_heating_code_match else "" + ) + + return output + def extract(self): """ Extracts specific data from the provided PDF file. @@ -541,13 +597,11 @@ class ElmhurstSummaryReportExtractor: """ # Expected keys: - # dict_keys(['Windows', + # dict_keys([ # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', # 'Water Heating Code']) - data = { - - } + data = {} with (open(self.file_path, "rb") as file): reader = PyPDF2.PdfReader(file) @@ -631,62 +685,14 @@ class ElmhurstSummaryReportExtractor: raise ValueError("Failed to extract window data.") data["Windows"] = self.extract_window_age_description(windows_section.group(1)) - # Extract heating system - # Extract Primary Heating Data - # Extract Primary Heating Section - primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) - primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) - primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 - - primary_text = primary_heating_section.group(1) - - data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( - 1).strip() - data["Existing Primary Heating PCDF Reference"] = re.search( - r"PCDF boiler Reference\s*(\d+)", primary_text - ).group(1) - data["Existing Primary Heating Controls"] = re.search( - r"Main Heating Controls\s*(.*?)\n", primary_text - ).group(1).strip() - data["Existing Primary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) - ) + data["Primary Heating"] = self.extract_primary_heating(text) + data["Secondary Heating"] = self.extract_secondary_heating_details(text) # Extract Secondary Heating Section - secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) - - if secondary_heating_section is None: - data["Existing Secondary Heating System"] = "" - data["Existing Secondary Heating PCDF Reference"] = "" - data["Existing Secondary Heating Controls"] = "" - data["Existing Secondary Heating % of Heat"] = 0 - - else: - secondary_text = secondary_heating_section.group(1) - - main_heating_code_match_secondary = re.search( - r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text - ) - data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() - data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - secondary_text).group(1) - second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) - data["Existing Secondary Heating Controls"] = ( - second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" - ) - data["Existing Secondary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) - ) # Extract Secondary Heating and Water Heating Codes - secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) - water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) - if data["Existing Secondary Heating System"] == "": - data["Secondary Heating Code"] = "" - else: - data["Secondary Heating Code"] = secondary_heating_code_match.group( - 1).strip() if secondary_heating_code_match else "" + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) data["Water Heating Code"] = water_heating_code_match.group(1).strip() From 26e0206f378c4bedc8bca9e42d43e5d7bfcc196f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 09:20:44 +0000 Subject: [PATCH 120/255] extracted roof and walls --- utils/file_data_extraction.py | 233 ++++++++++++++++++++++++++++++++-- 1 file changed, 219 insertions(+), 14 deletions(-) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index 20590afd..80c0c19b 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -587,6 +587,220 @@ class ElmhurstSummaryReportExtractor: return output + @staticmethod + def extract_building_parts(text): + """ + Extracts building parts and associated dimensions from the summary report PDF. + This includes Main Property, multiple extensions if they exist, and Room in Roof areas. + """ + data = [] + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to extract each building part, starting from Main Property and including extensions + building_part_pattern = re.compile( + r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" + r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)", + re.DOTALL + ) + + # Loop through each building part match, including Main Property and extensions + for match in building_part_pattern.finditer(dimensions_text): + part_name = match.group(1) + floor_data = match.group(2) + + # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length + floor_pattern = re.compile( + r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract data for each floor within the building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data list + data.append( + { + "Building Part": part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + } + ) + + # Check specifically for "Room(s) in Roof" entries, which only have Floor Area + room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)") + room_in_roof_match = room_in_roof_pattern.search(floor_data) + if room_in_roof_match: + floor_area = float(room_in_roof_match.group(1)) + data.append( + { + "Building Part": part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, # Placeholder for missing data + "Perimeter (m)": None, # Placeholder for missing data + "Party Wall Length (m)": None # Placeholder for missing data + } + ) + + # Calculate aggregated dimensions + main_property = [part for part in data if "Main Property" in part["Building Part"]] + first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] + dimensions = { + "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), + "Total Ground Floor Area (m2)": sum( + [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] + ), + "RIR Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] + ), + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if + x["Perimeter (m)"] and x["Room Height (m)"]]), + "First Extension Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if + x["Perimeter (m)"] and x["Room Height (m)"]] + ), + } + + return dimensions + + @staticmethod + def extract_roof_details(text): + """ + Extracts roof type, insulation, and insulation thickness for each building part + in the 8.0 Roofs section of the summary report. + """ + # Define data structure to hold results + roof_data = [] + + # Locate the entire 8.0 Roofs section + roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL) + if not roof_section_match: + return roof_data # Return empty if no roof section is found + + # Extract the roof section and append "9.0 Floors:" as the boundary + roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:" + + # Define pattern to match each building part's roof entry + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label + r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label, + # or end + r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation + r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness + re.DOTALL + ) + + # Extract each building part's data + for match in building_part_pattern.finditer(roof_section): + part_name = match.group(1).strip() # Building part label + roof_type = match.group(2).strip() # Roof Type + roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation + roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness + + # Cleaning to handle annoying cases when it comes out like this: + # 'A Another dwelling above\n1st Extension' + if roof_type.startswith("A Another dwelling above"): + roof_type = "A Another dwelling above" + + # Store results for this building part + roof_data.append( + { + "Building Part": part_name, + "Roof Type": roof_type, + "Roof Insulation": roof_insulation, + "Roof Insulation Thickness": roof_insulation_thickness, + } + ) + + return roof_data + + @staticmethod + def extract_wall_details(text): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part, + including any alternative wall details within the 7.0 Walls section of the summary PDF text. + """ + # Define data structure to hold all building part wall entries + wall_data = [] + + # Locate the entire 7.0 Walls section + wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1) + + # Define pattern to match each building part's wall entry within the section + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label + r"Type\s+(.*?)\n" # Matches main wall Type + r"Insulation\s+(.*?)\n" # Matches main wall Insulation + r"(Dry-lining\s+(.*?)\n)?" # Optional main wall Dry-lining + r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown + r"Wall Thickness \[mm\]\s+(\d+)", # Matches main wall Thickness + re.DOTALL + ) + + # Define pattern to capture alternative wall details, if present + alternative_wall_pattern = re.compile( + r"Alternative Wall Area.*?\n" # Matches start of alternative wall section + r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type + r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation + r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining + r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown + r"Alternative Wall Thickness\s+(\d+)", # Matches alternative wall Thickness + re.DOTALL + ) + + # Find all building part entries within the 7.0 Walls section + for match in building_part_pattern.finditer(wall_section): + wall_label = match.group(1).strip() + main_wall_type = match.group(2).strip() + main_wall_insulation = match.group(3).strip() + main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A" + main_wall_thickness_unknown = match.group(6).strip() + main_wall_thickness = int(match.group(7)) + + # Initialize dictionary for this wall entry + wall_entry = { + "Building Part": wall_label, + "Wall Type": main_wall_type, + "Wall Insulation": main_wall_insulation, + "Wall Dry-lining": main_wall_dry_lining, + "Wall Thickness Unknown": main_wall_thickness_unknown, + "Wall Thickness (mm)": main_wall_thickness, + "Alternative Wall Type": None, + "Alternative Wall Insulation": None, + "Alternative Wall Dry-lining": "N/A", + "Alternative Wall Thickness Unknown": None, + "Alternative Wall Thickness (mm)": None, + } + + # Check if there's an alternative wall section following this wall entry + alt_match = alternative_wall_pattern.search(wall_section, match.end()) + if alt_match: + wall_entry["Alternative Wall Type"] = alt_match.group(1).strip() + wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip() + wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A" + wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip() + wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6)) + + # Append each building part as a dictionary in the wall_data list + wall_data.append(wall_entry) + + return wall_data + def extract(self): """ Extracts specific data from the provided PDF file. @@ -687,25 +901,16 @@ class ElmhurstSummaryReportExtractor: data["Primary Heating"] = self.extract_primary_heating(text) data["Secondary Heating"] = self.extract_secondary_heating_details(text) - - # Extract Secondary Heating Section - - # Extract Secondary Heating and Water Heating Codes + data["Building Parts"] = self.extract_building_parts(text) + data["Roof Details"] = self.extract_roof_details(text) + data["Wall Details"] = self.extract_wall_details(text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + if not water_heating_code_match: + raise ValueError("Failed to extract water heating code.") data["Water Heating Code"] = water_heating_code_match.group(1).strip() - dimensions = extract_building_parts_summary(text) - data.update(dimensions) - - extracted_roof_data = extract_roof_details_summary(text) - main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] - data["Main Roof Type"] = main_roof_data["Roof Type"] - data["Main Roof Insulation"] = main_roof_data["Roof Insulation"] - data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"] - - walls_data = extract_wall_details_summary(text) # Get the main building wall data main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] data["Main Wall Type"] = main_building_walls["Wall Type"] From 8b875cbccfc2ce5b0f00ed55f4466af7fec165f2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 09:44:26 +0000 Subject: [PATCH 121/255] done with summary report extraction --- utils/file_data_extraction.py | 75 +++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 16 deletions(-) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index 80c0c19b..f5e014a4 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -801,6 +801,64 @@ class ElmhurstSummaryReportExtractor: return wall_data + @staticmethod + def extract_conservatory(text): + """ + Extracts conservatory data from the provided text. + The section is located between "5.0 Conservatory" and "7.0 Walls". + + Args: + text (str): The full text of the Summary Report PDF. + + Returns: + dict: A dictionary with conservatory details: + - "Conservatory Present" + - "Conservatory Separated" + - "Conservatory Floor Area" + - "Conservatory Double Glazed" + - "Conservatory Glazed Perimeter" + - "Heated Conservatory Height" + """ + + # Extract the section between "5.0 Conservatory" and "7.0 Walls" + conservatory_match = re.search(r"5\.0 Conservatory:(.*?)7\.0 Walls:", text, re.DOTALL) + if not conservatory_match: + logger.error("Failed to extract conservatory data.") + raise ValueError("Could not extract conservatory data.") + + conservatory_text = conservatory_match.group(1) + + # Check if conservatory is present + present_match = re.search(r"Is there a conservatory\?\s*(Yes|No)", conservatory_text, re.IGNORECASE) + + if not present_match or present_match.group(1).strip().lower() == "no": + return { + "Conservatory Present": "No", + "Conservatory Separated": "", + "Conservatory Floor Area": 0, + "Conservatory Double Glazed": "", + "Conservatory Glazed Perimeter": 0, + "Heated Conservatory Height": "", + } + + # If we get here, raise a temporary exception since we've not seen a case of this, so should make sure + # this is correct + + separated_match = re.search(r"Is it thermally separated\?\s*(Yes|No)", conservatory_text, re.IGNORECASE) + floor_area_match = re.search(r"Floor Area \[m2\]\s*([\d.]+)", conservatory_text, re.IGNORECASE) + double_glazed_match = re.search(r"Double Glazed\s*(Yes|No)", conservatory_text, re.IGNORECASE) + glazed_perimeter_match = re.search(r"Glazed Perimeter \[m\]\s*([\d.]+)", conservatory_text, re.IGNORECASE) + height_match = re.search(r"Room Height\s*(.*?)(?=\n|$)", conservatory_text, re.IGNORECASE) + + return { + "Conservatory Present": "Yes", + "Conservatory Separated": separated_match.group(1).strip() if separated_match else "", + "Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0, + "Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "", + "Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0, + "Heated Conservatory Height": height_match.group(1).strip() if height_match else "", + } + def extract(self): """ Extracts specific data from the provided PDF file. @@ -810,11 +868,6 @@ class ElmhurstSummaryReportExtractor: - Address """ - # Expected keys: - # dict_keys([ - # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', - # 'Water Heating Code']) - data = {} with (open(self.file_path, "rb") as file): @@ -904,6 +957,7 @@ class ElmhurstSummaryReportExtractor: data["Building Parts"] = self.extract_building_parts(text) data["Roof Details"] = self.extract_roof_details(text) data["Wall Details"] = self.extract_wall_details(text) + data["Conservatory"] = self.extract_conservatory(text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) if not water_heating_code_match: @@ -911,15 +965,4 @@ class ElmhurstSummaryReportExtractor: data["Water Heating Code"] = water_heating_code_match.group(1).strip() - # Get the main building wall data - main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] - data["Main Wall Type"] = main_building_walls["Wall Type"] - data["Main Wall Insulation"] = main_building_walls["Wall Insulation"] - data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"] - data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"] - data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"] - data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"] - data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"] - data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"] - return data From 5a2ffe646ccecd9559b8c39d211d726ed8c547d8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 12:00:43 +0000 Subject: [PATCH 122/255] implementing summary report extraction --- etl/lodgement/app.py | 33 ++++++++++++++++- utils/file_data_extraction.py | 69 ++++++++++++++++++++--------------- 2 files changed, 71 insertions(+), 31 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 3688ca19..629c10e0 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -170,8 +170,8 @@ def handler(): epr_to_insert = { "Postcode": extracted_contents["elmhurst epr"]["Postcode"], - "City/County": None, - "District/Town": None, + "City/County": extracted_contents["elmhurst epr"]["County"], + "District/Town": extracted_contents["elmhurst epr"]["Town"], "Local Authority": None, 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"], 'Pre Heat Transfer': pre_heat_transfer, @@ -207,6 +207,35 @@ def handler(): cr_to_insert ) + if extracted_contents.get("elmhurst summary report"): + total_floor_area = sum( + [x["Floor Area (m2)"] for x in extracted_contents["elmhurst summary report"]["Building Parts"]] + + # Get the conservatory floor area + [extracted_contents["elmhurst summary report"]["Conservatory"]["Conservatory Floor Area"]] + ) + + pre_heat_transfer = ( + extracted_contents["elmhurst summary report"]["Primary Energy Use Intensity (kWh/m2/yr)"] + ) + pre_heat_demand = None # Don't have this + + summary_to_insert = { + "Postcode": extracted_contents["elmhurst summary report"]["Postcode"], + "City/County": extracted_contents["elmhurst summary report"]["County"], + "District/Town": extracted_contents["elmhurst summary report"]["Town"], + 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst summary report"]["Current SAP Rating"], + 'Pre Heat Transfer': pre_heat_transfer, + 'Pre Total Floor Area': total_floor_area, + 'Pre Heat Demand': pre_heat_demand, + "R. Assessor - Name": extracted_contents["elmhurst summary report"]["Assessor Name"], + "Retrofit Assessment Date": extracted_contents["elmhurst summary report"]["Assessment Date"], + } + + update_dictionary_with_check( + output_row_data, + summary_to_insert + ) + extracted.append(output_row_data) extracted_df = pd.DataFrame(extracted) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index f5e014a4..c3cc8a10 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -398,6 +398,15 @@ class ElmhurstEprExtractor: data["Address"] = address_match.group(1).strip() data["Postcode"] = data["Address"].split(",")[-1].strip() + # TODO: + data["Region"] = None + data["House Name"] = None + data["House No"] = None + data["Street"] = None + data["Locality"] = None + data["Town"] = None + data["County"] = None + sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) if not sap_match: logger.error("Failed to extract SAP rating.") @@ -657,26 +666,7 @@ class ElmhurstSummaryReportExtractor: } ) - # Calculate aggregated dimensions - main_property = [part for part in data if "Main Property" in part["Building Part"]] - first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] - dimensions = { - "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), - "Total Ground Floor Area (m2)": sum( - [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] - ), - "RIR Floor Area": sum( - [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] - ), - "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if - x["Perimeter (m)"] and x["Room Height (m)"]]), - "First Extension Wall Area (m2)": sum( - [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if - x["Perimeter (m)"] and x["Room Height (m)"]] - ), - } - - return dimensions + return data @staticmethod def extract_roof_details(text): @@ -869,7 +859,6 @@ class ElmhurstSummaryReportExtractor: """ data = {} - with (open(self.file_path, "rb") as file): reader = PyPDF2.PdfReader(file) text = "" @@ -885,29 +874,51 @@ class ElmhurstSummaryReportExtractor: # Address and postcode postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) + postcode = postcode.group(1).strip() if postcode else "" + region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) + region = region.group(1).strip() if region else "" + house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) + house_name = house_name.group(1).strip() if house_name else "" + house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) + house_no = house_no.group(1).strip() if house_no else "" + street = re.search(r"Street:\s*(.*?)\nLocality:", text) + street = street.group(1).strip() if street else "" + locality = re.search(r"Locality:\s*(.*?)\nTown:", text) + locality = locality.group(1).strip() if locality else "" + town = re.search(r"Town:\s*(.*?)\nCounty:", text) + town = town.group(1).strip() if town else "" + county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) + county = county.group(1).strip() if county else "" # Clean extracted values and remove any prefixes address_parts = [ - house_no.group(1).strip() if house_no else "", - house_name.group(1).strip() if house_name else "", - street.group(1).strip() if street else "", - locality.group(1).strip() if locality else "", - town.group(1).strip() if town else "", - county.group(1).strip() if county else "", - region.group(1).strip() if region else "", - postcode.group(1).strip() if postcode else "" + house_no, + house_name, + street, + locality, + town, + county, + region, + postcode ] # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) data["Postcode"] = postcode.group(1).strip() + data["Region"] = region + data["House Name"] = house_name + data["House No"] = house_no + data["Street"] = street + data["Locality"] = locality + data["Town"] = town + data["County"] = county # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) From 5e7827f706d9ad6f55f518d5b8dc9acb04ebd50e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 14:02:55 +0000 Subject: [PATCH 123/255] added ocr extraction for permeability report --- etl/lodgement/app.py | 37 ++++++++++-- etl/lodgement/requirements.txt | 3 + utils/file_data_extraction.py | 100 +++++++++++++++++++++++++++++---- 3 files changed, 124 insertions(+), 16 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 629c10e0..6fe9fdc4 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -85,6 +85,10 @@ def handler(): customer_phone = "0345 678 9000" customer_email = "affordablewarmth@shropshire.gov.uk" + # TODO: In order for this to go live, we need to use Poppler, which needs to be installed + # w/ brew install poppler + # We also need to install Tesseract: brew install tesseract + # List the folders in the source data path folders = [x for x in os.listdir(source_data_path) if os.path.isdir(os.path.join(source_data_path, x))] @@ -94,25 +98,28 @@ def handler(): "osmosis condition report": OsmosisConditionReportParser, "elmhurst evidence report": None, "full sap xml": FullSapParser, + "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor } extracted = [] for property_folder in folders: - coordinator_folder = os.path.join(source_data_path, property_folder, "2. RA Coordinator Info") - # Check if this folder exists - if not os.path.exists(coordinator_folder): - coordinator_folder = os.path.join(source_data_path, property_folder, "1. RA Coordinator Info") + property_folder_path = os.path.join(source_data_path, property_folder) + # List the folders in the source data path + subfolders = [ + x for x in os.listdir(property_folder_path) if os.path.isdir(os.path.join(property_folder_path, x)) + ] + coord_folder = os.path.join(property_folder_path, [f for f in subfolders if "RA Coordinator Info" in f][0]) # Get the contents of the folder coordinator_folder_contents = [ - file for file in os.listdir(coordinator_folder) if os.path.isfile(os.path.join(coordinator_folder, file)) + file for file in os.listdir(coord_folder) if os.path.isfile(os.path.join(coord_folder, file)) ] # We detect the various file types extracted_contents = {} for filename in coordinator_folder_contents: - filepath = os.path.join(coordinator_folder, filename) + filepath = os.path.join(coord_folder, filename) if file_extraction_tools.is_pdf(filepath): report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) if report_type is None: @@ -134,6 +141,24 @@ def handler(): extracted_contents[xml_type] = file_extractor(filepath).extract() + att_folder = os.path.join(property_folder_path, [f for f in subfolders if "Air Tightness Tests" in f][0]) + att_folder_contents = [ + file for file in os.listdir(att_folder) if os.path.isfile(os.path.join(att_folder, file)) + ] + + for filename in att_folder_contents: + filepath = os.path.join(att_folder, filename) + if file_extraction_tools.is_pdf(filepath): + report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) + if report_type is None: + raise ValueError(f"Unknown report type for {filename}") + file_extractor = extractors[report_type] + + if file_extractor is None: + continue + + extracted_contents[report_type] = file_extractor(filepath).extract() + output_row_data = output_template.copy() # dict_keys([ 'City/County', 'District/Town', diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt index 75c63b26..09e475fe 100644 --- a/etl/lodgement/requirements.txt +++ b/etl/lodgement/requirements.txt @@ -8,3 +8,6 @@ fuzzywuzzy==0.18.0 python-dotenv python-docx pymupdf +pytesseract +pdf2image +pillow diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index c3cc8a10..c60f01b4 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -3,6 +3,8 @@ import re from collections import Counter from utils.logger import setup_logger from xml.dom.minidom import parseString +from pdf2image import convert_from_path +from pytesseract import image_to_string logger = setup_logger() @@ -41,11 +43,17 @@ def is_elmhurst_evidence_report(text): return text.startswith("RdSAP Evidence Report") +def is_pulse_air_permeability(text): + """ + Determines if the provided text indicates that the PDF is a Pulse Air Permeability Report. + """ + return text.startswith("Air Permeability Test Report @O PULSE") + + def detect_pdf_report_type(pdf_path): """ Detects the type of report based on content or filename. :param pdf_path: String path to the PDF file - :param pdf_file: String name of the PDF file :return: String type of the report ("epr", "summary", or None) """ # Attempt to read the first page of the PDF to determine type @@ -53,14 +61,23 @@ def detect_pdf_report_type(pdf_path): reader = PyPDF2.PdfReader(file) first_page_text = reader.pages[0].extract_text() if reader.pages else "" - if is_elmhurst_energy_report(first_page_text): - return "elmhurst epr" - elif is_elmhurst_summary_report(first_page_text): - return "elmhurst summary report" - elif is_osmosis_condition_report(first_page_text): - return "osmosis condition report" - elif is_elmhurst_evidence_report(first_page_text): - return "elmhurst evidence report" + if first_page_text == "": + # Convert PDF pages to images + logger.info("Extracting text from PDF images..., this may take a moment.") + pages = convert_from_path(pdf_path, dpi=300) + if pages: + first_page_text = image_to_string(pages[0]) + + if is_elmhurst_energy_report(first_page_text): + return "elmhurst epr" + elif is_elmhurst_summary_report(first_page_text): + return "elmhurst summary report" + elif is_osmosis_condition_report(first_page_text): + return "osmosis condition report" + elif is_elmhurst_evidence_report(first_page_text): + return "elmhurst evidence report" + elif is_pulse_air_permeability(first_page_text): + return "pulse air permeability" return None @@ -911,7 +928,7 @@ class ElmhurstSummaryReportExtractor: # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) - data["Postcode"] = postcode.group(1).strip() + data["Postcode"] = postcode data["Region"] = region data["House Name"] = house_name data["House No"] = house_no @@ -977,3 +994,66 @@ class ElmhurstSummaryReportExtractor: data["Water Heating Code"] = water_heating_code_match.group(1).strip() return data + + +class PulseAirPermeabilityExtractor: + """ + A utility class for extracting specific data from Pulse Air Permeability Test Reports. + """ + + def __init__(self, file_path): + self.file_path = file_path + + @staticmethod + def extract_table(text): + patterns = { + "Air Leakage Rate": r"Air Leakage Rate\s*([\d,@.]+)\s*m/h\s*([\d,@.]+)\s*m3/h", + "Air Permeability": r"Air Permeability\s*([\d,@.]+)\s*=.*?\s*([\d,@.]+)\s*m\?/m\?h", + "Air Changes per Hour": r"Air Changes per Hour\s*([\d,@.]+)\s*([\d,@.]+)", + "Equivalent Leakage Area": r"Equivalent Leakage Area\s*([\d,@.]+)\s*([\d,@.]+)", + "Calculation Uncertainty": r"Calculation Uncertainty\s*([\d,@.]+)\s*([\d,@.]+)", + } + + # Initialize results dictionary + table_data = [] + + # Parse each metric using the corresponding regex + for metric, pattern in patterns.items(): + match = re.search(pattern, text) + if match: + # Extract the two column values + first_value = match.group(1) + second_value = match.group(2) + + # Post-process values: replace '@' with '0' and remove commas + first_value = first_value.replace("@", "0").replace(",", "") + second_value = second_value.replace("@", "0").replace(",", "") + + table_data.append( + { + "Metric": metric, + "Measured @ 4PA": first_value, + "Extrapolated @ 50PA": second_value, + } + ) + else: + raise ValueError(f"Could not extract metric: {metric}") + + return table_data + + def extract(self): + # Extract the pdf using tesseract + logger.info("Extracting data from pdf image - this may take a while...") + pages = convert_from_path(self.file_path, dpi=300) + # Extract all of the pages + text = "" + for page in pages: + text += image_to_string(page) + + # We extract the air permeability reading + results_table = self.extract_table(text) + data = { + "Results Table": results_table + } + + return data From 2cfc88104441c8a2d48015a4b8d3631f9c260259 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 14:08:31 +0000 Subject: [PATCH 124/255] extracted air tightness --- etl/lodgement/app.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 6fe9fdc4..c75ece4c 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -261,6 +261,15 @@ def handler(): summary_to_insert ) + if extracted_contents.get("pulse air permeability"): + # We extract the AP50 number + results_table = extracted_contents["pulse air permeability"]["Results Table"] + ap50 = [x["Extrapolated @ 50PA"] for x in results_table if x["Metric"] == "Air Permeability"][0] + update_dictionary_with_check( + output_row_data, + {"Pre Air Tightness": ap50} + ) + extracted.append(output_row_data) extracted_df = pd.DataFrame(extracted) From 3cd9670d1aa49b7b71b9fa59739b82ab2b9e62dd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 15:53:48 +0000 Subject: [PATCH 125/255] adding file detection for elmhurst project handover --- etl/lodgement/app.py | 29 ++++++++++++++++++++++++++++- utils/file_data_extraction.py | 21 +++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index c75ece4c..2bdeb3d7 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -98,7 +98,8 @@ def handler(): "osmosis condition report": OsmosisConditionReportParser, "elmhurst evidence report": None, "full sap xml": FullSapParser, - "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor + "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor, + "elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor, } extracted = [] @@ -159,6 +160,32 @@ def handler(): extracted_contents[report_type] = file_extractor(filepath).extract() + lodgement_folder = os.path.join( + property_folder_path, [f for f in subfolders if "TrustMark Lodgement" in f][0] + ) + # Within the lodgement folder, we want the required documents sub-folder + lodgement_subfolders = [ + file for file in os.listdir(lodgement_folder) if os.path.isdir(os.path.join(lodgement_folder, file)) + ] + required_documents_folder = os.path.join( + lodgement_folder, [f for f in lodgement_subfolders if "required documents" in f.lower()][0] + ) + # List the contents + required_documents_contents = [ + file for file in os.listdir(required_documents_folder) if + os.path.isfile(os.path.join(required_documents_folder, file)) + ] + + # There are only a few file types we actually want to process in here for the moment + for filename in required_documents_contents: + filepath = os.path.join(required_documents_folder, filename) + if file_extraction_tools.is_pdf(filepath): + report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) + if report_type != "elmhurst project handover": + continue + blah + file_extractor = extractors[report_type] + output_row_data = output_template.copy() # dict_keys([ 'City/County', 'District/Town', diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index c60f01b4..ef02e7f0 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -50,6 +50,13 @@ def is_pulse_air_permeability(text): return text.startswith("Air Permeability Test Report @O PULSE") +def is_elmhurst_project_handover(text): + """ + Determines if the provided text indicates that the PDF is an Elmhurst Project Handover Report. + """ + return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text + + def detect_pdf_report_type(pdf_path): """ Detects the type of report based on content or filename. @@ -78,6 +85,8 @@ def detect_pdf_report_type(pdf_path): return "elmhurst evidence report" elif is_pulse_air_permeability(first_page_text): return "pulse air permeability" + elif is_elmhurst_project_handover(first_page_text): + return "elmhurst project handover" return None @@ -1057,3 +1066,15 @@ class PulseAirPermeabilityExtractor: } return data + + +class ElmhurstProjectHandoverExtractor: + """ + A utility class for extracting specific data from The Elmhurst Project Handover document + """ + + def __init__(self, file_path): + self.file_path = file_path + + def extract(self): + pass From c6e02836a88cd2a4af7dc8a6ee10e160d6e60f68 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 29 Nov 2024 12:10:29 +0000 Subject: [PATCH 126/255] poc done for now --- etl/lodgement/app.py | 34 +++++++++++++--- etl/lodgement/requirements.txt | 1 + utils/file_data_extraction.py | 72 +++++++++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 6 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 2bdeb3d7..c1da35dd 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -29,6 +29,7 @@ output_template = { "Tenant - Name": None, "Tenant - Phone": None, "R. Assessor - Name": None, + "R. Coordinator - Name": None, "Trustmark Licence Number": None, "Retrofit Assessment Date": None, "Company Name": None, @@ -100,6 +101,7 @@ def handler(): "full sap xml": FullSapParser, "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor, "elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor, + "core logic pas assessment report": file_extraction_tools.CoreLogicPasAssessmentReportExtractor, } extracted = [] @@ -183,9 +185,10 @@ def handler(): report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) if report_type != "elmhurst project handover": continue - blah file_extractor = extractors[report_type] + extracted_contents[report_type] = file_extractor(filepath).extract() + output_row_data = output_template.copy() # dict_keys([ 'City/County', 'District/Town', @@ -193,11 +196,9 @@ def handler(): # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone', # 'Owner - Email', 'Tenant - Name', 'Tenant - Phone', # 'Trustmark Licence Number', - # 'Company Name', 'Retrofit Designer Name', # Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat - # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures - # Installed', 'Total Cost of Works', 'Annual Fuel Saving (MTP)']) - # Populate the output row data + # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', + # 'Total Cost of Works', 'Annual Fuel Saving (MTP)']) update_dictionary_with_check( output_row_data, @@ -297,6 +298,29 @@ def handler(): {"Pre Air Tightness": ap50} ) + if extracted_contents.get("elmhurst project handover"): + handover_to_insert = { + "Number of Eligible Measures Installed": len( + extracted_contents["elmhurst project handover"]["Measures Fitted"] + ), + "Retrofit Designer Name": extracted_contents["elmhurst project handover"]["Designer Name"], + "Company Name": extracted_contents["elmhurst project handover"]["Installer Name"], + "R. Coordinator - Name": extracted_contents["elmhurst project handover"]["Retrofit Coordinator Name"], + } + update_dictionary_with_check(output_row_data, handover_to_insert) + + if extracted_contents.get("core logic pas assessment report"): + cr_to_insert = { + "No. of Bedrooms": extracted_contents["core logic pas assessment report"]["Number of bedrooms"], + } + update_dictionary_with_check( + output_row_data, + cr_to_insert + ) + extracted.append(output_row_data) extracted_df = pd.DataFrame(extracted) + + extracted_df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot/poc-extrcted-data.csv", + index=False) diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt index 09e475fe..412aed3b 100644 --- a/etl/lodgement/requirements.txt +++ b/etl/lodgement/requirements.txt @@ -11,3 +11,4 @@ pymupdf pytesseract pdf2image pillow +pdfplumber diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index ef02e7f0..2e849ef5 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -1,5 +1,6 @@ import PyPDF2 import re +import pdfplumber from collections import Counter from utils.logger import setup_logger from xml.dom.minidom import parseString @@ -57,6 +58,13 @@ def is_elmhurst_project_handover(text): return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text +def is_core_logic_pas_assessment_report(text): + """ + Determines if the provided text indicates that the PDF is a PAS Assessment Report. + """ + return text.startswith("Generated Using CoreLogic UK PAS Assessment") + + def detect_pdf_report_type(pdf_path): """ Detects the type of report based on content or filename. @@ -87,6 +95,8 @@ def detect_pdf_report_type(pdf_path): return "pulse air permeability" elif is_elmhurst_project_handover(first_page_text): return "elmhurst project handover" + elif is_core_logic_pas_assessment_report(first_page_text): + return "core logic pas assessment report" return None @@ -1077,4 +1087,64 @@ class ElmhurstProjectHandoverExtractor: self.file_path = file_path def extract(self): - pass + + with (open(self.file_path, "rb") as file): + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + data = {} + + # Regex patterns + patterns = { + "Retrofit Coordinator Name": r"Retrofit Coordinator Name:\s*(.+)", + "Retrofit Coordinator ID": r"Retrofit Coordinator ID:\s*(\d+)", + "Measures Fitted": r"Measure\(s\) Fitted:\s*([\s\S]*?)\nRetrofit Assessor Name:", + "Designer Name": r"Designer Name\(s\):\s*(.+)", + "Installer Name": r"Installer Name\(s\):\s*(.+)", + } + + # Extract data + for key, pattern in patterns.items(): + match = re.search(pattern, text) + if not match: + raise ValueError(f"Could not match {key}") + if match: + if key == "Measures Fitted": + # Special handling for multiline measures + measures = re.findall(r"[\u2022\u00b7\u25cf\uf0b7]\s*(.+)", match.group(1)) + measures = [m.strip() for m in measures] + data[key] = measures + else: + data[key] = match.group(1).strip() if match else "" + + return data + + +class CoreLogicPasAssessmentReportExtractor: + """ + A utility class for extracting specific data from CoreLogic PAS Assessment Reports. + """ + + def __init__(self, file_path): + self.file_path = file_path + + def extract(self): + data = {} + + with pdfplumber.open(self.file_path) as pdf: + for page in pdf.pages: + tables = page.extract_tables() + if tables: # If tables are detected on the page + for table in tables: + for row in table: + # Check if the row contains "Number of bedrooms" + if any("Number of bedrooms" in str(cell) for cell in row): + # Extract the corresponding value by filtering out None and non-relevant cells + for cell in row: + if cell and cell.strip().isdigit(): # Check if cell contains a numeric value + data["Number of bedrooms"] = int(cell.strip()) + break # Stop further processing once value is found + + return data From c806ef71516d7fda620f854262b7360937b48b10 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 29 Nov 2024 15:12:14 +0000 Subject: [PATCH 127/255] modified the hug postcodes data --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/access_reporting/app.py | 0 etl/customers/gla/hug_postcodes.py | 29 ++++++++++++++++++++++++++++- 4 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 etl/access_reporting/app.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 9b63b142..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index acd935c1..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/access_reporting/app.py b/etl/access_reporting/app.py new file mode 100644 index 00000000..e69de29b diff --git a/etl/customers/gla/hug_postcodes.py b/etl/customers/gla/hug_postcodes.py index 85783d62..ac2d1e3c 100644 --- a/etl/customers/gla/hug_postcodes.py +++ b/etl/customers/gla/hug_postcodes.py @@ -3,6 +3,7 @@ import pandas as pd from pathlib import Path from tqdm import tqdm from etl.epc.settings import EARLIEST_EPC_DATE +from etl.spatial.OpenUprnClient import OpenUprnClient src_file_path = inspect.getfile(lambda: None) @@ -22,6 +23,7 @@ for directory in tqdm(epc_directories): data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] data = data[~pd.isnull(data["uprn"])] + data["uprn"] = data["uprn"].astype(int) # Take just the newest EPC per uprn, based on lodgement-date data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") # Take EPC D and below @@ -31,16 +33,41 @@ for directory in tqdm(epc_directories): # Take homes that don't have a gas boiler off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)] + if off_gas.empty: + continue + + # Remote properties with conservation area issues + uprns = off_gas["uprn"].unique() + # Get data + ca_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev") + off_gas = off_gas.merge( + ca_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]].rename( + columns={"UPRN": "uprn"} + ), + how="left", + on="uprn", + ) + # Remove any restricted units + off_gas = off_gas[ + (off_gas["conservation_status"] != True) + & (off_gas["is_listed_building"] != True) + & (off_gas["is_heritage_building"] != True) + ] + region_summary = off_gas.groupby("postal_region").size().reset_index(name="count") aggregation.append(region_summary) postal_region_aggregation = pd.concat(aggregation) +# Re-aggregate +postal_region_aggregation = postal_region_aggregation.groupby("postal_region")["count"].sum().reset_index() + postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False) postal_region_aggregation = postal_region_aggregation.rename( columns={"postal_region": "Postcode Region", "count": "Number of Homes"} ) postal_region_aggregation.to_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions.xlsx", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions - without conservation " + "area.xlsx", index=False ) From 3e8a1bc4fdf54d9647bafc90806ba9e7f731e69e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 2 Dec 2024 17:50:08 +0000 Subject: [PATCH 128/255] set up cottons asset list --- etl/access_reporting/app.py | 394 ++++++++++++++++++++ etl/access_reporting/requirements.txt | 11 + etl/customers/cottons/remote_assessments.py | 102 +++++ etl/find_my_epc/RetrieveFindMyEpc.py | 1 + etl/route_march_data_pull/app.py | 33 +- 5 files changed, 540 insertions(+), 1 deletion(-) create mode 100644 etl/access_reporting/requirements.txt create mode 100644 etl/customers/cottons/remote_assessments.py diff --git a/etl/access_reporting/app.py b/etl/access_reporting/app.py index e69de29b..830f4370 100644 --- a/etl/access_reporting/app.py +++ b/etl/access_reporting/app.py @@ -0,0 +1,394 @@ +import os +from msal import ConfidentialClientApplication +from datetime import datetime, timedelta +import requests +from functools import wraps +import time +import logging +from io import BytesIO +import pandas as pd + +# Configure logging +logger = logging.getLogger(__name__) +if not logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +def handle_error(response): + """ + Handle errors based on HTTP status codes and log detailed information. + """ + try: + error_json = response.json().get('error', {}) + except ValueError: + error_json = {} + + error_code = error_json.get('code', 'unknownError') + error_message = error_json.get('message', 'No detailed error message provided.') + inner_error = error_json.get('innererror', {}) + details = error_json.get('details', []) + + logger.error(f"Error Code: {error_code}") + logger.error(f"Error Message: {error_message}") + if inner_error: + logger.error(f"Inner Error: {inner_error}") + if details: + logger.error(f"Error Details: {details}") + + if response.status_code == 401: + logger.error("Unauthorized. Token might be invalid.") + elif response.status_code == 403: + logger.error("Forbidden. Access denied to the requested resource.") + elif response.status_code == 404: + logger.error("Not Found. The requested resource doesn’t exist.") + elif response.status_code == 429: + retry_after = int(response.headers.get('Retry-After', 5)) # Default to 5 seconds if not provided + logger.warning(f"Too Many Requests. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return 'retry' + elif response.status_code in (500, 503): + retry_after = int(response.headers.get('Retry-After', 5)) # Default to 5 seconds if not provided + logger.error(f"Server error. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return 'retry' + else: + raise ValueError(f"API request failed with status code {response.status_code} - {error_message}") + + raise ValueError(f"API request failed with status code {response.status_code} - {error_message}") + + +def api_call_decorator(func): + """ + Handles various aspects of the API call, including refreshing the access token if needed and handling pagination. + :param func: The function to be decorated. + :return: The wrapped function. + """ + + @wraps(func) + def wrapper(self, *args, **kwargs): + try: + # Check and refresh the access token if needed + if self.is_access_token_expired(): + self.retrieve_access_token() + logger.info("Access token refreshed.") + + # Get the HTTP method, URL, and optionally data from the function + http_method, url, data = func(self, *args, **kwargs) + + # Initialize the results list and handle pagination if page_size is provided + results = [] + page_size = kwargs.get('page_size', None) + response_data = {} + + while url: + response = requests.request(http_method, url, headers=self.headers, json=data) + + # Handle the response + if response.status_code == 200: + response_json = response.json() # Store the response JSON + if page_size: + results.extend(response_json.get('value', [])) + url = response_json.get('@odata.nextLink', None) + else: + response_data = response_json # Capture the full response for consistency + break + else: + retry = handle_error(response) + if retry == 'retry': + continue + + if page_size: + response_data = {'value': results} + + return response_data + + except Exception as e: + logger.exception("An error occurred during the API call.") + raise e + + return wrapper + + +class SharePointClient: + access_token = None + access_token_request_timestamp = None + access_token_expiry = None + headers = None + + TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + + def __init__(self, tenant_id, client_id, client_secret, site_id, access_token=None, + access_token_expiration_details=None): + """ + Initializes the SharePointClient with necessary credentials and site information. + :param tenant_id: The tenant ID. + :param client_id: The client ID. + :param client_secret: The client secret. + :param site_id: The site ID. + :param access_token: The access token (optional) + :param access_token_expiration_details: The access token expiration details (optional) + """ + self.tenant_id = tenant_id + self.client_id = client_id + self.client_secret = client_secret + + if access_token: + if not access_token_expiration_details: + raise ValueError("Access token expiration details must be provided.") + self.access_token = access_token + self.set_access_token_expiration_details(access_token_expiration_details) + self.headers = { + 'Authorization': f"Bearer {self.access_token['access_token']}" + } + else: + self.retrieve_access_token() + + # Retrieve static identifiers + self.site_id = site_id + self.document_drive = self.get_documents_drive() + + def get_token_expiration_details(self): + """ + Returns the access token expiration details. Converts the datetime objects to strings for serialization. + :return: + """ + return { + 'access_token_request_timestamp': datetime.strftime( + self.access_token_request_timestamp, self.TIMESTAMP_FORMAT + ), + 'access_token_expiry': datetime.strftime(self.access_token_expiry, self.TIMESTAMP_FORMAT) + } + + def set_access_token_expiration_details(self, access_token_expiration_details): + """ + Sets the access token expiration details from a serialized dictionary. + :param access_token_expiration_details: The serialized access token expiration details. + :return: + """ + self.access_token_request_timestamp = datetime.strptime( + access_token_expiration_details['access_token_request_timestamp'], self.TIMESTAMP_FORMAT + ) + self.access_token_expiry = datetime.strptime( + access_token_expiration_details['access_token_expiry'], self.TIMESTAMP_FORMAT + ) + + def is_access_token_expired(self): + """ + Checks if the access token has expired. If it has, a new access token is retrieved. + :return: True if expired, False otherwise. + """ + return datetime.now() >= self.access_token_expiry + + def retrieve_access_token(self, refresh=False): + """ + Implements authentication using MSAL. + :param refresh: If True, force a refresh of the access token. + :return: None + """ + app = ConfidentialClientApplication( + self.client_id, + authority=f"https://login.microsoftonline.com/{self.tenant_id}", + client_credential=self.client_secret + ) + + scope = ["https://graph.microsoft.com/.default"] + + access_token_request_timestamp = datetime.now() + + if refresh: + logger.info("Forcing refresh of access token.") + token = app.acquire_token_for_client(scopes=scope) + else: + # Check if a token is already cached + token = app.acquire_token_silent(scope, account=None) + + if not token: + token = app.acquire_token_for_client(scopes=scope) + + if "access_token" not in token: + logger.error("Authentication failed.") + raise ValueError("Authentication failed") + + access_token_expiry = access_token_request_timestamp + timedelta( + seconds=token['expires_in'] - 20 + ) + + self.access_token = token + self.access_token_request_timestamp = access_token_request_timestamp + self.access_token_expiry = access_token_expiry + self.headers = { + 'Authorization': f"Bearer {self.access_token['access_token']}" + } + + logger.info("Access token retrieved successfully.") + + @api_call_decorator + def get_documents_drive(self): + """ + Get the document drive of the SharePoint site. + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/sites/{self.site_id}/drive" + logger.info(f"Getting document drive from URL: {url}") + return 'GET', url, None + + @api_call_decorator + def list_folder_contents(self, drive_id, folder_path: str, page_size: int = 100): + """ + This function will list the contents of a folder in SharePoint. + :param drive_id: The ID of the drive. + :param folder_path: The path of the folder. + :param page_size: The number of items per page (default is 100). + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{folder_path}:/children?$top={page_size}" + logger.info(f"Listing folder contents from URL: {url}") + return 'GET', url, None + + @staticmethod + def download_sharepoint_file(download_url): + """ + Downloads a file from the given URL and returns its content. + + :param download_url: The URL to download the file from. + :return: The content of the downloaded file. + """ + response = requests.get(download_url, stream=True) + response.raise_for_status() # Check if the request was successful + + file_content = BytesIO() + + # Read the file content into memory + for chunk in response.iter_content(chunk_size=8192): + file_content.write(chunk) + + file_content.seek(0) # Reset the file pointer to the beginning + + return file_content + + +def app(): + # Customers for WC 18/11/2024 + # + # ----- Eastlight location ----- + # No data this week, low on data + # Housing Associations/Eastlight/Survey Outcomes/ + # + # ----- Settle location ----- + # No data this week, in separate files + # Housing Associations/Settle/Survey Outcomes/ + # + # ----- Community Housing ----- + # In separate files - will we get to a singular form? + # Housing Associations/Community Housing/Survey Outcomes/ + # + # ----- ACIS location ----- + # Doesn't have this week's data + # Housing Asociation/ACIS/Survey Outcomes/ACIS Group - 25.11.2024 - USE THIS.xlsx + # + # ----- Southern location ----- + # + # + # ------ Unitas location ------ + # Does have this week's data + # Unitas location: Housing Associations/Unitas/Survey Outcomes/Unitas.xlsx + + locations = { + "Unitas": "Housing Associations/Unitas/Survey Outcomes/Unitas.xlsx", + "Eastlight": "Housing Associations/Eastlight/Survey Outcomes/", + "Settle": "Housing Associations/Settle/Survey Outcomes/", + "Community Housing": "Housing Associations/Community Housing/Survey Outcomes/", + "ACIS": "Housing Asociation/ACIS/Survey Outcomes/ACIS Group - 25.11.2024 - USE THIS.xlsx", + "Southern": None, + } + + SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) + SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None) + WARMFRONT_SHAREPOINT_SITE_ID = os.getenv("WARMFRONT_SHAREPOINT_SITE_ID", None) + + sharepoint_client = SharePointClient( + tenant_id=SHAREPOINT_TENANT_ID, + client_id=SHAREPOINT_CLIENT_ID, + client_secret=SHAREPOINT_CLIENT_SECRET, + site_id=WARMFRONT_SHAREPOINT_SITE_ID + ) + + results = [] + for customer, location in locations.items(): + if location is None: + continue + + if location.endswith(".xlsx"): + # Read in the file + # List the contents of the folder + location_folder = os.path.dirname(location) + contents = sharepoint_client.list_folder_contents( + drive_id=sharepoint_client.document_drive["id"], + folder_path=location_folder + ) + filepaths = contents["value"] + + download_url = next( + (file['@microsoft.graph.downloadUrl'] for file in filepaths + if '@microsoft.graph.downloadUrl' in file and file['name'] == os.path.basename(location)), + None + ) + + if download_url is None: + raise ValueError("File not found in the SharePoint folder.") + + file_content = sharepoint_client.download_sharepoint_file(download_url) + + # Convert to pandas dataframe since file is an excel file + df = pd.read_excel(file_content) + df["Outcome"] = df["Outcome"].str.strip().str.lower() + + # We cannot group by funding type accurately because any job that is not funded will have a NaN value + # and therefore we have a 100% acces rate for funded jobs and 0% otherwise + surveyor_outcomes = [] + for (week, surveyor, funding), group in df.groupby(["Week Commencing", "DEA/REA"]): + funding_type = [x for x in group["Funding Type"].unique() if not pd.isnull(x)] + if funding_type: + funding_type = " + ".join(funding_type) + else: + funding_type = "No Funding" + surveyed = group[group["Outcome"] == "surveyed"] + no_answer = group[ + group["Outcome"] == "no answer" + ] + other_issue = group[~group["Outcome"].isin(["surveyed", "no answer"])] + + surveyor_outcomes.append( + { + "Surveyor": surveyor, + "Week": week, + "Funding": funding_type, + "Surveyed": surveyed.shape[0], + "No Answer": no_answer.shape[0], + "Other Issue": other_issue.shape[0], + } + ) + + surveyor_outcomes = pd.DataFrame(surveyor_outcomes) + surveyor_outcomes["Week"] = pd.to_datetime(surveyor_outcomes["Week"]) + + weekly_access = ( + surveyor_outcomes.drop(columns=["Surveyor"]).groupby(["Week", "Funding"]).sum().reset_index() + ) + # Sort by week and surveyor ascending + surveyor_outcomes = surveyor_outcomes.sort_values(["Week", "Surveyor"], ascending=[True, True]) + surveyor_outcomes["Access Rate"] = 100 * surveyor_outcomes["Surveyed"] / ( + surveyor_outcomes["Surveyed"] + surveyor_outcomes["No Answer"] + surveyor_outcomes["Other Issue"] + ) + + weekly_access["Total"] = ( + weekly_access["Surveyed"] + weekly_access["No Answer"] + weekly_access["Other Issue"] + ) + weekly_access["Access Rate"] = 100 * weekly_access["Surveyed"] / ( + weekly_access["Surveyed"] + weekly_access["No Answer"] + weekly_access["Other Issue"] + ) diff --git a/etl/access_reporting/requirements.txt b/etl/access_reporting/requirements.txt new file mode 100644 index 00000000..8e6dbb08 --- /dev/null +++ b/etl/access_reporting/requirements.txt @@ -0,0 +1,11 @@ +python-docx==0.8.11 +PyPDF2==3.0.1 +boto3 +requests +pandas +pyarrow==12.0.1 +openpyxl==3.1.2 +usaddress==0.5.10 +pdfplumber==0.10.3 +msgpack==1.0.5 +msal \ No newline at end of file diff --git a/etl/customers/cottons/remote_assessments.py b/etl/customers/cottons/remote_assessments.py new file mode 100644 index 00000000..fe195f7d --- /dev/null +++ b/etl/customers/cottons/remote_assessments.py @@ -0,0 +1,102 @@ +import os +import time + +from tqdm import tqdm +import pandas as pd +from dotenv import load_dotenv +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc +from utils.s3 import save_csv_to_s3 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +PORTFOLIO_ID = 121 +USER_ID = 8 + + +def app(): + """ + Prepares the inputs to produce the remote assessments for Cottons + :return: + """ + + # Read in the asset list + cottons_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List EPC Data Pull.xlsx" + ) + # A number are missing EPCs due to the space in the postcode + # Breakdowns: + # C 119 + # D 106 + # E 26 + # B 5 + # + # Take the EPC D/E properties + asset_list = cottons_asset_list[ + cottons_asset_list["EPC rating on register"].isin(["D", "E"]) + ] + asset_list = asset_list.reset_index(drop=True) + asset_list["row_id"] = asset_list.index + asset_list["uprn"] = asset_list["uprn"].astype(int) + + extracted_data = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + add1 = home["address1"] + pc = home["postcode"] + # Retrieve the EPC data + epc_searcher = SearchEpc( + address1=add1, + postcode=pc, uprn=home["uprn"], auth_token=EPC_AUTH_TOKEN, os_api_key="" + ) + epc_searcher.find_property(skip_os=True) + + find_epc_searcher = RetrieveFindMyEpc(address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"]) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + + extracted_data.append( + { + "uprn": home["uprn"], + **find_epc_data, + } + ) + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + } + print(body) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index b6394275..4db72b23 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -295,6 +295,7 @@ class RetrieveFindMyEpc: "Change room heaters to condensing boiler": ["boiler_upgrade"], "Cylinder thermostat": ["cylinder_thermostat"], "Heat recovery system for mixer showers": ["heat_recovery_shower"], + "Room-in-roof insulation": ["room_in_roof_insulation"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index b53b36c2..0f3e0068 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -30,9 +30,12 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): postcode = home[postcode_column] house_number = home[address1_column] full_address = home[fulladdress_column] + house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) + if house_no is None: + house_no = house_number searcher = SearchEpc( - address1=str(house_number), + address1=str(house_no), postcode=postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="", @@ -46,6 +49,34 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): searcher.ordnance_survey_client.built_form = None searcher.find_property(skip_os=True) + + # Check if we have a flat or appartment + if searcher.newest_epc is None: + # Try again: + if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: + # Backup + add1 = full_address.split(",")[1].strip() + else: + add1 = str(house_number) + searcher = SearchEpc( + address1=add1, + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + + if ( + "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in + house_number.lower() + ): + searcher.ordnance_survey_client.property_type = "Flat" + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: no_epc.append(home["row_id"]) continue From 477504abd136c17aaca4ba0ab8757d59bdf84e0a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Dec 2024 19:08:18 +0000 Subject: [PATCH 129/255] adding non-intrusive sap points and survey flag pick up for multiple recommendations --- backend/Property.py | 12 ++++++++++++ backend/app/assumptions.py | 1 + etl/customers/cottons/remote_assessments.py | 11 ++++++++++- recommendations/DraughtProofingRecommendations.py | 5 ++++- recommendations/HeatingRecommender.py | 13 ++++++++++++- recommendations/HotwaterRecommendations.py | 9 +++++---- recommendations/RoofRecommendations.py | 10 ++++++++-- recommendations/WallRecommendations.py | 10 ++++++++-- 8 files changed, 60 insertions(+), 11 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index 31f207ab..cc5bf12b 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -426,6 +426,18 @@ class Property: if phase_epc_transformation[k] == v: continue + if k == "hotwater-description": + if ( + v == "From main system" + ) and ( + phase_epc_transformation["mainheat-description"] == "Electric storage heaters" + ) and ( + "Electric immersion" in phase_epc_transformation["hotwater-description"] + ): + # It means we've recommended HHR with electric immersion, and shouldn't overwrite + # the hot water description + continue + raise NotImplementedError( "Already have this key in the phase_epc_transformation - implement me" ) diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py index 79f2a087..44838a47 100644 --- a/backend/app/assumptions.py +++ b/backend/app/assumptions.py @@ -50,4 +50,5 @@ DESCRIPTIONS_TO_FUEL_TYPES = { }, "Gas instantaneous at point of use": {"fuel": "Natural Gas", "cop": 0.85}, "Room heaters, wood logs": {"fuel": "Wood Logs", "cop": 1}, + "Boiler and radiators, coal": {"fuel": "Coal", "cop": 0.85}, } diff --git a/etl/customers/cottons/remote_assessments.py b/etl/customers/cottons/remote_assessments.py index fe195f7d..6ac895f1 100644 --- a/etl/customers/cottons/remote_assessments.py +++ b/etl/customers/cottons/remote_assessments.py @@ -40,6 +40,7 @@ def app(): asset_list["uprn"] = asset_list["uprn"].astype(int) extracted_data = [] + model_asset_list = [] for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): add1 = home["address1"] pc = home["postcode"] @@ -63,6 +64,14 @@ def app(): } ) + model_asset_list.append( + { + "uprn": home["uprn"], + "address": epc_searcher.newest_epc["address1"], + "postcode": epc_searcher.newest_epc["postcode"], + } + ) + non_invasive_recommendations = [ { "uprn": r["uprn"], @@ -72,7 +81,7 @@ def app(): filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" save_csv_to_s3( - dataframe=pd.DataFrame(asset_list), + dataframe=pd.DataFrame(model_asset_list), bucket_name="retrofit-plan-inputs-dev", file_name=filename ) diff --git a/recommendations/DraughtProofingRecommendations.py b/recommendations/DraughtProofingRecommendations.py index 4bd85a03..a16a94f6 100644 --- a/recommendations/DraughtProofingRecommendations.py +++ b/recommendations/DraughtProofingRecommendations.py @@ -26,6 +26,9 @@ class DraughtProofingRecommendations: if not draught_proofing_recommendation_config: return + # Cost is based on a £50 cost per window, based on Checkatrade + cost = draught_proofing_recommendation_config.get("cost", self.property.number_of_windows * 50) + description = ( "Draught proof doors and windows to improve energy efficiency" if not draught_proofing_recommendation_config.get("description") @@ -48,7 +51,7 @@ class DraughtProofingRecommendations: "kwh_savings": 0, "co2_equivalent_savings": 0, "energy_cost_savings": 0, - "total": draught_proofing_recommendation_config["cost"], + "total": cost, # We use a very simple and rough estimate of 4 hours per unit "labour_hours": draught_proofing_recommendation_config.get("labour_hours", 8), "labour_days": draught_proofing_recommendation_config.get("labour_days", 1), # Assume 8 hour day diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index 7dc4f8b2..a4443bad 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -1,5 +1,6 @@ import re import backend.app.assumptions as assumptions +from etl.customers.immo.pilot.asset_list import non_invasive_recommendations from recommendations.Costs import Costs, BOILER_UPGRADE_SCHEME_ASHP_VALUE from recommendations.recommendation_utils import ( check_simulation_difference, override_costs, combine_recommendation_configs @@ -981,6 +982,10 @@ class HeatingRecommender: self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"] ) + non_invasive_recommendation = next(( + r for r in self.property.non_invasive_recommendations if r["type"] == "boiler_upgrade" + ), {}) + if has_inefficient_space_heating or has_inefficient_water: boiler_size = self.estimate_boiler_size( property_type=self.property.data["property-type"], @@ -1079,12 +1084,13 @@ class HeatingRecommender: "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": non_invasive_recommendation.get("sap_points", None), "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": description_simulation, **boiler_costs, "system_type": "boiler_upgrade", + "survey": non_invasive_recommendation.get("survey", None) } # We recommend the heating controls @@ -1105,6 +1111,11 @@ class HeatingRecommender: if not controls_recommender.recommendation and not boiler_recommendation: return + # If this is true, we set SAP points to None and survey to False for the boiler recommendation + if boiler_recommendation: + boiler_recommendation["sap_points"] = None + boiler_recommendation["survey"] = False + if not system_change and len(boiler_recommendation): # If there is not a system change, we add the boiler recommendation at point. self.heating_recommendations.extend([boiler_recommendation]) diff --git a/recommendations/HotwaterRecommendations.py b/recommendations/HotwaterRecommendations.py index b86329e4..d8404cc1 100644 --- a/recommendations/HotwaterRecommendations.py +++ b/recommendations/HotwaterRecommendations.py @@ -20,6 +20,8 @@ class HotwaterRecommendations: :return: """ # Reset the recommendations + recommendations_phase = phase + self.recommendations = [] non_invasive_recommendations = self.property.non_invasive_recommendations if non_invasive_recommendations: @@ -28,7 +30,6 @@ class HotwaterRecommendations: r["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"] ] - recommendations_phase = phase for m in measures: non_invasive_rec = [ r for r in non_invasive_recommendations if r["type"] == m @@ -55,7 +56,7 @@ class HotwaterRecommendations: if self.property.hotwater["clean_description"] == "Gas boiler/circulator, no cylinder thermostat": # Handle this case specifically: - self.recommend_cylinder_thermostat_gas_boiler_circulator(phase=phase) + self.recommend_cylinder_thermostat_gas_boiler_circulator(phase=recommendations_phase) return # If there is no system present, but access to the mains, we @@ -68,14 +69,14 @@ class HotwaterRecommendations: (self.property.hotwater["no_system_present"] is None) & (len(has_tank_recommendation) == 0) ): - self.recommend_tank_insulation(phase=phase) + self.recommend_tank_insulation(phase=recommendations_phase) return has_cylinder_recommendation = [r for r in self.recommendations if r["type"] == "cylinder_thermostat"] if ((self.property.hotwater["clean_description"] == "From main system, no cylinder thermostat") & (len(has_cylinder_recommendation) == 0)): - self.recommend_cylinder_thermostat(phase=phase) + self.recommend_cylinder_thermostat(phase=recommendations_phase) return def recommend_tank_insulation(self, phase, sap_points=None, survey=False, _return=False): diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index 51264b75..4e29083f 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -290,6 +290,11 @@ class RoofRecommendations: insulation_materials = pd.DataFrame(insulation_materials) + non_invasive_recommendations = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} + ) + lowest_selected_u_value = None recommendations = [] for _, insulation_material_group in insulation_materials.groupby("description"): @@ -429,14 +434,15 @@ class RoofRecommendations: "description": self.make_roof_insulation_description(material), "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": None, + "sap_points": non_invasive_recommendations.get("sap_points", 0), "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": { "roof-description": new_description, "roof-energy-eff": new_efficiency }, - **cost_result + **cost_result, + "survey": non_invasive_recommendations.get("survey", False) } ) diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py index f77ae5a0..92147fb8 100644 --- a/recommendations/WallRecommendations.py +++ b/recommendations/WallRecommendations.py @@ -385,6 +385,11 @@ class WallRecommendations(Definitions): if insulation_thickness == "below average": cavity_width = cavity_width * (1 - PARTIALLY_FILLED_PERCENTAGE_ASSUMPTION) + non_invasive_recommendations = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} + ) + # Test the different fill options lowest_selected_u_value = None recommendations = [] @@ -475,14 +480,15 @@ class WallRecommendations(Definitions): "description": description, "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": None, + "sap_points": non_invasive_recommendations.get("sap_points", None), "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": { "walls-description": "Cavity wall, filled cavity", "walls-energy-eff": "Good" }, - **cost_result + **cost_result, + "survey": non_invasive_recommendations.get("survey", False) } ) From 703c4e3ac1deacbb48ec1a2432c1a0c0b631c980 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 5 Dec 2024 13:45:19 +0000 Subject: [PATCH 130/255] adding additional coverage to RetrieveFindMyEpc --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/app/plan/router.py | 3 +- etl/customers/cottons/remote_assessments.py | 19 +++++++++++-- etl/find_my_epc/RetrieveFindMyEpc.py | 14 +++++++++- etl/route_march_data_pull/app.py | 31 +++++++++++++-------- recommendations/HeatingRecommender.py | 7 ++--- recommendations/Recommendations.py | 12 ++++++-- 8 files changed, 65 insertions(+), 25 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 4a5b3bd4..dbef6435 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -366,7 +366,7 @@ def extract_property_request_data( property_non_invasive_recommendations["recommendations"] = str(transformed) property_valution = next(( - float(x["value"]) for x in valuation_data if + float(x["valuation"]) for x in valuation_data if (str(x["uprn"]) == str(uprn)) ), None) @@ -611,6 +611,7 @@ async def trigger_plan(body: PlanTriggerRequest): property_instance=property_instance, all_predictions=all_predictions, recommendations=recommendations, + representative_recommendations=representative_recommendations ) ) diff --git a/etl/customers/cottons/remote_assessments.py b/etl/customers/cottons/remote_assessments.py index 6ac895f1..7855a1a9 100644 --- a/etl/customers/cottons/remote_assessments.py +++ b/etl/customers/cottons/remote_assessments.py @@ -10,8 +10,8 @@ from utils.s3 import save_csv_to_s3 load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -PORTFOLIO_ID = 121 USER_ID = 8 +PORTFOLIO_ID = 121 def app(): @@ -22,7 +22,8 @@ def app(): # Read in the asset list cottons_asset_list = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List EPC Data Pull.xlsx" + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List EPC Data Pull with " + "valuations.xlsx" ) # A number are missing EPCs due to the space in the postcode # Breakdowns: @@ -79,6 +80,9 @@ def app(): } for r in extracted_data ] + valuations_data = asset_list[["uprn", "Zoopla Valuation"]].copy().rename(columns={"Zoopla Valuation": "valuation"}) + valuations_data = valuations_data[~pd.isnull(valuations_data["valuation"])] + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" save_csv_to_s3( dataframe=pd.DataFrame(model_asset_list), @@ -94,6 +98,14 @@ def app(): file_name=non_invasive_recommendations_filename ) + # Store the valuations data in s3 + valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv" + save_csv_to_s3( + dataframe=valuations_data, + bucket_name="retrofit-plan-inputs-dev", + file_name=valuations_filename + ) + body = { "portfolio_id": str(PORTFOLIO_ID), "housing_type": "Social", @@ -103,9 +115,10 @@ def app(): "already_installed_file_path": "", "patches_file_path": "", "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, - "valuation_file_path": "", + "valuation_file_path": valuations_filename, "scenario_name": "Wave 3 Packages", "multi_plan": True, "budget": None, + "exclusions": ['air_source_heat_pump', 'boiler_upgrade', 'floor_insulation'] } print(body) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 4db72b23..3dd486b3 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -282,7 +282,8 @@ class RetrieveFindMyEpc: "Low energy lighting for all fixed outlets": ["low_energy_lighting"], "Cylinder thermostat recommendation": [], "Heating controls recommendation": [], - "Replace boiler with Band A condensing boiler": [], + "Replace boiler with Band A condensing boiler": ["boiler_upgrade"], + "Band A condensing gas boiler": ["boiler_upgrade"], "Solar panel recommendation": [], "Double glazing recommendation": [], "Solid wall insulation recommendation": [], @@ -296,6 +297,17 @@ class RetrieveFindMyEpc: "Cylinder thermostat": ["cylinder_thermostat"], "Heat recovery system for mixer showers": ["heat_recovery_shower"], "Room-in-roof insulation": ["room_in_roof_insulation"], + "Fan assisted storage heaters": [], + "Fan-assisted storage heaters": [], + "Step 1:": [], + "Biomass stove with boiler": [], + "Replace boiler with biomass boiler": [], + "Heating controls (room thermostat and thermostatic radiator valves)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Heating controls (programmer, and thermostatic radiator valves)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 0f3e0068..11dd19b8 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -25,6 +25,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): epc_data = [] errors = [] no_epc = [] + # home = asset_list[asset_list["row_id"] == errors[15]].squeeze() for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): try: postcode = home[postcode_column] @@ -94,7 +95,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): ) find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() except ValueError as e: - if "No EPC found" in str(e): + if "No EPC found" in str(e) and "address1" in searcher.newest_epc: find_epc_searcher = RetrieveFindMyEpc( address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] ) @@ -151,17 +152,17 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/" - DATA_FILENAME = "Cottons Asset List.xlsx" - SHEET_NAME = "Sheet1" - POSTCODE_COLUMN = "postcode" - FULLADDRESS_COLUMN = "Property Address" - ADDRESS1_COLUMN = "address1" - ADDRESS1_METHOD = None - ADDRESS_COLS_TO_CONCAT = [] + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford" + DATA_FILENAME = "BROMFORD - SOLAR PV ROOFs INSPECTED - Electric only properties getting to C list.xlsx" + SHEET_NAME = "MAIN" + POSTCODE_COLUMN = "Post Code" + FULLADDRESS_COLUMN = "Full Address" + ADDRESS1_COLUMN = None + ADDRESS1_METHOD = "first_two_words" + ADDRESS_COLS_TO_CONCAT = ["House No", "Street", "District"] asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) - # asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() + asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() asset_list["row_id"] = asset_list.index # We clean up portential non-breaking spaces, and double spaces @@ -249,6 +250,8 @@ def app(): [ "row_id", "uprn", + "address1", + "postcode", "property-type", "built-form", "inspection-date", @@ -256,6 +259,7 @@ def app(): "current-energy-efficiency", "roof-description", "walls-description", + "floor-description", "transaction-type", # New fields needed "secondheat-description", @@ -268,7 +272,7 @@ def app(): "energy-consumption-current", # kwh/m2 "photo-supply", ] - ] + ].rename(columns={"address1": "Address1 on EPC", "postcode": "Postcode on EPC"}) asset_list = asset_list.merge( epc_df, @@ -308,6 +312,7 @@ def app(): "number-habitable-rooms": "Number of Habitable Rooms", "walls-description": "Wall Construction", "roof-description": "Roof Construction", + "floor-description": "Floor Construction", "mainheat-description": "Heating Type", "secondheat-description": "Secondary Heating", "transaction-type": "Reason for last EPC", @@ -363,3 +368,7 @@ def app(): # Store as an excel filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" asset_list.to_excel(filename, index=False) + + matches_review = asset_list[ + [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address1 on EPC", "Postcode on EPC"] + ] diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index a4443bad..1eab7d42 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -1111,17 +1111,14 @@ class HeatingRecommender: if not controls_recommender.recommendation and not boiler_recommendation: return - # If this is true, we set SAP points to None and survey to False for the boiler recommendation - if boiler_recommendation: - boiler_recommendation["sap_points"] = None - boiler_recommendation["survey"] = False - if not system_change and len(boiler_recommendation): # If there is not a system change, we add the boiler recommendation at point. self.heating_recommendations.extend([boiler_recommendation]) if system_change: # We combine the heating and controls recommendations, in the case of a system change + # If this is true, we set SAP points to None and survey to False for the boiler recommendation + combined_recommendations = [] for controls_recommendation in controls_recommender.recommendation: combined_recommendation = self.combine_heating_and_controls( diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index ed6a8526..189581d8 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -311,7 +311,7 @@ class Recommendations: continue has_u_value = recommendations_by_type[0].get("new_u_value") is not None - has_sap_points = recommendations_by_type[0].get("sap_points") is not None + has_sap_points = all([r.get("sap_points") is not None for r in recommendations_by_type]) has_rank = recommendations_by_type[0].get("rank") is not None # When check if these recommendations have two different types, such as solid wall insulation @@ -449,6 +449,7 @@ class Recommendations: property_instance, all_predictions, recommendations, + representative_recommendations, ): """ @@ -473,6 +474,9 @@ class Recommendations: property_recommendations = recommendations[property_instance.id].copy() + representative_recs = representative_recommendations[property_instance.id].copy() + representative_ids = [r["recommendation_id"] for r in representative_recs] + increasing_variables = ["sap"] decreasing_variables = ["carbon", "heat_demand"] @@ -530,7 +534,9 @@ class Recommendations: else: - previous_phase_values_multiple = [x for x in impact_summary if x["phase"] == (rec["phase"] - 1)] + previous_phase_values_multiple = [ + x for x in impact_summary if x["phase"] == (rec["phase"] - 1) and x["representative"] + ] if len(previous_phase_values_multiple) != 1: # Take an average of each of the previous phases keys_to_median = ["sap", "carbon", "heat_demand"] @@ -628,7 +634,9 @@ class Recommendations: impact_summary.append( { "phase": rec["phase"], + "representative": rec["recommendation_id"] in representative_ids, "recommendation_id": rec["recommendation_id"], + "measure_type": rec["measure_type"], **current_phase_values } ) From c41891f0faaaf19753d6f5a2279918718ad34e29 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 5 Dec 2024 14:20:43 +0000 Subject: [PATCH 131/255] adding manual uprn override --- etl/route_march_data_pull/app.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 11dd19b8..3c8cfa31 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -21,7 +21,7 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -def get_data(asset_list, fulladdress_column, address1_column, postcode_column): +def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map): epc_data = [] errors = [] no_epc = [] @@ -34,6 +34,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) if house_no is None: house_no = house_number + uprn = manual_uprn_map.get(full_address, None) searcher = SearchEpc( address1=str(house_no), @@ -43,7 +44,8 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): property_type=None, fast=True, full_address=full_address, - max_retries=5 + max_retries=5, + uprn=uprn ) # Force the skipping of estimating the EPC searcher.ordnance_survey_client.property_type = None @@ -52,7 +54,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): searcher.find_property(skip_os=True) # Check if we have a flat or appartment - if searcher.newest_epc is None: + if searcher.newest_epc is None and uprn is None: # Try again: if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: # Backup @@ -161,6 +163,13 @@ def app(): ADDRESS1_METHOD = "first_two_words" ADDRESS_COLS_TO_CONCAT = ["House No", "Street", "District"] + # Maps addresses to uprn in problematic cases + MANUAL_UPRN_MAP = { + "1 Ivy Court, The Gardens, Erdington, Birmingham": 100071442178, + "8 Ivy Court, The Gardens, Erdington, Birmingham": 10033393299, + "7 Ivy Court, The Gardens, Erdington, Birmingham": 100071442184, + } + asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() asset_list["row_id"] = asset_list.index @@ -193,7 +202,8 @@ def app(): asset_list=asset_list, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN + postcode_column=POSTCODE_COLUMN, + manual_uprn_map=MANUAL_UPRN_MAP ) # We now retrieve any failed properties From 3c98cfa7cc31fa4db43a318726afe85d830775e7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 10 Dec 2024 17:02:59 +0000 Subject: [PATCH 132/255] reviewing stonewater assigned packages --- etl/customers/gla/hug_postcodes.py | 4 + .../stonewater/Wave 3 Preparation.py | 125 ++++++++++++++++++ .../whlg eligibile properties.py | 77 +++++++++++ etl/find_my_epc/RetrieveFindMyEpc.py | 1 + etl/route_march_data_pull/app.py | 35 +++-- 5 files changed, 224 insertions(+), 18 deletions(-) create mode 100644 etl/customers/waltham_forest/whlg eligibile properties.py diff --git a/etl/customers/gla/hug_postcodes.py b/etl/customers/gla/hug_postcodes.py index ac2d1e3c..fc89b6f2 100644 --- a/etl/customers/gla/hug_postcodes.py +++ b/etl/customers/gla/hug_postcodes.py @@ -54,6 +54,10 @@ for directory in tqdm(epc_directories): & (off_gas["is_heritage_building"] != True) ] + off_gas = off_gas[ + off_gas["tenure"].isin(["rental (private)", "Rented (private)", "owner-occupied", "Owner-occupied"]) + ] + region_summary = off_gas.groupby("postal_region").size().reset_index(name="count") aggregation.append(region_summary) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 77200e69..bd36d782 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2607,5 +2607,130 @@ def propsed_wave_3_sample(): len({v for v in units_in_bid if str(v) in u_aids}) len(list(set(units_in_bid))) + +def identify_incorrect_pacakges(): + """ + Due to limitations in the data collected during survey, we have some properties that do not have suitable packages + assigned. This function will identify those properties, which can be flagged for Stonewater's review + """ + + units_with_assigned_packages = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.20 V2.xlsx"), + header=2, + sheet_name="Individual Units Programme" + ) + + # This sheet contains information on the heating systems for properties, so we can flag any units that have + # been labelled as being electric but are actually gas + heating_survey_data = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "STOCKBOOK December 2024 data (5).xlsx"), + header=0, + sheet_name="Export" + ) + + units_with_assigned_packages = units_with_assigned_packages.merge( + heating_survey_data[["Asset Reference", "Heating Type"]], how="left", + left_on="Org. ref.", right_on="Asset Reference" + ) + + # Check the different heating types + units_with_assigned_packages["Gas properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"].isin(["Gas", "Communal Gas"])) & ( + units_with_assigned_packages["Heating"].isin( + [ + "Heat Pump: Electric Heat " + "pumps: Air source heat pump " + "with flow temperature <= 35°C", + "Electric Storage Systems: Fan " + "storage heaters", + "Electric (direct acting) room " + "heaters: Panel, convector or " + "radiant heaters" + ] + ) + ) + ) + + units_with_assigned_packages["Electric properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Electric") & ( + units_with_assigned_packages["Heating"].isin( + [ + "Boiler: A rated Regular Boiler", + "Boiler: F rated Combi", + "No Heating", + "Boiler: A rated CPSU", + "Boiler: G rated Regular Boiler" + ] + ) + ) + ) + + units_with_assigned_packages["Ground Source properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Ground Source") & ( + units_with_assigned_packages["Heating"].isin( + [ + "Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C", + "Electric Storage Systems: Fan storage heaters", + "Electric Storage Systems: High heat retention storage heaters" + ] + ) + ) + ) + + units_with_assigned_packages["LPG properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Lpg") & ( + units_with_assigned_packages["Main Fuel"].isin( + [ + "Gas: Mains Gas", "Solid Fuel: Wood Logs, Gas: Mains Gas" + ] + ) + ) + ) + + units_with_assigned_packages["Solid Fuel properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Solid Fuel") & ( + units_with_assigned_packages["Main Fuel"].isin( + [ + "Gas: Mains Gas" + ] + ) + ) + ) + + # The next check is to identify properties with specific features that are not condusive to specific packages. E.g. + # Solar PV packages for properties that have another dwelling above + + z = units_with_assigned_packages[ + units_with_assigned_packages["Package Ref"].isin( + [ + "3A", "3B", "4", 4 + ] + ) + ] + z["Roof Type"].value_counts() + z["Survey: Main Roof Type"].value_counts() + + z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")][ + "Survey: Matching Address ID"].value_counts() + + zz = z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")][ + ["Survey: Matching Address ID", "Survey: Org. ref.", "Survey: Main Roof Type"] + ].drop_duplicates() + zz = zz.sort_values("Survey: Matching Address ID") + zz.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "3A, 3B or 4 Packages with a dwelling above.csv"), index=False) + + z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")]["Package Ref"].value_counts() + + # Label properties that have been matched to a package, during coordination, that includes Solar PV and has + # a property with a dwelling above + units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = ( + (units_with_assigned_packages["Package Ref"].isin(["3A", "3B", "4", 4])) & ( + units_with_assigned_packages["Survey: Main Roof Type"].str.contains("A Another dwelling above") + ) + ) + + # Label properties that have a dwelling above in the Parity data, and weren't surveyed, but have been assigned + # a package that includes solar PV + # if __name__ == "__main__": # main() diff --git a/etl/customers/waltham_forest/whlg eligibile properties.py b/etl/customers/waltham_forest/whlg eligibile properties.py new file mode 100644 index 00000000..fee988c1 --- /dev/null +++ b/etl/customers/waltham_forest/whlg eligibile properties.py @@ -0,0 +1,77 @@ +""" +This is the list of properties, based on the EPC data, that look eligible for WHLG +""" +import pandas as pd +from etl.epc.settings import EARLIEST_EPC_DATE +from etl.spatial.OpenUprnClient import OpenUprnClient + +epc_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates/domestic-E09000031-Waltham-Forest/certificates.csv" +) +epc_data.columns = [c.replace("_", "-").lower() for c in epc_data.columns] +epc_data = epc_data[epc_data["lodgement-date"] >= EARLIEST_EPC_DATE] + +epc_data = epc_data[~pd.isnull(epc_data["uprn"])] +epc_data["uprn"] = epc_data["uprn"].astype(int) + +epc_data = epc_data[epc_data["current-energy-rating"].isin(["D", "E", "F", "G"])] +epc_data = epc_data[epc_data["tenure"].isin( + ["rental (private)", "Rented (private)", "owner-occupied", "Owner-occupied"]) +] + +whlg_eligible_postcodes = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx", + sheet_name="Eligible postcodes", + header=1 +) +# Format: +whlg_eligible_postcodes = whlg_eligible_postcodes[['Postcode', 'Local Authority']] + +uprns = epc_data["uprn"].unique() +# Get data +ca_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev") +epc_data = epc_data.merge( + ca_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]].rename( + columns={"UPRN": "uprn"} + ), + how="left", + on="uprn", +) + +epc_data["has_conservation_restrictions"] = ( + (epc_data["conservation_status"] == True) + | (epc_data["is_listed_building"] == True) + | (epc_data["is_heritage_building"] == True) +) + +# Pathway 1: +# Match based on eligible postcodes +pathway1 = epc_data[epc_data["postcode"].isin(whlg_eligible_postcodes["Postcode"].values)] +pathway1 = pathway1[ + [ + "uprn", "address", "address1", "postcode", "current-energy-rating", "current-energy-efficiency", + "lodgement-date", + "has_conservation_restrictions", "walls-description", "roof-description", "mainheat-description" + ] +] + +pathway1 = pathway1.rename( + columns={ + "current-energy-rating": "EPC Rating", "current-energy-efficiency": "SAP Score", + "lodgement-date": "EPC Date", "has_conservation_restrictions": "Conservation Area Restrictions", + "walls-description": "Wall Type", "roof-description": "Roof Type", "mainheat-description": "Main Heating" + } +) + +pathway1["EPC Date"] = pd.to_datetime(pathway1["EPC Date"]).dt.strftime("%Y-%m-%d") +# Create a year EPC was lodged +pathway1["EPC Year"] = pd.to_datetime(pathway1["EPC Date"]).dt.year + +pathway1.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Waltham Forest WHLG - Pathway 1 Eligibility.csv", + index=False +) + +# Pathway 2 or 3 +# The household will need to be means tested +pathway2 = epc_data[~epc_data["uprn"].isin(pathway1["uprn"].values)] diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 3dd486b3..5ea35a64 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -308,6 +308,7 @@ class RetrieveFindMyEpc: "Heating controls (programmer, and thermostatic radiator valves)": [ "roomstat_programmer_trvs", "time_temperature_zone_control" ], + "Replacement warm air unit": [] } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 3c8cfa31..9ed55185 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -25,7 +25,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m epc_data = [] errors = [] no_epc = [] - # home = asset_list[asset_list["row_id"] == errors[15]].squeeze() + # home = asset_list[asset_list["row_id"] == errors[5]].squeeze() for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): try: postcode = home[postcode_column] @@ -154,21 +154,17 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford" - DATA_FILENAME = "BROMFORD - SOLAR PV ROOFs INSPECTED - Electric only properties getting to C list.xlsx" - SHEET_NAME = "MAIN" - POSTCODE_COLUMN = "Post Code" - FULLADDRESS_COLUMN = "Full Address" - ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_two_words" - ADDRESS_COLS_TO_CONCAT = ["House No", "Street", "District"] + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Watford" + DATA_FILENAME = "JS Mailing List 10122024.xlsx" + SHEET_NAME = "Export" + POSTCODE_COLUMN = "Postcode" + FULLADDRESS_COLUMN = "Property Address" + ADDRESS1_COLUMN = "Address Line 1" + ADDRESS1_METHOD = None + ADDRESS_COLS_TO_CONCAT = [] # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = { - "1 Ivy Court, The Gardens, Erdington, Birmingham": 100071442178, - "8 Ivy Court, The Gardens, Erdington, Birmingham": 10033393299, - "7 Ivy Court, The Gardens, Erdington, Birmingham": 100071442184, - } + MANUAL_UPRN_MAP = {} asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() @@ -197,6 +193,7 @@ def app(): # Drop the dupes print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping") asset_list = asset_list[~asset_list["deduper"].duplicated()] + asset_list = asset_list.drop(columns=["deduper"]) epc_data, errors, no_epc = get_data( asset_list=asset_list, @@ -212,7 +209,8 @@ def app(): asset_list=asset_list_failed, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN + postcode_column=POSTCODE_COLUMN, + manual_uprn_map=MANUAL_UPRN_MAP ) # Append the failed data to the main data @@ -261,6 +259,7 @@ def app(): "row_id", "uprn", "address1", + "address", "postcode", "property-type", "built-form", @@ -282,7 +281,7 @@ def app(): "energy-consumption-current", # kwh/m2 "photo-supply", ] - ].rename(columns={"address1": "Address1 on EPC", "postcode": "Postcode on EPC"}) + ].rename(columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}) asset_list = asset_list.merge( epc_df, @@ -376,9 +375,9 @@ def app(): asset_list = asset_list.drop(columns=["row_id"]) # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull - Main.xlsx" asset_list.to_excel(filename, index=False) matches_review = asset_list[ - [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address1 on EPC", "Postcode on EPC"] + [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] ] From 2b7ca82d09aea93737d5c93cb0619c55aba71063 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 10 Dec 2024 18:55:30 +0000 Subject: [PATCH 133/255] creating checking code for Stonewater --- .../stonewater/Wave 3 Preparation.py | 76 +++++++++++++------ 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bd36d782..4e336f23 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2699,28 +2699,6 @@ def identify_incorrect_pacakges(): # The next check is to identify properties with specific features that are not condusive to specific packages. E.g. # Solar PV packages for properties that have another dwelling above - - z = units_with_assigned_packages[ - units_with_assigned_packages["Package Ref"].isin( - [ - "3A", "3B", "4", 4 - ] - ) - ] - z["Roof Type"].value_counts() - z["Survey: Main Roof Type"].value_counts() - - z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")][ - "Survey: Matching Address ID"].value_counts() - - zz = z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")][ - ["Survey: Matching Address ID", "Survey: Org. ref.", "Survey: Main Roof Type"] - ].drop_duplicates() - zz = zz.sort_values("Survey: Matching Address ID") - zz.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "3A, 3B or 4 Packages with a dwelling above.csv"), index=False) - - z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")]["Package Ref"].value_counts() - # Label properties that have been matched to a package, during coordination, that includes Solar PV and has # a property with a dwelling above units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = ( @@ -2731,6 +2709,60 @@ def identify_incorrect_pacakges(): # Label properties that have a dwelling above in the Parity data, and weren't surveyed, but have been assigned # a package that includes solar PV + units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = ( + (units_with_assigned_packages["Package Ref"].isin(["3A", "3B", "4", 4])) & ( + units_with_assigned_packages["Survey: Main Roof Type"].str.contains("A Another dwelling above") + ) + ) + + # We now iterate through postcodes and find anomalous properties based on the partiy data and survey data + fields_to_check = [ + 'Wall Type', 'Roof Type', 'Heating', 'Main Fuel', + 'Survey: Main Wall Type', + 'Survey: Main Roof Type', 'Survey: Primary Heating System' + ] + # Create an empty dictionary to store results + aggregated_results = {} + + units_with_assigned_packages['Wall Type'] = units_with_assigned_packages['Wall Type'].str.replace( + r'\s*\(.*?\)', '', regex=True + ) + + def check_mixed_types(row): + # Count distinct primary types with non-zero values + primary_types_present = set() + for col in field_counts.columns: + if ':' in col: + primary_type = col.split(':')[0] + if row[col] > 0: # Non-zero count means this type is present + primary_types_present.add(primary_type) + return len(primary_types_present) > 1 # True if more than one primary type + + # Process each field + for field in fields_to_check: + # Group by postcode and count occurrences of each unique value + field_counts = ( + units_with_assigned_packages.groupby(['Postcode', field]) + .size() + .unstack(fill_value=0) + .reset_index() + ) + + # Calculate dominant value and percentage before modifying the DataFrame + dominant_value = field_counts.iloc[:, 1:].idxmax(axis=1) + dominant_percentage = ( + (field_counts.iloc[:, 1:].max(axis=1) / field_counts.iloc[:, 1:].sum(axis=1)) * 100 + ) + number_of_properties = field_counts.iloc[:, 1:].sum(axis=1) + + # Add these as new columns after computation + field_counts['Dominant Value'] = dominant_value + field_counts['% Dominant'] = dominant_percentage + field_counts['Number of Properties'] = number_of_properties + field_counts['Mixed Type'] = field_counts.apply(check_mixed_types, axis=1) + + # Store the result in the dictionary + aggregated_results[field] = field_counts # if __name__ == "__main__": # main() From 000fe4dabb576e5ff22fe36a3bc0cdab46139f46 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 13 Dec 2024 09:34:15 +0000 Subject: [PATCH 134/255] minor stonewater work --- .../stonewater/Wave 3 Preparation.py | 182 ++++++++++++++++-- 1 file changed, 161 insertions(+), 21 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 4e336f23..d2232f40 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -6,6 +6,8 @@ import numpy as np from tqdm import tqdm from collections import Counter from scipy.optimize import linprog + +from SearchEpc import SearchEpc from utils.s3 import read_pickle_from_s3 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" @@ -2608,7 +2610,7 @@ def propsed_wave_3_sample(): len(list(set(units_in_bid))) -def identify_incorrect_pacakges(): +def identify_incorrect_packages(): """ Due to limitations in the data collected during survey, we have some properties that do not have suitable packages assigned. This function will identify those properties, which can be flagged for Stonewater's review @@ -2635,21 +2637,23 @@ def identify_incorrect_pacakges(): # Check the different heating types units_with_assigned_packages["Gas properties: different to Parity"] = ( - (units_with_assigned_packages["Heating Type"].isin(["Gas", "Communal Gas"])) & ( - units_with_assigned_packages["Heating"].isin( - [ - "Heat Pump: Electric Heat " - "pumps: Air source heat pump " - "with flow temperature <= 35°C", - "Electric Storage Systems: Fan " - "storage heaters", - "Electric (direct acting) room " - "heaters: Panel, convector or " - "radiant heaters" - ] + ( + units_with_assigned_packages["Heating Type"].isin(["Gas", "Communal Gas"]) + ) & ( + units_with_assigned_packages["Heating"].isin( + [ + "Heat Pump: Electric Heat " + "pumps: Air source heat pump " + "with flow temperature <= 35°C", + "Electric Storage Systems: Fan " + "storage heaters", + "Electric (direct acting) room " + "heaters: Panel, convector or " + "radiant heaters" + ] + ) ) ) - ) units_with_assigned_packages["Electric properties: different to Parity"] = ( (units_with_assigned_packages["Heating Type"] == "Electric") & ( @@ -2717,17 +2721,26 @@ def identify_incorrect_pacakges(): # We now iterate through postcodes and find anomalous properties based on the partiy data and survey data fields_to_check = [ - 'Wall Type', 'Roof Type', 'Heating', 'Main Fuel', + 'Wall Type Category', + # 'Roof Type Category', - not very interesting + 'Heating', + 'Main Fuel', 'Survey: Main Wall Type', - 'Survey: Main Roof Type', 'Survey: Primary Heating System' + # 'Survey: Main Roof Type', + 'Survey: Primary Heating System' ] - # Create an empty dictionary to store results - aggregated_results = {} - units_with_assigned_packages['Wall Type'] = units_with_assigned_packages['Wall Type'].str.replace( + units_with_assigned_packages['Wall Type Category'] = units_with_assigned_packages['Wall Type'].str.replace( r'\s*\(.*?\)', '', regex=True ) + # Create roof type category by splitting in colon and taking the first part + units_with_assigned_packages['Roof Type Category'] = units_with_assigned_packages['Roof Type'].str.split(':').str[0] + + units_with_assigned_packages["Street, Region and Postcode"] = ( + units_with_assigned_packages["Street and Region"] + ", " + units_with_assigned_packages["Postcode"] + ) + def check_mixed_types(row): # Count distinct primary types with non-zero values primary_types_present = set() @@ -2738,11 +2751,11 @@ def identify_incorrect_pacakges(): primary_types_present.add(primary_type) return len(primary_types_present) > 1 # True if more than one primary type - # Process each field + aggregated_results = {} for field in fields_to_check: # Group by postcode and count occurrences of each unique value field_counts = ( - units_with_assigned_packages.groupby(['Postcode', field]) + units_with_assigned_packages.groupby(['Street, Region and Postcode', field]) .size() .unstack(fill_value=0) .reset_index() @@ -2764,5 +2777,132 @@ def identify_incorrect_pacakges(): # Store the result in the dictionary aggregated_results[field] = field_counts + # Let's fetch the EPC data + # Read in the existing EPC data we stored + import json + from utils.s3 import read_from_s3, read_pickle_from_s3 + def read_epc_data(): + epc_data = json.loads( + read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/epc_data.json" + ) + ) + epc_data = pd.DataFrame(epc_data) + + epc_data["uprn"] = np.where( + epc_data["internal_id"] == 1091, + 83143766, + epc_data["uprn"] + ) + epc_data_batch_2 = read_pickle_from_s3( + s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", + bucket_name="retrofit-data-dev" + ) + epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) + + complete_epcs = pd.concat([epc_data, epc_data_batch_2]) + + return complete_epcs + + epc_data = read_epc_data() + # Get just the fields we want from the EPC: Uprn, Wall, Roof, Heating, Fuel, SAP Score, EPC Band, Date of EPC + epc_data_to_append = epc_data[ + [ + "uprn", "walls-description", "roof-description", "mainheat-description", "main-fuel", + "current-energy-efficiency", "current-energy-rating", "lodgement-date", + "estimated" + ] + ].rename( + columns={ + "uprn": "UPRN", + "walls-description": "EPC: Wall Type", + "roof-description": "EPC: Roof Type", + "mainheat-description": "EPC: Heating", + "mainfuel": "EPC: Main Fuel", + "current-energy-efficiency": "EPC: SAP Score", + "current-energy-rating": "EPC: EPC Band", + "lodgement-date": "EPC: Date of EPC", + "estimated": "EPC Estimated based on Nearby Properties" + } + ) + # Find entries where the SAP score is not an integer + non_integer_sap = epc_data_to_append[~epc_data_to_append["EPC: SAP Score"].astype(str).str.isnumeric()] + non_integer_sap["UPRN"].values[0] + + epc_data_to_append["EPC: Date of EPC"] = pd.to_datetime(epc_data_to_append["EPC: Date of EPC"]) + # Years since the EPC was lodged + epc_data_to_append["Years since EPC"] = (pd.Timestamp.now() - epc_data_to_append["EPC: Date of EPC"]).dt.days / 365 + epc_data_to_append = epc_data_to_append[epc_data_to_append["UPRN"] != ""] + epc_data_to_append["UPRN"] = epc_data_to_append["UPRN"].astype(int) + + units_with_assigned_packages = units_with_assigned_packages.merge( + epc_data_to_append, how="left", on="UPRN", + ) + + # Read in the wave 2.1 data + wave_2_data = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Stonewater 2.1 SAP Pre & Post.xlsx" + ), + header=3 + ) + # Remove any where the work is outstanding + wave_2_data = wave_2_data[wave_2_data["Retrofit Assessment"] == "Completed"] + wave_2_data = wave_2_data[~pd.isnull(wave_2_data["Package Approved (Client)"])] + wave_2_data["house_number"] = wave_2_data["Name"].apply(lambda x: SearchEpc.get_house_number(x, "")) + + # Filter postcodes in the units_with_assigned_packages, to find overlapping postcodes + related_to_wave_2 = units_with_assigned_packages[ + units_with_assigned_packages["Postcode"].isin( + wave_2_data["Post Code"].values + ) & ( + ~units_with_assigned_packages["Confidence Tier"].isin( + [ + "1 - same archetype, same postal region", "1 - property was surveyed" + ] + ) + ) + ] + + wave2_matches = [] + for _, home in related_to_wave_2.iterrows(): + # Get the related homes + assigned_wave_2_packages = wave_2_data[ + wave_2_data["Post Code"] == home["Postcode"] + ] + + if assigned_wave_2_packages.shape[0] != 1: + # In this case, we get the closest match based on door number + hn = SearchEpc.get_house_number(home["Name"], home["Postcode"]) + + assigned_wave_2_packages = assigned_wave_2_packages[ + abs(assigned_wave_2_packages["house_number"].astype(int) - int(hn)) == min( + abs(assigned_wave_2_packages["house_number"].astype(int) - int(hn))) + ] + + wave2_matches.append( + { + "UPRN": home["UPRN"], + "2.1 matched address": assigned_wave_2_packages["Name"].values[0], + "2.1 matched address: Package Ref": assigned_wave_2_packages["Package Approved (Client)"].values[0], + "2.1 matched address: Wall Insulation": assigned_wave_2_packages["Wall Insulation"].values[0], + "2.1 matched address: Loft Insulation": assigned_wave_2_packages["Loft Insulation"].values[0], + "2.1 matched address: Ventilation": assigned_wave_2_packages["Ventilation"].values[0], + "2.1 matched address: Windows": assigned_wave_2_packages["Windwos Upgrade"].values[0] + } + ) + + # Store each results to CSV + for field, df in aggregated_results.items(): + df.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, f"{field} - aggregated results.csv"), index=False + ) + + # Store units_with_assigned_packages + units_with_assigned_packages.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Units with assigned packages - with flags.csv"), index=False + ) + # if __name__ == "__main__": # main() From ea5e888a82cf7ab0ebf1beffcb896cb55698458b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 18 Dec 2024 10:27:12 +0000 Subject: [PATCH 135/255] Adding funding class --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/Funding.py | 297 ++++++++++++++++++ backend/app/plan/router.py | 34 ++ etl/customers/cambridge/remote_assessment.py | 138 ++++++++ .../stonewater/Wave 3 Preparation.py | 7 +- recommendations/RoofRecommendations.py | 1 + 7 files changed, 476 insertions(+), 5 deletions(-) create mode 100644 backend/Funding.py create mode 100644 etl/customers/cambridge/remote_assessment.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/Funding.py b/backend/Funding.py new file mode 100644 index 00000000..21430f35 --- /dev/null +++ b/backend/Funding.py @@ -0,0 +1,297 @@ +import pandas as pd +import numpy as np +from typing import List + +from backend.app.plan.schemas import HousingType + + +class Funding: + """ + Given a property, this class identifies if the home is possibly eligible for funding under + the various funding schemes. It will also calculate the expected amount of funding available + and flag any tenant specific requirements that need to be considered to the funding to be attained + """ + + ECO_SAP_SCORE_THREHOLDS = [ + {'Band': 'High_A', 'From': 96.0, 'Up to': 100.0, 'Mid-point': 98.0}, + {'Band': 'Low_A', 'From': 92.0, 'Up to': 96.0, 'Mid-point': 94.0}, + {'Band': 'High_B', 'From': 86.0, 'Up to': 91.0, 'Mid-point': 88.5}, + {'Band': 'Low_B', 'From': 81.0, 'Up to': 86.0, 'Mid-point': 83.5}, + {'Band': 'High_C', 'From': 74.5, 'Up to': 80.0, 'Mid-point': 77.25}, + {'Band': 'Low_C', 'From': 69.0, 'Up to': 74.5, 'Mid-point': 71.75}, + {'Band': 'High_D', 'From': 61.5, 'Up to': 68.0, 'Mid-point': 64.75}, + {'Band': 'Low_D', 'From': 55.0, 'Up to': 61.5, 'Mid-point': 58.25}, + {'Band': 'High_E', 'From': 46.5, 'Up to': 54.0, 'Mid-point': 50.25}, + {'Band': 'Low_E', 'From': 39.0, 'Up to': 46.5, 'Mid-point': 42.75}, + {'Band': 'High_F', 'From': 29.5, 'Up to': 38.0, 'Mid-point': 33.75}, + {'Band': 'Low_F', 'From': 21.0, 'Up to': 29.5, 'Mid-point': 25.25}, + {'Band': 'High_G', 'From': 10.5, 'Up to': 20.0, 'Mid-point': 15.25}, + {'Band': 'Low_G', 'From': 1.0, 'Up to': 10.5, 'Mid-point': 5.75} + ] + + def __init__( + self, + tenure: HousingType, + starting_epc, + starting_sap, + floor_area, + council_tax_band, + property_recommendations, + project_scores_matrix, + gbis_abs_rate: int, + eco4_abs_rate: int, + ): + """ + Use Pydantic to validate the parameter types + :param tenure: Indicates if the property is a social or private home + :param starting_epc: The current EPC rating of the property + :param starting_sap: The current SAP score for the property + :param floor_area: The total floor area of the property + :param gbis_abs_rate: The assumed £/abs achieved by the installer for GBIS + :param eco4_abs_rate: The assumed £/abs achieved by the installer for ECO4 + """ + + # TODO: Things we need to include: + # 1) Amount of funding + # 2) Fundable measures, as a subset of measures may be fundable, not all + + self.tenure = tenure + self.starting_epc = starting_epc + self.starting_sap = starting_sap + self.starting_eco_band = self.sap_to_eco_band(self.starting_sap) + self.floor_area_segment = self.classify_floor_area(floor_area) + self.gbis_abs_rate = gbis_abs_rate + self.eco4_abs_rate = eco4_abs_rate + self.council_tax_band = council_tax_band + + self.recommendations = property_recommendations + + self.measure_types = [] + for recs in self.recommendations: + self.measure_types.extend([r["measure_type"] for r in recs]) + + # Load in the eco4 project scores matrix + # Filter the matrix on scores relevant to this property + self.project_scores_matrix = project_scores_matrix[ + (project_scores_matrix["Floor Area Segment"] == self.floor_area_segment) & + (project_scores_matrix["Starting Band"] == self.starting_eco_band) + ] + + # Store the final outputs + self.gbis_eligibiltiy = {} + self.eco4_eligibility = {} + self.whlg_eligibility = {} + + def output( + self, + measure_types: List[str], + estimated_funding: float, + notify_tenant_benefits_requirements: bool, + notify_council_tax_band_requirements: bool, + notify_tenant_low_income_requirements: bool, + ): + """" + """ + return { + "measure_types": measure_types, + "estimated_funding": estimated_funding, + "notify_tenant_benefits_requirements": notify_tenant_benefits_requirements, + "notify_council_tax_band_requirements": notify_council_tax_band_requirements, + "notify_tenant_low_income_requirements": notify_tenant_low_income_requirements + } + + @staticmethod + def classify_floor_area(floor_area): + if floor_area <= 72: + return "0-72" + + if floor_area <= 97: + return "73-97" + + if floor_area <= 199: + return "98-199" + + return "200" + + def eco4(self): + """ + Checks if a property is eligible for ECO4 + :return: + """ + pass + + def find_best_gbis_measure(self, measures): + """ + The best measure is one that: + 1) Creates some SAP movement, therefore enables eligiblity + 2) Generates the most funding + 3) Has a reasonable ROI + :return: + """ + measure_table = pd.DataFrame([ + m[0] for m in self.recommendations if m[0]["measure_type"] in measures + ]) + + measure_table["post_install_sap"] = measure_table["sap_points"] + self.starting_sap + # We classify the movement + measure_table["Finishing Band"] = measure_table["sap_points"].apply( + lambda points: self.sap_to_eco_band(points) + ) + # Remove any measures that generate zero SAP movement + measure_table = measure_table[measure_table["Finishing Band"] != self.starting_eco_band] + + if measure_table.empty: + raise NotImplementedError("No measures available, handle me!") + + # We merge on the project matrix, on post install band + measure_table = measure_table.merge( + self.project_scores_matrix, how="left", on="Finishing Band" + ) + # Cost Savings is the abs + measure_table["estimated_funding"] = measure_table["Cost Savings"] * self.gbis_abs_rate + # We cap any estimated funding at the install cost + measure_table["estimated_funding"] = np.where( + measure_table["estimated_funding"] >= measure_table["total"], + measure_table["total"], + measure_table["estimated_funding"] + ) + + # Sort by the measure that will cost the client the least, per sap point + measure_table["cost_minus_funding"] = measure_table["total"] - measure_table["estimated_funding"] + measure_table["cost_minus_funding_per_sap"] = measure_table["cost_minus_funding"] / measure_table["sap_points"] + measure_table = measure_table.sort_values(["cost_minus_funding_per_sap", "total"], ascending=[True, False]) + # Recommend the measure, with estimated funding amount + recommended_measure = measure_table.head(1) + + return { + "measure_type": recommended_measure["measure_type"], + "estimated_funding": recommended_measure["estimated_funding"] + } + + def sap_to_eco_band(self, sap_points): + """ + Giuven a sap point score, this function will classify the points into the SAP half-band + :param sap_points: + :return: + """ + + if sap_points > 100: + return "High_A" + + classification = [ + x for x in self.ECO_SAP_SCORE_THREHOLDS if (x["From"] <= sap_points) and (sap_points <= x["Up to"]) + ] + + if len(classification) != 1: + raise Exception("We should have a single classifcation for SAP points to half band") + + return classification[0]['Band'] + + def gbis_prs(self): + """ + Checks if a private rental is eligible for GBIS. There are the following possible options + 1) General Eligibilty, contigent on EPC D-G and council tax band A-D. Excludes CWI, LI and heating + controls + 2) Low income group - contigent on EPC D-G and tenant must receive benefits. Excludes heating controls + 3) GBIS Flex route 1, 3 - Great British Insulation Scheme Routes 1 and 3 are for pre-installation + SAP bands D-G for owner-occupied households, D-E for private rented sector households + (Including F & G if exempt from MEES). If houseold is low income. Excludes heating controls + 4) GBIS Flex route 2 - EPC E - G and low income household. Excludes heating controls + + Eligible measures: + • Solid wall + • pitched roof + • flat roof + • under floor + • solid floor park home and + • room in-roof insulation + + :return: + """ + + valid_measures = [ + "internal_wall_insulation", + "external_wall_insulation", + "flat_roof_insulation", + "suspended_floor_insulation", + "room_roof_insulation", + # Not available for every eligiblity type + "cavity_wall_insulation", + "loft_insulation", + ] + + # General Eligibility + if ( + (self.starting_epc in ["G", "D", "E", "F"]) and + len( + [measure in valid_measures for measure in self.measure_types + if measure not in ["cavity_wall_insulation", "loft_insulation"]] + ) and + (self.council_tax_band in [None, "A", "B", "C", "D"]) + ): + # We find the best measure for GBIS + recommended_measure = self.find_best_gbis_measure( + measures=[m for m in valid_measures if m not in ["cavity_wall_insulation", "loft_insulation"]] + ) + # If the council tax band is missing, we nofify the customer that this is a requirement that + # should be checked + return self.output( + measure_types=[recommended_measure["measure_type"]], + estimated_funding=recommended_measure["estimated_funding"], + notify_tenant_benefits_requirements=False, + notify_council_tax_band_requirements=self.council_tax_band is None, + notify_tenant_low_income_requirements=False, + ) + + # Low income/flex + if ( + (self.starting_sap in ["G", "D", "E", "F"]) and + len([measure in valid_measures for measure in self.measure_types]) + ): + # Find the best measure, and can also include CWI/LI but requires the tenant to be + # low inome or on benefits + # We find the best measure for GBIS + recommended_measure = self.find_best_gbis_measure(measures=valid_measures) + return self.output( + measure_types=[recommended_measure["measure_type"]], + estimated_funding=recommended_measure["estimated_funding"], + notify_tenant_benefits_requirements=True, + notify_council_tax_band_requirements=False, + notify_tenant_low_income_requirements=True, + ) + + # Otherwise, no funding availability + return self.output( + measure_types=[], + estimated_funding=0, + notify_tenant_benefits_requirements=False, + notify_council_tax_band_requirements=False, + notify_tenant_low_income_requirements=False + ) + + def gbis(self): + """ + Check if a property is eligible for GBIS + :return: + """ + + if self.tenure == "Private": + self.gbis_eligibiltiy = self.gbis_prs() + return + + raise NotImplementedError("Implement social/oo") + + def eco4(self): + if self.tenure == "Private": + self.eco4_eligibiltiy = self.eco4_prs() + return + + def check_eligibiltiy(self): + """ + This function instigates the checking process + :return: + """ + + self.gbis() + self.eco4() + self.whlg() diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index dbef6435..056f7f1c 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -373,6 +373,16 @@ def extract_property_request_data( return patch, property_already_installed, property_non_invasive_recommendations, property_valution +def get_eco_project_scores_matrix(): + data = read_csv_from_s3( + bucket_name=get_settings().DATA_BUCKET, + filepath="funding/ECO4 Full Project Scores Matrix.csv", + ) + df = pd.DataFrame(data) + df.columns = ['Floor Area Segment', 'Starting Band', 'Finishing Band', 'Cost Savings'] + return df + + router = APIRouter( prefix="/plan", tags=["plan"], @@ -438,6 +448,12 @@ async def trigger_plan(body: PlanTriggerRequest): if not is_new and not body.multi_plan: continue + if epc_searcher.newest_epc is None: + raise ValueError( + "No EPCs found for this property and did not estimate - likely need to provide a" + "property type and built form" + ) + if is_new: create_property_targets( session, @@ -508,6 +524,7 @@ async def trigger_plan(body: PlanTriggerRequest): logger.info("Reading in materials and cleaned datasets") materials = get_materials(session) cleaned = get_cleaned() + eco_project_scores_matrix = get_eco_project_scores_matrix() kwh_client = KwhData(bucket=get_settings().DATA_BUCKET, read_consumption_data=True) @@ -730,6 +747,23 @@ async def trigger_plan(body: PlanTriggerRequest): ] recommendations[p.id] = final_recommendations + # ~~~~~~~~~~~~~~~~ + # Funding + # ~~~~~~~~~~~~~~~~ + from backend.Funding import Funding + for p in input_properties: + funding_calulator = Funding( + tenure=body.housing_type, + starting_epc=p.data["current-energy-rating"], + starting_sap=p.data["current-energy-efficiency"], + floor_area=p.floor_area, + council_tax_band=None, # This is seemingly always None at the moment + property_recommendations=recommendations[p.id], + project_scores_matrix=eco_project_scores_matrix, + gbis_abs_rate=20, + eco4_abs_rate=20, + ) + logger.info("Uploading recommendations to the database") # If we have any work to do, we create a new scenario engine_scenario = create_scenario( diff --git a/etl/customers/cambridge/remote_assessment.py b/etl/customers/cambridge/remote_assessment.py new file mode 100644 index 00000000..3f152e79 --- /dev/null +++ b/etl/customers/cambridge/remote_assessment.py @@ -0,0 +1,138 @@ +import os +import time + +from tqdm import tqdm +import pandas as pd +from dotenv import load_dotenv +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc +from utils.s3 import save_csv_to_s3 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +USER_ID = 8 +PORTFOLIO_ID = 122 + + +def app(): + asset_list = [ + { + "address": "12 Church Lane", "postcode": "CB23 8AF", "uprn": 100090136018, + "property_type": "House", "built-form": "Semi-Detached" + }, + { + "address": "21 High Street", "postcode": "CB23 8AB", "uprn": 100090136026 + }, + { + "address": "22 High Street", "postcode": "CB23 8AB", "uprn": 100090136027 + }, + { + "address": "5 Bunkers Hill", "postcode": "CB3 0LY", "uprn": 10008078615 + }, + { + "address": "6 Bunkers Hill", "postcode": "CB3 0LY", "uprn": 10008078616 + }, + { + "address": "7 Bunkers Hill", "postcode": "CB3 0LY", "uprn": 10008078617 + }, + { + "address": "32 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200075 + }, + { + "address": "33 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200076 + }, + { + "address": "35 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200078 + }, + { + "address": "36 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200079 + } + ] + asset_list = pd.DataFrame(asset_list) + + valuations_data = [ + {'uprn': 100090136018, "valuation": 586_000}, + {'uprn': 100090136026, "valuation": 551_000}, + {'uprn': 100090136027, "valuation": 844_000}, + {'uprn': 10008078615, "valuation": 763_000}, + {'uprn': 10008078616, "valuation": 616_000}, + {'uprn': 10008078617, "valuation": 593_000}, + {'uprn': 200004200075, "valuation": 450_000}, + {'uprn': 200004200076, "valuation": 457_000}, + {'uprn': 200004200078, "valuation": 304_000}, + {'uprn': 200004200079, "valuation": 313_000} + ] + + # Pull the additional data + extracted_data = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + add1 = home["address"] + pc = home["postcode"] + # Retrieve the EPC data + epc_searcher = SearchEpc( + address1=add1, + postcode=pc, uprn=home["uprn"], auth_token=EPC_AUTH_TOKEN, os_api_key="" + ) + epc_searcher.find_property(skip_os=True) + if epc_searcher.newest_epc is None: + continue + + find_epc_searcher = RetrieveFindMyEpc(address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"]) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + + extracted_data.append( + { + "uprn": home["uprn"], + **find_epc_data, + } + ) + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + # Store the valuations data in s3 + valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(valuations_data), + bucket_name="retrofit-plan-inputs-dev", + file_name=valuations_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "B", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": valuations_filename, + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": [] + } + print(body) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d2232f40..0f757f7b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2826,9 +2826,10 @@ def identify_incorrect_packages(): "estimated": "EPC Estimated based on Nearby Properties" } ) - # Find entries where the SAP score is not an integer - non_integer_sap = epc_data_to_append[~epc_data_to_append["EPC: SAP Score"].astype(str).str.isnumeric()] - non_integer_sap["UPRN"].values[0] + # Take non-estimated EPCs? + # epc_data_to_append = epc_data_to_append[epc_data_to_append["EPC Estimated based on Nearby Properties"] != True] + # Take the newest EPC per UPRN, based on lodgement date + epc_data_to_append = epc_data_to_append.sort_values("EPC: Date of EPC", ascending=False).drop_duplicates("UPRN") epc_data_to_append["EPC: Date of EPC"] = pd.to_datetime(epc_data_to_append["EPC: Date of EPC"]) # Years since the EPC was lodged diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index 4e29083f..6778e886 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -496,6 +496,7 @@ class RoofRecommendations: roof_roof_insulation_materials = [ { "type": "room_roof_insulation", + "measure_type": "room_roof_insulation", "description": "Insulating the ceiling of the roof roof and re-decorate", "depths": [100], "depth_unit": "mm", From 82cf08eb988e1f933f41281cc27d172a65f202d5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 18 Dec 2024 11:16:35 +0000 Subject: [PATCH 136/255] implemented gbis for the moment --- backend/Funding.py | 16 +++++++--------- backend/app/plan/router.py | 1 + 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/backend/Funding.py b/backend/Funding.py index 21430f35..8a9b08ae 100644 --- a/backend/Funding.py +++ b/backend/Funding.py @@ -66,9 +66,7 @@ class Funding: self.recommendations = property_recommendations - self.measure_types = [] - for recs in self.recommendations: - self.measure_types.extend([r["measure_type"] for r in recs]) + self.measure_types = list({r["measure_type"] for r in property_recommendations if r["default"]}) # Load in the eco4 project scores matrix # Filter the matrix on scores relevant to this property @@ -129,12 +127,12 @@ class Funding: :return: """ measure_table = pd.DataFrame([ - m[0] for m in self.recommendations if m[0]["measure_type"] in measures + m for m in self.recommendations if m in measures and m["default"] ]) measure_table["post_install_sap"] = measure_table["sap_points"] + self.starting_sap # We classify the movement - measure_table["Finishing Band"] = measure_table["sap_points"].apply( + measure_table["Finishing Band"] = np.floor(measure_table["post_install_sap"]).apply( lambda points: self.sap_to_eco_band(points) ) # Remove any measures that generate zero SAP movement @@ -223,7 +221,7 @@ class Funding: # General Eligibility if ( (self.starting_epc in ["G", "D", "E", "F"]) and - len( + any( [measure in valid_measures for measure in self.measure_types if measure not in ["cavity_wall_insulation", "loft_insulation"]] ) and @@ -246,7 +244,7 @@ class Funding: # Low income/flex if ( (self.starting_sap in ["G", "D", "E", "F"]) and - len([measure in valid_measures for measure in self.measure_types]) + any([measure in valid_measures for measure in self.measure_types]) ): # Find the best measure, and can also include CWI/LI but requires the tenant to be # low inome or on benefits @@ -293,5 +291,5 @@ class Funding: """ self.gbis() - self.eco4() - self.whlg() + # self.eco4() + # self.whlg() diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 056f7f1c..ea831a31 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -380,6 +380,7 @@ def get_eco_project_scores_matrix(): ) df = pd.DataFrame(data) df.columns = ['Floor Area Segment', 'Starting Band', 'Finishing Band', 'Cost Savings'] + df["Cost Savings"] = df["Cost Savings"].astype(float) return df From 843be48ca4e50ca2991c26124134fb79196e4eb0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 18 Dec 2024 21:19:10 +0000 Subject: [PATCH 137/255] debugging funding eligibility --- backend/Property.py | 14 +++ backend/app/plan/router.py | 10 +- .../connells/pilot_remote_assessments.py | 108 ++++++++++++++++++ 3 files changed, 129 insertions(+), 3 deletions(-) create mode 100644 etl/customers/connells/pilot_remote_assessments.py diff --git a/backend/Property.py b/backend/Property.py index cc5bf12b..0b63b266 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -22,6 +22,7 @@ from recommendations.recommendation_utils import ( ) from backend.ml_models.AnnualBillSavings import AnnualBillSavings from backend.app.utils import sap_to_epc +from backend.Funding import Funding import backend.app.assumptions as assumptions ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev") @@ -202,6 +203,11 @@ class Property: # TODO: We keep this but only temporarily until we add bathrooms, bedrooms, building id to the condition data self.parse_kwargs(kwargs) + # Funding + self.gbis_eligibiltiy = None + self.eco4_eligibility = None + self.whlg_eligibility = None + @classmethod def extract_kwargs(cls, kwargs): """ @@ -1306,3 +1312,11 @@ class Property: ) return electric_consumption + + def insert_funding(self, funding_calulator: Funding): + """ + This method inserts the funding into the property object + """ + self.gbis_eligibiltiy = funding_calulator.gbis_eligibiltiy + self.eco4_eligibility = funding_calulator.eco4_eligibility + self.whlg_eligibility = funding_calulator.whlg_eligibility diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index ea831a31..849f7fd7 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -30,6 +30,7 @@ from backend.app.utils import epc_to_sap_lower_bound, sap_to_epc from backend.ml_models.api import ModelApi from backend.Property import Property +from backend.Funding import Funding from backend.apis.GoogleSolarApi import GoogleSolarApi from recommendations.optimiser.CostOptimiser import CostOptimiser @@ -751,12 +752,12 @@ async def trigger_plan(body: PlanTriggerRequest): # ~~~~~~~~~~~~~~~~ # Funding # ~~~~~~~~~~~~~~~~ - from backend.Funding import Funding + for p in input_properties: funding_calulator = Funding( tenure=body.housing_type, starting_epc=p.data["current-energy-rating"], - starting_sap=p.data["current-energy-efficiency"], + starting_sap=int(p.data["current-energy-efficiency"]), floor_area=p.floor_area, council_tax_band=None, # This is seemingly always None at the moment property_recommendations=recommendations[p.id], @@ -764,7 +765,10 @@ async def trigger_plan(body: PlanTriggerRequest): gbis_abs_rate=20, eco4_abs_rate=20, ) - + funding_calulator.check_eligibiltiy() + # Insert finding + p.insert_funding(funding_calulator) + logger.info("Uploading recommendations to the database") # If we have any work to do, we create a new scenario engine_scenario = create_scenario( diff --git a/etl/customers/connells/pilot_remote_assessments.py b/etl/customers/connells/pilot_remote_assessments.py new file mode 100644 index 00000000..9eace9c8 --- /dev/null +++ b/etl/customers/connells/pilot_remote_assessments.py @@ -0,0 +1,108 @@ +import os +import time + +from tqdm import tqdm +import pandas as pd +from dotenv import load_dotenv +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc +from utils.s3 import save_csv_to_s3 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +USER_ID = 8 +PORTFOLIO_ID = 123 + + +def app(): + asset_list = [ + {"address": "1 Raven Crescent", "postcode": "WV11 2EX", "uprn": 100071188496}, + + {"address": "13 Bayliss Avenue", "postcode": "WV11 2EX", "uprn": 100071136271}, + + {"address": "30 Southbourne Road", "postcode": "WV10 6ET", "uprn": 100071194376}, + + {"address": "96 Marsh Lane", "postcode": "WV10 6RX", "uprn": 100071176297}, + ] + asset_list = pd.DataFrame(asset_list) + + valuations_data = [ + {'uprn': 100071188496, "valuation": 175_000}, + {'uprn': 100090136026, "valuation": 183_000}, + {'uprn': 100071194376, "valuation": 221_000}, + {'uprn': 100071176297, "valuation": 208_000}, + ] + + # Pull the additional data + extracted_data = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + add1 = home["address"] + pc = home["postcode"] + # Retrieve the EPC data + epc_searcher = SearchEpc( + address1=add1, + postcode=pc, uprn=home["uprn"], auth_token=EPC_AUTH_TOKEN, os_api_key="" + ) + epc_searcher.find_property(skip_os=True) + if epc_searcher.newest_epc is None: + continue + + find_epc_searcher = RetrieveFindMyEpc(address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"]) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + + extracted_data.append( + { + "uprn": home["uprn"], + **find_epc_data, + } + ) + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + # Store the valuations data in s3 + valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(valuations_data), + bucket_name="retrofit-plan-inputs-dev", + file_name=valuations_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "B", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": valuations_filename, + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": [] + } + print(body) From 75c5f0a712faff25689b2a4ec15da95547449246 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 18 Dec 2024 21:26:33 +0000 Subject: [PATCH 138/255] Added VAT to ashp and solar pv --- recommendations/Costs.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/recommendations/Costs.py b/recommendations/Costs.py index 5554245f..ee4db7eb 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -758,32 +758,31 @@ class Costs: else: system_cost = [c for c in INSTALLER_SOLAR_COSTS if c["n_panels"] == n_panels][0]["cost"] - total_cost = array_cost if array_cost is not None else system_cost + subtotal = array_cost if array_cost is not None else system_cost if has_battery: battery_cost = [c for c in INSTALLER_SOLAR_BATTERY_COSTS if c["capacity_kwh"] == battery_kwh][0]["cost"] - total_cost += battery_cost + subtotal += battery_cost scaffolding_cost = [c for c in INSTALLER_SCAFFOLDING_COSTS if c["stories"] == n_floors][0]["cost"] - total_cost += scaffolding_cost + subtotal += scaffolding_cost if needs_inverter: - total_cost += INSTALLER_SOLAR_PV_INVERTER_COST + subtotal += INSTALLER_SOLAR_PV_INVERTER_COST # We also add an additional labour cost - total_cost += INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST + subtotal += INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST # We add an additional cost for scaffolding - - subtotal_before_vat = total_cost / (1 + self.VAT_RATE) - - vat = total_cost - subtotal_before_vat + # The costs from installers exclude VAT + vat = subtotal * self.VAT_RATE + total_cost = subtotal + vat # Labour hours are based on estimates from online research but an average team seems to consist of 3 people # and most jobs take around 2 days. Assuming an 8 hour day for 3 people across 2 days, gives us 48 hours of # labour return { "total": total_cost, - "subtotal": subtotal_before_vat, + "subtotal": subtotal, "vat": vat, "labour_hours": 48, "labour_days": 2, @@ -1163,17 +1162,18 @@ class Costs: cost = [x for x in INSTALLER_ASHP_COSTS if x][0]["cost"] # We add some contingency since there are additional costs such as resizing radiators, that could be required - total_cost = cost * (1 + self.CONTINGENCY) - subtotal_before_vat = total_cost / (1 + self.VAT_RATE) - vat = total_cost - subtotal_before_vat + subtotal = cost * (1 + self.CONTINGENCY) + # The costs from installers exclude VAT + vat = subtotal * self.VAT_RATE + total_cost = subtotal + vat # We assume 5 days installation labour_days = 5 labour_hours = labour_days * 8 return { - "total": total_cost, - "subtotal": subtotal_before_vat, + "total": subtotal, + "subtotal": subtotal, "vat": vat, "labour_hours": labour_hours, "labour_days": labour_days, From 0af0e3a22a87436a12750b7533497127f0e1c770 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 14 Jan 2025 14:29:04 +0000 Subject: [PATCH 139/255] cmabridge done for now --- .../db/functions/recommendations_functions.py | 2 +- backend/app/plan/router.py | 32 +++++++++---------- .../connells/pilot_remote_assessments.py | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/backend/app/db/functions/recommendations_functions.py b/backend/app/db/functions/recommendations_functions.py index d6e41c61..d26adf66 100644 --- a/backend/app/db/functions/recommendations_functions.py +++ b/backend/app/db/functions/recommendations_functions.py @@ -138,7 +138,7 @@ def upload_recommendations(session: Session, recommendations_to_upload, property "recommendation_id": recommendation_id, "material_id": part["id"], "depth": int(part["depth"]) if part["depth"] else None, - "quantity": part["quantity"], + "quantity": float(part["quantity"]), "quantity_unit": part["quantity_unit"], "estimated_cost": part["total"], } diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 849f7fd7..fb896659 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -753,22 +753,22 @@ async def trigger_plan(body: PlanTriggerRequest): # Funding # ~~~~~~~~~~~~~~~~ - for p in input_properties: - funding_calulator = Funding( - tenure=body.housing_type, - starting_epc=p.data["current-energy-rating"], - starting_sap=int(p.data["current-energy-efficiency"]), - floor_area=p.floor_area, - council_tax_band=None, # This is seemingly always None at the moment - property_recommendations=recommendations[p.id], - project_scores_matrix=eco_project_scores_matrix, - gbis_abs_rate=20, - eco4_abs_rate=20, - ) - funding_calulator.check_eligibiltiy() - # Insert finding - p.insert_funding(funding_calulator) - + # for p in input_properties: + # funding_calulator = Funding( + # tenure=body.housing_type, + # starting_epc=p.data["current-energy-rating"], + # starting_sap=int(p.data["current-energy-efficiency"]), + # floor_area=p.floor_area, + # council_tax_band=None, # This is seemingly always None at the moment + # property_recommendations=recommendations[p.id], + # project_scores_matrix=eco_project_scores_matrix, + # gbis_abs_rate=20, + # eco4_abs_rate=20, + # ) + # funding_calulator.check_eligibiltiy() + # # Insert finding + # p.insert_funding(funding_calulator) + logger.info("Uploading recommendations to the database") # If we have any work to do, we create a new scenario engine_scenario = create_scenario( diff --git a/etl/customers/connells/pilot_remote_assessments.py b/etl/customers/connells/pilot_remote_assessments.py index 9eace9c8..799bd805 100644 --- a/etl/customers/connells/pilot_remote_assessments.py +++ b/etl/customers/connells/pilot_remote_assessments.py @@ -28,7 +28,7 @@ def app(): valuations_data = [ {'uprn': 100071188496, "valuation": 175_000}, - {'uprn': 100090136026, "valuation": 183_000}, + {'uprn': 100071136271, "valuation": 183_000}, {'uprn': 100071194376, "valuation": 221_000}, {'uprn': 100071176297, "valuation": 208_000}, ] From ad3ba924754b33c2a509c4aa54550cf156a823a0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 17 Jan 2025 18:53:04 +0000 Subject: [PATCH 140/255] fixing route march data pull --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/SearchEpc.py | 13 +- backend/app/plan/router.py | 58 +++--- backend/ml_models/AnnualBillSavings.py | 8 +- etl/customers/cambridge/remote_assessment.py | 8 +- etl/customers/l_and_g/ic_asset_list.py | 166 ++++++++++++++++++ etl/find_my_epc/RetrieveFindMyEpc.py | 8 +- etl/route_march_data_pull/app.py | 18 +- recommendations/Recommendations.py | 13 ++ recommendations/RoofRecommendations.py | 4 + .../optimiser/optimiser_functions.py | 2 +- 12 files changed, 254 insertions(+), 48 deletions(-) create mode 100644 etl/customers/l_and_g/ic_asset_list.py diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 8ec4fdbe..d916f82f 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -693,9 +693,20 @@ class SearchEpc: estimated_epc[variable] = str(int(estimated_epc[variable])) # This is a string - estimated_epc["low-energy-fixed-light-count"] = str(estimated_epc["low-energy-fixed-light-count"]) + estimated_epc["low-energy-fixed-light-count"] = ( + str(estimated_epc["low-energy-fixed-light-count"]) if estimated_epc["low-energy-fixed-light-count"] else "" + ) + # This is an int + estimated_epc["photo-supply"] = ( + int(np.round(estimated_epc["photo-supply"])) if estimated_epc["photo-supply"] else estimated_epc[ + "photo-supply"] + ) estimated_epc["postcode"] = self.postcode + if not self.uprn: + # Update self.uprn too + self.uprn = hash(self.address1 + self.postcode) + estimated_epc["uprn"] = self.uprn estimated_epc["address"] = self.full_address # Indicate that this epc was estimated diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index fb896659..1989a363 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -407,6 +407,7 @@ async def trigger_plan(body: PlanTriggerRequest): plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path) # Check for duplicate UPRNS input_uprns = [x.get("uprn") for x in plan_input if "uprn" in x and x.get("uprn")] + if input_uprns: # Check for dupes if len(input_uprns) != len(set(input_uprns)): @@ -680,37 +681,42 @@ async def trigger_plan(body: PlanTriggerRequest): input_measures = prepare_input_measures(recommendations[p.id], body.goal) - current_sap_points = int(p.data["current-energy-efficiency"]) - target_sap_points = epc_to_sap_lower_bound(body.goal_value) - sap_gain = CostOptimiser.calculate_sap_gain_with_slack(target_sap_points - current_sap_points) - - if not body.optimise: - if body.goal != "Increasing EPC": - raise NotImplementedError("Only EPC optimisation is currently supported") - solution = [] - for sub_list in input_measures: - # Select the entry with the highest gain, and if tied, choose the one with the lowest cost - best_measure = max(sub_list, key=lambda x: (x['gain'], -x['cost'])) - solution.append(best_measure) + if not input_measures[0]: + # This means that we have no defaults + selected_recommendations = {} else: - if body.budget: - optimiser = GainOptimiser( - input_measures, max_cost=body.budget, max_gain=sap_gain if sap_gain > 0 else 0 - ) + current_sap_points = int(p.data["current-energy-efficiency"]) + target_sap_points = epc_to_sap_lower_bound(body.goal_value) + sap_gain = CostOptimiser.calculate_sap_gain_with_slack(target_sap_points - current_sap_points) + + if not body.optimise: + if body.goal != "Increasing EPC": + raise NotImplementedError("Only EPC optimisation is currently supported") + solution = [] + for sub_list in input_measures: + # Select the entry with the highest gain, and if tied, choose the one with the lowest cost + best_measure = max(sub_list, key=lambda x: (x['gain'], -x['cost'])) + solution.append(best_measure) else: - # The minimum gain is the minimum number of SAP points required to get to the target SAP band - # If the gain is negative, the optimiser will return an empty solution - optimiser = CostOptimiser( - input_measures, - min_gain=sap_gain - ) - optimiser.setup() - optimiser.solve() - solution = optimiser.solution + if body.budget: + optimiser = GainOptimiser( + input_measures, max_cost=body.budget, max_gain=sap_gain if sap_gain > 0 else 0 + ) + else: + # The minimum gain is the minimum number of SAP points required to get to the target SAP band + # If the gain is negative, the optimiser will return an empty solution + optimiser = CostOptimiser( + input_measures, + min_gain=sap_gain + ) - selected_recommendations = {r["id"] for r in solution} + optimiser.setup() + optimiser.solve() + solution = optimiser.solution + + selected_recommendations = {r["id"] for r in solution} # If wall insulation is selected, we also include mechanical ventilation as a best practice measure if any(x in [r["type"] for r in solution] for x in [ diff --git a/backend/ml_models/AnnualBillSavings.py b/backend/ml_models/AnnualBillSavings.py index 211e5ea6..b22837d8 100644 --- a/backend/ml_models/AnnualBillSavings.py +++ b/backend/ml_models/AnnualBillSavings.py @@ -28,8 +28,8 @@ class AnnualBillSavings: # Latest price cap figures from Ofgem are for April 2024 # https://www.ofgem.gov.uk/energy-price-cap - ELECTRICITY_PRICE_CAP = 0.2236 - GAS_PRICE_CAP = 0.0548 + ELECTRICITY_PRICE_CAP = 0.2486 + GAS_PRICE_CAP = 0.0634 # This is the most recent export payment figure, at 9.28p/kWh # Smart export guarantee rates can be found here: # https://www.sunsave.energy/solar-panels-advice/exporting-to-the-grid/best-seg-rates @@ -39,8 +39,8 @@ class AnnualBillSavings: PRICE_FACTOR = 0.09549999999999999 # Daily standard charge, based on average across England, Scotland and Wales, and includes VAT - DAILY_STANDARD_CHARGE_GAS = 0.3143 - DAILY_STANDARD_CHARGE_ELECTRICITY = 0.601 + DAILY_STANDARD_CHARGE_GAS = 0.3165 + DAILY_STANDARD_CHARGE_ELECTRICITY = 0.6097 # Based on https://www.nottenergy.com/advice-and-tools/project-energy-cost-comparison # For July 2024. These quotes are based on the east midlands region, so we diff --git a/etl/customers/cambridge/remote_assessment.py b/etl/customers/cambridge/remote_assessment.py index 3f152e79..dc5beff5 100644 --- a/etl/customers/cambridge/remote_assessment.py +++ b/etl/customers/cambridge/remote_assessment.py @@ -21,10 +21,10 @@ def app(): "property_type": "House", "built-form": "Semi-Detached" }, { - "address": "21 High Street", "postcode": "CB23 8AB", "uprn": 100090136026 + "address": "21 High Street", "postcode": "CB23 8AB", "uprn": 100090144815 }, { - "address": "22 High Street", "postcode": "CB23 8AB", "uprn": 100090136027 + "address": "22 High Street", "postcode": "CB23 8AB", "uprn": 100090144816 }, { "address": "5 Bunkers Hill", "postcode": "CB3 0LY", "uprn": 10008078615 @@ -52,8 +52,8 @@ def app(): valuations_data = [ {'uprn': 100090136018, "valuation": 586_000}, - {'uprn': 100090136026, "valuation": 551_000}, - {'uprn': 100090136027, "valuation": 844_000}, + {'uprn': 100090144815, "valuation": 446_000}, + {'uprn': 100090144816, "valuation": 448_000}, {'uprn': 10008078615, "valuation": 763_000}, {'uprn': 10008078616, "valuation": 616_000}, {'uprn': 10008078617, "valuation": 593_000}, diff --git a/etl/customers/l_and_g/ic_asset_list.py b/etl/customers/l_and_g/ic_asset_list.py new file mode 100644 index 00000000..d0966bdf --- /dev/null +++ b/etl/customers/l_and_g/ic_asset_list.py @@ -0,0 +1,166 @@ +""" +This script prepares the asset list for modelling the properties from the L&Q dataset, for their January IC +""" + +import pandas as pd +import numpy as np + +from etl.route_march_data_pull.app import get_data +from utils.s3 import save_csv_to_s3 + +PORTFOLIO_ID = 124 +USER_ID = 8 + + +def app(): + asset_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon information for Domna/Basildon MDS v1.4 " + "(1).xlsx", + sheet_name="Basildon", + header=5 + ) + + asset_data = asset_data.head(-3) + + asset_data["address1"] = np.where( + pd.isnull(asset_data["Address 1"]), + asset_data["Address 2"], + asset_data["Address 1"] + ) + + asset_data["full_address"] = np.where( + pd.isnull(asset_data["Address 1"]), + asset_data["Address 2"] + ", " + asset_data["Address 3"], + asset_data["Address 1"] + ", " + asset_data["Address 2"] + ", " + asset_data["Address 3"], + ) + + asset_list = asset_data[["address1", "PostCode", "full_address", "Bedrooms"]] + + asset_list = asset_list.reset_index(drop=True) + + asset_list["row_id"] = asset_list.index + + # L&G's focus: + # Measures: loft and cavity insulation, replacement thermally efficient windows, PV cells, AS heat pumps. + + epc_data, errors, no_epc = get_data( + asset_list=asset_list, + fulladdress_column="full_address", + address1_column="address1", + postcode_column="PostCode", + manual_uprn_map={} + ) + + missed = asset_list[ + asset_list["row_id"].isin(no_epc) + ] + + # We merge on the property types, where we have them + missed = missed.merge( + asset_data[["address1", "PostCode", "Property Type"]], + how="left", + on=["address1", "PostCode"] + ) + # Remap Block: Residential to Flat + missed["Property Type"] = np.where( + missed["Property Type"] == "Block: Residential", + "Flat", + missed["Property Type"] + ) + + # We create the asset list - we have some properties that genuninely never had an EPC + + epc_df = pd.DataFrame(epc_data) + fetched_asset_list = epc_df[["address1", "postcode", "uprn", "row_id"]] + fetched_asset_list = fetched_asset_list.merge( + asset_list[["row_id", "Bedrooms"]], + how="left", + on=["row_id"] + ) + + missed = missed.rename(columns={"PostCode": "postcode"}).drop(columns=["row_id"]) + + # missed.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/missed_epcs.csv") + missed_uprns = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/missed_epcs_uprn.csv", + ) + + missed = missed.merge( + missed_uprns[["address1", "postcode", "UPRN"]].rename( + columns={"UPRN": "uprn"}, + ), + how="left", + on=["address1", "postcode"] + ) + + fetched_asset_list = fetched_asset_list.drop(columns=["row_id"]) + # We concatename them + final_asset_list = pd.concat( + [fetched_asset_list, missed[["address1", "postcode", "Property Type", "Bedrooms", "uprn"]]] + ) + + final_asset_list = final_asset_list.rename( + columns={ + "address1": "address", + "Property Type": "property_type", + "Bedrooms": "n_bedrooms" + } + ) + + # Finally, we merge on the numeber of bedrooms + + # Extract the non-invasive recommendations: + non_invasive_recommendations = [] + for x in epc_data: + non_invasive_recommendations.append( + { + "uprn": x["uprn"], + "recommendations": x["find_my_epc_data"]["recommendations"] + } + ) + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(final_asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + # Store the valuations data in s3 + # valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv" + # save_csv_to_s3( + # dataframe=pd.DataFrame(valuations_data), + # bucket_name="retrofit-plan-inputs-dev", + # file_name=valuations_filename + # ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "A", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Retrofit Packages", + "multi_plan": True, + "budget": None, + "inclusions": [ + "cavity_wall_insulation", + "loft_insulation", + "windows", + "solar_pv", + "air_source_heat_pump" + ] + } + print(body) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 5ea35a64..cd960151 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -300,6 +300,8 @@ class RetrieveFindMyEpc: "Fan assisted storage heaters": [], "Fan-assisted storage heaters": [], "Step 1:": [], + "Step 2:": [], + 'Step 3:': [], "Biomass stove with boiler": [], "Replace boiler with biomass boiler": [], "Heating controls (room thermostat and thermostatic radiator valves)": [ @@ -308,7 +310,11 @@ class RetrieveFindMyEpc: "Heating controls (programmer, and thermostatic radiator valves)": [ "roomstat_programmer_trvs", "time_temperature_zone_control" ], - "Replacement warm air unit": [] + "Heating controls (programmer and TRVs)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Replacement warm air unit": [], + "Secondary glazing": ["secondary_glazing"] } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 9ed55185..f2889975 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -1,6 +1,5 @@ import os import time -from idlelib.iomenu import errors import pandas as pd import numpy as np @@ -25,7 +24,6 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m epc_data = [] errors = [] no_epc = [] - # home = asset_list[asset_list["row_id"] == errors[5]].squeeze() for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): try: postcode = home[postcode_column] @@ -154,13 +152,13 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Watford" - DATA_FILENAME = "JS Mailing List 10122024.xlsx" - SHEET_NAME = "Export" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches" + DATA_FILENAME = "Full Below SAP C Stock - RN Copy.xlsx" + SHEET_NAME = "Electric Properties" POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = "Property Address" - ADDRESS1_COLUMN = "Address Line 1" - ADDRESS1_METHOD = None + FULLADDRESS_COLUMN = "Address" + ADDRESS1_COLUMN = None + ADDRESS1_METHOD = "first_two_words" ADDRESS_COLS_TO_CONCAT = [] # Maps addresses to uprn in problematic cases @@ -372,7 +370,9 @@ def app(): how="left", on="row_id" ) - asset_list = asset_list.drop(columns=["row_id"]) + asset_list = asset_list.drop(columns=["row_id", "index"]) + + asset_list[asset_list["Assessor’s name"] == "Robin Bailey"]["Assessor's Email"].value_counts() # Store as an excel filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull - Main.xlsx" diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 189581d8..c34ff92b 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -28,6 +28,9 @@ class Recommendations: High level recommendations class, which sits above the measure specific recommendation classes """ + # Constant for carbon intensity calculation, as of 16th Jan 2025 + CARBON_INTENSITY = 0.232 + def __init__( self, property_instance: Property, @@ -691,6 +694,10 @@ class Recommendations: """ This method inserts the kwh savings and the bill savings that the customer will make from the recommendations based on the predictions from the ML model + + It also ensures we base our solar savings and solar carbon savings from the calculations based on + the solar API and size of the array, instead of ML model + :param property_instance: Instance of the Property class, for the home associated to property_id :param kwh_simulation_predictions: dictionary of predictions from the model apis :param property_recommendations: dictionary of recommendations for the property @@ -824,6 +831,12 @@ class Recommendations: if rec["type"] == "solar_pv": rec["kwh_savings"] = rec_impact["solar_kwh_savings"].values[0] + + # Calculate carbon savings from this + emissions_kg = rec["kwh_savings"] * cls.CARBON_INTENSITY # Calculate emissions in kg + emissions_tonnes = emissions_kg / 1000 + + rec["co2_equivalent_savings"] = emissions_tonnes rec["energy_cost_savings"] = ( rec_impact["solar_kwh_savings"].values[0] * AnnualBillSavings.ELECTRICITY_PRICE_CAP ) diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index 6778e886..b7e34406 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -138,6 +138,10 @@ class RoofRecommendations: u_value = self.property.roof["thermal_transmittance"] + # If we have a flat roof but we don't have flat roof as a measure, we exit + if self.property.roof["is_flat"] and "flat_roof_insulation" not in measures: + return + # We check if the roof is already insulated and if so, we exit # Building regulations part L recommend installing at least 270mm of insulation, however generally we diff --git a/recommendations/optimiser/optimiser_functions.py b/recommendations/optimiser/optimiser_functions.py index c1123e3d..223b1f82 100644 --- a/recommendations/optimiser/optimiser_functions.py +++ b/recommendations/optimiser/optimiser_functions.py @@ -31,7 +31,7 @@ def prepare_input_measures(property_recommendations, goal): "gain": rec[goal_key], "type": rec["type"] } - for rec in recs + for rec in recs if rec["energy_cost_savings"] >= 0 ] ) From edf9c00759cdddf647c5e0dd366493655e8237a4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 23 Jan 2025 08:15:47 +0000 Subject: [PATCH 141/255] L&G work and adding new AssetListEpcData class. Working on a remote asssessment --- backend/Property.py | 16 +- backend/app/assumptions.py | 9 +- backend/app/plan/router.py | 2 +- etl/customers/l_and_g/ic_slides.py | 239 ++++++++++++++++++ etl/customers/remote_assessments/app.py | 72 +++--- etl/find_my_epc/AssetListEpcData.py | 89 +++++++ etl/find_my_epc/RetrieveFindMyEpc.py | 3 + etl/route_march_data_pull/app.py | 37 ++- recommendations/Recommendations.py | 10 +- .../optimiser/optimiser_functions.py | 4 + 10 files changed, 429 insertions(+), 52 deletions(-) create mode 100644 etl/customers/l_and_g/ic_slides.py create mode 100644 etl/find_my_epc/AssetListEpcData.py diff --git a/backend/Property.py b/backend/Property.py index 0b63b266..a495431f 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -133,9 +133,14 @@ class Property: self.energy_cost_estimates = {} self.energy_consumption_estimates = {} + # when storing the energy, we'll also self.energy = { "primary_energy_consumption": epc_record.get("energy_consumption_current"), - "co2_emissions": epc_record.get("co2_emissions_current"), + "epc_co2_emissions": epc_record.get("co2_emissions_current"), + # These will be added in once we estimate the amount of emissions from appliances - using the carbon + # intensity of electricity + "appliances_co2_emissions": None, + "co2_emissions": None } self.ventilation = { "ventilation": epc_record.get("mechanical_ventilation"), @@ -725,6 +730,15 @@ class Property: "unadjusted": unadjusted_kwh_estimates } + # Update carbon with appliances + self.energy["appliances_co2_emissions"] = ( + (unadjusted_kwh_estimates["appliances"] * assumptions.ELECTRICITY_CARBON_INTENSITY) / 1000 + ) + # Re-calculate total CO2 emissions + self.energy["co2_emissions"] = float(np.round( + self.energy["epc_co2_emissions"] + self.energy["appliances_co2_emissions"], 2 + )) + def set_spatial(self, spatial: pd.DataFrame): """ Sets whether the property is in a conservation area given the output of the ConservationAreaClient diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py index 44838a47..841ec2c1 100644 --- a/backend/app/assumptions.py +++ b/backend/app/assumptions.py @@ -1,7 +1,7 @@ -# Assumes that the average efficiency of an air source heat pump is 250%, taking the median of the 200-400% range, -# which is often quoted as a sensible efficiency range for air source heat pumps. +# We assume that the ASHP efficiency is 280%, which is the minimum that Cotswolds Energy Group achieves, as +# they target this PESSIMISTIC_ASHP_EFFICIENCY = 200 -AVERAGE_ASHP_EFFICIENCY = 250 +AVERAGE_ASHP_EFFICIENCY = 280 # Conservative estimate of the proportion of electricity that will be consumed, whereas the rest will # be exported. These are averages based on Google research. E.g @@ -14,6 +14,9 @@ RDSAP_AREA_PER_PANEL = 3.4 SOCIAL_TENURES = ["Rented (social)", "rental (social)"] +# Carbon intensity of electricity, as of 16th Jan 2025 +ELECTRICITY_CARBON_INTENSITY = 0.232 + DESCRIPTIONS_TO_FUEL_TYPES = { "Air source heat pump, radiators, electric": { "fuel": "Electricity", "cop": AVERAGE_ASHP_EFFICIENCY / 100 diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 1989a363..6ca5d3d0 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -121,7 +121,7 @@ def extract_portfolio_aggregation_data( # We can now calculate multiple outputs based on default recommendations carbon_savings = sum([r["co2_equivalent_savings"] for r in default_recommendations]) - pre_retrofit_co2 = p.data["co2-emissions-current"] + pre_retrofit_co2 = p.energy["co2_emissions"] post_retrofit_co2 = pre_retrofit_co2 - carbon_savings pre_retrofit_energy_bill = sum(p.current_energy_bill.values()) diff --git a/etl/customers/l_and_g/ic_slides.py b/etl/customers/l_and_g/ic_slides.py new file mode 100644 index 00000000..71b0945c --- /dev/null +++ b/etl/customers/l_and_g/ic_slides.py @@ -0,0 +1,239 @@ +import pandas as pd +from backend.app.utils import sap_to_epc + +data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/basildon_age_breakdowns/property_202501170837.csv" +) + +data["year_built"].value_counts() + +# 1991-2002 139 +# 2003-2006 50 +# 1996-2002 42 +# 1976-1982 37 +# 1967-1975 37 +# 1983-1990 33 +# 1950-1966 26 + +data["full_property_type"] = data["property_type"] + ": " + data["built_form"] + +data["full_property_type"].value_counts() +# House: Mid-Terrace 136 +# House: End-Terrace 83 +# House: Semi-Detached 55 +# Flat: Semi-Detached 24 +# Flat: End-Terrace 19 +# House: Detached 10 +# Flat: Mid-Terrace 9 +# Maisonette: Mid-Terrace 9 +# Maisonette: Semi-Detached 8 +# Maisonette: End-Terrace 6 +# Flat: Detached 4 +# Bungalow: Detached 1 + +epc_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/basildon_age_breakdowns/basildon EPC Data.csv" +) + +# Classify floor area in <73m2, 73-98, 99-200, 200+ +epc_data["floor_area_bracket"] = epc_data["total_floor_area"].apply( + lambda x: "<73" if x < 73 else "73-98" if x < 99 else "99-200" if x < 200 else "200+") + +# 73-98 185 +# <73 156 +# 99-200 23 + +epc_data["wall_type"] = epc_data["walls"].str.split(",").str[0] +epc_data["wall_type"].value_counts() + +# Cavity wall 343 +# Timber frame 15 +# System built 6 + +# we pull some additional data +# We want: +# 1) The list of properties included in the portfolio, with uprn +# 2) The recommendations against each property with costs, and whether or not the recommendation was defaulted +# 3) The properties without recommendations and why + +from tqdm import tqdm +import pandas as pd +import numpy as np +from sqlalchemy.orm import sessionmaker +from backend.app.db.connection import db_engine +from backend.app.db.models.recommendations import Recommendation, Plan, PlanRecommendations +from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel + + +def get_data(portfolio_id, scenario_ids): + session = sessionmaker(bind=db_engine)() + session.begin() + + # Get properties and their details for a specific portfolio + properties_query = session.query( + PropertyModel, + PropertyDetailsEpcModel + ).join( + PropertyDetailsEpcModel, PropertyModel.id == PropertyDetailsEpcModel.property_id + ).filter( + PropertyModel.portfolio_id == portfolio_id # Filter by portfolio ID + ).all() + + # Transform properties data to include all fields dynamically + properties_data = [ + {**{col.name: getattr(prop.PropertyModel, col.name) for col in PropertyModel.__table__.columns}, + **{col.name: getattr(prop.PropertyDetailsEpcModel, col.name) for col in + PropertyDetailsEpcModel.__table__.columns}} + for prop in properties_query + ] + + # Get property IDs from fetched properties + + # Get plans linked to the fetched properties + plans_query = session.query(Plan).filter(Plan.scenario_id.in_(scenario_ids)).all() + + # Transform plans data to include all fields dynamically + plans_data = [ + {col.name: getattr(plan, col.name) for col in Plan.__table__.columns} + for plan in plans_query + ] + + # Extract plan IDs for filtering recommendations through PlanRecommendations + plan_ids = [plan['id'] for plan in plans_data] + + # Get recommendations through PlanRecommendations for those plans and that are default + recommendations_query = session.query( + Recommendation, + Plan.scenario_id + ).join( + PlanRecommendations, Recommendation.id == PlanRecommendations.recommendation_id + ).join( + Plan, Plan.id == PlanRecommendations.plan_id # Join with Plan to access scenario_id + ).filter( + PlanRecommendations.plan_id.in_(plan_ids), + Recommendation.default == True # Filtering for default recommendations + ).all() + + # Transform recommendations data to include all fields dynamically and include scenario_id + recommendations_data = [ + {**{col.name: getattr(rec.Recommendation, col.name) if hasattr(rec, 'Recommendation') else getattr(rec, + col.name) for + col in Recommendation.__table__.columns}, + "Scenario ID": rec.scenario_id} + for rec in recommendations_query + ] + + session.close() + + return properties_data, plans_data, recommendations_data + + +properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[199]) + +properties_df = pd.DataFrame(properties_data) +plans_df = pd.DataFrame(plans_data) +recommendations_df = pd.DataFrame(recommendations_data) + +recommended_measures_df = recommendations_df[ + ["property_id", "measure_type", "estimated_cost", "default"] +] +recommended_measures_df = recommended_measures_df[recommended_measures_df["default"]] +recommended_measures_df = recommended_measures_df.drop(columns=["default"]) + +post_install_sap = recommendations_df[["property_id", "default", "sap_points"]] +post_install_sap = post_install_sap[post_install_sap["default"]] +# Sum up the sap points by property id +post_install_sap = post_install_sap.groupby("property_id")[["sap_points"]].sum().reset_index() + +recommendations_measures_pivot = recommended_measures_df.pivot( + index='property_id', + columns='measure_type', + values='estimated_cost' +) +recommendations_measures_pivot = recommendations_measures_pivot.reset_index() + +recommendations_measures_pivot = recommendations_measures_pivot.rename( + columns={ + "air_source_heat_pump": "Cost: Air Source Heat Pump", + "cavity_wall_insulation": "Cost: Cavity Wall Insulation", + "double_glazing": "Cost: Double Glazing", + "loft_insulation": "Cost: Loft Insulation", + "mechanical_ventilation": "Cost: Ventilation", + "solar_pv": "Cost: Solar PV" + } +) +recommendations_measures_pivot = recommendations_measures_pivot.fillna(0) +recommendations_measures_pivot["Recommendation: Air Source Heat Pump"] = ( + recommendations_measures_pivot["Cost: Air Source Heat Pump"] > 0 +) +recommendations_measures_pivot["Recommendation: Cavity Wall Insulation"] = ( + recommendations_measures_pivot["Cost: Cavity Wall Insulation"] > 0 +) +recommendations_measures_pivot["Recommendation: Double Glazing"] = ( + recommendations_measures_pivot["Cost: Double Glazing"] > 0 +) +recommendations_measures_pivot["Recommendation: Loft Insulation"] = ( + recommendations_measures_pivot["Cost: Loft Insulation"] > 0 +) +recommendations_measures_pivot["Recommendation: Ventilation"] = ( + recommendations_measures_pivot["Cost: Ventilation"] > 0 +) +recommendations_measures_pivot["Recommendation: Solar PV"] = ( + recommendations_measures_pivot["Cost: Solar PV"] > 0 +) + +df = properties_df[ + [ + "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", "heating", "windows", + "current_epc_rating", + "current_sap_points", "total_floor_area", "number_of_rooms", + ] +].merge( + recommendations_measures_pivot, how="left", on="property_id" +).merge( + post_install_sap, how="left", on="property_id" +) + +df = df.drop(columns=["property_id"]) +df["sap_points"] = df["sap_points"].fillna(0) + +df = df.rename( + columns={ + "uprn": "UPRN", + "address": "Address", + "postcode": "Postcode", + "walls": "Walls", + "roof": "Roof", + "heating": "Heating", + "windows": "Windows", + "current_epc_rating": "Current EPC Rating", + "current_sap_points": "Current SAP Points", + "total_floor_area": "Total Floor Area", + "number_of_rooms": "Number of Habitable Rooms", + "floor_height": "Floor Height", + } +) + +df["Has Recommendations"] = ~pd.isnull(df["Cost: Air Source Heat Pump"]) + +# We fill missings: +for col in [ + "Recommendation: Air Source Heat Pump", "Recommendation: Cavity Wall Insulation", + "Recommendation: Double Glazing", "Recommendation: Loft Insulation", "Recommendation: Ventilation", + "Recommendation: Solar PV" +]: + df[col] = df[col].fillna(False) + +for col in [ + "Cost: Air Source Heat Pump", "Cost: Cavity Wall Insulation", + "Cost: Double Glazing", "Cost: Loft Insulation", "Cost: Ventilation", + "Cost: Solar PV" +]: + df[col] = df[col].fillna(0) + +# Calculate post SAP +df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"] +df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round() +df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x)) + +df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon Data Export - 2.csv", index=False) diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 59e0e868..ccbc9ac8 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -1,9 +1,15 @@ +import os import pandas as pd +from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 +from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 120 +PORTFOLIO_ID = 126 USER_ID = 8 +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + def app(): """ @@ -13,11 +19,20 @@ def app(): asset_list = [ { - "uprn": 100030334057, - "address": "5, Lynton Street", - "postcode": "DE22 3RW" + "address": "Garden Flat, 48 Bedminster Parade", + "postcode": "BS3 4HS", + "building_id": 1 + }, + { + "addresss": "Top Floor Flat, 48 Bedminster Parade", + "postcode": "BS3 4HS", + "building_id": 1 + }, + { + "address": "First Floor Flat, 48 Bedminster Parade", + "postcode": "BS3 4HS", + "building_id": 1 } - ] asset_list = pd.DataFrame(asset_list) @@ -29,40 +44,37 @@ def app(): file_name=filename ) - non_invasive_recommendations = [ - { - "uprn": 100030334057, - "recommendations": [ - { - "type": "internal_wall_insulation", - "sap_points": 9, - "survey": True - }, - { - "type": "external_wall_insulation", - "sap_points": 9, - "survey": True - }, - { - "type": "suspended_floor_insulation", - "sap_points": 2, - "survey": True - } - ] - } - ] + # Pull the non-invasive recommendations automatically + asset_list_epc_client = AssetListEpcData( + asset_list=asset_list, + epc_auth_token=EPC_AUTH_TOKEN + ) + asset_list_epc_client.get_data() + asset_list_epc_client.get_non_invasive_recommendations() + # Store non-invasive recommendations in S3 non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" save_csv_to_s3( - dataframe=pd.DataFrame(non_invasive_recommendations), + dataframe=pd.DataFrame(asset_list_epc_client.non_invasive_recommendations), bucket_name="retrofit-plan-inputs-dev", file_name=non_invasive_recommendations_filename ) valuation_data = [ { - "uprn": 100030334057, - "value": 133_000 + "address": "Garden Flat, 48 Bedminster Parade", + "postcode": "BS3 4HS", + "value": 337_000 + }, + { + "addresss": "Top Floor Flat, 48 Bedminster Parade", + "postcode": "BS3 4HS", + "value": 337_000 + }, + { + "address": "First Floor Flat, 48 Bedminster Parade", + "postcode": "BS3 4HS", + "value": 337_000 } ] # Store valuation data to s3 diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py new file mode 100644 index 00000000..ba490161 --- /dev/null +++ b/etl/find_my_epc/AssetListEpcData.py @@ -0,0 +1,89 @@ +import time +import pandas as pd +from tqdm import tqdm +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc +from utils.logger import setup_logger + +logger = setup_logger() + + +class AssetListEpcData: + + def __init__(self, asset_list: pd.DataFrame, epc_auth_token: str): + + """ + This class handles pulling data assocaited to an asset list and performs common functions like + getting EPC api data, retrieveing data form the find my epc website and extracting non-intrusive + recommendations + :param asset_list: + """ + + # Check the asset list contains the correct columns + + self.asset_list = self.check_asset_list(asset_list) + self.epc_auth_token = epc_auth_token + + self.extracted_data = None + self.non_invasive_recommendations = None + + @staticmethod + def check_asset_list(asset_list): + # TODO: Update this with pydantic + + return asset_list + + def get_non_invasive_recommendations(self): + + """ + Extracts non-invasive recommendations in a format that can be used by the engine + :return: + """ + + if self.extracted_data is None: + raise ValueError("Please run get_data first") + + self.non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in self.extracted_data + ] + + def get_data(self): + + logger.info("Retrieving data for given asset list") + + # Pull the additional data + extracted_data = [] + for _, home in tqdm(self.asset_list.iterrows(), total=len(self.asset_list)): + add1 = home["address"] + pc = home["postcode"] + # Retrieve the EPC data + epc_searcher = SearchEpc( + address1=add1, + postcode=pc, + uprn=home["uprn"], + auth_token=self.epc_auth_token, + os_api_key="" + ) + epc_searcher.find_property(skip_os=True) + if epc_searcher.newest_epc is None: + continue + + find_epc_searcher = RetrieveFindMyEpc( + address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + + extracted_data.append( + { + "uprn": home["uprn"], + **find_epc_data, + } + ) + + logger.info("Data Extrction complete") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index cd960151..a172f27d 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -313,6 +313,9 @@ class RetrieveFindMyEpc: "Heating controls (programmer and TRVs)": [ "roomstat_programmer_trvs", "time_temperature_zone_control" ], + "Heating controls (programmer and room thermostat)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], "Replacement warm air unit": [], "Secondary glazing": ["secondary_glazing"] } diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index f2889975..8d19aa84 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -27,8 +27,8 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): try: postcode = home[postcode_column] - house_number = home[address1_column] - full_address = home[fulladdress_column] + house_number = home[address1_column].strip() + full_address = home[fulladdress_column].strip() house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) if house_no is None: house_no = house_number @@ -56,7 +56,13 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m # Try again: if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: # Backup - add1 = full_address.split(",")[1].strip() + add1 = full_address.split(",") + if len(add1) > 1: + add1 = add1[1].strip() + else: + # Try splitting on space + add1 = full_address.split(" ")[0].strip() + else: add1 = str(house_number) searcher = SearchEpc( @@ -126,6 +132,10 @@ def extract_address1(asset_list, full_address_col, method="first_two_words"): asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") return asset_list + if method == "first_word": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + return asset_list + raise ValueError(f"Method {method} not recognized") @@ -152,17 +162,19 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches" - DATA_FILENAME = "Full Below SAP C Stock - RN Copy.xlsx" - SHEET_NAME = "Electric Properties" - POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = "Address" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern" + DATA_FILENAME = "January 2025 Additions Query.xlsx" + SHEET_NAME = "Jan 2025 additions" + POSTCODE_COLUMN = "Post Code" + FULLADDRESS_COLUMN = "Street / Block Name" ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_two_words" + ADDRESS1_METHOD = "first_word" ADDRESS_COLS_TO_CONCAT = [] # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = {} + MANUAL_UPRN_MAP = { + "Ardelagh Ardelagh Faris Lane Woodham Addlestone KT15 3DJ": 100061484560 + } asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() @@ -211,6 +223,9 @@ def app(): manual_uprn_map=MANUAL_UPRN_MAP ) + no_data = asset_list[asset_list["row_id"].isin(no_epc)] + print(no_data[[FULLADDRESS_COLUMN, POSTCODE_COLUMN]]) + # Append the failed data to the main data epc_data.extend(epc_data_failed) @@ -372,8 +387,6 @@ def app(): ) asset_list = asset_list.drop(columns=["row_id", "index"]) - asset_list[asset_list["Assessor’s name"] == "Robin Bailey"]["Assessor's Email"].value_counts() - # Store as an excel filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull - Main.xlsx" asset_list.to_excel(filename, index=False) diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index c34ff92b..15614a0b 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -28,9 +28,6 @@ class Recommendations: High level recommendations class, which sits above the measure specific recommendation classes """ - # Constant for carbon intensity calculation, as of 16th Jan 2025 - CARBON_INTENSITY = 0.232 - def __init__( self, property_instance: Property, @@ -531,6 +528,9 @@ class Recommendations: previous_phase_values = { "sap": float(property_instance.data["current-energy-efficiency"]), + # For carbon, even though we generally use the updated figure which includes the carbon + # associated to appliances, for this scoring process we use the EPC carbon value. This means + # that we don't overestimate the impact since the model uses the EPC carbon value "carbon": float(property_instance.data["co2-emissions-current"]), "heat_demand": float(property_instance.data["energy-consumption-current"]), } @@ -832,8 +832,8 @@ class Recommendations: if rec["type"] == "solar_pv": rec["kwh_savings"] = rec_impact["solar_kwh_savings"].values[0] - # Calculate carbon savings from this - emissions_kg = rec["kwh_savings"] * cls.CARBON_INTENSITY # Calculate emissions in kg + # Calculate carbon savings from this - emissions in kg and convert to tonnes + emissions_kg = rec["kwh_savings"] * assumptions.ELECTRICITY_CARBON_INTENSITY emissions_tonnes = emissions_kg / 1000 rec["co2_equivalent_savings"] = emissions_tonnes diff --git a/recommendations/optimiser/optimiser_functions.py b/recommendations/optimiser/optimiser_functions.py index 223b1f82..8c15673d 100644 --- a/recommendations/optimiser/optimiser_functions.py +++ b/recommendations/optimiser/optimiser_functions.py @@ -23,6 +23,10 @@ def prepare_input_measures(property_recommendations, goal): # if the recommendation is a solar recommendation with a battery, we exclude it from the optimisation. recs = [r for r in recs if ~r["has_battery"]] + recs_to_append = [rec for rec in recs if rec["energy_cost_savings"] >= 0] + if not recs_to_append: + continue + input_measures.append( [ { From 020ac42c5f90330ea466b653a51c585901c03466 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 23 Jan 2025 08:19:52 +0000 Subject: [PATCH 142/255] allowing uprn to be optional --- etl/find_my_epc/AssetListEpcData.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py index ba490161..7bd16090 100644 --- a/etl/find_my_epc/AssetListEpcData.py +++ b/etl/find_my_epc/AssetListEpcData.py @@ -45,7 +45,9 @@ class AssetListEpcData: self.non_invasive_recommendations = [ { - "uprn": r["uprn"], + "uprn": r.get("uprn"), + "address": r["address"], + "postcode": r["postcode"], "recommendations": r["recommendations"] } for r in self.extracted_data ] @@ -63,7 +65,7 @@ class AssetListEpcData: epc_searcher = SearchEpc( address1=add1, postcode=pc, - uprn=home["uprn"], + uprn=home.get("uprn"), auth_token=self.epc_auth_token, os_api_key="" ) @@ -81,7 +83,9 @@ class AssetListEpcData: extracted_data.append( { - "uprn": home["uprn"], + "uprn": home.get("uprn"), + "address": home["address"], + "postcode": home["postcode"], **find_epc_data, } ) From 0fad758fbbccba9acf08dd9d1bbcdbca2f5a23e1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 23 Jan 2025 20:57:27 +0000 Subject: [PATCH 143/255] added unit tests --- backend/SearchEpc.py | 83 ++++++++++++++++------- backend/apis/GoogleSolarApi.py | 27 ++++++-- backend/app/plan/router.py | 31 +++++++-- backend/tests/test_search_epc.py | 50 ++++++++++++++ etl/customers/l_and_g/ic_slides.py | 16 +++-- etl/customers/remote_assessments/app.py | 17 +++-- etl/find_my_epc/AssetListEpcData.py | 1 + recommendations/Costs.py | 5 +- recommendations/SolarPvRecommendations.py | 8 ++- recommendations/county_to_region.py | 7 +- 10 files changed, 190 insertions(+), 55 deletions(-) create mode 100644 backend/tests/test_search_epc.py diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index d916f82f..c74a0b1f 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -139,8 +139,8 @@ class SearchEpc: } NODATA = { - "status": 201, - "message": "No data", + "status": 204, + "message": "no data", "error": None } @@ -155,7 +155,7 @@ class SearchEpc: uprn: [int, None] = None, size=None, property_type=None, - fast=False + fast=False, ): """ Address lines 1 and postcode are mandatory fields. The other address lines are optional @@ -248,14 +248,10 @@ class SearchEpc: else: return None - def get_epc(self, params=None, size=None): - # Get the EPC data with retries - size = size if size is not None else self.size - if params is None: - if self.uprn: - params = {"uprn": self.uprn} - else: - params = {"address": self.address1, "postcode": self.postcode} + def _get_epc(self, params, size): + """ + To be called by get_epc() - not for external usage + """ url = os.path.join(self.client.domestic.host, "search") if size: @@ -268,24 +264,20 @@ class SearchEpc: if response: self.data = response - return self.SUCCESS + return { + "response": response, + "msg": self.SUCCESS + } if retry > 0: logger.info("Failed previous attempt but retry successful") # If we got nothing, final try if not response: return { - "status": 204, - "message": "no data", - "error": None + "response": response, + "msg": self.NODATA } - return { - "status": 200, - "message": "success", - "error": None - } - except Exception as e: if retry < self.max_retries - 1: # If not the last retry, wait for 3 seconds before retrying @@ -293,11 +285,54 @@ class SearchEpc: else: # If it's the last retry, we continue return { - "status": 500, - "message": "Could not retrieve EPC data", - "error": str(e) + "response": {}, + "msg": { + "status": 500, + "message": "Could not retrieve EPC data", + "error": str(e) + } } + def get_epc(self, params=None, size=None): + # Get the EPC data with retries + size = size if size is not None else self.size + if params: + output = self._get_epc(params=params, size=size) + if output["msg"]["status"] == 200: + self.data = output["response"] + return output["msg"] + + uprn_params = {"uprn": self.uprn} if self.uprn else {} + address_params = {"address": self.address1, "postcode": self.postcode} + + # We attempt the search with uprn params + + data = {"rows": []} + if uprn_params: + api_response = self._get_epc(params=uprn_params, size=size) + if api_response["msg"]["status"] == 200: + data["rows"].extend(api_response["response"]["rows"]) + + # If we were unsuccessful, we then make a second attempt to fetch the data. We find that + # properties are sometimes listed under the wrong UPRN + api_response = self._get_epc(params=address_params, size=size) + if api_response["msg"]["status"] == 200: + # We update the data with the correct uprn + if self.uprn: + for x in api_response["response"]["rows"]: + x["uprn"] = self.uprn + + data["rows"].extend(api_response["response"]["rows"]) + + # We no de-dupe on lmk-key to avoid duplicates + seen = set() + data["rows"] = [ + row for row in data["rows"] + if row["lmk-key"] not in seen and not seen.add(row["lmk-key"]) + ] + + return api_response["msg"] + def filter_rows(self, rows, property_type=None, address=None): """ This method should not be used when property_type and address are both not None diff --git a/backend/apis/GoogleSolarApi.py b/backend/apis/GoogleSolarApi.py index e2b7d933..183503d5 100644 --- a/backend/apis/GoogleSolarApi.py +++ b/backend/apis/GoogleSolarApi.py @@ -51,6 +51,9 @@ class GoogleSolarApi: MIN_UNIT_PANELS = 4 # Minimum number of panels we allow for a domestic building MIN_BUILDING_PANELS = 10 # Minimum number of panels we allow for a block of flats + # Max area of a roof space we allow panels for + PERCENTAGE_OF_ROOF_LIMIT = 0.8 + def __init__(self, api_key, max_retries=5): """ Initialize the GoogleSolarApi class with the provided API key and maximum retries. @@ -159,10 +162,11 @@ class GoogleSolarApi: # Automatically exclude north-facing segments self.exclude_north_facing_segments(property_instance=property_instance) # If a property is semi-detached, it's possible for us to include segments from an attached unit - if (property_instance.data["built-form"] == "Semi-Detached") and ( - property_instance.data["extension-count"] == 0 - ): - self.exclude_likely_duplicate_surfaces() + if property_instance is not None: + if (property_instance.data["built-form"] == "Semi-Detached") and ( + property_instance.data["extension-count"] == 0 + ): + self.exclude_likely_duplicate_surfaces() self.roof_area = self.insights_data["solarPotential"]["wholeRoofStats"]['areaMeters2'] self.floor_area = self.insights_data["solarPotential"]["wholeRoofStats"]['groundAreaMeters2'] @@ -179,7 +183,9 @@ class GoogleSolarApi: # We now start finding the solar panel configurations self.optimise_solar_configuration( - energy_consumption=energy_consumption, is_building=is_building, property_instance=property_instance + energy_consumption=energy_consumption, + is_building=is_building, + property_instance=property_instance ) # Finally, if we have a double property, we half the data we stored area @@ -295,7 +301,11 @@ class GoogleSolarApi: continue if cost_instance is None: - total_cost = MCS_SOLAR_PV_COST_DATA["average_cost_per_kwh"] * (wattage / 1000) + total_cost = Costs.solar_pv( + n_panels=roi_summary["n_panels"].sum(), + has_battery=False, + n_floors=3, # Assume the most amount of scaffolding + )["total"] else: total_cost = cost_instance.solar_pv( n_panels=roi_summary["n_panels"].sum(), @@ -491,6 +501,11 @@ class GoogleSolarApi: panel_performance = panel_performance.drop(columns=["n_panels_halved"]) panel_performance = panel_performance[panel_performance["n_panels"] >= min_panels] + # Finally, we prevent pannelled roof area being above a limit + panel_performance = panel_performance[ + panel_performance["panneled_roof_area"] <= self.roof_area * self.PERCENTAGE_OF_ROOF_LIMIT + ] + self.panel_performance = panel_performance def exclude_north_facing_segments(self, property_instance): diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 6ca5d3d0..855fd9d6 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -339,6 +339,9 @@ def extract_property_request_data( # Because we have some non-invasive recommendations that match on address and postcode, but not UPRN # we need to check existence of uprn has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else True + if has_uprn: + has_uprn = non_invasive_recommendations[0]["uprn"] not in ["", None] + if has_uprn: property_non_invasive_recommendations = next(( x for x in non_invasive_recommendations if @@ -366,10 +369,21 @@ def extract_property_request_data( property_non_invasive_recommendations["recommendations"] = str(transformed) - property_valution = next(( - float(x["valuation"]) for x in valuation_data if - (str(x["uprn"]) == str(uprn)) - ), None) + # Check if the valuation data has uprn + valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else True + if valuation_has_uprn: + valuation_has_uprn = valuation_data[0]["uprn"] not in ["", None] + + if valuation_has_uprn: + property_valution = next(( + float(x["valuation"]) for x in valuation_data if + (str(x["uprn"]) == str(uprn)) + ), None) + else: + property_valution = next(( + float(x["valuation"]) for x in valuation_data if + (x["address"] == config["address"]) and (x["postcode"] == config["postcode"]) + ), None) return patch, property_already_installed, property_non_invasive_recommendations, property_valution @@ -444,9 +458,12 @@ async def trigger_plan(body: PlanTriggerRequest): # Create a record in db property_id, is_new = create_property( - session, body.portfolio_id, epc_searcher.address_clean, epc_searcher.postcode_clean, - epc_searcher.uprn, - energy_assessment + session=session, + portfolio_id=body.portfolio_id, + address=epc_searcher.address_clean, + postcode=epc_searcher.postcode_clean, + uprn=epc_searcher.uprn, + energy_assessment=energy_assessment ) if not is_new and not body.multi_plan: continue diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py new file mode 100644 index 00000000..3b2e2a5b --- /dev/null +++ b/backend/tests/test_search_epc.py @@ -0,0 +1,50 @@ +import pytest +import os +from backend.SearchEpc import SearchEpc # Replace with your actual module name +from dotenv import load_dotenv + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +class TestSearchEpcIntegration: + @pytest.mark.parametrize( + "address, postcode, uprn, skip_os, expected_partial_address", + [ + # Test case 1: Valid address and postcode, skipping OS + # In this case, the property is an individual flat but the uprn associated to the + # EPC is for the building as a whole, possibly because there was a conversion of sorts + ("Garden Flat, 48 Bedminster Parade", "BS3 4HS", 308249, True, + "260907a5431fa073d193cc6bbec51fbf1ba9a61845ab2503f85aa19ce3ed6afd", 1), + + # Test case 2: Another valid address and postcode + # In this case, the newest EPC, does not have a uprn associated to it. If we did a search by + # uprn, we would get an old EPC + ("Flat 8, Hainton House", "DN32 9AQ", 10090082018, True, + "bd1149a20a73397184f07a9955f872424826e70f4870c058d71be887766ee1f8", 3), + + ], + ) + def test_find_property(self, address, postcode, uprn, skip_os, lmk_key, n_old_epcs): + """ + Integration test for `find_property`, making actual API calls. + """ + # Provide your actual API keys or tokens here + os_api_key = "" + + # Initialize the SearchEpc instance + epc_searcher = SearchEpc( + address1=address, + postcode=postcode, + uprn=uprn, + auth_token=EPC_AUTH_TOKEN, + os_api_key=os_api_key, + ) + + # Execute the method + epc_searcher.find_property(skip_os=skip_os) + + # We check that we have the correct epc + assert epc_searcher.newest_epc["lmk-key"] == lmk_key + assert epc_searcher.newest_epc["uprn"] == uprn + assert len(epc_searcher.older_epcs) == n_old_epcs diff --git a/etl/customers/l_and_g/ic_slides.py b/etl/customers/l_and_g/ic_slides.py index 71b0945c..72dfc2c0 100644 --- a/etl/customers/l_and_g/ic_slides.py +++ b/etl/customers/l_and_g/ic_slides.py @@ -7,16 +7,20 @@ data = pd.read_csv( data["year_built"].value_counts() -# 1991-2002 139 -# 2003-2006 50 -# 1996-2002 42 -# 1976-1982 37 -# 1967-1975 37 -# 1983-1990 33 # 1950-1966 26 +# 1967-1975 37 +# 1976-1982 37 +# 1983-1990 33 +# 1991-1995 139 +# 1996-2002 42 +# 2003-2006 50 data["full_property_type"] = data["property_type"] + ": " + data["built_form"] +houses = data[data["property_type"].isin(["House", "Bungalow"])] +houses["built_form"].value_counts() + +data["property_type"].value_counts() data["full_property_type"].value_counts() # House: Mid-Terrace 136 # House: End-Terrace 83 diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index ccbc9ac8..13cdc41b 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -21,17 +21,20 @@ def app(): { "address": "Garden Flat, 48 Bedminster Parade", "postcode": "BS3 4HS", - "building_id": 1 + "building_id": 1, + "uprn": 308249, }, { - "addresss": "Top Floor Flat, 48 Bedminster Parade", + "address": "Top Floor Flat, 48 Bedminster Parade", "postcode": "BS3 4HS", - "building_id": 1 + "building_id": 1, + "uprn": 308251 }, { "address": "First Floor Flat, 48 Bedminster Parade", "postcode": "BS3 4HS", - "building_id": 1 + "building_id": 1, + "uprn": 308250, } ] asset_list = pd.DataFrame(asset_list) @@ -64,17 +67,17 @@ def app(): { "address": "Garden Flat, 48 Bedminster Parade", "postcode": "BS3 4HS", - "value": 337_000 + "valuation": 337_000 }, { "addresss": "Top Floor Flat, 48 Bedminster Parade", "postcode": "BS3 4HS", - "value": 337_000 + "valuation": 337_000 }, { "address": "First Floor Flat, 48 Bedminster Parade", "postcode": "BS3 4HS", - "value": 337_000 + "valuation": 337_000 } ] # Store valuation data to s3 diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py index 7bd16090..bce8cd1f 100644 --- a/etl/find_my_epc/AssetListEpcData.py +++ b/etl/find_my_epc/AssetListEpcData.py @@ -90,4 +90,5 @@ class AssetListEpcData: } ) + self.extracted_data = extracted_data logger.info("Data Extrction complete") diff --git a/recommendations/Costs.py b/recommendations/Costs.py index ee4db7eb..2312dff2 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -719,8 +719,9 @@ class Costs: "labour_days": labour_days } + @classmethod def solar_pv( - self, + cls, n_panels: int | float, has_battery: bool = False, array_cost=None, @@ -774,7 +775,7 @@ class Costs: # We add an additional cost for scaffolding # The costs from installers exclude VAT - vat = subtotal * self.VAT_RATE + vat = subtotal * cls.VAT_RATE total_cost = subtotal + vat # Labour hours are based on estimates from online research but an average team seems to consist of 3 people diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py index 66c1d0c3..ed5554dc 100644 --- a/recommendations/SolarPvRecommendations.py +++ b/recommendations/SolarPvRecommendations.py @@ -106,10 +106,16 @@ class SolarPvRecommendations: roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / total_roof_area * 100) else: raise Exception("IMPLEMENT ME") + + n_floors = ( + self.property.number_of_storeys["number_of_storeys"] if + self.property.number_of_storeys["number_of_storeys"] is not None else 3 + ) + total_cost = self.costs.solar_pv( array_cost=recommendation_config.get("cost", None), n_panels=recommendation_config["n_panels"], - n_floors=self.property.number_of_storeys["number_of_storeys"], + n_floors=n_floors, needs_inverter=True, )["total"] / n_units diff --git a/recommendations/county_to_region.py b/recommendations/county_to_region.py index f7d5193f..e84b5698 100644 --- a/recommendations/county_to_region.py +++ b/recommendations/county_to_region.py @@ -111,8 +111,11 @@ county_to_region_map = { 'Windsor and Maidenhead': 'South East England', 'Woking': 'South East England', 'Wokingham': 'South East England', 'Worthing': 'South East England', 'Wycombe': 'South East England', 'Bath and North East Somerset': 'South West England', 'Bournemouth': 'South West England', - 'Bristol': 'South West England', 'Cheltenham': 'South West England', 'Christchurch': 'South West England', - 'City of Bristol': 'South West England', 'Cornwall': 'South West England', 'Cotswold': 'South West England', + 'Bristol': 'South West England', + 'Cheltenham': 'South West England', 'Christchurch': 'South West England', + 'City of Bristol': 'South West England', + 'Bristol, City of': 'South West England', + 'Cornwall': 'South West England', 'Cotswold': 'South West England', 'Devon': 'South West England', 'Dorset': 'South West England', 'East Devon': 'South West England', 'East Dorset': 'South West England', 'Exeter': 'South West England', 'Forest of Dean': 'South West England', 'Gloucester': 'South West England', 'Gloucestershire': 'South West England', From 3ccc5eae89d0f81eb2298fdb5747d36cc4c46b1b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 24 Jan 2025 10:24:23 +0000 Subject: [PATCH 144/255] adding enforcing of solar across the building if one unit needs it --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/app/plan/router.py | 26 ++++++++++++++++++++++++++ etl/find_my_epc/RetrieveFindMyEpc.py | 4 ++-- recommendations/HeatingRecommender.py | 24 +++++++++++++++++++----- 5 files changed, 49 insertions(+), 9 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 855fd9d6..1b72e10e 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -772,6 +772,32 @@ async def trigger_plan(body: PlanTriggerRequest): ] recommendations[p.id] = final_recommendations + # when we have buildings, we tweak our solar PV recommendations as if one unit needs it, we apply it to all + # of them + # TODO: We can probably do better and optimise at the building level - this is temp + logger.info("Adjusting solar PV recommendations for buildings") + building_ids = set([p.building_id for p in input_properties if p.building_id is not None]) + + for bid in building_ids: + # We check if any of them have solar PV + building = [p for p in input_properties if p.building_id == bid] + has_solar = False + for unit in building: + # Get default recommendations + has_solar = len([r for r in recommendations[unit.id] if r["default"] and r["type"] == "solar_pv"]) > 0 + if has_solar: + break + + if has_solar: + # We adjust the units within the building + for unit in building: + for rec in recommendations[unit.id]: + if rec["type"] == "solar_pv": + # This is straightforward, we just set the default to True, since when we're at a building + # level, we only allow 1 solar PV option for each unit. If we change this, this logic will + # need to be updated + rec["default"] = True + # ~~~~~~~~~~~~~~~~ # Funding # ~~~~~~~~~~~~~~~~ diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index a172f27d..f93a5a73 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -263,7 +263,7 @@ class RetrieveFindMyEpc: "roomstat_programmer_trvs", "time_temperature_zone_control" ], "Change heating to gas condensing boiler": ["boiler_upgrade"], - "Fan assisted storage heaters and dual immersion cylinder": ["high_heat_retention_storage_heaters"], + "Fan assisted storage heaters and dual immersion cylinder": ["high_heat_retention_storage_heater"], "Flat roof or sloping ceiling insulation": ["flat_roof_insulation"], "Heating controls (room thermostat)": [ "roomstat_programmer_trvs", "time_temperature_zone_control" @@ -291,7 +291,7 @@ class RetrieveFindMyEpc: "PV Cells recommendation": [], "Replacement glazing units": ["double_glazing"], "Heating controls (time and temperature zone control)": ["time_temperature_zone_control"], - "High heat retention storage heaters": ["high_heat_retention_storage_heaters"], + "High heat retention storage heaters": ["high_heat_retention_storage_heater"], "Gas condensing boiler": ["boiler_upgrade"], "Change room heaters to condensing boiler": ["boiler_upgrade"], "Cylinder thermostat": ["cylinder_thermostat"], diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index 1eab7d42..c5c07f89 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -1,6 +1,5 @@ import re import backend.app.assumptions as assumptions -from etl.customers.immo.pilot.asset_list import non_invasive_recommendations from recommendations.Costs import Costs, BOILER_UPGRADE_SCHEME_ASHP_VALUE from recommendations.recommendation_utils import ( check_simulation_difference, override_costs, combine_recommendation_configs @@ -632,7 +631,8 @@ class HeatingRecommender: heating_controls_only, system_change, system_type, - measure_type + measure_type, + non_intrusive_recommendation=None ): """ Given a recommendation for heating controls, and a recommendation for the heating system, we combine the two @@ -650,8 +650,13 @@ class HeatingRecommender: :param system_type: The type of heating system we are recommending :param measure_type: The type of measure we are recommending - more granular than the "type" field, allowing us to distinguish between different types of heating recommendations + :param non_intrusive_recommendation: A non-intrusive recommendation, which may specify the number of SAP points + or a cost for this recommendation """ + if non_intrusive_recommendation is None: + non_intrusive_recommendation = {} + # We produce recommendations with & without heating controls # We will also produce a recommendation for heating controls only heating_controls_switch = [True, False] if controls_recommendations else [False] @@ -699,13 +704,14 @@ class HeatingRecommender: "description": recommendation_description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": non_intrusive_recommendation.get("sap_points"), "already_installed": already_installed, **total_costs, "simulation_config": recommendation_simulation_config, "description_simulation": recommendation_description_simulation, # We insert the heating system type here - "system_type": system_type + "system_type": system_type, + "survey": non_intrusive_recommendation.get("survey", False) } output.append(recommendation) @@ -808,6 +814,13 @@ class HeatingRecommender: # No recommendation needed return + # We check if there is a high heat retention non-intrusive recommendation + non_intrusive_recommendation = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == "high_heat_retention_storage_heater"), + {} + ) + # We check if the property has dual heating in place with a boiler and storage heaters if self.dual_heating: new_heating_description = self.DUAL_HEATING_DESCRIPTIONS[ @@ -896,7 +909,8 @@ class HeatingRecommender: heating_controls_only=heating_controls_only, system_change=system_change, system_type="high_heat_retention_storage_heater", - measure_type="high_heat_retention_storage_heater" + measure_type="high_heat_retention_storage_heater", + non_intrusive_recommendation=non_intrusive_recommendation ) if _return: return recommendations From fe193305e672b49eeb3862b903a2552a8c21e334 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 24 Jan 2025 11:31:41 +0000 Subject: [PATCH 145/255] paused for the moment --- backend/Funding.py | 53 ++++++++++++++++++++++++++++++--- backend/app/plan/router.py | 61 ++++++++++++++++++++++---------------- etl/funding/app.py | 35 ++++++++++++++++++++++ 3 files changed, 120 insertions(+), 29 deletions(-) create mode 100644 etl/funding/app.py diff --git a/backend/Funding.py b/backend/Funding.py index 8a9b08ae..f0780c51 100644 --- a/backend/Funding.py +++ b/backend/Funding.py @@ -12,6 +12,8 @@ class Funding: and flag any tenant specific requirements that need to be considered to the funding to be attained """ + SCHEMES = ["eco4", "gbis", "whlg"] + ECO_SAP_SCORE_THREHOLDS = [ {'Band': 'High_A', 'From': 96.0, 'Up to': 100.0, 'Mid-point': 98.0}, {'Band': 'Low_A', 'From': 92.0, 'Up to': 96.0, 'Mid-point': 94.0}, @@ -34,10 +36,12 @@ class Funding: tenure: HousingType, starting_epc, starting_sap, + postcode, floor_area, council_tax_band, property_recommendations, project_scores_matrix, + whlg_eligible_postcodes, gbis_abs_rate: int, eco4_abs_rate: int, ): @@ -47,6 +51,10 @@ class Funding: :param starting_epc: The current EPC rating of the property :param starting_sap: The current SAP score for the property :param floor_area: The total floor area of the property + :param council_tax_band: The council tax band of the property + :param property_recommendations: The recommendations for the property + :param project_scores_matrix: The matrix of project scores for ECO4 + :param whlg_eligible_postcodes: The postcodes eligible for WHLG :param gbis_abs_rate: The assumed £/abs achieved by the installer for GBIS :param eco4_abs_rate: The assumed £/abs achieved by the installer for ECO4 """ @@ -58,6 +66,7 @@ class Funding: self.tenure = tenure self.starting_epc = starting_epc self.starting_sap = starting_sap + self.postcode = postcode self.starting_eco_band = self.sap_to_eco_band(self.starting_sap) self.floor_area_segment = self.classify_floor_area(floor_area) self.gbis_abs_rate = gbis_abs_rate @@ -75,6 +84,11 @@ class Funding: (project_scores_matrix["Starting Band"] == self.starting_eco_band) ] + # The postcode column is already lower case + self.whlg_eligible_postcodes = whlg_eligible_postcodes[ + whlg_eligible_postcodes["Postcode"] == self.postcode.lower() + ] + # Store the final outputs self.gbis_eligibiltiy = {} self.eco4_eligibility = {} @@ -82,6 +96,8 @@ class Funding: def output( self, + scheme: str, + eligible: bool, measure_types: List[str], estimated_funding: float, notify_tenant_benefits_requirements: bool, @@ -90,12 +106,18 @@ class Funding: ): """" """ + + if scheme not in self.SCHEMES: + raise ValueError("Scheme not recognised") + return { + "scheme": scheme, + "eligible": eligible, "measure_types": measure_types, "estimated_funding": estimated_funding, - "notify_tenant_benefits_requirements": notify_tenant_benefits_requirements, - "notify_council_tax_band_requirements": notify_council_tax_band_requirements, - "notify_tenant_low_income_requirements": notify_tenant_low_income_requirements + "requires_benefits": notify_tenant_benefits_requirements, + "requires_council_tax_band": notify_council_tax_band_requirements, + "requires_low_income": notify_tenant_low_income_requirements } @staticmethod @@ -234,6 +256,8 @@ class Funding: # If the council tax band is missing, we nofify the customer that this is a requirement that # should be checked return self.output( + scheme="gbis", + eligible=True, measure_types=[recommended_measure["measure_type"]], estimated_funding=recommended_measure["estimated_funding"], notify_tenant_benefits_requirements=False, @@ -251,6 +275,8 @@ class Funding: # We find the best measure for GBIS recommended_measure = self.find_best_gbis_measure(measures=valid_measures) return self.output( + scheme="gbis", + eligible=True, measure_types=[recommended_measure["measure_type"]], estimated_funding=recommended_measure["estimated_funding"], notify_tenant_benefits_requirements=True, @@ -260,6 +286,8 @@ class Funding: # Otherwise, no funding availability return self.output( + scheme="gbis", + eligible=False, measure_types=[], estimated_funding=0, notify_tenant_benefits_requirements=False, @@ -279,6 +307,23 @@ class Funding: raise NotImplementedError("Implement social/oo") + def whlg(self): + if self.tenure == "Social": + # We can't do anything for social housing + self.whlg_eligibility = self.output( + scheme="whlg", + eligible=False, + measure_types=[], + estimated_funding=0, + notify_tenant_benefits_requirements=False, + notify_council_tax_band_requirements=False, + notify_tenant_low_income_requirements=False + ) + return + + if not self.whlg_eligible_postcodes.empty: + print("Eligible implement me!") + def eco4(self): if self.tenure == "Private": self.eco4_eligibiltiy = self.eco4_prs() @@ -292,4 +337,4 @@ class Funding: self.gbis() # self.eco4() - # self.whlg() + self.whlg() diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 1b72e10e..04a2ef7f 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -388,15 +388,26 @@ def extract_property_request_data( return patch, property_already_installed, property_non_invasive_recommendations, property_valution -def get_eco_project_scores_matrix(): - data = read_csv_from_s3( +def get_funding_data(): + """ + This function retrieves the eco project scores matrix and the warm homes local grant funding data + :return: + """ + project_scores_matrix = read_csv_from_s3( bucket_name=get_settings().DATA_BUCKET, filepath="funding/ECO4 Full Project Scores Matrix.csv", ) - df = pd.DataFrame(data) - df.columns = ['Floor Area Segment', 'Starting Band', 'Finishing Band', 'Cost Savings'] - df["Cost Savings"] = df["Cost Savings"].astype(float) - return df + project_scores_matrix = pd.DataFrame(project_scores_matrix) + project_scores_matrix.columns = ['Floor Area Segment', 'Starting Band', 'Finishing Band', 'Cost Savings'] + project_scores_matrix["Cost Savings"] = project_scores_matrix["Cost Savings"].astype(float) + + whlg_eligible_postcodes = read_csv_from_s3( + bucket_name=get_settings().DATA_BUCKET, + filepath="funding/whlg eligible postcodes.csv", + ) + whlg_eligible_postcodes = pd.DataFrame(whlg_eligible_postcodes) + + return project_scores_matrix, whlg_eligible_postcodes router = APIRouter( @@ -544,7 +555,7 @@ async def trigger_plan(body: PlanTriggerRequest): logger.info("Reading in materials and cleaned datasets") materials = get_materials(session) cleaned = get_cleaned() - eco_project_scores_matrix = get_eco_project_scores_matrix() + eco_project_scores_matrix, whlg_eligible_postcodes = get_funding_data() kwh_client = KwhData(bucket=get_settings().DATA_BUCKET, read_consumption_data=True) @@ -688,9 +699,7 @@ async def trigger_plan(body: PlanTriggerRequest): # Insert the predictions into the recommendations and run the optimiser # TODO: If a recommendation has a negative impact on SAP, we should remove it - this seems to have become a - # possibility with heating system - # TODO: After optimising, if there are any cheap, quick win measures (e.g. insulate water tank with hot water - # cylinder jacket), we should add these to the recommendations as default + # possibility with heating system? for p in input_properties: if not recommendations.get(p.id): @@ -802,21 +811,23 @@ async def trigger_plan(body: PlanTriggerRequest): # Funding # ~~~~~~~~~~~~~~~~ - # for p in input_properties: - # funding_calulator = Funding( - # tenure=body.housing_type, - # starting_epc=p.data["current-energy-rating"], - # starting_sap=int(p.data["current-energy-efficiency"]), - # floor_area=p.floor_area, - # council_tax_band=None, # This is seemingly always None at the moment - # property_recommendations=recommendations[p.id], - # project_scores_matrix=eco_project_scores_matrix, - # gbis_abs_rate=20, - # eco4_abs_rate=20, - # ) - # funding_calulator.check_eligibiltiy() - # # Insert finding - # p.insert_funding(funding_calulator) + for p in input_properties: + funding_calulator = Funding( + tenure=body.housing_type, + starting_epc=p.data["current-energy-rating"], + starting_sap=int(p.data["current-energy-efficiency"]), + postcode=p.postcode, + floor_area=p.floor_area, + council_tax_band=None, # This is seemingly always None at the moment + property_recommendations=recommendations[p.id], + project_scores_matrix=eco_project_scores_matrix, + whlg_eligible_postcodes=whlg_eligible_postcodes, + gbis_abs_rate=20, + eco4_abs_rate=15, + ) + funding_calulator.check_eligibiltiy() + # Insert finding + p.insert_funding(funding_calulator) logger.info("Uploading recommendations to the database") # If we have any work to do, we create a new scenario diff --git a/etl/funding/app.py b/etl/funding/app.py new file mode 100644 index 00000000..fba48ca4 --- /dev/null +++ b/etl/funding/app.py @@ -0,0 +1,35 @@ +""" +This scipt prepares the data, required for us to perform funding calculations. The starting data should be stored +on the machine this is being run on, and this will prepare the information and upload if +""" +import pandas as pd +from utils.s3 import save_csv_to_s3 + +STAGE = "dev" +DATA_BUCKET = "retrofit-data-{stage}" +PROJECTS_SCORES_MATRIX_LOCATION = "/Users/khalimconn-kowlessar/Downloads/ECO4 Full Project Scores Matrix.csv" +WHLG_ELIGIBLE_POSTCODES = "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx" + + +def app(): + # Read in the project scores matrix + project_scores_matrix = pd.read_csv(PROJECTS_SCORES_MATRIX_LOCATION) + + # Store in AWS S3 + save_csv_to_s3( + dataframe=project_scores_matrix, + bucket_name=DATA_BUCKET.format(stage=STAGE), + file_name="funding/ECO4 Full Project Scores Matrix.csv" + ) + + # Read in the Warm Homes Local Grant eligible postcodes data + whlg_eligible_postcodes = pd.read_excel(WHLG_ELIGIBLE_POSTCODES, sheet_name="Eligible postcodes", header=1) + # We tidy up the data before we store + whlg_eligible_postcodes = whlg_eligible_postcodes[["Postcode"]] + whlg_eligible_postcodes["Postcode"] = whlg_eligible_postcodes["Postcode"].str.lower() + + save_csv_to_s3( + dataframe=whlg_eligible_postcodes, + bucket_name=DATA_BUCKET.format(stage=STAGE), + file_name="funding/whlg eligible postcodes.csv" + ) From 8922fc7b8fd84e582d3a464a9764c49881512db8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 24 Jan 2025 13:06:47 +0000 Subject: [PATCH 146/255] adjusted % of roof covered with panels for buildings --- recommendations/SolarPvRecommendations.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py index ed5554dc..95f189d3 100644 --- a/recommendations/SolarPvRecommendations.py +++ b/recommendations/SolarPvRecommendations.py @@ -103,7 +103,10 @@ class SolarPvRecommendations: for rank, recommendation_config in best_configurations.iterrows(): # If we dont have the panneled_roof_area in the recommendation_config we calculate it if recommendation_config.get("panneled_roof_area", None): - roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / total_roof_area * 100) + # We spread the coverage across the individual units + roof_coverage_percent = round( + ((recommendation_config["panneled_roof_area"] / total_roof_area) * 100) / n_units + ) else: raise Exception("IMPLEMENT ME") From 36bb4b0f275b402e7806f01cde788676e7090bd3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Jan 2025 15:10:23 +0000 Subject: [PATCH 147/255] pulled data needed for stonewater --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/access_reporting/app.py | 46 ++ .../stonewater/Wave 3 Preparation.py | 33 ++ etl/customers/stonewater/data_cleaning.py | 137 ++++++ .../stonewater/potential_eco_properties.py | 393 ++++++++++++------ .../whlg eligibile properties.py | 8 + 7 files changed, 495 insertions(+), 126 deletions(-) create mode 100644 etl/customers/stonewater/data_cleaning.py diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/access_reporting/app.py b/etl/access_reporting/app.py index 830f4370..8a8254a1 100644 --- a/etl/access_reporting/app.py +++ b/etl/access_reporting/app.py @@ -83,8 +83,11 @@ def api_call_decorator(func): results = [] page_size = kwargs.get('page_size', None) response_data = {} + n_calls = 0 while url: + logger.info("Making call for page: " + str(n_calls + 1)) + n_calls += 1 response = requests.request(http_method, url, headers=self.headers, json=data) # Handle the response @@ -93,6 +96,7 @@ def api_call_decorator(func): if page_size: results.extend(response_json.get('value', [])) url = response_json.get('@odata.nextLink', None) + logger.info(f"Next page URL: {url}") else: response_data = response_json # Capture the full response for consistency break @@ -270,6 +274,48 @@ class SharePointClient: return file_content + def download_sharepoint_folder(self, drive_id, folder_path, download_dir, excluded_file_types=None): + """ + Downloads all files in a SharePoint folder to the specified local directory. + + :param drive_id: The ID of the SharePoint drive. + :param folder_path: The path of the folder in SharePoint. + :param download_dir: The local directory to save the downloaded files. + :param excluded_file_types: A list of file types to exclude from download (default is None). + """ + + excluded_file_types = [] if excluded_file_types is None else excluded_file_types + + # Ensure the download directory exists + os.makedirs(download_dir, exist_ok=True) + + # List folder contents + folder_contents = self.list_folder_contents(drive_id, folder_path) + files = folder_contents.get('value', []) + + for item in files: + if item.get('folder'): # Check if it's a folder + # Recursively handle subfolders + subfolder_path = f"{folder_path}/{item['name']}" + subfolder_dir = os.path.join(download_dir, item['name']) + self.download_sharepoint_folder(drive_id, subfolder_path, subfolder_dir) + else: + # It's a file, download it + file_name = item['name'] + if file_name.split(".")[-1] in excluded_file_types: + continue + download_url = item['@microsoft.graph.downloadUrl'] + + logger.info(f"Downloading file: {file_name}") + file_content = self.download_sharepoint_file(download_url) + + # Save the file locally + file_path = os.path.join(download_dir, file_name) + with open(file_path, 'wb') as f: + f.write(file_content.read()) + + logger.info(f"File saved to: {file_path}") + def app(): # Customers for WC 18/11/2024 diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 0f757f7b..8538188b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2905,5 +2905,38 @@ def identify_incorrect_packages(): os.path.join(CUSTOMER_FOLDER_PATH, "Units with assigned packages - with flags.csv"), index=False ) + +def revised_model(): + """ + This function implements the revised model for Stonewater, where we are looking at new priority postcodes + This work was undertaken in January 2021. + """ + + # 1) Create the new list of properties + + new_priority_postcodes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Jan 2025 Project/Updated 2025 to 2030 " + "priority list.xlsx" + ) + + original_archetypes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] + original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] + original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + + original_archetypes = original_archetypes[ + ["Address ID", "Archetype ID", ""] + ] + + # Check if we have all of the addresses + missed = original_archetypes[ + ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) + ]["Archetype ID"].unique() + assert + # if __name__ == "__main__": # main() diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py new file mode 100644 index 00000000..8751960c --- /dev/null +++ b/etl/customers/stonewater/data_cleaning.py @@ -0,0 +1,137 @@ +import os +import shutil +from tqdm import tqdm + + +def delete_large_files(): + """ + This function deletes photos, designs and other files which we don't need + :return: + """ + + folder_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys" + + # List the contents of this folder since in each sub-folder we have the property folders + contents = os.listdir(folder_path) + + for subfolder in contents: + if not os.path.isdir(os.path.join(folder_path, subfolder)): + continue + subfolder_path = os.path.join(folder_path, subfolder) + # List the contents + property_folders = os.listdir(subfolder_path) + + for property in tqdm(property_folders): + # Check if it's a directory + if not os.path.isdir(os.path.join(subfolder_path, property)): + continue + + property_path = os.path.join(subfolder_path, property) + property_contents = os.listdir(property_path) + # We delete the contents of the following folders: + # '1. RA Property Pics' + # '4. Air Tightness Tests' + # '5. RD Design Info' + for folder_to_delete in ["1. RA Property Pics", "4. Air Tightness Tests", "5. RD Design Info", + "1. RA Property PIcs", "Post EPC Photos", "4. RD Design Info", + "5. Installer Info", "6. Trustmark lodgement", "7.Post Install Inspection Photos", + "6. Trustmark Lodgement", "7. Post Inspection Photos"]: + if folder_to_delete not in property_contents: + continue + folder_to_delete_path = os.path.join(property_path, folder_to_delete) + if os.path.isdir(folder_to_delete_path): + # Delete the folder, even if it's not empty + shutil.rmtree(folder_to_delete_path) + + # We now check the '2. RA Coordinator Info' folder for any .MOV files and delete them + if "2. RA Coordinator Info" not in property_contents: + coordinator_folder = "1. RA Coordinator Info" + else: + coordinator_folder = "2. RA Coordinator Info" + coordinator_info_path = os.path.join(property_path, coordinator_folder) + coordinator_info_contents = os.listdir(coordinator_info_path) + # Look for .MOV files and .jpg files + for file in coordinator_info_contents: + if file.endswith(".MOV"): + os.remove(os.path.join(coordinator_info_path, file)) + + if file.endswith(".jpg"): + os.remove(os.path.join(coordinator_info_path, file)) + + if "Property Pics" in coordinator_info_contents: + # Delete folder and contents + shutil.rmtree(os.path.join(coordinator_info_path, "Property Pics")) + + +def download_data_from_sharepoint(): + # Given a sharepoint location, this function will download the retrofit assessment folders from the locations + # specified in the sharepoint location + from etl.access_reporting.app import SharePointClient + + sharepoint_client = SharePointClient( + tenant_id="10d5af8b-2cfd-4882-9ccd-b96e4812dacf", + client_id="6832a4c5-fb8c-4082-a746-4f51e1020f0d", + client_secret="xpC8Q~Frww48SM1V-D8lGy5iOY7P_cJ7FF3jgarQ", + site_id="bc925a9a-ad0b-4de9-9a3c-e61014cc7489" + ) + + # Retrieve the data from Sharepoint and write to local machine + contents = sharepoint_client.list_folder_contents( + drive_id=sharepoint_client.document_drive["id"], + folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + ) + + len(contents["value"]) + folders_to_pull = [ + folder for folder in contents["value"] if folder["name"] in ["3. Wiltshire", "4. Bournemouth", "5. Coventry"] + ] + for folder_to_pull in folders_to_pull: + # Get the contents + folder_contents = sharepoint_client.list_folder_contents( + drive_id=sharepoint_client.document_drive["id"], + folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" + + folder_to_pull["name"], + page_size=100 + ) + + property_folders = [f for f in folder_contents["value"]] + + for property_folder in property_folders: + # We go into each property folder and get the contents + property_folder_contents = sharepoint_client.list_folder_contents( + drive_id=sharepoint_client.document_drive["id"], + folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" + + folder_to_pull["name"] + "/" + property_folder["name"] + ) + # We look for the retrofit assessment folder: + property_sub_folders = [ + f for f in property_folder_contents["value"] if "ra coordinator info" in f["name"].lower() + ] + + if not property_sub_folders: + continue + + # if we have this, we download the folder and store it on my laptop! + property_sub_folder = property_sub_folders[0] + + property_folder_path = os.path.join( + "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) + + download_dir = os.path.join( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) + + # We download the folder + sharepoint_client.download_sharepoint_folder( + drive_id=sharepoint_client.document_drive["id"], + folder_path=property_folder_path, + download_dir=download_dir, + excluded_file_types=["MOV"] + ) diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index c0301e9a..bda9c30c 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -7,6 +7,8 @@ from tqdm import tqdm from dotenv import load_dotenv from backend.SearchEpc import SearchEpc from utils.s3 import read_from_s3, read_pickle_from_s3 +import msoffcrypto +from io import BytesIO load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") @@ -64,6 +66,28 @@ def app(): This code creates a list of cavity properties, for review """ + # Read in the password protected master + # TODO: This file should be deleted! + + # Path to the password-protected Excel file + file_path = ("/Users/khalimconn-kowlessar/Downloads/STONEWATER MASTER SHEET - UPDATED 20.5.24 - K- PASSWORD " + "PROTECTED.xlsx") + password = "STONE123" # Replace with the actual password + + # Open the file and decrypt it + with open(file_path, "rb") as f: + decrypted_file = BytesIO() + office_file = msoffcrypto.OfficeFile(f) + office_file.load_key(password=password) + office_file.decrypt(decrypted_file) + + # Read the decrypted file into a DataFrame + eco_rolling_master = pd.read_excel(decrypted_file, sheet_name="Sheet1", engine="openpyxl") + + eco_rolling_master = eco_rolling_master[ + ~eco_rolling_master['INSTALL/CANCELLATION DATE'].str.contains("CANCELLED") + ] + archetyped_properties = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 - " "Archetyped V3.1.xlsx", @@ -116,13 +140,16 @@ def app(): features_to_merge = features[ [ - "Address ID", "Age", "Property Type", "Walls", "Roofs", "Glazing", "Heating", "Main Fuel", "Hot Water", + "Address ID", "Organisation Reference", "Age", "Property Type", "Walls", "Roofs", "Glazing", "Heating", + "Main Fuel", + "Hot Water", "Renewables", "Total Floor Area" ] ] stonewater_cavity_properties = archetyped_properties[ - ["Name", "Postcode", "Osm. ID", "Address ID", "UPRN", "UDPRN", "Archetype ID", "House no", "Street name", + ["Name", "Postcode", "Osm. ID", "Org. ref.", "Address ID", "UPRN", "UDPRN", "Archetype ID", "House no", + "Street name", "Address line 2", "City/Town", "Is Cavity Property", "Survey shows CWI needed for Archetype"] ].merge( features_to_merge, how="left", on="Address ID" @@ -166,77 +193,137 @@ def app(): stonewater_cavity_properties["Reason Included"] ) + # We flag units that were installed under ECO3 + numeric_ids = eco_rolling_master[eco_rolling_master["STONEWATER UPRN"] != "NOT ON ASSET LIST"] + numeric_ids = numeric_ids[~pd.isnull(numeric_ids["STONEWATER UPRN"])] + numeric_ids["STONEWATER UPRN"] = numeric_ids["STONEWATER UPRN"].astype(int) + + stonewater_cavity_properties["Installed under ECO3"] = stonewater_cavity_properties["Org. ref."].isin( + numeric_ids['STONEWATER UPRN'].values + ) + + # Which postcodes were installed under ECO3 + priority_list_eco3 = stonewater_cavity_properties[ + stonewater_cavity_properties["Installed under ECO3"] + ]["Postcode"].unique() + + # These are properties that were not installed under ECO3, that have the same postcodes as properties + # installed under ECO3 + + # These are 66 properties we might want to start with as an immediate priority + stonewater_cavity_properties["Same Postcode as Installed under ECO3"] = ( + ~stonewater_cavity_properties["Installed under ECO3"] & ( + stonewater_cavity_properties["Postcode"].isin(priority_list_eco3) + ) + ) + # We get the EPC data - epc_data = json.loads( - read_from_s3( - bucket_name="retrofit-data-dev", - s3_file_name="customers/Stonewater/clustering/epc_data.json" - ) - ) - epc_data = pd.DataFrame(epc_data) - - epc_data["uprn"] = np.where( - epc_data["internal_id"] == 1091, - 83143766, - epc_data["uprn"] - ) - - epc_data_batch_2 = read_pickle_from_s3( - s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", - bucket_name="retrofit-data-dev" - ) - epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) - - complete_epcs = pd.concat([epc_data, epc_data_batch_2]) - - epcs_to_merge = complete_epcs[ - [ - "uprn", - "address", - "postcode", - "property-type", - "built-form", - "inspection-date", - "current-energy-rating", - "current-energy-efficiency", - "roof-description", - "walls-description", - "transaction-type", - "secondheat-description", - "total-floor-area", - "construction-age-band", - "floor-height", - "number-habitable-rooms", - "mainheat-description", - "energy-consumption-current" - ] - ].rename( - columns={ - "address": "Address", - "postcode": "Postcode", - "inspection-date": "Date of last EPC", - "current-energy-efficiency": "SAP score on register", - "current-energy-rating": "EPC rating on register", - "property-type": "Property Type", - "built-form": "Archetype", - "total-floor-area": "Property Floor Area", - "construction-age-band": "Property Age Band", - "floor-height": "Property Floor Height", - "number-habitable-rooms": "Number of Habitable Rooms", - "walls-description": "Wall Construction", - "roof-description": "Roof Construction", - "mainheat-description": "Heating Type", - "secondheat-description": "Secondary Heating", - "transaction-type": "Reason for last EPC", - "energy-consumption-current": "Heat Demand (kWh/m2)", - } - ) - # We de-dupe, taking the newest on the date the EPC was lod - epcs_to_merge["Date of last EPC"] = pd.to_datetime(epcs_to_merge["Date of last EPC"]) - epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False) - epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn") + # epc_data = json.loads( + # read_from_s3( + # bucket_name="retrofit-data-dev", + # s3_file_name="customers/Stonewater/clustering/epc_data.json" + # ) + # ) + # epc_data = pd.DataFrame(epc_data) + # + # epc_data["uprn"] = np.where( + # epc_data["internal_id"] == 1091, + # 83143766, + # epc_data["uprn"] + # ) + # + # epc_data_batch_2 = read_pickle_from_s3( + # s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", + # bucket_name="retrofit-data-dev" + # ) + # epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) + # + # complete_epcs = pd.concat([epc_data, epc_data_batch_2]) + # + # epcs_to_merge = complete_epcs[ + # [ + # "uprn", + # "address", + # "postcode", + # "property-type", + # "built-form", + # "inspection-date", + # "current-energy-rating", + # "current-energy-efficiency", + # "roof-description", + # "walls-description", + # "transaction-type", + # "secondheat-description", + # "total-floor-area", + # "construction-age-band", + # "floor-height", + # "number-habitable-rooms", + # "mainheat-description", + # "energy-consumption-current" + # ] + # ].rename( + # columns={ + # "address": "Address", + # "postcode": "Postcode", + # "inspection-date": "Date of last EPC", + # "current-energy-efficiency": "SAP score on register", + # "current-energy-rating": "EPC rating on register", + # "property-type": "Property Type", + # "built-form": "Archetype", + # "total-floor-area": "Property Floor Area", + # "construction-age-band": "Property Age Band", + # "floor-height": "Property Floor Height", + # "number-habitable-rooms": "Number of Habitable Rooms", + # "walls-description": "Wall Construction", + # "roof-description": "Roof Construction", + # "mainheat-description": "Heating Type", + # "secondheat-description": "Secondary Heating", + # "transaction-type": "Reason for last EPC", + # "energy-consumption-current": "Heat Demand (kWh/m2)", + # } + # ) + # # We de-dupe, taking the newest on the date the EPC was lod + # epcs_to_merge["Date of last EPC"] = pd.to_datetime(epcs_to_merge["Date of last EPC"]) + # epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False) + # epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn") stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str) + stonewater_cavity_properties["Reason Included"].value_counts() + # Find the postcodes where an Osmosis survey revealed a need for CWI + postcodes_found_needing_cwi = stonewater_cavity_properties[ + stonewater_cavity_properties["Reason Included"].isin( + [ + "Survey revealed potential need for CWI or extract and re-fill", + "Surveyed revealed potential need for CWI or extract and re-fill and is an as built cavity property", + "Survey showed this property needs CWI", + "Survey showed this property could need extract and re-fill" + ] + ) + ]["Postcode"].unique() + + stonewater_cavity_properties["Suspected Needs CWI - not surveyed"] = ( + ( + stonewater_cavity_properties[ + "Postcode"].isin( + postcodes_found_needing_cwi) + ) & ( + ~stonewater_cavity_properties[ + "Reason Included"].isin( + [ + "Survey revealed potential need " + "for CWI or extract and re-fill", + "Surveyed revealed potential " + "need for CWI or extract and " + "re-fill and is an as built " + "cavity property", + "Survey showed this property " + "needs CWI", + "Survey showed this property " + "could need extract and re-fill" + ] + ) + ) + ) # Merge the EPCs on, with the data we need stonewater_cavity_properties = stonewater_cavity_properties.rename( @@ -252,12 +339,12 @@ def app(): "Renewables": "Parity - Renewables", "Total Floor Area": "Parity - Total Floor Area" } - ).merge( - epcs_to_merge, - how="left", - left_on="UPRN", - right_on="uprn" - ) + ) # .merge( + # epcs_to_merge, + # how="left", + # left_on="UPRN", + # right_on="uprn" + # ) # We now flag the additional properties in the as built list @@ -288,8 +375,56 @@ def app(): additional_properties = additional_properties.merge(house_numbers, how="left", on="Address ID") additional_properties["row_id"] = additional_properties["Address ID"].copy() + # Flag any units in this list that were installed under ECO3 + additional_properties["Installed under ECO3"] = additional_properties["Organisation Reference"].isin( + numeric_ids['STONEWATER UPRN'].values + ) + + # Additional list ECO3 + additional_list_eco3 = additional_properties[additional_properties["Installed under ECO3"]]["Postcode"].unique() + + # These are properties that were not installed under ECO3, that have the same postcodes as properties + # installed under ECO3 + # These are 297 properties we might want to start with as an immediate priority + additional_properties["Same Postcode as Installed under ECO3"] = ( + ~additional_properties["Installed under ECO3"] & ( + additional_properties["Postcode"].isin(additional_list_eco3) + ) + ) + + # We do some additional manual checks, for ECO3 properties that were installed that didn't get matched to either + # dataaset + numeric_ids["In asset list"] = numeric_ids["STONEWATER UPRN"].isin( + stonewater_cavity_properties['Org. ref.'].astype(int).values + ) + numeric_ids["In asset list"] = numeric_ids["In asset list"] | ( + numeric_ids["STONEWATER UPRN"].isin( + additional_properties['Organisation Reference'].astype(int).values + ) + ) + + # eco3_installs_not_in_asset_list = numeric_ids[~numeric_ids["In asset list"]] + # # We now take samples of properties randomly and manually check the ID against the asset list + # print(eco3_installs_not_in_asset_list.sample(1)[["STONEWATER UPRN", "Post Code", "NO ", "Street / Block Name", ]]) + # # Checked STONEWATER UPRN + # # 9862, BH15 1NR, 33, THE QUAY FOYER [x] + # # 12785, S01 66PN, 57, SEACOLE GARDENS [x] + # # 26071, MK42 0TE, 51, De Havilland Avenue, Shortstown [x] + # # 18213, HR6 9UW, 20 Ford Street [x] + # # 24344, LU4 9FF, 6 SEAL CLOSE [x] + # # 31222, SN14 0QZ, 7 HARDBROOK COURT [x] + # # 9343, SP4 7XL, 10 OAK PLACE [x] + # # 34730, LU5 5TN, 4 TUDOR DRIVE [x] + # # 7021, BN27 2BZ, 32 BUTTS FIELD [] + # + # stonewater_cavity_properties[stonewater_cavity_properties['Org. ref.'] == 7021] + # stonewater_cavity_properties[stonewater_cavity_properties['Postcode'] == "BN27 2BZ"]["Name"] + # + # additional_properties[additional_properties['Organisation Reference'] == 7021] + # additional_properties[additional_properties['Postcode'] == "BN27 2BZ"][["Address"]] + # Pull the EPCs for these properties - additional_properties_epcs, errors = get_data(additional_properties) + # additional_properties_epcs, errors = get_data(additional_properties) # Save this data as a pickle # import pickle @@ -297,12 +432,20 @@ def app(): # "wb") as f: # pickle.dump(additional_properties_epcs, f) + additional_properties["Suspected Needs CWI - not surveyed"] = ( + ( + additional_properties["Postcode"].isin(postcodes_found_needing_cwi) + ) + ) + + additional_properties["Same Postcode as Installed under ECO3"].value_counts() + # We drop Full Address additional_properties = additional_properties.drop(columns=["Full Address"]) additional_properties2 = additional_properties[[ - "row_id", "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing", - "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", - + "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing", + "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3', + 'Same Postcode as Installed under ECO3' ]].rename( columns={ "SAP": "Parity - Predicted SAP", @@ -318,56 +461,58 @@ def app(): "Renewables": "Parity - Renewables", "Total Floor Area": "Parity - Total Floor Area" } - ).merge( - pd.DataFrame(additional_properties_epcs)[ - [ - "row_id", - "property-type", - "built-form", - "inspection-date", - "current-energy-rating", - "current-energy-efficiency", - "roof-description", - "walls-description", - "transaction-type", - "secondheat-description", - "total-floor-area", - "construction-age-band", - "floor-height", - "number-habitable-rooms", - "mainheat-description", - "energy-consumption-current" - ] - ].rename( - columns={ - "inspection-date": "Date of last EPC", - "current-energy-efficiency": "SAP score on register", - "current-energy-rating": "EPC rating on register", - "property-type": "Property Type", - "built-form": "Archetype", - "total-floor-area": "Property Floor Area", - "construction-age-band": "Property Age Band", - "floor-height": "Property Floor Height", - "number-habitable-rooms": "Number of Habitable Rooms", - "walls-description": "Wall Construction", - "roof-description": "Roof Construction", - "mainheat-description": "Heating Type", - "secondheat-description": "Secondary Heating", - "transaction-type": "Reason for last EPC", - "energy-consumption-current": "Heat Demand (kWh/m2)", - } - ), - how="left", - on="row_id" - ) + ) # .merge( + # pd.DataFrame(additional_properties_epcs)[ + # [ + # "row_id", + # "property-type", + # "built-form", + # "inspection-date", + # "current-energy-rating", + # "current-energy-efficiency", + # "roof-description", + # "walls-description", + # "transaction-type", + # "secondheat-description", + # "total-floor-area", + # "construction-age-band", + # "floor-height", + # "number-habitable-rooms", + # "mainheat-description", + # "energy-consumption-current" + # ] + # ].rename( + # columns={ + # "inspection-date": "Date of last EPC", + # "current-energy-efficiency": "SAP score on register", + # "current-energy-rating": "EPC rating on register", + # "property-type": "Property Type", + # "built-form": "Archetype", + # "total-floor-area": "Property Floor Area", + # "construction-age-band": "Property Age Band", + # "floor-height": "Property Floor Height", + # "number-habitable-rooms": "Number of Habitable Rooms", + # "walls-description": "Wall Construction", + # "roof-description": "Roof Construction", + # "mainheat-description": "Heating Type", + # "secondheat-description": "Secondary Heating", + # "transaction-type": "Reason for last EPC", + # "energy-consumption-current": "Heat Demand (kWh/m2)", + # } + # ), + # how="left", + # on="row_id" + # ) # We save the data locally stonewater_cavity_properties.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties.csv", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority " + "postcodes.csv", index=False ) additional_properties2.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties.csv", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - " + "non-priority postcodes.csv", index=False ) # Save the survey findings diff --git a/etl/customers/waltham_forest/whlg eligibile properties.py b/etl/customers/waltham_forest/whlg eligibile properties.py index fee988c1..9e1949f7 100644 --- a/etl/customers/waltham_forest/whlg eligibile properties.py +++ b/etl/customers/waltham_forest/whlg eligibile properties.py @@ -44,6 +44,10 @@ epc_data["has_conservation_restrictions"] = ( | (epc_data["is_heritage_building"] == True) ) +whlg_eligible_postcodes["Local Authority"].value_counts() + +whlg_eligible_postcodes = whlg_eligible_postcodes[whlg_eligible_postcodes["Local Authority"] == "Waltham Forest"] + # Pathway 1: # Match based on eligible postcodes pathway1 = epc_data[epc_data["postcode"].isin(whlg_eligible_postcodes["Postcode"].values)] @@ -67,6 +71,10 @@ pathway1["EPC Date"] = pd.to_datetime(pathway1["EPC Date"]).dt.strftime("%Y-%m-% # Create a year EPC was lodged pathway1["EPC Year"] = pd.to_datetime(pathway1["EPC Date"]).dt.year +low_epc = pathway1[pathway1["EPC Rating"].isin(["F", "G"])] +low_epc["EPC Rating"].value_counts() +low_epc.tail(1)[["address", "postcode"]] + pathway1.to_csv( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Waltham Forest WHLG - Pathway 1 Eligibility.csv", index=False From 11a4bc24a1903f4f384aef48fd006ca8c17c28e8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Jan 2025 17:42:19 +0000 Subject: [PATCH 148/255] anonymised sharepoint keys --- .../panacap_ventures/sample_remote_assessments.py | 1 + etl/customers/stonewater/Wave 3 Preparation.py | 1 - etl/customers/stonewater/data_cleaning.py | 15 ++++++++++----- 3 files changed, 11 insertions(+), 6 deletions(-) create mode 100644 etl/customers/panacap_ventures/sample_remote_assessments.py diff --git a/etl/customers/panacap_ventures/sample_remote_assessments.py b/etl/customers/panacap_ventures/sample_remote_assessments.py new file mode 100644 index 00000000..1a5ddff7 --- /dev/null +++ b/etl/customers/panacap_ventures/sample_remote_assessments.py @@ -0,0 +1 @@ +# The address we're looking from for the remote assessments is Natwest House, Shenley Rd, Borehamwood WD6 1DL diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 8538188b..b1bf0638 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2936,7 +2936,6 @@ def revised_model(): missed = original_archetypes[ ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) ]["Archetype ID"].unique() - assert # if __name__ == "__main__": # main() diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py index 8751960c..7ee06fcd 100644 --- a/etl/customers/stonewater/data_cleaning.py +++ b/etl/customers/stonewater/data_cleaning.py @@ -1,6 +1,7 @@ import os import shutil from tqdm import tqdm +from etl.access_reporting.app import SharePointClient def delete_large_files(): @@ -66,13 +67,17 @@ def delete_large_files(): def download_data_from_sharepoint(): # Given a sharepoint location, this function will download the retrofit assessment folders from the locations # specified in the sharepoint location - from etl.access_reporting.app import SharePointClient + + SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) + SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None) + OSMOSIS_SHAREPOINT_SITE_ID = os.getenv("OSMOSIS_SHAREPOINT_SITE_ID", None) sharepoint_client = SharePointClient( - tenant_id="10d5af8b-2cfd-4882-9ccd-b96e4812dacf", - client_id="6832a4c5-fb8c-4082-a746-4f51e1020f0d", - client_secret="xpC8Q~Frww48SM1V-D8lGy5iOY7P_cJ7FF3jgarQ", - site_id="bc925a9a-ad0b-4de9-9a3c-e61014cc7489" + tenant_id=SHAREPOINT_TENANT_ID, + client_id=SHAREPOINT_CLIENT_ID, + client_secret=SHAREPOINT_CLIENT_SECRET, + site_id=OSMOSIS_SHAREPOINT_SITE_ID ) # Retrieve the data from Sharepoint and write to local machine From 86deed8115c8b630ca5516f113ec5beb585460e0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Jan 2025 18:13:07 +0000 Subject: [PATCH 149/255] setting up the stonewater assessment extraction process --- .../sample_remote_assessments.py | 1 - .../stonewater/Wave 3 Preparation.py | 116 +++++++++++++++++- 2 files changed, 112 insertions(+), 5 deletions(-) delete mode 100644 etl/customers/panacap_ventures/sample_remote_assessments.py diff --git a/etl/customers/panacap_ventures/sample_remote_assessments.py b/etl/customers/panacap_ventures/sample_remote_assessments.py deleted file mode 100644 index 1a5ddff7..00000000 --- a/etl/customers/panacap_ventures/sample_remote_assessments.py +++ /dev/null @@ -1 +0,0 @@ -# The address we're looking from for the remote assessments is Natwest House, Shenley Rd, Borehamwood WD6 1DL diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b1bf0638..105628e9 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2928,14 +2928,122 @@ def revised_model(): original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) - original_archetypes = original_archetypes[ - ["Address ID", "Archetype ID", ""] - ] - # Check if we have all of the addresses missed = original_archetypes[ ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) ]["Archetype ID"].unique() + assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'} + + original_archetypes = original_archetypes[ + ["Address ID", "Archetype ID", "Archetype Group Rank"] + ] + + # Merge these archetypes on to the new priority postcodes + new_priority_postcodes = new_priority_postcodes.merge( + original_archetypes, how="left", on="Address ID" + ) + + # Basic check, should have no rows with missing Archetype ID, where + assert float(new_priority_postcodes[pd.isnull(new_priority_postcodes["Archetype ID"])]["Address ID"].isin( + original_archetypes["Address ID"] + ).sum()) == 0 + + # We pull together the survey data sheet + survey_folders = [] + + # Loop over each survey folder and list its contents + for i in range(1, NUM_FOLDERS + 1): + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + wave_21_folders = [ + "1. Herefordshire", + "2. Bedfordshire", + "3. Wiltshire", + "4. Bournemouth", + "5. Coventry", + "6. West Sussex", + "7. Dorset", + "8. Cambridgeshire", + "9. Guildford", + "10. Little Island", + "11. CCS Dorset" + ] + + for wave_2_1_folder in wave_21_folders: + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 2.1 Surveys", wave_2_1_folder) + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join("Wave 2.1 Surveys", wave_2_1_folder, file) for file in + os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + # We now do a large pull of all of the data + extracted_data = [] + for survey_folder in tqdm(survey_folders): + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) + + # List the folders inside of the survey folder + survey_subfolders = [ + name for name in os.listdir(survey_folder_path) + if os.path.isdir(os.path.join(survey_folder_path, name)) + ] + + # Check if there's a "retrofit assessment" folder + retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + + ra_folder = next( + (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), + None + ) + + # If retrofit assessment folder exists, check if it has content + if retrofit_folder or ra_folder: + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + else: + retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) + + # Check if everything inside is a sub-folder and the number of folders is 2 + items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] + all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] + if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: + # Get the folder that isn't Property Pics + retrofit_folder_path = os.path.join( + retrofit_folder_path, [item for item in items if item != "Property Pics"][0] + ) + + if os.listdir(retrofit_folder_path): # If not empty + summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + continue + else: + # Then we have an empty Retrofit Assessment folder + continue + + # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if not summary_data: + if len(survey_subfolders) == 1: + survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + + retrofit_assessment_data = pd.DataFrame(extracted_data) + # TODO - Save this data + # if __name__ == "__main__": # main() From ca7a0e9d107c7da66fd7a8d5066834b7dbf00978 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Jan 2025 22:15:53 +0000 Subject: [PATCH 150/255] debugging extract epr for old elmhurst epr --- .../stonewater/Wave 3 Preparation.py | 29 +++++++++++++++++-- etl/route_march_data_pull/app.py | 18 +++++------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 105628e9..ee314f17 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -747,12 +747,30 @@ def extract_epr(pdf_path): # Extract Current and Potential SAP ratings sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) - current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) - data["Current SAP Rating"] = current_sap + if sap_match is None: + # Handles the older format of the elmhurst EPR + # The text will look something like this: + # Least energy efficient - higher running costsD 61 - we extract D 61 + sap_match = re.search( + r"(?P[A-G])\s(?P\d{1,3})(?P[A-G])\s(?P\d{1,3})", + text) + data["Current EPC Band"] = sap_match.group("current_epc") + data["Current SAP Rating"] = int(sap_match.group("current_sap")) + else: + current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) + data["Current SAP Rating"] = current_sap # Extract the primary energy use intensity additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) - data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + if additional_rating_match: + data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + else: + # Handles the older format of the Elmhurst EPR + primary_energy_match = re.search(r"actual consumption\.\n(?P\d+)", text) + data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy")) + # We calculate the primary energy use intensity by dividing by floor area + floor_area = re.search(r"Total Floor Area\s(?P\d+)\s?m2", text).group("floor_area") + data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) # Extract Number of Storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) @@ -2983,8 +3001,13 @@ def revised_model(): # We now do a large pull of all of the data extracted_data = [] for survey_folder in tqdm(survey_folders): + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) + # Check that the survey folder is actually a folder + if not os.path.isdir(survey_folder_path): + continue + # List the folders inside of the survey folder survey_subfolders = [ name for name in os.listdir(survey_folder_path) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 8d19aa84..247ce98c 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -162,19 +162,17 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern" - DATA_FILENAME = "January 2025 Additions Query.xlsx" - SHEET_NAME = "Jan 2025 additions" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/For Housing" + DATA_FILENAME = "For Housing Data pull.xlsx" + SHEET_NAME = "Sheet1" POSTCODE_COLUMN = "Post Code" - FULLADDRESS_COLUMN = "Street / Block Name" - ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_word" - ADDRESS_COLS_TO_CONCAT = [] + FULLADDRESS_COLUMN = None + ADDRESS1_COLUMN = "NO." + ADDRESS1_METHOD = None + ADDRESS_COLS_TO_CONCAT = ["NO.", "Street / Block Name"] # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = { - "Ardelagh Ardelagh Faris Lane Woodham Addlestone KT15 3DJ": 100061484560 - } + MANUAL_UPRN_MAP = {} asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() From fd98721748c9da95c3660116f33b6aa00d1be01f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 15:24:02 +0000 Subject: [PATCH 151/255] debugging epr extraction when the dimensions are external --- etl/customers/stonewater/Wave 3 Preparation.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ee314f17..4db089e7 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -465,7 +465,11 @@ def extract_building_parts_summary(text): r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL ) if not dimensions_section: - raise ValueError("Failed to locate dimensions section in the text.") + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") dimensions_text = dimensions_section.group(1) @@ -898,11 +902,18 @@ def detect_report_type(pdf_path, pdf_file): """ # Attempt to read the first page of the PDF to determine type with open(pdf_path, "rb") as file: + # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter + # This is because the pdf is irregular. We could possibly try a library like fitz to handle this reader = PyPDF2.PdfReader(file) first_page_text = reader.pages[0].extract_text() if reader.pages else "" + n_pages = len(reader.pages) - if is_energy_report(first_page_text): + if is_energy_report(first_page_text) and n_pages > 3: + # The EPR should have more than 3 pages return "epr" + elif is_energy_report(first_page_text) and n_pages <= 3: + # This is a shortened version of the EPR which isn't massively useful + return "short_form_epr" elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): return "summary" elif is_condition_report(first_page_text): From 231069f4e3e4ca2a40e114db0963c55aa56b09b7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 20:37:06 +0000 Subject: [PATCH 152/255] matching algorithm wip --- .../stonewater/Wave 3 Preparation.py | 275 +++++++++++++++++- 1 file changed, 274 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 4db089e7..904afd30 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3077,7 +3077,280 @@ def revised_model(): extracted_data.append(summary_data) retrofit_assessment_data = pd.DataFrame(extracted_data) - # TODO - Save this data + + # Remove some definite duplicates + dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"] + dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)] + dupes = dupes.sort_values("Address") + # Get all of the folders that end with ROSS + to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() + + retrofit_assessment_data = retrofit_assessment_data[ + ~retrofit_assessment_data["survey_folder"].isin( + [ + "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", + "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS", + "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS" + ] + to_drop + ) + ] + # Replace \n with "" + retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") + + # retrofit_assessment_data.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet.csv"), index=False + # ) + + # We can read in the data as needed + + # Next Step: Read in the coordinated measures and match to the extracted data + ############################################################ + # CCS + ############################################################# + ccs_coordination_sheet = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx"), + header=4 + ) + ccs_coordination_sheet["contractor"] = "CCS" + # We split ccs into two sections - the first being + ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21) + ccs_coordination_sheet = ccs_coordination_sheet.head(87) + ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet]) + + ############################################################ + # WATES + ############################################################# + wates_coordination_sheet = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx" + ), + header=4 + ) + wates_coordination_sheet["contractor"] = "Wates" + # Break into the different sites: + # Wiltshire + wates_coordination_sheet_wiltshere = wates_coordination_sheet.head(267) + wates_coordination_sheet_herefordshire = wates_coordination_sheet.iloc[271:332, :] + wates_coordination_sheet_coventry = wates_coordination_sheet.iloc[336:409, :] + wates_coordination_sheet_bedfordshire = wates_coordination_sheet.iloc[413:520, :] + wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :] + wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :] + wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :] + wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[928:972, :] + + wates_coordination = pd.concat( + [ + wates_coordination_sheet_wiltshere, + wates_coordination_sheet_herefordshire, + wates_coordination_sheet_coventry, + wates_coordination_sheet_bedfordshire, + wates_coordination_sheet_bournemouth, + wates_coordination_sheet_cambridgeshire, + wates_coordination_sheet_removed_from_programme, + wates_coordination_sheet_abeyance + ] + ) + + # Combine the data back + + ############################################################ + # NEW 450 COORDINATED RETROFIT ASSESSMENTS + ############################################################# + + retrofit_packages_board = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx" + ), + header=4 + ) + retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] + # Take just the rows that have been surveyed + retrofit_packages_board = retrofit_packages_board[ + retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) + ] + + manual_filters = { + "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", + "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", + "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", + 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", + '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', + '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', + 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', + 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', + '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS', + '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY', + '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW', + '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS', + '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX', + '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX', + '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ', + '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG", + '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX', + "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', + '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', + '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', + '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX', + '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA' + } + + # We now match this retrofit packages board to the extracted data + matching_lookup = [] + for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".", + "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Address ID": home["Address ID"], + "Name": home["Name"] + } + ) + continue + + # home["Name"] should be contained in the survey_folder + filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] + # We have an edge case wher some properties have two outputs in Sharepoint + if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + raise Exception("Fix me1") + # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + + if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + raise Exception("Fix me2") + # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + + if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': + filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] + + if filtered.empty: + continue + if filtered.shape[0] != 1: + raise Exception("something went wrong") + + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Address ID": home["Address ID"], + "Name": home["Name"] + } + ) + matching_lookup = pd.DataFrame(matching_lookup) + + ccs_coordination = ccs_coordination.rename( + columns={"Post Code": "Postcode"} + ) + ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])] + from fuzzywuzzy import fuzz + + ccs_manual_filters = {} + ccs_matching_lookup = [] + for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["survey_folder"]. + str.replace(r"[^\w\s]", ""). + str.replace(",", ""). + str.replace(".", ""). + str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + ) + if to_filter.sum() == 0: + to_filter = filtered["Address"].str.split(",").str[0:2].str.join("") == home["Name"] + + if to_filter.sum() == 0: + # Do a fuzzy match on the name + # Find the best filter + to_filter = filtered["Address"].str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 9 + ) + + if to_filter.sum() == 0: + blah + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + ccs_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID.1": home["Asset ID.1"], + "Name": home["Name"] + } + ) + continue + + blah2 + + # home["Name"] should be contained in the survey_folder + # filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] + # # We have an edge case wher some properties have two outputs in Sharepoint + # if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + # raise Exception("Fix me1") + # # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + # + # if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + # raise Exception("Fix me2") + # # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + # + # if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': + # filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] + # + # if filtered.empty: + # continue + # if filtered.shape[0] != 1: + # raise Exception("something went wrong") + # + # matching_lookup.append( + # { + # "survey_folder": filtered["survey_folder"].values[0], + # "Address ID": home["Address ID"], + # "Name": home["Name"] + # } + # ) # if __name__ == "__main__": # main() From 7dd64781724df896badfd2170cba3ba5d2c283b9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 20:43:56 +0000 Subject: [PATCH 153/255] Added more logic for matching --- etl/customers/stonewater/Wave 3 Preparation.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 904afd30..ab640496 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3273,7 +3273,7 @@ def revised_model(): ccs_matching_lookup = [] for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): # Handle the case that has the wrong postcode in the asset data - if home["Name"] in manual_filters: + if home["Name"] in ccs_manual_filters: filtered = retrofit_assessment_data[ retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] ].copy() @@ -3297,13 +3297,16 @@ def revised_model(): ) ) if to_filter.sum() == 0: - to_filter = filtered["Address"].str.split(",").str[0:2].str.join("") == home["Name"] - + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("") == home[ + "Name"] + if to_filter.sum() == 0: + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("") == home[ + "Name"] if to_filter.sum() == 0: # Do a fuzzy match on the name # Find the best filter - to_filter = filtered["Address"].str.split(",").str[0:2].str.join("").apply( - lambda x: fuzz.partial_ratio(home["Name"], x) > 9 + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 93 ) if to_filter.sum() == 0: From 0331d82f6ac687b55297e80f430a15fa148f5d67 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 20:55:36 +0000 Subject: [PATCH 154/255] added manual match --- .../stonewater/Wave 3 Preparation.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ab640496..61344038 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3267,15 +3267,19 @@ def revised_model(): columns={"Post Code": "Postcode"} ) ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])] + ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"] from fuzzywuzzy import fuzz - ccs_manual_filters = {} + ccs_manual_filters = { + "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35" + } ccs_matching_lookup = [] for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): + # Handle the case that has the wrong postcode in the asset data if home["Name"] in ccs_manual_filters: filtered = retrofit_assessment_data[ - retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] + retrofit_assessment_data["survey_folder"] == ccs_manual_filters[home["Name"]] ].copy() else: filtered = retrofit_assessment_data[ @@ -3297,11 +3301,15 @@ def revised_model(): ) ) if to_filter.sum() == 0: - to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("") == home[ - "Name"] + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == + home["Name"].lower() + ) if to_filter.sum() == 0: - to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("") == home[ - "Name"] + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == + home["Name"].lower() + ) if to_filter.sum() == 0: # Do a fuzzy match on the name # Find the best filter From 678a4b52d28194d1dcf7c2d86d3993dde0161f3f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 21:03:11 +0000 Subject: [PATCH 155/255] matching for all of ccs --- etl/customers/stonewater/Wave 3 Preparation.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 61344038..fa548f0d 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3316,6 +3316,19 @@ def revised_model(): to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( lambda x: fuzz.partial_ratio(home["Name"], x) > 93 ) + if to_filter.sum() == 0: + # We also some cases where the name of the survey folder is like "Colville Road 7" and the + # property name is actually 7 Colville Road, so we try taking the final part of the address, + # splitting on space, and adding it to the front + def reformat_survey_folder(x): + filename = x.split("/")[-1] + parts = filename.split(" ") + return " ".join(parts[-1:] + parts[:-1]) + + to_filter = ( + filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == + home["Name"].lower() + ) if to_filter.sum() == 0: blah From 7291f7128e6b5403132e5afdcc56330ea3d71f15 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 21:11:29 +0000 Subject: [PATCH 156/255] started wates matching --- .../stonewater/Wave 3 Preparation.py | 119 +++++++++++++----- 1 file changed, 91 insertions(+), 28 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fa548f0d..cbbf04c6 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3331,7 +3331,7 @@ def revised_model(): ) if to_filter.sum() == 0: - blah + raise Exception("Error") filtered = filtered[to_filter] if filtered.empty: @@ -3347,34 +3347,97 @@ def revised_model(): ) continue - blah2 + raise Exception("No match") - # home["Name"] should be contained in the survey_folder - # filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] - # # We have an edge case wher some properties have two outputs in Sharepoint - # if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": - # raise Exception("Fix me1") - # # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] - # - # if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': - # raise Exception("Fix me2") - # # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] - # - # if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': - # filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] - # - # if filtered.empty: - # continue - # if filtered.shape[0] != 1: - # raise Exception("something went wrong") - # - # matching_lookup.append( - # { - # "survey_folder": filtered["survey_folder"].values[0], - # "Address ID": home["Address ID"], - # "Name": home["Name"] - # } - # ) + ccs_matching_lookup = pd.DataFrame(ccs_matching_lookup) + # We get a match for all records + assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0] + assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum() + + # We do the same for Wates + wates_coordination = wates_coordination.rename( + columns={"Post Code": "Postcode"} + ) + wates_coordination = wates_coordination[ + wates_coordination["Retrofit Assessment"].isin(["Completed"]) + ] + + wates_manual_filters = {} + wates_matching_lookup = [] + for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in wates_manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == wates_manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["survey_folder"]. + str.replace(r"[^\w\s]", ""). + str.replace(",", ""). + str.replace(".", ""). + str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + # Do a fuzzy match on the name + # Find the best filter + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 93 + ) + if to_filter.sum() == 0: + # We also some cases where the name of the survey folder is like "Colville Road 7" and the + # property name is actually 7 Colville Road, so we try taking the final part of the address, + # splitting on space, and adding it to the front + def reformat_survey_folder(x): + filename = x.split("/")[-1] + parts = filename.split(" ") + return " ".join(parts[-1:] + parts[:-1]) + + to_filter = ( + filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == + home["Name"].lower() + ) + + if to_filter.sum() == 0: + raise Exception("Error") + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + wates_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID": home["Asset ID"], + "Name": home["Name"] + } + ) + continue + + raise Exception("No match") # if __name__ == "__main__": # main() From b1936521f6f3c3585057d5f2ce10d1998e558400 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 21:16:18 +0000 Subject: [PATCH 157/255] added manual match --- etl/customers/stonewater/Wave 3 Preparation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index cbbf04c6..8a00604b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3362,7 +3362,9 @@ def revised_model(): wates_coordination["Retrofit Assessment"].isin(["Completed"]) ] - wates_manual_filters = {} + wates_manual_filters = { + "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View" + } wates_matching_lookup = [] for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): From 1814d7b6709cd7861db5c15ac6821a601708882e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 21:21:08 +0000 Subject: [PATCH 158/255] 11% through matching --- etl/customers/stonewater/Wave 3 Preparation.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 8a00604b..7cbf04f1 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3382,6 +3382,13 @@ def revised_model(): to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False ) + + if to_filter.sum() > 1: + to_filter = ( + filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.split("/").str[-1].str.lower() == + home["Name"].replace(r"[^\w\s]", "").lstrip().lower() + ) + if to_filter.sum() == 0: to_filter = ( filtered["survey_folder"]. From b4296db52d7b3c3e26ce3869ac31753bd731c379 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 30 Jan 2025 00:51:39 +0000 Subject: [PATCH 159/255] adding quidos extraction functions --- .../stonewater/Wave 3 Preparation.py | 7 ++ survey_report/app.py | 44 +++++++++ .../extraction/detect_report_type.py | 19 ++++ survey_report/extraction/quidos.py | 99 +++++++++++++++++++ survey_report/requirements.txt | 0 5 files changed, 169 insertions(+) create mode 100644 survey_report/app.py create mode 100644 survey_report/extraction/detect_report_type.py create mode 100644 survey_report/extraction/quidos.py create mode 100644 survey_report/requirements.txt diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 7cbf04f1..70c531c0 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3366,8 +3366,15 @@ def revised_model(): "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View" } wates_matching_lookup = [] + # Examples to skip when we cannot get the data + wates_to_skip = [ + "66 Abbatt Close", # File type is unusual, couldn't extract the data + ] for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): + if home["Name"] in wates_to_skip: + continue + # Handle the case that has the wrong postcode in the asset data if home["Name"] in wates_manual_filters: filtered = retrofit_assessment_data[ diff --git a/survey_report/app.py b/survey_report/app.py new file mode 100644 index 00000000..825a3658 --- /dev/null +++ b/survey_report/app.py @@ -0,0 +1,44 @@ +import os +import PyPDF2 +from survey_report.extraction.detect_report_type import detect_report_type +from survey_report.extraction.quidos import SiteNotesExtractor + + +def handle(): + """ + Performs the data extraction process for the survey report + :return: + """ + + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2" + + folder_contents = os.listdir(data_folder) + # We look for the following files: + # Site notes + file_mapping = {} + for file in folder_contents: + # Check if it's a pdf file + if not file.endswith(".pdf"): + continue + filepath = os.path.join(data_folder, file) + with (open(filepath, "rb") as f): + pdf = PyPDF2.PdfReader(f) + first_page = pdf.pages[0].extract_text() + text = "" + for page in pdf.pages: + text += page.extract_text() + + # Check the report type + report_type = detect_report_type(first_page) + if report_type is not None: + file_mapping[report_type] = text + + # Check the report type + report_type = detect_report_type(os.path.join(data_folder, file)) + + # This is only set up to work with quido site notes so we must have it + if "quidos_site_notes" not in file_mapping: + raise ValueError("No quidos site notes found") + + site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) + site_notes = site_notes_extractor.extract_all() diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py new file mode 100644 index 00000000..fe1600e7 --- /dev/null +++ b/survey_report/extraction/detect_report_type.py @@ -0,0 +1,19 @@ +import re + + +def detect_report_type(first_page): + """ + Detects the type of report based on the first page of the report + :param first_page: + :return: + """ + # Set up for the minute to handle quidos files. We have the Elmhurst logic so we can introduce + # this when we need + + if re.match( + r"^Created \d{2}/\d{2}/\d{4} for Quidos Ltd using Argyle software BRE approved calculator", + first_page + ): + return "quidos_site_notes" + + return None diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py new file mode 100644 index 00000000..f11ffcb1 --- /dev/null +++ b/survey_report/extraction/quidos.py @@ -0,0 +1,99 @@ +import re + + +class SiteNotesExtractor: + """ + Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report. + """ + + def __init__(self, pdf_text): + """ + Initializes the SiteNotesExtractor with the extracted PDF text. + """ + self.text = pdf_text + self.data = {} + + def extract_sap_rating(self): + """ + Extracts the current and potential SAP rating from the report. + """ + pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text) + + if not pattern: + raise ValueError("No SAP rating found in the report") + + self.data.update({ + "Current EPC Band": pattern.group(1), + "Current SAP Rating": int(pattern.group(2)), + "Potential EPC Band": pattern.group(3), + "Potential SAP Rating": int(pattern.group(4)), + }) + + def extract_carbon_emissions(self): + """ + Extracts the current and adjusted annual carbon emissions (TCO2). + """ + pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text) + + if not pattern: + raise ValueError("No carbon emissions found in the report") + + self.data.update({ + "Current Carbon Emissions (TCO2)": float(pattern.group(1)), + }) + + def extract_building_dimensions(self): + """ + Extracts dimensions for each building part and stores them in a list. + Handles Main Property and multiple extensions. + """ + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) " + r"Party Wall " + r"Length \(m\)\n" + r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL + ) + + if not dimensions_section: + raise ValueError("Failed to locate the dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.) + building_part_pattern = re.compile( + r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + building_parts = [] + for match in building_part_pattern.finditer(dimensions_text): + to_append = { + "Building Part": match.group(1).strip(), + "Part Floor Area (m2)": float(match.group(2)), + "Room Height (m)": float(match.group(3)), + "Loss Perimeter (m)": float(match.group(4)), + "Party Wall Length (m)": float(match.group(5)), + } + # We calculate the heat loss area + to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"] + building_parts.append(to_append) + + if not building_parts: + raise ValueError("No building dimensions found in the report") + + self.data["Building Dimensions"] = building_parts + # We calculate some totals + self.data["Total Building Dimensions"] = { + "floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]), + "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]), + } + + def extract_all(self): + """ + Runs all extraction methods and returns a dictionary with extracted data. + """ + self.extract_sap_rating() + self.extract_carbon_emissions() + self.extract_building_dimensions() + return self.data diff --git a/survey_report/requirements.txt b/survey_report/requirements.txt new file mode 100644 index 00000000..e69de29b From 32b053e7db3b08445b1649d6c418f33c5b235647 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 30 Jan 2025 00:54:56 +0000 Subject: [PATCH 160/255] extracting bills --- survey_report/extraction/quidos.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py index f11ffcb1..ae66dd0d 100644 --- a/survey_report/extraction/quidos.py +++ b/survey_report/extraction/quidos.py @@ -89,11 +89,23 @@ class SiteNotesExtractor: "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]), } + def extract_bills_estimate(self): + """ + Extracts the estimated annual energy costs (£) from the report. + """ + pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text) + + if not pattern: + raise ValueError("No bills estimate found in the report") + + self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", "")) + def extract_all(self): """ Runs all extraction methods and returns a dictionary with extracted data. """ self.extract_sap_rating() self.extract_carbon_emissions() + self.extract_bills_estimate() self.extract_building_dimensions() return self.data From daabf2a586eec7bf31440696f014ad7035a0033e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 30 Jan 2025 01:09:41 +0000 Subject: [PATCH 161/255] extracting epr --- survey_report/app.py | 20 ++++--- .../extraction/detect_report_type.py | 3 + survey_report/extraction/quidos.py | 55 +++++++++++++++++++ 3 files changed, 71 insertions(+), 7 deletions(-) diff --git a/survey_report/app.py b/survey_report/app.py index 825a3658..f59c9984 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -1,7 +1,7 @@ import os import PyPDF2 from survey_report.extraction.detect_report_type import detect_report_type -from survey_report.extraction.quidos import SiteNotesExtractor +from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor def handle(): @@ -33,12 +33,18 @@ def handle(): if report_type is not None: file_mapping[report_type] = text - # Check the report type - report_type = detect_report_type(os.path.join(data_folder, file)) - # This is only set up to work with quido site notes so we must have it - if "quidos_site_notes" not in file_mapping: - raise ValueError("No quidos site notes found") - site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) site_notes = site_notes_extractor.extract_all() + + # We also must have an EPR + epr_extractor = EPRExtractor(file_mapping["quidos_epr"]) + epr = epr_extractor.extract_all() + + # We now produce the combined data sheet which is the starting figure: + data_sheet = {**epr, **site_notes} + del data_sheet['Building Dimensions'] + # We unnest the Total Building Dimensions + data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] + data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] + del data_sheet["Total Building Dimensions"] diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py index fe1600e7..434a3fb4 100644 --- a/survey_report/extraction/detect_report_type.py +++ b/survey_report/extraction/detect_report_type.py @@ -16,4 +16,7 @@ def detect_report_type(first_page): ): return "quidos_site_notes" + if re.search(r"\nIQ-Energy\nEnergy Performance Report\nPage 1 of 1", first_page): + return "quidos_epr" + return None diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py index ae66dd0d..374df084 100644 --- a/survey_report/extraction/quidos.py +++ b/survey_report/extraction/quidos.py @@ -109,3 +109,58 @@ class SiteNotesExtractor: self.extract_bills_estimate() self.extract_building_dimensions() return self.data + + +class EPRExtractor: + """ + Extracts space heating, water heating, and address from an Energy Performance Report (EPR). + """ + + def __init__(self, pdf_text): + """ + Initializes the EPRExtractor with the extracted PDF text. + """ + self.text = pdf_text + self.data = {} + + def extract_heating_data(self): + """ + Extracts space heating and water heating values from the report. + """ + pattern = re.search( + r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)", + self.text, + re.DOTALL + ) + + if not pattern: + raise ValueError("No heating data found in the report") + + self.data.update({ + "Space Heating (KWH)": int(pattern.group(1).replace(",", "")), + "Water Heating (KWH)": int(pattern.group(2).replace(",", "")) + }) + + def extract_address(self): + """ + Extracts the full address from the report. + """ + pattern = re.search( + r"Address\s*(.*?)\nTown\s*(.*?)\n", + self.text, + re.DOTALL + ) + + if not pattern: + raise ValueError("No address found in the report") + + full_address = pattern.group(1).strip() + self.data["Address"] = full_address + + def extract_all(self): + """ + Runs all extraction methods and returns a dictionary with extracted data. + """ + self.extract_address() + self.extract_heating_data() + return self.data From f6d8688698bfcdc1c9d1230b9040dfe071e2bf1e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 30 Jan 2025 17:30:17 +0000 Subject: [PATCH 162/255] completed matching --- .../stonewater/Wave 3 Preparation.py | 89 +++++++++++-- etl/customers/stonewater/data_cleaning.py | 5 +- survey_report/app.py | 41 ++++++ survey_report/template.html | 123 ++++++++++++++++++ 4 files changed, 248 insertions(+), 10 deletions(-) create mode 100644 survey_report/template.html diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 70c531c0..d9b5c41d 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3078,6 +3078,13 @@ def revised_model(): retrofit_assessment_data = pd.DataFrame(extracted_data) + # retrofit_assessment_data.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), index=False + # ) + retrofit_assessment_data = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), + ) + # Remove some definite duplicates dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"] dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)] @@ -3097,10 +3104,6 @@ def revised_model(): # Replace \n with "" retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") - # retrofit_assessment_data.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet.csv"), index=False - # ) - # We can read in the data as needed # Next Step: Read in the coordinated measures and match to the extracted data @@ -3108,24 +3111,59 @@ def revised_model(): # CCS ############################################################# ccs_coordination_sheet = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx"), + os.path.join( + CUSTOMER_FOLDER_PATH, + "Jan 2025 Project", + "CCS_Installation_Compliance_-_Stonewater_SHDF_2_1_1738228227.xlsx" + ), header=4 ) + ccs_postcodes = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx" + ), + header=4 + ) + ccs_coordination_sheet = ccs_postcodes[['Name', 'Post Code', 'Asset ID', 'Asset ID.1']].merge( + ccs_coordination_sheet, how="left", on="Name" + ) + ccs_coordination_sheet = ccs_coordination_sheet[~pd.isnull(ccs_coordination_sheet["Name"])] ccs_coordination_sheet["contractor"] = "CCS" # We split ccs into two sections - the first being ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21) ccs_coordination_sheet = ccs_coordination_sheet.head(87) ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet]) + from urllib import parse + def extract_sharepoint_url(x): + if pd.isnull(x): + return "" + return "/".join(parse.urlparse( + x.split(" - http")[1] + ).path.replace("%20", " ").split("/")[-2:]) + + ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x)) + ############################################################ # WATES ############################################################# wates_coordination_sheet = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_1738229226.xlsx" + ), + header=4 + ) + wates_postcodes = pd.read_excel( os.path.join( CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx" ), header=4 ) + wates_postcodes = wates_postcodes[~pd.isnull(wates_postcodes["Post Code"])] + wates_coordination_sheet = wates_coordination_sheet.merge( + wates_postcodes[['Name', 'Post Code', 'Asset ID']].drop_duplicates(), how="left", on="Name" + ) + wates_coordination_sheet["contractor"] = "Wates" # Break into the different sites: # Wiltshire @@ -3136,7 +3174,7 @@ def revised_model(): wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :] wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :] wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :] - wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[928:972, :] + wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[930:972, :] wates_coordination = pd.concat( [ @@ -3151,12 +3189,15 @@ def revised_model(): ] ) + wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply( + lambda x: extract_sharepoint_url(x) + ) + # Combine the data back ############################################################ # NEW 450 COORDINATED RETROFIT ASSESSMENTS ############################################################# - retrofit_packages_board = pd.read_excel( os.path.join( CUSTOMER_FOLDER_PATH, @@ -3361,17 +3402,49 @@ def revised_model(): wates_coordination = wates_coordination[ wates_coordination["Retrofit Assessment"].isin(["Completed"]) ] + wates_coordination = wates_coordination[ + ~pd.isnull(wates_coordination["Postcode"]) + ] wates_manual_filters = { - "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View" + "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View", + "14 Edencroft": "Wave 2.1 Surveys/3. Wiltshire/14 Edencroft", + "Flat 31 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/Flat 31 Rabley Wood View", + 'Flat 13, Manor Fields': 'Wave 2.1 Surveys/1. Herefordshire/(038) Manor Fields Flat 13', + "4 Kittys Lane": "Wave 2.1 Surveys/1. Herefordshire/(005) Kittys Lane 4", + '1 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 1', + '2 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 2', } wates_matching_lookup = [] # Examples to skip when we cannot get the data wates_to_skip = [ "66 Abbatt Close", # File type is unusual, couldn't extract the data + "Flat 69 Goddard Road", # Doesn't exist + "19 Garth House", # # File type is unusual, couldn't extract the data + '5 Gilpin Close', # No properly formatted EPR + '49 The Hide, Netherfield', # TODO: TEMP HERE + '19 Chanders Rd', + '5 Chanders Rd', + '23 Chanders Rd', + '3 Chanders Rd', + '1 Orchard Close', ] for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): + # Search the folder + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"].str.contains(home["folder_path"], regex=False) + ] + if len(filtered) == 1: + wates_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID": home["Asset ID"], + "Name": home["Name"] + } + ) + continue + if home["Name"] in wates_to_skip: continue diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py index 7ee06fcd..010902ce 100644 --- a/etl/customers/stonewater/data_cleaning.py +++ b/etl/customers/stonewater/data_cleaning.py @@ -86,7 +86,6 @@ def download_data_from_sharepoint(): folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" ) - len(contents["value"]) folders_to_pull = [ folder for folder in contents["value"] if folder["name"] in ["3. Wiltshire", "4. Bournemouth", "5. Coventry"] ] @@ -108,6 +107,8 @@ def download_data_from_sharepoint(): folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" + folder_to_pull["name"] + "/" + property_folder["name"] ) + if not property_folder_contents.get("value"): + continue # We look for the retrofit assessment folder: property_sub_folders = [ f for f in property_folder_contents["value"] if "ra coordinator info" in f["name"].lower() @@ -138,5 +139,5 @@ def download_data_from_sharepoint(): drive_id=sharepoint_client.document_drive["id"], folder_path=property_folder_path, download_dir=download_dir, - excluded_file_types=["MOV"] + excluded_file_types=["MOV", "jpg"] ) diff --git a/survey_report/app.py b/survey_report/app.py index f59c9984..87ce7864 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -1,9 +1,33 @@ import os import PyPDF2 +from string import Template from survey_report.extraction.detect_report_type import detect_report_type from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor +def generate_html_report(template_path, output_path, data): + """ + Reads an HTML template file, injects dynamic values, and generates a final HTML report. + + Args: + - template_path (str): Path to the HTML template file. + - output_path (str): Path to save the generated HTML file. + - data (dict): Dictionary containing dynamic values for the report. + """ + # Read the template file + with open(template_path, "r", encoding="utf-8") as f: + html_template = Template(f.read()) # Use Template from string module + + # Replace placeholders with actual data + final_html = html_template.safe_substitute(data) # Use safe_substitute to prevent missing key errors + + # Save the generated HTML file + with open(output_path, "w", encoding="utf-8") as f: + f.write(final_html) + + print(f"HTML report generated successfully: {output_path}") + + def handle(): """ Performs the data extraction process for the survey report @@ -48,3 +72,20 @@ def handle(): data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] del data_sheet["Total Building Dimensions"] + + # Generate the HTML report + # Placeholder locations + template_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/template.html" + output_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/output/report.html" + logo_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/assets/logo.png" + generate_html_report( + template_path, output_path, + data={ + "address": data_sheet["Address"], + "logo_path": logo_path, + "current_epc": data_sheet["Current EPC Band"], + "current_sap": data_sheet["Current SAP Rating"], + "potential_epc": "A", # TODO PLACEHOLDER + "potential_sap": 91, # TODO PLACEHOLDER + } + ) diff --git a/survey_report/template.html b/survey_report/template.html new file mode 100644 index 00000000..5d3b6c63 --- /dev/null +++ b/survey_report/template.html @@ -0,0 +1,123 @@ + + + + + + Domna Energy Report + + + + +
+ +
+
+

Domna Energy Report

+

${address}

+
+ +
+ + +
+
+
Current EPC Rating
+
${current_epc}
+
SAP ${current_sap}
+
+ +
+
Potential EPC Rating
+
${potential_epc}
+
SAP ${potential_sap}
+
+
+ +
+ + + From 01a5077c17cd219ddc907c48eaae4158c9117cfb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Feb 2025 12:54:57 +0000 Subject: [PATCH 163/255] tidying up stonewater work --- .../stonewater/Wave 3 Preparation.py | 224 +++++++++++++++++- 1 file changed, 221 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d9b5c41d..5c4da35b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1,4 +1,6 @@ import os +from pyexpat import features + import PyPDF2 import re import pandas as pd @@ -1704,7 +1706,6 @@ def append_stonewater_id(): ) model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])] model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int) - z = model_proposed_sample["Archetype ID"].drop_duplicates().sort_values() original_archetypes = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " @@ -2942,7 +2943,6 @@ def revised_model(): """ # 1) Create the new list of properties - new_priority_postcodes = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Jan 2025 Project/Updated 2025 to 2030 " "priority list.xlsx" @@ -3188,7 +3188,13 @@ def revised_model(): wates_coordination_sheet_abeyance ] ) - + # We correct the Asset ID for 34 Kempster Close + wates_coordination["Asset ID"] = np.where( + wates_coordination["Name"] == "34 Kempster Close", + "12005", + wates_coordination["Asset ID"] + ) + wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply( lambda x: extract_sharepoint_url(x) ) @@ -3198,6 +3204,14 @@ def revised_model(): ############################################################ # NEW 450 COORDINATED RETROFIT ASSESSMENTS ############################################################# + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + features["Address ID"] = features["Address ID"].astype(str).astype(int) + features_to_merge = features[["Address ID", "Organisation Reference"]] + retrofit_packages_board = pd.read_excel( os.path.join( CUSTOMER_FOLDER_PATH, @@ -3211,6 +3225,10 @@ def revised_model(): retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) ] + retrofit_packages_board = retrofit_packages_board.merge( + features_to_merge, how="left", on="Address ID" + ) + manual_filters = { "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", @@ -3527,6 +3545,206 @@ def revised_model(): continue raise Exception("No match") + wates_matching_lookup = pd.DataFrame(wates_matching_lookup) + + # Merge lookup tables onto the coordination sheets + wates_coordination = wates_coordination.merge( + wates_matching_lookup, how="left", on="Name" + ) + missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])] + if not missed_asset_id.empty: + # We fill the missing ids + missing_lookup = { + "4 Sydnall Fields": 31231, + "12 Sydnall Fields": 31239, + "12 Athena Gardens": 28061, + "49 Banner Lane": 41189, + "4 Jonathan Road": 41232, + "8 Jonathan Road": 41236, + "1 Jonathan Road": 41229, + "96 Taunton Way": 31417, + "94 Taunton Way": 31418, + "1 Lady Lane": 29430, + "10 Jonathan Road": 41283, + "21 Jonathan Road": 41246, + "12 Ashcroft Close": 26399 + } + for name, asset_id in missing_lookup.items(): + wates_coordination["Asset ID_x"] = np.where( + wates_coordination["Name"] == name, + asset_id, + wates_coordination["Asset ID_x"] + ) + + ccs_coordination = ccs_coordination.merge( + ccs_matching_lookup, how="left", on="Name" + ) + + retrofit_packages_board = retrofit_packages_board.merge( + matching_lookup, how="left", on="Name" + ) + + # We combine this into a singular board + coordinated_packages = pd.concat( + [ + retrofit_packages_board[ + [ + "Name", "Postcode", 'Actual SAP Band', 'Actual SAP Rating', + 'Modelled SAP Band', 'Modelled SAP Rating', 'Package Ref', + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures', 'Organisation Reference', + ] + ], + ccs_coordination[ + [ + # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls, + # Solar PV + "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', + 'SAP Band Install Package', 'Package Approved (Client)', + 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', + 'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y", + ] + ].rename( + columns={ + "SAP Band Pre": "Actual SAP Band", + "SAP Rating Pre": "Actual SAP Rating", + 'SAP Rating Install Package': 'Modelled SAP Band', + 'SAP Band Install Package': 'Modelled SAP Rating', + 'Package Approved (Client)': 'Package Ref', + 'Wall Insulation': 'Main Wall Insulation', + 'Loft Insulation': 'Loft insulation', + 'Windows Upgrade': 'Window Upgrade', + 'Ext. Doors Upgrade': 'Door Upgrade', + 'Heating': 'Main Heating', + 'Other Measures': 'Other measures', + 'Asset ID.1_y': 'Organisation Reference', + } + ), + wates_coordination[ + [ + "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', + 'SAP Band Install Package', 'Package Approved (Client)', + 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', + 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x' + + ] + ].rename( + columns={ + "SAP Band Pre": "Actual SAP Band", + "SAP Rating Pre": "Actual SAP Rating", + 'SAP Rating Install Package': 'Modelled SAP Band', + 'SAP Band Install Package': 'Modelled SAP Rating', + 'Package Approved (Client)': 'Package Ref', + 'Wall Insulation': 'Main Wall Insulation', + 'Loft Insulation': 'Loft insulation', + 'Windows Upgrade': 'Window Upgrade', + 'Ext. Doors Upgrade': 'Door Upgrade', + 'Heating': 'Main Heating', + 'Other Measures': 'Other measures', + 'Asset ID_x': 'Organisation Reference', + } + ) + ] + ) + + coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int) + + # Merge the property features on + coordinated_packages = coordinated_packages.merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]], + how="left", + on="Organisation Reference" + ) + + # We need the features pertaining to these priority postcodes + + def find_nearest_matching_property(coordinated_packages, home): + filter_levels = [ + ["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], + ["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], + ["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], + ] + + for i, filters in enumerate(filter_levels): + match = coordinated_packages.copy() + + for col in filters: + match = match[match[col] == home[col]] + + if not match.empty: + return match + + return None # No match found + + coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() + new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip() + + coordinated_packages["Roof Simple"] = coordinated_packages["Roofs"].str.split(":").str[0].str.strip() + new_priority_postcodes["Roof Simple"] = new_priority_postcodes["Roofs"].str.split(":").str[0].str.strip() + + coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0] + new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0] + + # For every property in the priority postcodes data, we look for a most appropriate matching property + no_match = [] + matches = [] + for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)): + closest_match = find_nearest_matching_property(coordinated_packages, home) + if closest_match is None: + no_match.append(home["Organisation Reference"]) + continue + + to_extend = [ + { + "Organisation Reference": home["Organisation Reference"], + "Best Match Organisation Reference": m + } for m in closest_match["Organisation Reference"].values + ] + matches.extend(to_extend) + + no_match_summary = new_priority_postcodes[ + new_priority_postcodes["Organisation Reference"].isin( + no_match + ) + ].groupby(["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"])[ + "Organisation Reference"].count().reset_index() + + no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False) + + # len(no_match) + # 8764, 5607 + # no_match_summary.shape + # (3953, 6), (2948, 6) + + # We match the properties to their closest match + + matches_df = pd.DataFrame(matches) + matches_df = matches_df.merge( + coordinated_packages[["Organisation Reference", "Actual SAP Band", "Actual SAP Rating"]], + left_on="Best Match Organisation Reference", right_on="Organisation Reference", + suffixes=("", " - Closest Match") + ) + # We want to aggregate the matches, when we have multiple + aggregated_matches_df = [] + for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): + if mapped_matches.shape[0] == 1: + mapped_matches["Number of matches"] = 1 + mapped_matches["Proportion"] + aggregated_matches_df.append(mapped_matches) + continue + + mapped_priority_list = new_priority_postcodes.merge( + matches_df, on="Organisation Reference", + ) + # We merge on the EPC ratings for the matched properties + mapped_priority_list = mapped_priority_list.merge( + + ) # if __name__ == "__main__": # main() From 04eba60961b0ea215701b2b35feaed74f9a5ef11 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Feb 2025 13:04:10 +0000 Subject: [PATCH 164/255] fixing cleaning for stonewater --- .../stonewater/Wave 3 Preparation.py | 58 +++++++++++-------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 5c4da35b..04078e47 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3194,7 +3194,32 @@ def revised_model(): "12005", wates_coordination["Asset ID"] ) - + + # We fill the missing ids + missing_lookup = { + "4 Sydnall Fields": 31231, + "12 Sydnall Fields": 31239, + "12 Athena Gardens": 28061, + "49 Banner Lane": 41189, + "4 Jonathan Road": 41232, + "8 Jonathan Road": 41236, + "1 Jonathan Road": 41229, + "96 Taunton Way": 31417, + "94 Taunton Way": 31418, + "1 Lady Lane": 29430, + "10 Jonathan Road": 41283, + "21 Jonathan Road": 41246, + "12 Ashcroft Close": 26399 + } + for name, asset_id in missing_lookup.items(): + wates_coordination["Asset ID_x"] = np.where( + wates_coordination["Name"] == name, + asset_id, + wates_coordination["Asset ID_x"] + ) + + wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])] + wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply( lambda x: extract_sharepoint_url(x) ) @@ -3412,6 +3437,7 @@ def revised_model(): # We get a match for all records assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0] assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum() + assert not ccs_matching_lookup["Asset ID.1"].duplicated().sum() # We do the same for Wates wates_coordination = wates_coordination.rename( @@ -3447,6 +3473,8 @@ def revised_model(): '3 Chanders Rd', '1 Orchard Close', ] + wates_coordination = wates_coordination[~wates_coordination["Name"].isin(wates_to_skip)] + for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): # Search the folder @@ -3547,34 +3575,18 @@ def revised_model(): raise Exception("No match") wates_matching_lookup = pd.DataFrame(wates_matching_lookup) + # We get a match for all records + assert wates_matching_lookup.shape[0] == wates_coordination.shape[0] + assert not pd.isnull(wates_matching_lookup["Asset ID"]).sum() + assert not wates_matching_lookup["Asset ID"].duplicated().sum() + # Merge lookup tables onto the coordination sheets wates_coordination = wates_coordination.merge( wates_matching_lookup, how="left", on="Name" ) missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])] if not missed_asset_id.empty: - # We fill the missing ids - missing_lookup = { - "4 Sydnall Fields": 31231, - "12 Sydnall Fields": 31239, - "12 Athena Gardens": 28061, - "49 Banner Lane": 41189, - "4 Jonathan Road": 41232, - "8 Jonathan Road": 41236, - "1 Jonathan Road": 41229, - "96 Taunton Way": 31417, - "94 Taunton Way": 31418, - "1 Lady Lane": 29430, - "10 Jonathan Road": 41283, - "21 Jonathan Road": 41246, - "12 Ashcroft Close": 26399 - } - for name, asset_id in missing_lookup.items(): - wates_coordination["Asset ID_x"] = np.where( - wates_coordination["Name"] == name, - asset_id, - wates_coordination["Asset ID_x"] - ) + raise Exception("Missing Asset ID") ccs_coordination = ccs_coordination.merge( ccs_matching_lookup, how="left", on="Name" From 10bc433283417a2c15ffe2924537ded81af240d6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Feb 2025 16:06:47 +0000 Subject: [PATCH 165/255] assigning properties to bands --- .../stonewater/Wave 3 Preparation.py | 71 ++++++++++++++++--- 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 04078e47..c623e9f7 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3212,10 +3212,10 @@ def revised_model(): "12 Ashcroft Close": 26399 } for name, asset_id in missing_lookup.items(): - wates_coordination["Asset ID_x"] = np.where( + wates_coordination["Asset ID"] = np.where( wates_coordination["Name"] == name, asset_id, - wates_coordination["Asset ID_x"] + wates_coordination["Asset ID"] ) wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])] @@ -3596,6 +3596,16 @@ def revised_model(): matching_lookup, how="left", on="Name" ) + # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board + to_remove = wates_coordination[ + wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) + ] + assert to_remove.shape[0] == 4 + # Remove them from the wates board + wates_coordination = wates_coordination[ + ~wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) + ] + # We combine this into a singular board coordinated_packages = pd.concat( [ @@ -3662,6 +3672,7 @@ def revised_model(): ) coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int) + assert not coordinated_packages["Organisation Reference"].duplicated().sum() # Merge the property features on coordinated_packages = coordinated_packages.merge( @@ -3670,6 +3681,25 @@ def revised_model(): on="Organisation Reference" ) + # We match the properties to their closest match + # We clean up the SAP ratings in the coordinated packages + def sap_to_number(x): + try: + return int(x) + except: + if x[-1] in ["A", "B", "C", "D", "E", "F"]: + return int(x[:-1]) + + if x[0] in ["A", "B", "C", "D", "E", "F"]: + return int(x[1:]) + + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Band"])] + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Rating"])] + + coordinated_packages["Actual SAP Rating"] = coordinated_packages["Actual SAP Rating"].apply( + lambda x: sap_to_number(x) + ) + # We need the features pertaining to these priority postcodes def find_nearest_matching_property(coordinated_packages, home): @@ -3729,11 +3759,9 @@ def revised_model(): no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False) # len(no_match) - # 8764, 5607 + # 8764, 5607, 5646 # no_match_summary.shape - # (3953, 6), (2948, 6) - - # We match the properties to their closest match + # (3953, 6), (2948, 6), (2969, 7) matches_df = pd.DataFrame(matches) matches_df = matches_df.merge( @@ -3745,11 +3773,36 @@ def revised_model(): aggregated_matches_df = [] for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): if mapped_matches.shape[0] == 1: - mapped_matches["Number of matches"] = 1 - mapped_matches["Proportion"] - aggregated_matches_df.append(mapped_matches) + aggregated_matches_df.append( + { + "Organisation Reference": org_ref, + "Number of matches": 1, + "Proportion": 100, + "Estimated SAP Rating": mapped_matches["Actual SAP Rating"].values[0], + "Estimated EPC Rating": sap_to_epc(mapped_matches["Actual SAP Rating"].values[0]) + } + ) continue + # We need to aggregate the matches, since we have multiple + average_rating = mapped_matches["Actual SAP Rating"].mean() + number_of_matches = mapped_matches.shape[0] + average_epc_rating = sap_to_epc(average_rating) + # proportion is the number of properties that have this EPC rating + proportion_with_this_epc = int( + mapped_matches[mapped_matches["Actual SAP Band"] == average_epc_rating].shape[0] / number_of_matches * 100) + aggregated_matches_df.append( + { + "Organisation Reference": org_ref, + "Number of matches": number_of_matches, + "Proportion": proportion_with_this_epc, + "Estimated SAP Rating": average_rating, + "Estimated EPC Rating": average_epc_rating + } + ) + + aggregated_matches_df = pd.DataFrame(aggregated_matches_df) + mapped_priority_list = new_priority_postcodes.merge( matches_df, on="Organisation Reference", ) From 139db23592ea885af14d8734d9cf2e36a1484a59 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 4 Feb 2025 14:04:20 +0000 Subject: [PATCH 166/255] putting together outputs --- .../stonewater/Wave 3 Preparation.py | 346 +++++++++++++++--- etl/route_march_data_pull/app.py | 16 +- survey_report/app.py | 79 ++-- 3 files changed, 360 insertions(+), 81 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index c623e9f7..1748f624 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1,5 +1,6 @@ import os -from pyexpat import features +from urllib import parse +from fuzzywuzzy import fuzz import PyPDF2 import re @@ -2936,6 +2937,14 @@ def identify_incorrect_packages(): ) +def extract_sharepoint_url(x): + if pd.isnull(x): + return "" + return "/".join(parse.urlparse( + x.split(" - http")[1] + ).path.replace("%20", " ").split("/")[-2:]) + + def revised_model(): """ This function implements the revised model for Stonewater, where we are looking at new priority postcodes @@ -2956,6 +2965,7 @@ def revised_model(): original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str) # Check if we have all of the addresses missed = original_archetypes[ @@ -2965,7 +2975,7 @@ def revised_model(): assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'} original_archetypes = original_archetypes[ - ["Address ID", "Archetype ID", "Archetype Group Rank"] + ["Address ID", "Archetype ID", "Archetype Group Rank", "UPRN"] ] # Merge these archetypes on to the new priority postcodes @@ -3104,6 +3114,42 @@ def revised_model(): # Replace \n with "" retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") + retrofit_assessments_data_columns = [ + 'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)', + 'Primary Energy Use Intensity (kWh/m2/yr)', 'Number of Storeys', + 'Fuel Bill', 'Window Age Description', + 'Window Age Description Proportion (%)', + 'Secondary Window Age Description', + 'Secondary Window Age Description Proportion (%)', 'Number of Windows', + 'Total Number of Doors', 'Number of Insulated Doors', + 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference', + 'Existing Primary Heating Controls', + 'Existing Primary Heating % of Heat', + 'Existing Secondary Heating System', + 'Existing Secondary Heating PCDF Reference', + 'Existing Secondary Heating Controls', + 'Existing Secondary Heating % of Heat', 'Secondary Heating Code', + 'Water Heating Code', 'Total Floor Area (m2)', + 'Total Ground Floor Area (m2)', 'RIR Floor Area', + 'Main Building Wall Area (m2)', 'First Extension Wall Area (m2)', + 'Number of Light Fittings', 'Number of LEL Fittings', + 'Number of fittings needing LEL', 'Main Roof Type', + 'Main Roof Insulation', 'Main Roof Insulation Thickness', + 'Main Wall Type', 'Main Wall Insulation', 'Main Wall Dry-lining', + 'Main Wall Thickness', 'Main Building Alternative Wall Type', + 'Main Building Alternative Wall Insulation', + 'Main Building Alternative Wall Dry-lining', + 'Main Building Alternative Wall Thickness', 'Main Fuel' + ] + # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey: + retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns] + rename_dict = dict(zip(retrofit_assessments_data_columns, retrofit_assessments_data_columns_prefixed)) + retrofit_assessment_data = retrofit_assessment_data.rename(columns=rename_dict) + retrofit_assessment_data["Survey: Current EPC Band"] = ( + retrofit_assessment_data["Survey: Current SAP Rating"].apply(lambda x: sap_to_epc(x)) + ) + # We can read in the data as needed # Next Step: Read in the coordinated measures and match to the extracted data @@ -3134,14 +3180,6 @@ def revised_model(): ccs_coordination_sheet = ccs_coordination_sheet.head(87) ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet]) - from urllib import parse - def extract_sharepoint_url(x): - if pd.isnull(x): - return "" - return "/".join(parse.urlparse( - x.split(" - http")[1] - ).path.replace("%20", " ").split("/")[-2:]) - ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x)) ############################################################ @@ -3224,8 +3262,6 @@ def revised_model(): lambda x: extract_sharepoint_url(x) ) - # Combine the data back - ############################################################ # NEW 450 COORDINATED RETROFIT ASSESSMENTS ############################################################# @@ -3352,7 +3388,6 @@ def revised_model(): ) ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])] ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"] - from fuzzywuzzy import fuzz ccs_manual_filters = { "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35" @@ -3596,6 +3631,17 @@ def revised_model(): matching_lookup, how="left", on="Name" ) + # We now map the retrofit assessment data to the coordinated packages + wates_coordination = wates_coordination.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + ccs_coordination = ccs_coordination.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + retrofit_packages_board = retrofit_packages_board.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board to_remove = wates_coordination[ wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) @@ -3617,8 +3663,8 @@ def revised_model(): 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', 'Solar PV', 'Other measures', 'Organisation Reference', - ] - ], + ] + retrofit_assessments_data_columns_prefixed + ], ccs_coordination[ [ # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls, @@ -3627,8 +3673,8 @@ def revised_model(): 'SAP Band Install Package', 'Package Approved (Client)', 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', 'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y", - ] - ].rename( + ] + retrofit_assessments_data_columns_prefixed + ].rename( columns={ "SAP Band Pre": "Actual SAP Band", "SAP Rating Pre": "Actual SAP Rating", @@ -3651,8 +3697,8 @@ def revised_model(): 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x' - ] - ].rename( + ] + retrofit_assessments_data_columns_prefixed + ].rename( columns={ "SAP Band Pre": "Actual SAP Band", "SAP Rating Pre": "Actual SAP Rating", @@ -3681,24 +3727,8 @@ def revised_model(): on="Organisation Reference" ) - # We match the properties to their closest match - # We clean up the SAP ratings in the coordinated packages - def sap_to_number(x): - try: - return int(x) - except: - if x[-1] in ["A", "B", "C", "D", "E", "F"]: - return int(x[:-1]) - - if x[0] in ["A", "B", "C", "D", "E", "F"]: - return int(x[1:]) - - coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Band"])] - coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Rating"])] - - coordinated_packages["Actual SAP Rating"] = coordinated_packages["Actual SAP Rating"].apply( - lambda x: sap_to_number(x) - ) + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current EPC Band"])] + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current SAP Rating"])] # We need the features pertaining to these priority postcodes @@ -3721,6 +3751,11 @@ def revised_model(): if not match.empty: return match + # Finally, we search for a property in the same Archetype + match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]] + if not match.empty: + return match + return None # No match found coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() @@ -3732,6 +3767,12 @@ def revised_model(): coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0] new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0] + coordinated_packages = coordinated_packages.merge( + new_priority_postcodes[["Organisation Reference", "Archetype ID"]], + how="left", + on="Organisation Reference" + ) + # For every property in the priority postcodes data, we look for a most appropriate matching property no_match = [] matches = [] @@ -3759,16 +3800,17 @@ def revised_model(): no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False) # len(no_match) - # 8764, 5607, 5646 + # 8764, 5607, 5646, 5071 # no_match_summary.shape - # (3953, 6), (2948, 6), (2969, 7) + # (3953, 6), (2948, 6), (2969, 7), (2575, 7) matches_df = pd.DataFrame(matches) matches_df = matches_df.merge( - coordinated_packages[["Organisation Reference", "Actual SAP Band", "Actual SAP Rating"]], + coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]], left_on="Best Match Organisation Reference", right_on="Organisation Reference", suffixes=("", " - Closest Match") ) + # We want to aggregate the matches, when we have multiple aggregated_matches_df = [] for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): @@ -3778,19 +3820,21 @@ def revised_model(): "Organisation Reference": org_ref, "Number of matches": 1, "Proportion": 100, - "Estimated SAP Rating": mapped_matches["Actual SAP Rating"].values[0], - "Estimated EPC Rating": sap_to_epc(mapped_matches["Actual SAP Rating"].values[0]) + "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0], + "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0] } ) continue # We need to aggregate the matches, since we have multiple - average_rating = mapped_matches["Actual SAP Rating"].mean() + average_rating = mapped_matches["Survey: Current SAP Rating"].mean() number_of_matches = mapped_matches.shape[0] average_epc_rating = sap_to_epc(average_rating) # proportion is the number of properties that have this EPC rating proportion_with_this_epc = int( - mapped_matches[mapped_matches["Actual SAP Band"] == average_epc_rating].shape[0] / number_of_matches * 100) + mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[ + 0] / number_of_matches * 100 + ) aggregated_matches_df.append( { "Organisation Reference": org_ref, @@ -3804,12 +3848,220 @@ def revised_model(): aggregated_matches_df = pd.DataFrame(aggregated_matches_df) mapped_priority_list = new_priority_postcodes.merge( - matches_df, on="Organisation Reference", + aggregated_matches_df, on="Organisation Reference", how="left" ) - # We merge on the EPC ratings for the matched properties - mapped_priority_list = mapped_priority_list.merge( + mapped_priority_list["address1"] = mapped_priority_list["Address"].str.split(",").str[0] + + # If we have a leading number like 01, 02, 03, 04, 05, 06, 07, 08, 09, we remove the leading 0 + + def remove_leading_zero(address): + return re.sub(r"^0([1-9]) ", r"\1 ", address) + + # Example usage + mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero) + mapped_priority_list["address1"] = np.where( + mapped_priority_list["Organisation Reference"] == 37004, + "8 Mason Road", + mapped_priority_list["address1"] ) + mapped_priority_list["address1"] = np.where( + mapped_priority_list["Organisation Reference"] == 37003, + "9 Mason Road", + mapped_priority_list["address1"] + ) + + mapped_priority_list = mapped_priority_list.rename( + columns={"UPRN": "uprn"} + ) + mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"] + + # Let's get the newest EPC data for these properties + # We merge on UPRN, when we have it + # from etl.route_march_data_pull.app import get_data + # epc_data, errors, nodata = get_data( + # asset_list=mapped_priority_list, + # fulladdress_column="Address", + # address1_column="address1", + # postcode_column="Postcode", + # manual_uprn_map={}, + # epc_api_only=True + # ) + # + # epc_df = pd.DataFrame(epc_data) + # epc_df.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"), index=False + # ) + epc_df = pd.read_csv(os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv")) + epc_df = epc_df.rename(columns={"row_id": "Organisation Reference"}) + + # We now package up the data + + # Sheet 1 is the base coordination data + output_coordination_sheet = coordinated_packages[ + [ + "Name", "Postcode", 'Organisation Reference', 'Package Ref', + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures', 'Survey: Current SAP Rating', 'Survey: Current EPC Band', + 'Survey: Primary Energy Use (kWh/yr)', + 'Survey: Primary Energy Use Intensity (kWh/m2/yr)', + 'Survey: Number of Storeys', 'Survey: Fuel Bill', + 'Survey: Window Age Description', + 'Survey: Window Age Description Proportion (%)', + 'Survey: Secondary Window Age Description', + 'Survey: Secondary Window Age Description Proportion (%)', + 'Survey: Number of Windows', 'Survey: Total Number of Doors', + 'Survey: Number of Insulated Doors', + 'Survey: Existing Primary Heating System', + 'Survey: Existing Primary Heating PCDF Reference', + 'Survey: Existing Primary Heating Controls', + 'Survey: Existing Primary Heating % of Heat', + 'Survey: Existing Secondary Heating System', + 'Survey: Existing Secondary Heating PCDF Reference', + 'Survey: Existing Secondary Heating Controls', + 'Survey: Existing Secondary Heating % of Heat', + 'Survey: Secondary Heating Code', 'Survey: Water Heating Code', + 'Survey: Total Floor Area (m2)', 'Survey: Total Ground Floor Area (m2)', + 'Survey: RIR Floor Area', 'Survey: Main Building Wall Area (m2)', + 'Survey: First Extension Wall Area (m2)', + 'Survey: Number of Light Fittings', 'Survey: Number of LEL Fittings', + 'Survey: Number of fittings needing LEL', 'Survey: Main Roof Type', + 'Survey: Main Roof Insulation', + 'Survey: Main Roof Insulation Thickness', 'Survey: Main Wall Type', + 'Survey: Main Wall Insulation', 'Survey: Main Wall Dry-lining', + 'Survey: Main Wall Thickness', + 'Survey: Main Building Alternative Wall Type', + 'Survey: Main Building Alternative Wall Insulation', + 'Survey: Main Building Alternative Wall Dry-lining', + 'Survey: Main Building Alternative Wall Thickness', + 'Survey: Main Fuel', + 'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type' + ] + ].rename( + columns={ + 'Walls': "Parity - Walls", + 'Roofs': "Parity - Roof", + 'Heating': "Parity - Heating", + 'Main Fuel': "Parity - Fuel", + 'Age': "Parity - Age Band", + 'Property Type': "Parity - Property Type" + } + ) + + # Sheet 2 is the lookup table which maps the properties to their closest match + # We need to bring in the parity attributes between the mapped properties so we can see side-by-side + mapped_lookup = matches_df[ + [ + 'Organisation Reference', + 'Best Match Organisation Reference', + 'Survey: Current EPC Band', + 'Survey: Current SAP Rating' + ] + ].rename( + columns={ + 'Best Match Organisation Reference': "Best Match - Organisation Reference", + "Survey: Current EPC Band": "Best Match - Survey: Current EPC Band", + 'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating" + } + ).merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]], + how="left", + on="Organisation Reference" + ).merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]].rename( + columns={ + "Organisation Reference": "Best Match - Organisation Reference", + "Walls": "Best Match - Walls", + "Roofs": "Best Match - Roof", + "Heating": "Best Match - Heating", + "Main Fuel": "Best Match - Main Fuel", + "Age": "Best Match - Age", + "Property Type": "Best Match - Property Type" + } + ), + how="left", + on="Best Match - Organisation Reference" + ).merge( + coordinated_packages[ + [ + "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation', + 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness', + 'Survey: Existing Primary Heating System', + ] + ].rename( + columns={ + "Organisation Reference": "Best Match - Organisation Reference", + 'Survey: Main Wall Type': 'Best Match - Survey: Main Wall Type', + 'Survey: Main Wall Insulation': 'Best Match - Survey: Main Wall Insulation', + 'Survey: Main Roof Type': 'Best Match - Survey: Main Roof Type', + 'Survey: Main Roof Insulation': 'Best Match - Survey: Main Roof Insulation', + 'Survey: Main Roof Insulation Thickness': 'Best Match - Survey: Main Roof Insulation Thickness', + 'Survey: Existing Primary Heating System': 'Best Match - Survey: Existing Primary Heating System', + } + ), + how="left", + on="Best Match - Organisation Reference" + ) + + # Finally, we have the property, against the mapped home with the estimate SAP scores and the EPC data + worksheet = mapped_priority_list[ + [ + 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', + 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', + 'Heating', 'Main Fuel', 'Hot Water', 'Estimated SAP Rating', 'Estimated EPC Rating' + ] + ].rename( + columns={ + "SAP": "Parity - SAP Rating", + "SAP Band": "Parity - EPC Rating", + "Property Type": "Parity - Property Type", + "Walls": "Parity - Walls", + "Roofs": "Parity - Roofs", + 'Glazing': "Parity - Glazing", + 'Heating': 'Parity - Heating', + 'Main Fuel': 'Parity - Main Fuel', + 'Hot Water': 'Parity - Hot Water', + } + ).merge( + epc_df[ + [ + "Organisation Reference", + "uprn", + "current-energy-efficiency", + "current-energy-rating", + "lodgement-date", + "construction-age-band", + "walls-description", + "roof-description", + "mainheat-description", + "windows-description", + "hotwater-description", + "main-fuel", + "total-floor-area", + ] + ].rename( + columns={ + "uprn": "Last EPC - uprn", + "current-energy-efficiency": "Last EPC - SAP Score", + "current-energy-rating": "Last EPC - EPC Rating", + "lodgement-date": "Last EPC - Date Lodged", + "construction-age-band": "Last EPC - Age Band", + "walls-description": "Last EPC - Walls", + "roof-description": "Last EPC - Roof", + "mainheat-description": "Last EPC - Heating", + "windows-description": "Last EPC - Windows", + "hotwater-description": "Last EPC - Hot Water", + "main-fuel": "Last EPC - Main Fuel", + "total-floor-area": "Last EPC - Total Floor Area" + } + ), + how="left", + on='Organisation Reference' + ) + + worksheet["Years Since Last EPC"] # if __name__ == "__main__": # main() diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 247ce98c..3432b744 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -20,7 +20,7 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map): +def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=True): epc_data = [] errors = [] no_epc = [] @@ -33,6 +33,11 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m if house_no is None: house_no = house_number uprn = manual_uprn_map.get(full_address, None) + if uprn is None and home.get("uprn"): + uprn = home["uprn"] + + if pd.isnull(uprn): + uprn = None searcher = SearchEpc( address1=str(house_no), @@ -88,6 +93,15 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m no_epc.append(home["row_id"]) continue + if epc_api_only: + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy() + } + + epc_data.append(epc) + continue + # Look for EPC recommendatons try: property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) diff --git a/survey_report/app.py b/survey_report/app.py index 87ce7864..be31bd52 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -1,6 +1,9 @@ import os import PyPDF2 from string import Template + +import pandas as pd + from survey_report.extraction.detect_report_type import detect_report_type from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor @@ -34,44 +37,54 @@ def handle(): :return: """ - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2" + folders = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1", + "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2", + "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3", + "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 4", + "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 5", + ] + data = [] + for data_folder in folders: - folder_contents = os.listdir(data_folder) - # We look for the following files: - # Site notes - file_mapping = {} - for file in folder_contents: - # Check if it's a pdf file - if not file.endswith(".pdf"): - continue - filepath = os.path.join(data_folder, file) - with (open(filepath, "rb") as f): - pdf = PyPDF2.PdfReader(f) - first_page = pdf.pages[0].extract_text() - text = "" - for page in pdf.pages: - text += page.extract_text() + folder_contents = os.listdir(data_folder) + # We look for the following files: + # Site notes + file_mapping = {} + for file in folder_contents: + # Check if it's a pdf file + if not file.endswith(".pdf"): + continue + filepath = os.path.join(data_folder, file) + with (open(filepath, "rb") as f): + pdf = PyPDF2.PdfReader(f) + first_page = pdf.pages[0].extract_text() + text = "" + for page in pdf.pages: + text += page.extract_text() - # Check the report type - report_type = detect_report_type(first_page) - if report_type is not None: - file_mapping[report_type] = text + # Check the report type + report_type = detect_report_type(first_page) + if report_type is not None: + file_mapping[report_type] = text - # This is only set up to work with quido site notes so we must have it - site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) - site_notes = site_notes_extractor.extract_all() + # This is only set up to work with quido site notes so we must have it + site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) + site_notes = site_notes_extractor.extract_all() - # We also must have an EPR - epr_extractor = EPRExtractor(file_mapping["quidos_epr"]) - epr = epr_extractor.extract_all() + # We also must have an EPR + epr_extractor = EPRExtractor(file_mapping["quidos_epr"]) + epr = epr_extractor.extract_all() - # We now produce the combined data sheet which is the starting figure: - data_sheet = {**epr, **site_notes} - del data_sheet['Building Dimensions'] - # We unnest the Total Building Dimensions - data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] - data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] - del data_sheet["Total Building Dimensions"] + # We now produce the combined data sheet which is the starting figure: + data_sheet = {**epr, **site_notes} + del data_sheet['Building Dimensions'] + # We unnest the Total Building Dimensions + data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] + data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] + del data_sheet["Total Building Dimensions"] + data.append(data_sheet) + data = pd.DataFrame(data) # Generate the HTML report # Placeholder locations From 7885467fa40240a2a2632b4b6e120cce5a047c61 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 4 Feb 2025 14:35:24 +0000 Subject: [PATCH 167/255] formatting output --- .../stonewater/Wave 3 Preparation.py | 42 +++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 1748f624..fcde164e 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3777,6 +3777,21 @@ def revised_model(): no_match = [] matches = [] for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)): + + # We check if the property was surveyed + survey_result = coordinated_packages[ + coordinated_packages["Organisation Reference"] == home["Organisation Reference"] + ] + if not survey_result.empty: + to_extend = [ + { + "Organisation Reference": home["Organisation Reference"], + "Best Match Organisation Reference": m, + "Was Surveyed": True + } for m in survey_result["Organisation Reference"].values + ] + matches.extend(to_extend) + closest_match = find_nearest_matching_property(coordinated_packages, home) if closest_match is None: no_match.append(home["Organisation Reference"]) @@ -3785,7 +3800,8 @@ def revised_model(): to_extend = [ { "Organisation Reference": home["Organisation Reference"], - "Best Match Organisation Reference": m + "Best Match Organisation Reference": m, + "Was Surveyed": False } for m in closest_match["Organisation Reference"].values ] matches.extend(to_extend) @@ -4010,7 +4026,8 @@ def revised_model(): [ 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', - 'Heating', 'Main Fuel', 'Hot Water', 'Estimated SAP Rating', 'Estimated EPC Rating' + 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion', + 'Estimated SAP Rating', 'Estimated EPC Rating' ] ].rename( columns={ @@ -4023,6 +4040,7 @@ def revised_model(): 'Heating': 'Parity - Heating', 'Main Fuel': 'Parity - Main Fuel', 'Hot Water': 'Parity - Hot Water', + 'Proportion': 'Proportion of matched properties with same EPC rating', } ).merge( epc_df[ @@ -4061,7 +4079,25 @@ def revised_model(): on='Organisation Reference' ) - worksheet["Years Since Last EPC"] + worksheet["Years Since Last EPC"] = pd.Timestamp.now().year - pd.to_datetime( + worksheet["Last EPC - Date Lodged"]).dt.year + + worksheet["Last EPC - uprn"] = worksheet["Last EPC - uprn"].astype("Int64").astype(str) + + worksheet["uprn"] = np.where( + pd.isnull(worksheet["uprn"]) & pd.notnull(worksheet["Last EPC - uprn"]), + worksheet["Last EPC - uprn"], + worksheet["uprn"] + ) + + worksheet["uprn"] = worksheet["uprn"].replace("", "") + + # Save to Excel with multiple sheets + excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "04022025 Stonewater Priority List.xlsx") + with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer: + worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True) + mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True) + output_coordination_sheet.to_excel(writer, sheet_name="Coordination", index=False, header=True) # if __name__ == "__main__": # main() From 77844c625eb1b00f140c1f64224b4101a51e1ca5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 10 Feb 2025 15:41:33 +0000 Subject: [PATCH 168/255] minor --- etl/customers/panacap/assets.py | 61 +++++ etl/customers/remote_assessments/app.py | 34 +-- .../stonewater/Wave 3 Preparation.py | 16 +- .../stonewater/potential_eco_properties.py | 250 ++++++++---------- etl/find_my_epc/RetrieveFindMyEpc.py | 19 +- etl/route_march_data_pull/app.py | 149 ++++++++--- recommendations/Recommendations.py | 2 + 7 files changed, 324 insertions(+), 207 deletions(-) create mode 100644 etl/customers/panacap/assets.py diff --git a/etl/customers/panacap/assets.py b/etl/customers/panacap/assets.py new file mode 100644 index 00000000..ec57d9a4 --- /dev/null +++ b/etl/customers/panacap/assets.py @@ -0,0 +1,61 @@ +import os + +import pandas as pd +from dotenv import load_dotenv + +from etl.spatial.OpenUprnClient import OpenUprnClient +from etl.route_march_data_pull.app import get_data + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +addresses = [ + {"address": "3 Willis Road", "postcode": "CB1 2AQ"}, + {"address": "22 Catharine Street", "postcode": "CB1 3AW"}, + {"address": "332 Mill Road", "postcode": "CB1 3NN"}, + {"address": "330 Mill Road", "postcode": "CB1 3NN"}, + {"address": "328 Mill Road", "postcode": "CB1 3NN"}, + {"address": "71 Mill Road", "postcode": "CB1 2AS"}, + {"address": "78 Argyle Street", "postcode": "CB1 3LZ"}, + {"address": "9 Graham Road", "postcode": "CB4 2ZE"}, + {"address": "217 Mill Road", "postcode": "CB1 3BE"}, + {"address": "374 Mill Road", "postcode": "CB1 3NN"}, + {"address": "174 Thoday Street", "postcode": "CB1 3AX"}, + {"address": "37 Abbey Road", "postcode": "CB5 8HH"}, + {"address": "18 Upper Gwydir Street", "postcode": "CB1 2LR"}, + {"address": "21 Fulbourn Road Fulbourn", "postcode": "CB1 9JL"}, + {"address": "108 Argyle Street", "postcode": "CB1 3LS"}, + {"address": "115 Victoria Road", "postcode": "CB4 3BS"}, + {"address": "55 Ross Street", "postcode": "CB1 3BP"}, + {"address": "16 Kingston Street", "postcode": "CB1 2NU"}, + {"address": "13 Thoday Street", "postcode": "CB1 3AS"}, + {"address": "103 York Street", "postcode": "CB1 2PZ"}, +] + +asset_list = pd.DataFrame(addresses) +asset_list["row_id"] = asset_list.index + +epc_data, _, _ = get_data( + asset_list=asset_list, fulladdress_column="address", postcode_column="postcode", address1_column="address", + manual_uprn_map={}, epc_api_only=True +) + +epc_df = pd.DataFrame(epc_data) +epc_df.shape + +asset_list = asset_list.merge( + epc_df, how="left", on="row_id" +) + +asset_list = asset_list.rename(columns={"address_x": "Address", "postcode_x": "Postcode"}) +asset_list["uprn"] = asset_list["uprn"].astype(str) + +spatial_data = OpenUprnClient.get_spatial_data([x["uprn"] for x in epc_data], bucket_name="retrofit-data-dev") +spatial_data["UPRN"] = spatial_data["UPRN"].astype(str) + +asset_list = asset_list.merge( + spatial_data, how="left", left_on="uprn", right_on="UPRN" +) + +asset_list.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Panacap/Acquisitions EPC Data.csv", + index=False) diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 13cdc41b..e1298565 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 126 +PORTFOLIO_ID = 127 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,22 +19,9 @@ def app(): asset_list = [ { - "address": "Garden Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "building_id": 1, - "uprn": 308249, - }, - { - "address": "Top Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "building_id": 1, - "uprn": 308251 - }, - { - "address": "First Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "building_id": 1, - "uprn": 308250, + "address": "49 Brailsford Road", + "postcode": "M14 6PT", + "uprn": 77145666, } ] asset_list = pd.DataFrame(asset_list) @@ -65,18 +52,7 @@ def app(): valuation_data = [ { - "address": "Garden Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "valuation": 337_000 - }, - { - "addresss": "Top Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "valuation": 337_000 - }, - { - "address": "First Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", + "uprn": 77145666, "valuation": 337_000 } ] diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fcde164e..b2a92e4c 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3777,7 +3777,6 @@ def revised_model(): no_match = [] matches = [] for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)): - # We check if the property was surveyed survey_result = coordinated_packages[ coordinated_packages["Organisation Reference"] == home["Organisation Reference"] @@ -3791,6 +3790,7 @@ def revised_model(): } for m in survey_result["Organisation Reference"].values ] matches.extend(to_extend) + continue closest_match = find_nearest_matching_property(coordinated_packages, home) if closest_match is None: @@ -3821,6 +3821,7 @@ def revised_model(): # (3953, 6), (2948, 6), (2969, 7), (2575, 7) matches_df = pd.DataFrame(matches) + matches_df = matches_df.merge( coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]], left_on="Best Match Organisation Reference", right_on="Organisation Reference", @@ -3837,7 +3838,8 @@ def revised_model(): "Number of matches": 1, "Proportion": 100, "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0], - "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0] + "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0], + "Was Surveyed": mapped_matches["Was Surveyed"].values[0], } ) continue @@ -3857,7 +3859,8 @@ def revised_model(): "Number of matches": number_of_matches, "Proportion": proportion_with_this_epc, "Estimated SAP Rating": average_rating, - "Estimated EPC Rating": average_epc_rating + "Estimated EPC Rating": average_epc_rating, + "Was Surveyed": False } ) @@ -3973,7 +3976,8 @@ def revised_model(): 'Organisation Reference', 'Best Match Organisation Reference', 'Survey: Current EPC Band', - 'Survey: Current SAP Rating' + 'Survey: Current SAP Rating', + "Was Surveyed" ] ].rename( columns={ @@ -4027,7 +4031,7 @@ def revised_model(): 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion', - 'Estimated SAP Rating', 'Estimated EPC Rating' + 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed" ] ].rename( columns={ @@ -4092,6 +4096,8 @@ def revised_model(): worksheet["uprn"] = worksheet["uprn"].replace("", "") + worksheet = worksheet.drop(columns=["Last EPC - uprn"]) + # Save to Excel with multiple sheets excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "04022025 Stonewater Priority List.xlsx") with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer: diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index bda9c30c..eef82eae 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -217,78 +217,7 @@ def app(): ) ) - # We get the EPC data - # epc_data = json.loads( - # read_from_s3( - # bucket_name="retrofit-data-dev", - # s3_file_name="customers/Stonewater/clustering/epc_data.json" - # ) - # ) - # epc_data = pd.DataFrame(epc_data) - # - # epc_data["uprn"] = np.where( - # epc_data["internal_id"] == 1091, - # 83143766, - # epc_data["uprn"] - # ) - # - # epc_data_batch_2 = read_pickle_from_s3( - # s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", - # bucket_name="retrofit-data-dev" - # ) - # epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) - # - # complete_epcs = pd.concat([epc_data, epc_data_batch_2]) - # - # epcs_to_merge = complete_epcs[ - # [ - # "uprn", - # "address", - # "postcode", - # "property-type", - # "built-form", - # "inspection-date", - # "current-energy-rating", - # "current-energy-efficiency", - # "roof-description", - # "walls-description", - # "transaction-type", - # "secondheat-description", - # "total-floor-area", - # "construction-age-band", - # "floor-height", - # "number-habitable-rooms", - # "mainheat-description", - # "energy-consumption-current" - # ] - # ].rename( - # columns={ - # "address": "Address", - # "postcode": "Postcode", - # "inspection-date": "Date of last EPC", - # "current-energy-efficiency": "SAP score on register", - # "current-energy-rating": "EPC rating on register", - # "property-type": "Property Type", - # "built-form": "Archetype", - # "total-floor-area": "Property Floor Area", - # "construction-age-band": "Property Age Band", - # "floor-height": "Property Floor Height", - # "number-habitable-rooms": "Number of Habitable Rooms", - # "walls-description": "Wall Construction", - # "roof-description": "Roof Construction", - # "mainheat-description": "Heating Type", - # "secondheat-description": "Secondary Heating", - # "transaction-type": "Reason for last EPC", - # "energy-consumption-current": "Heat Demand (kWh/m2)", - # } - # ) - # # We de-dupe, taking the newest on the date the EPC was lod - # epcs_to_merge["Date of last EPC"] = pd.to_datetime(epcs_to_merge["Date of last EPC"]) - # epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False) - # epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn") - stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str) - stonewater_cavity_properties["Reason Included"].value_counts() # Find the postcodes where an Osmosis survey revealed a need for CWI postcodes_found_needing_cwi = stonewater_cavity_properties[ stonewater_cavity_properties["Reason Included"].isin( @@ -339,12 +268,7 @@ def app(): "Renewables": "Parity - Renewables", "Total Floor Area": "Parity - Total Floor Area" } - ) # .merge( - # epcs_to_merge, - # how="left", - # left_on="UPRN", - # right_on="uprn" - # ) + ) # We now flag the additional properties in the as built list @@ -434,12 +358,11 @@ def app(): additional_properties["Suspected Needs CWI - not surveyed"] = ( ( - additional_properties["Postcode"].isin(postcodes_found_needing_cwi) + additional_properties["Postcode"].isin(postcodes_found_needing_cwi) & + ~additional_properties["Installed under ECO3"] ) ) - additional_properties["Same Postcode as Installed under ECO3"].value_counts() - # We drop Full Address additional_properties = additional_properties.drop(columns=["Full Address"]) additional_properties2 = additional_properties[[ @@ -461,65 +384,57 @@ def app(): "Renewables": "Parity - Renewables", "Total Floor Area": "Parity - Total Floor Area" } - ) # .merge( - # pd.DataFrame(additional_properties_epcs)[ - # [ - # "row_id", - # "property-type", - # "built-form", - # "inspection-date", - # "current-energy-rating", - # "current-energy-efficiency", - # "roof-description", - # "walls-description", - # "transaction-type", - # "secondheat-description", - # "total-floor-area", - # "construction-age-band", - # "floor-height", - # "number-habitable-rooms", - # "mainheat-description", - # "energy-consumption-current" - # ] - # ].rename( - # columns={ - # "inspection-date": "Date of last EPC", - # "current-energy-efficiency": "SAP score on register", - # "current-energy-rating": "EPC rating on register", - # "property-type": "Property Type", - # "built-form": "Archetype", - # "total-floor-area": "Property Floor Area", - # "construction-age-band": "Property Age Band", - # "floor-height": "Property Floor Height", - # "number-habitable-rooms": "Number of Habitable Rooms", - # "walls-description": "Wall Construction", - # "roof-description": "Roof Construction", - # "mainheat-description": "Heating Type", - # "secondheat-description": "Secondary Heating", - # "transaction-type": "Reason for last EPC", - # "energy-consumption-current": "Heat Demand (kWh/m2)", - # } - # ), - # how="left", - # on="row_id" - # ) + ) + + # Combine the data: + full_dataset = pd.concat([stonewater_cavity_properties, additional_properties2]) + + # We not define the priority list for non-intrusives + full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2] + full_dataset["Postal Region 2"] = full_dataset["Postcode"].str.split(" ").str[0] + + # Strip out anything we definitely don't want + full_dataset = full_dataset[~full_dataset["Installed under ECO3"]] + + areas = full_dataset[full_dataset["Suspected Needs CWI - not surveyed"] == True]["Postal Region 2"].unique() + + priorities = full_dataset[ + full_dataset["Postal Region 2"].isin(areas) + ] + + region_prevalance = priorities["Postal Region 2"].value_counts().to_frame().reset_index() + region_prevalance = region_prevalance[region_prevalance["count"] > 100] + df = priorities[priorities["Postal Region 2"].isin(region_prevalance["Postal Region 2"].values)] + + df["Postal Region"].value_counts() + df["Postal Region 2"].value_counts() + + if df["Installed under ECO3"].sum(): + raise ValueError("There are properties in the priority list that were installed under ECO3") + + df.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - " + "revised list.xlsx", + index=False + ) # We save the data locally - stonewater_cavity_properties.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority " - "postcodes.csv", - index=False - ) - additional_properties2.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - " - "non-priority postcodes.csv", - index=False - ) - # Save the survey findings - needs_cwi.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv", - index=False - ) + # stonewater_cavity_properties.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority " + # "postcodes.csv", + # index=False + # ) + # additional_properties2.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - " + # "non-priority postcodes.csv", + # index=False + # ) + # # Save the survey findings + # needs_cwi.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - + # WIP.csv", + # index=False + # ) def cross_reference_epc_programme(): @@ -528,6 +443,12 @@ def cross_reference_epc_programme(): "SURVEYED - ECO3 NOT COMPLETED.xlsx" ) + for _, x in eco3_fallout.iterrows(): + house_no = SearchEpc.get_house_number(x["ADDRESS"], "") + if house_no is None: + house_no = x["ADDRESS"].split(",")[0] + x["house_number"] = house_no + eco3_fallout["house_number"] = eco3_fallout.apply( lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1 ) @@ -558,3 +479,58 @@ def cross_reference_epc_programme(): stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90) ] match.head() + + +def finalise_list_for_non_intrusives(): + non_intrusives_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/20250207 Stonewater " + "Non-Intrusives.xlsx" + ) + + # Remove anything installed under ECO3 + non_intrusives_list = non_intrusives_list[~non_intrusives_list["Installed under ECO3"]] + + # We make any properties that were surveyed by Osmosis + packages = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Stonewater - Bid Packages WIP 14.11.20 V2 " + "(1).xlsx", + header=13, + sheet_name="Modelled Packages" + ) + + non_intrusives_list["Surveyed by Osmosis"] = non_intrusives_list["Address ID"].isin( + packages["Address ID"].values + ) + # Removed 54 addresses + final_non_intrusives = non_intrusives_list[ + ~non_intrusives_list["Surveyed by Osmosis"] + ] + + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + + # Add on the orgnisaion reference + final_non_intrusives = final_non_intrusives.merge( + features[["Organisation Reference", "Address ID"]], + how="left", + on="Address ID" + ) + + final_non_intrusives["Postal Region"] = final_non_intrusives["Postcode"].str.split(" ").str[0].str[0:2] + selected_regions = final_non_intrusives[ + final_non_intrusives["Include in non-intrusives"] + ]["Postcode"].unique() + + final_non_intrusives["Is in region"] = final_non_intrusives["Postcode"].isin(selected_regions) + + # Filter down: + final_non_intrusives = final_non_intrusives[ + final_non_intrusives["Is in region"] + ] + + final_non_intrusives.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives " + "List - final.xlsx") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index f93a5a73..eaba1058 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -25,6 +25,7 @@ class RetrieveFindMyEpc: self.postcode = postcode self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower() + self.walls = [] @staticmethod def extract_low_carbon_sources(soup): @@ -102,6 +103,8 @@ class RetrieveFindMyEpc: # 2) Bills estimates # 3) Recommendations and SAP points # 4) Low and zero carbon energy sources + # 5) The wall types of the property - used for determining if we have an extension wall insulation# + # recommendation ratings = address_res.find('desc', {'id': 'svg-desc'}).text current_rating = ratings.split(".")[0] @@ -208,6 +211,17 @@ class RetrieveFindMyEpc: if key not in assessment_data: raise ValueError(f"Missing key: {key}") + # The wall types of the property + property_features_table = address_res.find("tbody", class_="govuk-table__body") + property_features_table = property_features_table.find_all("tr") + + # Extract wall types + self.walls = [] + for row in property_features_table: + cells = row.find_all("td") + if row.find("th").text.strip() == "Wall": + self.walls.append(cells[0].text.strip()) + # Finally, we format the recommendations recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date) @@ -229,8 +243,7 @@ class RetrieveFindMyEpc: return resulting_data - @staticmethod - def format_recommendations(recommendations, assessment_data, sap_2012_date=None): + def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None): """ This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey :param recommendations: The recommendations from the EPC @@ -330,6 +343,8 @@ class RetrieveFindMyEpc: for rec in recommendations: mapped = measure_map[rec["measure"]] for measure in mapped: + if measure == "cavity_wall_insulation" and "solid brick" in self.walls[0].lower(): + measure = "extension_cavity_wall_insulation" to_append = { "type": measure, "sap_points": rec["sap_points"], diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 3432b744..cc50caae 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -1,5 +1,6 @@ import os import time +import pickle import pandas as pd import numpy as np @@ -20,7 +21,7 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=True): +def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=False): epc_data = [] errors = [] no_epc = [] @@ -116,10 +117,14 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() except ValueError as e: if "No EPC found" in str(e) and "address1" in searcher.newest_epc: - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e): + find_epc_data = {} else: find_epc_data = {} except Exception as e: @@ -176,19 +181,33 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/For Housing" - DATA_FILENAME = "For Housing Data pull.xlsx" - SHEET_NAME = "Sheet1" - POSTCODE_COLUMN = "Post Code" - FULLADDRESS_COLUMN = None - ADDRESS1_COLUMN = "NO." + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People" + DATA_FILENAME = "Regulated Stock - Do Not Change (06.06.24).xlsx" + SHEET_NAME = "Assets 1" + POSTCODE_COLUMN = "Postcode" + FULLADDRESS_COLUMN = "Address" + ADDRESS1_COLUMN = "AddressLine1" ADDRESS1_METHOD = None - ADDRESS_COLS_TO_CONCAT = ["NO.", "Street / Block Name"] + ADDRESS_COLS_TO_CONCAT = [] + MISSING_POSTCODES_METHOD = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) + + if MISSING_POSTCODES_METHOD is not None: + if MISSING_POSTCODES_METHOD == "last_two_words": + # Replace any double spaces + asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False) + asset_list["Postcode"] = np.where( + pd.isnull(asset_list["Postcode"]), + asset_list[FULLADDRESS_COLUMN].str.split(" ").str[-2:].str.join(" "), + asset_list["Postcode"] + ) + else: + raise ValueError(f"Method {MISSING_POSTCODES_METHOD} not recognized") + asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() asset_list["row_id"] = asset_list.index @@ -217,29 +236,46 @@ def app(): asset_list = asset_list[~asset_list["deduper"].duplicated()] asset_list = asset_list.drop(columns=["deduper"]) - epc_data, errors, no_epc = get_data( - asset_list=asset_list, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, - manual_uprn_map=MANUAL_UPRN_MAP - ) + # We chunk up this data into 5000 rows at a time + chunk_size = 5000 + epc_data = [] + errors = [] + no_epc = [] + skip = None # Used to skip already completed chunks + for i in range(0, len(asset_list), chunk_size): + print(f"Processing chunk {i} to {i + chunk_size}") + if skip is not None: + if i <= skip: + continue + chunk = asset_list[i:i + chunk_size] + epc_data_chunk, errors_chunk, no_epc_chunk = get_data( + asset_list=chunk, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN, + manual_uprn_map=MANUAL_UPRN_MAP + ) - # We now retrieve any failed properties - asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] - epc_data_failed, _, _ = get_data( - asset_list=asset_list_failed, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, - manual_uprn_map=MANUAL_UPRN_MAP - ) + # We now retrieve any failed properties + chunk_failed = chunk[chunk["row_id"].isin(errors)] + epc_data_failed, _, _ = get_data( + asset_list=chunk_failed, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN, + manual_uprn_map=MANUAL_UPRN_MAP, + epc_api_only=False + ) - no_data = asset_list[asset_list["row_id"].isin(no_epc)] - print(no_data[[FULLADDRESS_COLUMN, POSTCODE_COLUMN]]) + epc_data_chunk.extend(epc_data_failed) + errors.extend(errors_chunk) + no_epc.extend(no_epc_chunk) - # Append the failed data to the main data - epc_data.extend(epc_data_failed) + # Append the failed data to the main data + # Store the chunk locally as a csv + pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) + + epc_data.extend(epc_data_chunk) epc_df = pd.DataFrame(epc_data) @@ -339,7 +375,7 @@ def app(): "current-energy-efficiency": "SAP score on register", "current-energy-rating": "EPC rating on register", "property-type": "Property Type", - "built-form": "Archetype", + "built-form": "Archetype - EPC", "total-floor-area": "Property Floor Area", "construction-age-band": "Property Age Band", "floor-height": "Property Floor Height", @@ -375,7 +411,7 @@ def app(): num_floors=x["Estimated Number of Floors"], floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, perimeter=x["Estimated Perimeter (m)"], - built_form=x["Archetype"] + built_form=x["Archetype - EPC"] ), axis=1 ) @@ -406,3 +442,48 @@ def app(): matches_review = asset_list[ [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] ] + + +import requests +import base64 + +API_KEY = "c4afe10370d67eeaa44f067dd37d115263f6c90e" +URL = "https://epc.opendatacommunities.org/api/v1/domestic/search?size=20" +email = "itskruel@gmail.com" + +AUTH_TOKEN = base64.b64encode( + ":".join([email, API_KEY]).encode("utf-8") +) + +AUTH_TOKEN = "aXRza3J1ZWxAZ21haWwuY29tOmM0YWZlMTAzNzBkNjdlZWFhNDRmMDY3ZGQzN2QxMTUyNjNmNmM5MGU=" + +headers = { + "Authorization": "Basic {auth_token}".format(auth_token=AUTH_TOKEN), + "Accept": "application/json", +} + +params = { + "UPRN": "766024370" +} + +response = requests.get(url="https://epc.opendatacommunities.org/api/v1/domestic/search?size=20&UPRN=766024370", + headers=headers) +response.json() + +data = response.json() + +from operator import itemgetter + +newest = sorted(data["rows"], key=itemgetter('lodgement-date')) +data["rows"][0]["lodgement-date"] +data["rows"][1]["lodgement-date"] + +import pandas as pd + +df = pd.DataFrame(data["rows"]) + +df["uprn"].values[2] + +df[df["uprn"] == "3455035000"]["property-type"] + +from backend.apis.GoogleSolarApi import GoogleSolarApi diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 15614a0b..03e651e8 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -503,7 +503,9 @@ class Recommendations: impact_summary.append( { "phase": rec["phase"], + "representative": rec["recommendation_id"] in representative_ids, "recommendation_id": rec["recommendation_id"], + "measure_type": rec["measure_type"], "sap": sap + rec["sap_points"], "carbon": carbon - rec["co2_equivalent_savings"], "heat_demand": heat_demand - rec["heat_demand"], From 61544d01db865af74608e8d2e9d1ea3e9d727dde Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 12 Feb 2025 10:14:14 +0000 Subject: [PATCH 169/255] updating data pull code --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/remote_assessments/app.py | 10 +- .../stonewater/potential_eco_properties.py | 12 +- etl/route_march_data_pull/app.py | 322 ++++++++++++++---- 5 files changed, 274 insertions(+), 74 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - +
diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index e1298565..f32dcea6 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -19,9 +19,9 @@ def app(): asset_list = [ { - "address": "49 Brailsford Road", - "postcode": "M14 6PT", - "uprn": 77145666, + "address": "19 Hillcrest Court", + "postcode": "IP21 4YJ", + "uprn": 2630134524, } ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 77145666, - "valuation": 337_000 + "uprn": 2630134524, + "valuation": 96_000 } ] # Store valuation data to s3 diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index eef82eae..6666ce15 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -368,9 +368,10 @@ def app(): additional_properties2 = additional_properties[[ "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing", "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3', - 'Same Postcode as Installed under ECO3' + 'Same Postcode as Installed under ECO3', "Organisation Reference", ]].rename( columns={ + "Organisation Reference": "Org. ref.", "SAP": "Parity - Predicted SAP", "SAP Band": "Parity - Predicted SAP Band", "Age": "Parity - Build Age", @@ -387,7 +388,12 @@ def app(): ) # Combine the data: - full_dataset = pd.concat([stonewater_cavity_properties, additional_properties2]) + + stonewater_cavity_properties2 = stonewater_cavity_properties.merge( + features[["Address", "Organisation Reference"]], how="left", on="Organisation Reference" + ) + full_dataset = pd.concat([stonewater_cavity_properties2, additional_properties2]) + full_dataset = full_dataset.drop(columns=['Osm. ID']) # We not define the priority list for non-intrusives full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2] @@ -414,7 +420,7 @@ def app(): df.to_csv( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - " - "revised list.xlsx", + "revised list.csv", index=False ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index cc50caae..dba85b3f 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -1,7 +1,6 @@ import os import time -import pickle - +from BaseUtility import Definitions import pandas as pd import numpy as np from tqdm import tqdm @@ -17,6 +16,10 @@ from recommendations.recommendation_utils import ( estimate_number_of_floors ) +from etl.epc_clean.epc_attributes.attribute_utils import ( + extract_thermal_transmittance +) + load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") @@ -158,6 +161,53 @@ def extract_address1(asset_list, full_address_col, method="first_two_words"): raise ValueError(f"Method {method} not recognized") +def process_age_band(x, year_built_column): + year_built = float(x[year_built_column]) + + if pd.isnull(x["Property Age Band"]) or ( + x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES + ) or pd.isnull(year_built): + return "No EPC Age Band" + + # We check if we have a numeric data + if x["Property Age Band"].isdigit(): + if year_built == float(x["Property Age Band"]): + return "EPC Age Band Matches Year Built" + if year_built > float(x["Property Age Band"]): + return "EPC Age Band is older than Year Built" + if year_built < float(x["Property Age Band"]): + return "EPC Age Band is newer than Year Built" + + # Handle specific case + if x["Property Age Band"] == "England and Wales: 2007 onwards": + if year_built >= 2007: + return "EPC Age Band Matches Year Built" + if year_built < 2007: + return "EPC Age Band is older than Year Built" + + if x["Property Age Band"] == "England and Wales: before 1900": + if year_built < 1900: + return "EPC Age Band Matches Year Built" + if year_built >= 1900: + return "EPC Age Band is newer than Year Built" + + # Age band will be formatted as such: + # 'England and Wales: {upper date}-{lower date}' + # so we extract the lower and upper date + age_band = x["Property Age Band"].split(": ")[1] + lower_date, upper_date = age_band.split("-") + if year_built <= float(upper_date) and year_built <= float(upper_date): + return "EPC Age Band Matches Year Built" + + if year_built > float(upper_date): + return "EPC Age Band is older than Year Built" + + if year_built < float(upper_date): + return "EPC Age Band is newer than Year Built" + + raise Exception("Should not reach here") + + def app(): """ This app is EPC pulling data for some properties owned by Livewest @@ -179,17 +229,47 @@ def app(): Heat loss calculations EPC recommendations Property UPRN - """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People" - DATA_FILENAME = "Regulated Stock - Do Not Change (06.06.24).xlsx" - SHEET_NAME = "Assets 1" + + # TODO: + # For cavity work: + # - Flag any entries that have a different wall type between non-intrusive data against EPC + # - Worth double checking entries that have a difference in wall construction + # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity + # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation + # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats + # are less than C75 + # - Flag anything pre SAP2012 + # - Flag anything over 5 years old + # - Look at year built vs age band + # + # For Solar: + # - Discount any that have solar PV - based on non-intrusives and from the inspections team + # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with + # electric room heaters but it might need to be an EPC E + # - Fabric - check the floor, wall and roof: + # - Filled or empty cavity is good + # - Insulated solid/timber/system built is good + # - SCIS/CEG needs solid floors + # - JJC don’t care + # - Anything with a loft 200 or below + # - Anything C75 and above won’t qualify + # - Insulated loft = 200mm + # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) + # - Or the insulation required is loft/cavity (floors should be solid) + + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Eastlight" + DATA_FILENAME = "Eastlight addresses potential PV data pull required.xlsx" + SHEET_NAME = "Sheet1" POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = "Address" - ADDRESS1_COLUMN = "AddressLine1" + FULLADDRESS_COLUMN = None + ADDRESS1_COLUMN = "HouseName" ADDRESS1_METHOD = None - ADDRESS_COLS_TO_CONCAT = [] + ADDRESS_COLS_TO_CONCAT = [ + "HouseName", "Block", "Address1" + ] MISSING_POSTCODES_METHOD = None + PROPERTY_YEAR_BUILT = 'Built In Year' # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -216,6 +296,7 @@ def app(): asset_list[col] = asset_list[col].astype(str) asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) + asset_list[col] = asset_list[col].str.strip() if ADDRESS1_COLUMN is None: ADDRESS1_COLUMN = "address1_extracted" @@ -226,7 +307,15 @@ def app(): if FULLADDRESS_COLUMN is None: FULLADDRESS_COLUMN = "fulladdress_extracted" # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas - asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1) + # Sometimes, some of the columns are empty, so we need to remove them + asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply( + lambda x: ", ".join([y for y in x if not pd.isnull(y)]), axis=1 + ) + + # We clean up portential non-breaking spaces, and double spaces + asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].astype(str) + asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False) + asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False) # We check for duplicated addresses asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] @@ -237,8 +326,10 @@ def app(): asset_list = asset_list.drop(columns=["deduper"]) # We chunk up this data into 5000 rows at a time + # Create the chunks directory + if not os.path.exists(os.path.join(DATA_FOLDER, "Chunks")): + os.makedirs(os.path.join(DATA_FOLDER, "Chunks")) chunk_size = 5000 - epc_data = [] errors = [] no_epc = [] skip = None # Used to skip already completed chunks @@ -275,9 +366,19 @@ def app(): # Store the chunk locally as a csv pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) - epc_data.extend(epc_data_chunk) + # We read in and concatenate the created created chunks + chunks_folder = os.path.join(DATA_FOLDER, "Chunks") + # List the contents + chunk_files = os.listdir(chunks_folder) + epc_data = [] + for file in chunk_files: + csv_data = pd.read_csv(os.path.join(chunks_folder, file)) + # We need to convert the recommendations back to a list + csv_data["recommendations"] = csv_data["recommendations"].apply(eval) + csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) + epc_data.append(csv_data) - epc_df = pd.DataFrame(epc_data) + epc_df = pd.concat(epc_data) # We expand out the recommendations recommendations_df = epc_df[["row_id", "recommendations"]] @@ -302,9 +403,9 @@ def app(): transformed_data.append(row_data) transformed_df = pd.DataFrame(transformed_data) - # Drop the column that is "" - if "" in transformed_df.columns: - transformed_df = transformed_df.drop(columns=[""]) + # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation + # recommendations + transformed_df = transformed_df[["row_id", "Cavity wall insulation"]] # Get the find my epc data find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( @@ -342,7 +443,9 @@ def app(): "energy-consumption-current", # kwh/m2 "photo-supply", ] - ].rename(columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}) + ].rename( + columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"} + ) asset_list = asset_list.merge( epc_df, @@ -422,6 +525,138 @@ def app(): axis=1 ) + # We produce some additional fields + # 1) Is the SAP rating below C75 + asset_list["SAP Rating is 75 and below"] = asset_list["SAP score on register"] <= 75 + # 2) Flag anything where the EPC is older than 5 years + cutoff_year = pd.Timestamp.now().year - 5 + asset_list[f"EPC is pre {cutoff_year}"] = ( + pd.to_datetime(asset_list["Date of last EPC"]).dt.year < cutoff_year + ) + + # 3) If we have year in the asset list, we flag entries where the built year is different from the + # EPC Age band + if PROPERTY_YEAR_BUILT is not None: + asset_list["Does Age Match EPC Age Band?"] = asset_list.apply( + lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1 + ) + + # 4) Flag properties that look like they're good candidates for solar installs + # Firstly, flag if the fabric is completely done + + insulated_wall_substrings = [ + ", insulated", "with external insulation", "with internal insulation", "filled cavity" + ] + + insulated_roof_substrings = [ + "(another dwelling above)", "limited insulation", "(other premises above)", + ", no insulation", + ] + + def check_solar_insulation_conditions(x): + + if pd.isnull(x["Wall Construction"]): + return None + + if "average thermal transmittance" in x["Wall Construction"].lower(): + # We extract out the u-values + wall_uvalue = extract_thermal_transmittance({}, x["Wall Construction"])[0]["thermal_transmittance"] + roof_uvalue = extract_thermal_transmittance({}, x["Roof Construction"])[0]["thermal_transmittance"] + floor_uvalue = extract_thermal_transmittance({}, x["Floor Construction"])[0]["thermal_transmittance"] + + roof_uvalue = 0 if roof_uvalue is None else roof_uvalue + floor_uvalue = 0 if floor_uvalue is None else floor_uvalue + + # We apply some cutoffs + if wall_uvalue < 0.7 and roof_uvalue < 0.7 and floor_uvalue < 0.7: + return "Walls, Roof and Floor have U-values below 0.7" + + return "Confirm U-values" + + walls_insulated = any( + insulated_substring in x["Wall Construction"].lower() for insulated_substring in insulated_wall_substrings + ) + roof_is_numeric = False + if str(x["Roof Insulation Thickness"]).isdigit(): + roof_is_numeric = True + roof_insulated = int(x["Roof Insulation Thickness"]) >= 200 + else: + roof_insulated = any( + insulated_substring in x["Roof Construction"].lower() for insulated_substring in + insulated_roof_substrings + ) + + floor_is_solid = "solid" in x["Floor Construction"].lower() + + if walls_insulated and roof_insulated and floor_is_solid: + return "Walls Insulated, Roof Insulated, Floor Solid" + + if walls_insulated and floor_is_solid and roof_is_numeric: + return "Walls Insulated, Floor Solid, Loft need top-up" + + return "Not Fully Insulated or no data" + + asset_list["Solar Fabric Condition"] = asset_list.apply(check_solar_insulation_conditions, axis=1) + + asset_list["Good Solar Candidate"] = ( + asset_list["SAP Rating is 75 and below"] & + ~asset_list["Has Solar PV"] & + ( + asset_list["Heating Type"].isin( + [ + "Electric storage heaters", + "Room heaters, electric", + ] + ) | asset_list["Heating Type"].str.contains("heat pump", case=False) + ) & ( + asset_list["Solar Fabric Condition"].isin( + [ + "Walls Insulated, Roof Insulated, Floor Solid", + "Walls, Roof and Floor have U-values below 0.7", + "Walls Insulated, Floor Solid, Loft need top-up" + ] + ) + ) + ) + + def flat_analysis(asset_list): + + # We need to deduce the building name - we strip out the house number + def extract_building_name(x): + # TODO: This doesn't really work + if pd.isnull(x): + return None + house_no = SearchEpc.get_house_number(address=x, postcode=None) + if house_no: + return x.replace(house_no, "").strip() + return x.split(",")[0].strip() + + # We want to deduce if flats have 50% of the properties below C75 + # We group by postcode and property type + grouped = asset_list.groupby(["Postcode", "Property Type"]) + + flat_data = [] + for _, group in grouped: + if "flat" in group["Property Type"].str.lower().values: + num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0) + num_below_c75 = group["SAP score on register"].lt(75).sum() + + flat_data.append( + { + "Postcode": group["Postcode"].iloc[0], + "Property Type": "Flat", + "Number of Flats with EPC": num_flats, + "Number of Flats below C75": num_below_c75, + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats) + } + ) + + flat_data = pd.DataFrame(flat_data) + + return flat_data + + flat_data = flat_analysis(asset_list) + # For all of the columns in transformed_df, prefix with "Recommendation: " for col in transformed_df.columns: if col == "row_id": @@ -436,54 +671,13 @@ def app(): asset_list = asset_list.drop(columns=["row_id", "index"]) # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull - Main.xlsx" - asset_list.to_excel(filename, index=False) + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" + # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data + + with pd.ExcelWriter(filename) as writer: + asset_list.to_excel(writer, sheet_name="EPC Data", index=False) + flat_data.to_excel(writer, sheet_name="Flat Data", index=False) matches_review = asset_list[ [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] ] - - -import requests -import base64 - -API_KEY = "c4afe10370d67eeaa44f067dd37d115263f6c90e" -URL = "https://epc.opendatacommunities.org/api/v1/domestic/search?size=20" -email = "itskruel@gmail.com" - -AUTH_TOKEN = base64.b64encode( - ":".join([email, API_KEY]).encode("utf-8") -) - -AUTH_TOKEN = "aXRza3J1ZWxAZ21haWwuY29tOmM0YWZlMTAzNzBkNjdlZWFhNDRmMDY3ZGQzN2QxMTUyNjNmNmM5MGU=" - -headers = { - "Authorization": "Basic {auth_token}".format(auth_token=AUTH_TOKEN), - "Accept": "application/json", -} - -params = { - "UPRN": "766024370" -} - -response = requests.get(url="https://epc.opendatacommunities.org/api/v1/domestic/search?size=20&UPRN=766024370", - headers=headers) -response.json() - -data = response.json() - -from operator import itemgetter - -newest = sorted(data["rows"], key=itemgetter('lodgement-date')) -data["rows"][0]["lodgement-date"] -data["rows"][1]["lodgement-date"] - -import pandas as pd - -df = pd.DataFrame(data["rows"]) - -df["uprn"].values[2] - -df[df["uprn"] == "3455035000"]["property-type"] - -from backend.apis.GoogleSolarApi import GoogleSolarApi From 959d29b675a6b8e6c57074d5a9fe5a3973ed1d96 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 12 Feb 2025 15:20:55 +0000 Subject: [PATCH 170/255] allowing optional ashp cop parameter --- backend/app/plan/router.py | 5 +++-- backend/app/plan/schemas.py | 2 ++ etl/customers/l_and_g/ic_slides.py | 5 ++++- recommendations/Recommendations.py | 25 +++++++++++++++++++------ 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 04a2ef7f..f85ceacc 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -370,7 +370,7 @@ def extract_property_request_data( property_non_invasive_recommendations["recommendations"] = str(transformed) # Check if the valuation data has uprn - valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else True + valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else False if valuation_has_uprn: valuation_has_uprn = valuation_data[0]["uprn"] not in ["", None] @@ -692,7 +692,8 @@ async def trigger_plan(body: PlanTriggerRequest): Recommendations.calculate_recommendation_tenant_savings( property_instance=property_instance, kwh_simulation_predictions=kwh_simulation_predictions, - property_recommendations=property_recommendations + property_recommendations=property_recommendations, + ashp_cop=body.ashp_cop ) ) property_instance.current_energy_bill = property_current_energy_bill diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py index f84912fe..618bec90 100644 --- a/backend/app/plan/schemas.py +++ b/backend/app/plan/schemas.py @@ -80,3 +80,5 @@ class PlanTriggerRequest(BaseModel): multi_plan: Optional[bool] = False optimise: Optional[bool] = True default_u_values: Optional[bool] = True + + ashp_cop: Optional[float] = 2.8 diff --git a/etl/customers/l_and_g/ic_slides.py b/etl/customers/l_and_g/ic_slides.py index 72dfc2c0..a5cb3511 100644 --- a/etl/customers/l_and_g/ic_slides.py +++ b/etl/customers/l_and_g/ic_slides.py @@ -132,7 +132,7 @@ def get_data(portfolio_id, scenario_ids): return properties_data, plans_data, recommendations_data -properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[199]) +properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[205]) properties_df = pd.DataFrame(properties_data) plans_df = pd.DataFrame(plans_data) @@ -240,4 +240,7 @@ df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"] df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round() df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x)) +df["Recommendation: Air Source Heat Pump"].sum() +df["Cost: Air Source Heat Pump"].sum() + df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon Data Export - 2.csv", index=False) diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 03e651e8..42f4e783 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -649,7 +649,9 @@ class Recommendations: return property_recommendations, impact_summary @staticmethod - def map_descriptions_to_fuel(heating_description, hotwater_description, main_fuel_description): + def map_descriptions_to_fuel( + heating_description, hotwater_description, main_fuel_description, descriptions_to_fuel_types + ): # Handle the case of community schemes if (heating_description == "Community scheme") or (hotwater_description == "Community scheme"): @@ -662,7 +664,7 @@ class Recommendations: } raise NotImplementedError("Handle this case") - mapped = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[heating_description] + mapped = descriptions_to_fuel_types[heating_description] heating_fuel = mapped["fuel"] if hotwater_description in [ @@ -682,7 +684,7 @@ class Recommendations: "heating_cop": mapped["cop"], "hotwater_cop": 1 } - mapped_hotwater = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[hotwater_description] + mapped_hotwater = descriptions_to_fuel_types[hotwater_description] return { "heating_fuel_type": heating_fuel, "hotwater_fuel_type": mapped_hotwater["fuel"], @@ -691,7 +693,7 @@ class Recommendations: @classmethod def calculate_recommendation_tenant_savings( - cls, property_instance, kwh_simulation_predictions, property_recommendations + cls, property_instance, kwh_simulation_predictions, property_recommendations, ashp_cop=None ): """ This method inserts the kwh savings and the bill savings that the customer will make from the recommendations @@ -703,9 +705,12 @@ class Recommendations: :param property_instance: Instance of the Property class, for the home associated to property_id :param kwh_simulation_predictions: dictionary of predictions from the model apis :param property_recommendations: dictionary of recommendations for the property + :param ashp_cop: The coefficient of performance for the air source heat pump. :return: """ + ashp_cop = ashp_cop if ashp_cop else assumptions.AVERAGE_ASHP_EFFICIENCY + kwh_impact_table = kwh_simulation_predictions["heating_kwh_predictions"][ kwh_simulation_predictions["heating_kwh_predictions"]["property_id"] == str(property_instance.id) ].merge( @@ -774,12 +779,19 @@ class Recommendations: if kwh_impact_table.loc[i, col] > previous_phase[col].max(): kwh_impact_table.loc[i, col] = previous_phase[col].max() + descriptions_to_fuel_types = assumptions.DESCRIPTIONS_TO_FUEL_TYPES + # We will the air source heat pump efficiencies + ashp_keys = [k for k in descriptions_to_fuel_types.keys() if "air source heat pump" in k.lower()] + for k in ashp_keys: + descriptions_to_fuel_types[k]["cop"] = ashp_cop + # For heating system recommendations, this could result in a fuel type change so we reflect that fuel_mapping = pd.DataFrame([ { "id": epc["id"], **cls.map_descriptions_to_fuel( - epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"] + epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"], + descriptions_to_fuel_types ) } for epc in property_instance.updated_simulation_epcs ]) @@ -793,7 +805,8 @@ class Recommendations: **cls.map_descriptions_to_fuel( property_instance.data["mainheat-description"], property_instance.data["hotwater-description"], - property_instance.data["main-fuel"] + property_instance.data["main-fuel"], + descriptions_to_fuel_types ) } ] From 6396f081c15a56dcb799db1edd64edbb89c56921 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 12 Feb 2025 16:19:52 +0000 Subject: [PATCH 171/255] stonewater extracting age --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/stonewater/Wave 3 Preparation.py | 7 ++++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b2a92e4c..24a8e9bb 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -125,6 +125,7 @@ def extract_summary_report(pdf_path): - Address """ + blah data = { "Address": None, "Postcode": None, @@ -701,6 +702,7 @@ def extract_epr(pdf_path): "Primary Energy Use (kWh/yr)": None, "Primary Energy Use Intensity (kWh/m2/yr)": None, "Number of Storeys": None, + "Main Building Age Band": None, "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -779,6 +781,10 @@ def extract_epr(pdf_path): floor_area = re.search(r"Total Floor Area\s(?P\d+)\s?m2", text).group("floor_area") data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) + # Extract age band + age_band_match = re.search(r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4})", text) + data["Main Building Age Band"] = age_band_match.group(1) + # Extract Number of Storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) data["Number of Storeys"] = int(storeys_match.group(1)) @@ -3022,7 +3028,6 @@ def revised_model(): # We now do a large pull of all of the data extracted_data = [] for survey_folder in tqdm(survey_folders): - survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) # Check that the survey folder is actually a folder From 84d4070b490a04d0cf4fdefc20ab4aaaab1d7d05 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 12 Feb 2025 17:10:21 +0000 Subject: [PATCH 172/255] extracting from ima --- .../stonewater/Wave 3 Preparation.py | 61 ++++++++++++++++++- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 24a8e9bb..e471211c 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -125,13 +125,13 @@ def extract_summary_report(pdf_path): - Address """ - blah data = { "Address": None, "Postcode": None, "Current SAP Rating": None, "Current EPC Band": None, "Fuel Bill": None, + "Main Building Age Band": None, "Number of Storeys": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -181,6 +181,10 @@ def extract_summary_report(pdf_path): sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] + # Extract age + age_band_match = re.search(r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4})", text) + data["Main Building Age Band"] = age_band_match.group(1) + # Number of storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) data["Number of Storeys"] = int(storeys_match.group(1)) @@ -3027,6 +3031,7 @@ def revised_model(): # We now do a large pull of all of the data extracted_data = [] + mtp_extracted_data = [] # Additional data to extract from the medium term plans for survey_folder in tqdm(survey_folders): survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) @@ -3048,6 +3053,58 @@ def revised_model(): None ) + mtp_folder = next( + (name for name in survey_subfolders if "mid-term" in name.lower() or "mtp" in name.lower()), + None + ) + if mtp_folder: + # We have a mid term plan: + mtp_folder_path = os.path.join(survey_folder_path, mtp_folder) + # Get the contents - files and not folder + mtp_contents = [ + os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path) + if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file)) + ] + # We check the the IMA + for file_name in mtp_contents: + filepath = os.path.join(survey_folder_path, file_name) + # We expect a pdf so try and parse it + try: + with open(filepath, "rb") as file: + reader = PyPDF2.PdfReader(file) + # Just the first page + text = reader.pages[0].extract_text() + + except Exception as e: + continue + + # We check if this is an IMA + ima_heading_search = re.search( + r"Improvement measure\s+Capital Cost\s+Lifetime of\s*\n\s*measureFuel saving\s*Lifetime fuel", text + ) + + is_ima = bool(ima_heading_search) + if not is_ima: + continue + + # Otherwise, extract: RIR, PV + pv_search = re.search(r"PV \(\d+Kwp\)", text) + has_pv = bool(pv_search) + pv_system = pv_search.group(0) if has_pv else None + + rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text) + has_rir = bool(rir_search) + rir_spec = rir_search.group(0) if has_rir else None + + mtp_extracted_data.append({ + "survey_folder": survey_folder, + "has_pv": has_pv, + "PV System": pv_system, + "RIR Specification": rir_spec, + "has_rir": has_rir + }) + continue + # If retrofit assessment folder exists, check if it has content if retrofit_folder or ra_folder: if retrofit_folder: @@ -3094,7 +3151,7 @@ def revised_model(): retrofit_assessment_data = pd.DataFrame(extracted_data) # retrofit_assessment_data.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), index=False + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False # ) retrofit_assessment_data = pd.read_csv( os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), From 711db3f552e958128faeb49a22073e5461dbc4f6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Feb 2025 07:59:12 +0000 Subject: [PATCH 173/255] adding v1 extraction to stonewater --- .../stonewater/Wave 3 Preparation.py | 53 +++++++++++++++++-- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index e471211c..12158671 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -182,7 +182,10 @@ def extract_summary_report(pdf_path): data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] # Extract age - age_band_match = re.search(r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4})", text) + age_band_match = re.search( + r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", + text + ) data["Main Building Age Band"] = age_band_match.group(1) # Number of storeys @@ -786,7 +789,11 @@ def extract_epr(pdf_path): data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) # Extract age band - age_band_match = re.search(r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4})", text) + age_band_match = re.search( + r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", + text + ) + data["Main Building Age Band"] = age_band_match.group(1) # Extract Number of Storeys @@ -3065,8 +3072,21 @@ def revised_model(): os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path) if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file)) ] + + has_v1 = [ + f for f in mtp_contents if "v1" in f.lower() or "/ss" in f.lower() + ] + + if has_v1: + # Then we go one level deeper + mtp_contents = [ + os.path.join(has_v1[0], f) for f in + os.listdir(os.path.join(survey_folder_path, has_v1[0])) + ] + # We check the the IMA for file_name in mtp_contents: + filepath = os.path.join(survey_folder_path, file_name) # We expect a pdf so try and parse it try: @@ -3092,6 +3112,12 @@ def revised_model(): has_pv = bool(pv_search) pv_system = pv_search.group(0) if has_pv else None + # We perform a second search for PV: + if pv_search is None: + pv_search = re.search("solar pv", text.lower()) + has_pv = bool(pv_search) + pv_system = "Solar PV" if has_pv else None + rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text) has_rir = bool(rir_search) rir_spec = rir_search.group(0) if has_rir else None @@ -3149,12 +3175,20 @@ def revised_model(): extracted_data.append(summary_data) retrofit_assessment_data = pd.DataFrame(extracted_data) + mtp_df = pd.DataFrame(mtp_extracted_data) + # Save # retrofit_assessment_data.to_csv( # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False # ) + # mtp_df.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), index=False + # ) retrofit_assessment_data = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), + ) + mtp_df = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), ) # Remove some definite duplicates @@ -3164,6 +3198,9 @@ def revised_model(): # Get all of the folders that end with ROSS to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() + # Replace \n with "" + retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") + retrofit_assessment_data = retrofit_assessment_data[ ~retrofit_assessment_data["survey_folder"].isin( [ @@ -3173,8 +3210,6 @@ def revised_model(): ] + to_drop ) ] - # Replace \n with "" - retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") retrofit_assessments_data_columns = [ 'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)', @@ -3685,9 +3720,17 @@ def revised_model(): if not missed_asset_id.empty: raise Exception("Missing Asset ID") + # We merge the mpt data on to the wates coordination + wates_coordination = wates_coordination.merge( + mtp_df, how="left", on="survey_folder" + ) + ccs_coordination = ccs_coordination.merge( ccs_matching_lookup, how="left", on="Name" ) + ccs_coordination = ccs_coordination.merge( + mtp_df, how="left", on="survey_folder" + ) retrofit_packages_board = retrofit_packages_board.merge( matching_lookup, how="left", on="Name" From b8a094106c7a8ff7260648ba18d8d48b8f8715e1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Feb 2025 17:28:47 +0000 Subject: [PATCH 174/255] updating stonewater --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/remote_assessments/app.py | 12 ++-- .../stonewater/Wave 3 Preparation.py | 72 ++++++++++++------- etl/customers/stonewater/data_cleaning.py | 59 ++++++++------- 5 files changed, 89 insertions(+), 58 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index f32dcea6..70ceb76d 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 127 +PORTFOLIO_ID = 128 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,9 +19,9 @@ def app(): asset_list = [ { - "address": "19 Hillcrest Court", - "postcode": "IP21 4YJ", - "uprn": 2630134524, + "address": "46", + "postcode": "BS6 7BD", + "uprn": 61091, } ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 2630134524, - "valuation": 96_000 + "uprn": 61091, + "valuation": 897_000 } ] # Store valuation data to s3 diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 12158671..94904aae 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3028,11 +3028,12 @@ def revised_model(): "10. Little Island", "11. CCS Dorset" ] + wave_21_folder_name = "Wave 2.1 Surveys - 2" for wave_2_1_folder in wave_21_folders: - folder_path = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 2.1 Surveys", wave_2_1_folder) + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder) if os.path.isdir(folder_path): # Check if folder exists - folder_contents = [os.path.join("Wave 2.1 Surveys", wave_2_1_folder, file) for file in + folder_contents = [os.path.join(wave_21_folder_name, wave_2_1_folder, file) for file in os.listdir(folder_path)] survey_folders.extend(folder_contents) # Append contents to the master list @@ -3179,18 +3180,32 @@ def revised_model(): # Save # retrofit_assessment_data.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), index=False # ) # mtp_df.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), index=False + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), index=False # ) retrofit_assessment_data = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), ) mtp_df = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), ) + # There are a few duplicates we just manually drop + mtp_df = mtp_df.drop_duplicates() + mtp_df = mtp_df[ + ~(( + mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/1. Herefordshire/(043) Manor Fields 27" + ) & (~mtp_df["has_pv"])) + ] + + mtp_df = mtp_df[ + ~(( + mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/2. Bedfordshire/(147) Gilpin Close 5" + ) & (~mtp_df["has_pv"])) + ] + # Remove some definite duplicates dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"] dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)] @@ -3487,7 +3502,7 @@ def revised_model(): ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"] ccs_manual_filters = { - "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35" + "35 Kittiwake Close": f"{wave_21_folder_name}/11. CCS Dorset/Kittiwake Close 35" } ccs_matching_lookup = [] for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): @@ -3583,13 +3598,13 @@ def revised_model(): ] wates_manual_filters = { - "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View", - "14 Edencroft": "Wave 2.1 Surveys/3. Wiltshire/14 Edencroft", - "Flat 31 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/Flat 31 Rabley Wood View", - 'Flat 13, Manor Fields': 'Wave 2.1 Surveys/1. Herefordshire/(038) Manor Fields Flat 13', - "4 Kittys Lane": "Wave 2.1 Surveys/1. Herefordshire/(005) Kittys Lane 4", - '1 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 1', - '2 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 2', + "24 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/24-25 Rabley Wood View", + "14 Edencroft": f"{wave_21_folder_name}/3. Wiltshire/14 Edencroft", + "Flat 31 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/Flat 31 Rabley Wood View", + 'Flat 13, Manor Fields': f'{wave_21_folder_name}/1. Herefordshire/(038) Manor Fields Flat 13', + "4 Kittys Lane": f"{wave_21_folder_name}/1. Herefordshire/(005) Kittys Lane 4", + '1 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 1', + '2 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 2', } wates_matching_lookup = [] # Examples to skip when we cannot get the data @@ -3720,6 +3735,9 @@ def revised_model(): if not missed_asset_id.empty: raise Exception("Missing Asset ID") + if wates_coordination["Asset ID_x"].duplicated().sum(): + raise Exception("Duplicated IDs in wates") + # We merge the mpt data on to the wates coordination wates_coordination = wates_coordination.merge( mtp_df, how="left", on="survey_folder" @@ -3839,29 +3857,31 @@ def revised_model(): def find_nearest_matching_property(coordinated_packages, home): filter_levels = [ - ["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], - ["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], - ["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], + (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 1), + (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), + (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), + (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 4), + (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 5), + (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 6), ] - for i, filters in enumerate(filter_levels): + max_confidence = max([confidence for (_, confidence) in filter_levels]) + + for i, (filters, match_confidence) in enumerate(filter_levels): match = coordinated_packages.copy() for col in filters: match = match[match[col] == home[col]] if not match.empty: - return match + return match, match_confidence # Finally, we search for a property in the same Archetype match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]] if not match.empty: - return match + return match, max_confidence + 1 - return None # No match found + return None, None # No match found coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip() @@ -3896,8 +3916,8 @@ def revised_model(): ] matches.extend(to_extend) continue - - closest_match = find_nearest_matching_property(coordinated_packages, home) + blah + closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home) if closest_match is None: no_match.append(home["Organisation Reference"]) continue diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py index 010902ce..a5da0c79 100644 --- a/etl/customers/stonewater/data_cleaning.py +++ b/etl/customers/stonewater/data_cleaning.py @@ -86,8 +86,14 @@ def download_data_from_sharepoint(): folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" ) + folders_to_keep = [ + "1. Herefordshire", "2. Bedfordshire", "3. Wiltshire", "4. Bournemouth", + "5. Coventry", "6. West Sussex", "7. Dorset", "8. Cambridgeshire", + "9. Guildford", "10. Little Island", "11. CCS Dorset", + ] + folders_to_pull = [ - folder for folder in contents["value"] if folder["name"] in ["3. Wiltshire", "4. Bournemouth", "5. Coventry"] + folder for folder in contents["value"] if folder["name"] in folders_to_keep ] for folder_to_pull in folders_to_pull: # Get the contents @@ -109,35 +115,40 @@ def download_data_from_sharepoint(): ) if not property_folder_contents.get("value"): continue - # We look for the retrofit assessment folder: + # We look for the retrofit assessment folder or mtp folders: property_sub_folders = [ - f for f in property_folder_contents["value"] if "ra coordinator info" in f["name"].lower() + f for f in property_folder_contents["value"] if + "ra coordinator info" in f["name"].lower() or + "retrofit assessment" in f["name"].lower() or + "ra info" in f["name"].lower() or + "mtp" in f["name"].lower() or + "mid-term" in f["name"].lower() ] if not property_sub_folders: continue - # if we have this, we download the folder and store it on my laptop! - property_sub_folder = property_sub_folders[0] + for property_sub_folder in property_sub_folders: + # if we have this, we download the folder and store it on my laptop! - property_folder_path = os.path.join( - "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders", - folder_to_pull["name"], - property_folder["name"], - property_sub_folder["name"] - ) + property_folder_path = os.path.join( + "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) - download_dir = os.path.join( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys", - folder_to_pull["name"], - property_folder["name"], - property_sub_folder["name"] - ) + download_dir = os.path.join( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys - 2", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) - # We download the folder - sharepoint_client.download_sharepoint_folder( - drive_id=sharepoint_client.document_drive["id"], - folder_path=property_folder_path, - download_dir=download_dir, - excluded_file_types=["MOV", "jpg"] - ) + # We download the folder + sharepoint_client.download_sharepoint_folder( + drive_id=sharepoint_client.document_drive["id"], + folder_path=property_folder_path, + download_dir=download_dir, + excluded_file_types=["MOV", "jpg"] + ) From bd131a2f663056fb46a906d8f148b2bcc06cd871 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Feb 2025 22:32:31 +0000 Subject: [PATCH 175/255] preparing outputs for stonewater --- .../stonewater/Wave 3 Preparation.py | 77 +++++++++++++++---- 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 94904aae..50dadcaf 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2984,6 +2984,8 @@ def revised_model(): original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str) + wave_21_folder_name = "Wave 2.1 Surveys - 2" + # Check if we have all of the addresses missed = original_archetypes[ ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) @@ -3028,7 +3030,6 @@ def revised_model(): "10. Little Island", "11. CCS Dorset" ] - wave_21_folder_name = "Wave 2.1 Surveys - 2" for wave_2_1_folder in wave_21_folders: folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder) @@ -3252,7 +3253,9 @@ def revised_model(): 'Main Wall Thickness', 'Main Building Alternative Wall Type', 'Main Building Alternative Wall Insulation', 'Main Building Alternative Wall Dry-lining', - 'Main Building Alternative Wall Thickness', 'Main Fuel' + 'Main Building Alternative Wall Thickness', + 'Main Fuel', + 'Main Building Age Band', ] # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey: retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns] @@ -3795,7 +3798,8 @@ def revised_model(): "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', 'SAP Band Install Package', 'Package Approved (Client)', 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', - 'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y", + 'Ventilation', 'Heating', 'Other Measures', 'PV System', + "Asset ID.1_y", ] + retrofit_assessments_data_columns_prefixed ].rename( columns={ @@ -3811,6 +3815,7 @@ def revised_model(): 'Heating': 'Main Heating', 'Other Measures': 'Other measures', 'Asset ID.1_y': 'Organisation Reference', + "PV System": "Solar PV", } ), wates_coordination[ @@ -3818,8 +3823,7 @@ def revised_model(): "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', 'SAP Band Install Package', 'Package Approved (Client)', 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', - 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x' - + 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System" ] + retrofit_assessments_data_columns_prefixed ].rename( columns={ @@ -3835,6 +3839,7 @@ def revised_model(): 'Heating': 'Main Heating', 'Other Measures': 'Other measures', 'Asset ID_x': 'Organisation Reference', + "PV System": "Solar PV", } ) ] @@ -3857,12 +3862,12 @@ def revised_model(): def find_nearest_matching_property(coordinated_packages, home): filter_levels = [ - (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 1), - (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), - (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), - (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 4), - (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 5), - (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 6), + (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), + (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), + (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4), + (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5), + (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6), + (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7), ] max_confidence = max([confidence for (_, confidence) in filter_levels]) @@ -3911,12 +3916,13 @@ def revised_model(): { "Organisation Reference": home["Organisation Reference"], "Best Match Organisation Reference": m, + "match_confidence": 1, "Was Surveyed": True } for m in survey_result["Organisation Reference"].values ] matches.extend(to_extend) continue - blah + closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home) if closest_match is None: no_match.append(home["Organisation Reference"]) @@ -3926,6 +3932,7 @@ def revised_model(): { "Organisation Reference": home["Organisation Reference"], "Best Match Organisation Reference": m, + "match_confidence": match_confidence, "Was Surveyed": False } for m in closest_match["Organisation Reference"].values ] @@ -3953,10 +3960,29 @@ def revised_model(): suffixes=("", " - Closest Match") ) + measures_columns = [ + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures' + ] + # We want to aggregate the matches, when we have multiple aggregated_matches_df = [] for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): + + measures = coordinated_packages[ + ( + coordinated_packages["Organisation Reference"].isin( + mapped_matches['Best Match Organisation Reference'].values + ) + ) + ][measures_columns] + if mapped_matches.shape[0] == 1: + # Get the measures for this property + measures = measures.squeeze() + aggregated_matches_df.append( { "Organisation Reference": org_ref, @@ -3965,6 +3991,7 @@ def revised_model(): "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0], "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0], "Was Surveyed": mapped_matches["Was Surveyed"].values[0], + **measures } ) continue @@ -3978,6 +4005,17 @@ def revised_model(): mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[ 0] / number_of_matches * 100 ) + + measures_aggregated = {} + for m in measures_columns: + if any(~pd.isnull(measures[m])): + # Check if we have 2 unique values + vals = measures[~pd.isnull(measures[m])][m].unique() + if len(vals) > 1: + measures_aggregated[m] = ", ".join(vals) + else: + measures_aggregated[m] = vals[0] + aggregated_matches_df.append( { "Organisation Reference": org_ref, @@ -3985,7 +4023,8 @@ def revised_model(): "Proportion": proportion_with_this_epc, "Estimated SAP Rating": average_rating, "Estimated EPC Rating": average_epc_rating, - "Was Surveyed": False + "Was Surveyed": False, + **measures_aggregated } ) @@ -4002,7 +4041,6 @@ def revised_model(): def remove_leading_zero(address): return re.sub(r"^0([1-9]) ", r"\1 ", address) - # Example usage mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero) mapped_priority_list["address1"] = np.where( mapped_priority_list["Organisation Reference"] == 37004, @@ -4020,6 +4058,13 @@ def revised_model(): ) mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"] + # Flag where 2 out of the three columns have consensus + mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = ( + (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) | + (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) | + (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"]) + ) + # Let's get the newest EPC data for these properties # We merge on UPRN, when we have it # from etl.route_march_data_pull.app import get_data @@ -4081,6 +4126,7 @@ def revised_model(): 'Survey: Main Building Alternative Wall Dry-lining', 'Survey: Main Building Alternative Wall Thickness', 'Survey: Main Fuel', + 'Survey: Main Building Age Band', 'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type' ] ].rename( @@ -4133,7 +4179,8 @@ def revised_model(): [ "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation', 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness', - 'Survey: Existing Primary Heating System', + 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band', + 'Survey: Main Building Wall Area (m2)', ] ].rename( columns={ From 846cd99631923224d4ba8d776bdeaed35b08884a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 15 Feb 2025 16:05:57 +0000 Subject: [PATCH 176/255] switch off solar PV if property is listed/heritage or in a conservation area --- backend/Property.py | 5 ++++ backend/app/plan/router.py | 6 ++-- etl/customers/lambeth/re-knocks.py | 23 +++++++++++++++ .../stonewater/Wave 3 Preparation.py | 28 +++++++++++++------ etl/route_march_data_pull/app.py | 22 +++++++-------- 5 files changed, 62 insertions(+), 22 deletions(-) create mode 100644 etl/customers/lambeth/re-knocks.py diff --git a/backend/Property.py b/backend/Property.py index a495431f..e19970eb 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -395,6 +395,7 @@ class Property: primary_recommendation_id=rec["recommendation_id"], non_invasive_recommendations=self.non_invasive_recommendations, ) + self.recommendations_scoring_data.append(scoring_dict) simulation_epc = self.epc_record.prepared_epc.copy() @@ -1258,6 +1259,10 @@ class Property: if (self.building_id is not None) and (self.solar_panel_configuration is not None): return True + # If the property is in a conservation area, don't recommend + if self.restricted_measures: + return False + is_valid_property_type = self.data["property-type"] in ["House", "Bungalow", "Maisonette"] is_valid_roof_type = ( self.roof["is_flat"] or self.roof["is_pitched"] or self.roof["is_roof_room"] diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index f85ceacc..949c8e4c 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -639,8 +639,10 @@ async def trigger_plan(body: PlanTriggerRequest): recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data) recommendations_scoring_data = recommendations_scoring_data.drop( - columns=["rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", - "carbon_ending"] + columns=[ + "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", + "carbon_ending" + ] ) all_predictions = await model_api.async_paginated_predictions( diff --git a/etl/customers/lambeth/re-knocks.py b/etl/customers/lambeth/re-knocks.py new file mode 100644 index 00000000..1de91b50 --- /dev/null +++ b/etl/customers/lambeth/re-knocks.py @@ -0,0 +1,23 @@ +import pandas as pd + +data = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Lambeth Reknocks.xlsx", sheet_name="Possible Route", + header=1 +) + +data["Outcomes"].value_counts() + +# Strip out: No + +df = data[data["Outcomes"] == "See notes"] +notes_df = df[ + ("Notes (If 'no answer' under outcomes, have you checked around the property for access issues where " + "possible?)")].value_counts().to_frame() + +example = df[df["Notes (If 'no answer' under outcomes, have you checked around the property for access issues where " + "possible?)"] == ('Access to rear of property only through number 10. Overgrown athe rear of property ' + 'installer wont be able to access') + ] + +# 18 did not attend +# diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 50dadcaf..95fe4fcd 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -4093,7 +4093,9 @@ def revised_model(): 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', - 'Solar PV', 'Other measures', 'Survey: Current SAP Rating', 'Survey: Current EPC Band', + 'Solar PV', 'Other measures', + 'Survey: Current SAP Rating', + 'Survey: Current EPC Band', 'Survey: Primary Energy Use (kWh/yr)', 'Survey: Primary Energy Use Intensity (kWh/m2/yr)', 'Survey: Number of Storeys', 'Survey: Fuel Bill', @@ -4148,7 +4150,8 @@ def revised_model(): 'Best Match Organisation Reference', 'Survey: Current EPC Band', 'Survey: Current SAP Rating', - "Was Surveyed" + "Was Surveyed", + "match_confidence", ] ].rename( columns={ @@ -4157,11 +4160,13 @@ def revised_model(): 'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating" } ).merge( - features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]], + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type", + "Total Floor Area"]], how="left", on="Organisation Reference" ).merge( - features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]].rename( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type", + "Total Floor Area"]].rename( columns={ "Organisation Reference": "Best Match - Organisation Reference", "Walls": "Best Match - Walls", @@ -4169,7 +4174,8 @@ def revised_model(): "Heating": "Best Match - Heating", "Main Fuel": "Best Match - Main Fuel", "Age": "Best Match - Age", - "Property Type": "Best Match - Property Type" + "Property Type": "Best Match - Property Type", + "Total Floor Area": "Best Match - Total Floor Area" } ), how="left", @@ -4180,7 +4186,8 @@ def revised_model(): "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation', 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness', 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band', - 'Survey: Main Building Wall Area (m2)', + 'Survey: Main Building Wall Area (m2)', 'Survey: Total Floor Area (m2)', + 'Survey: Main Building Age Band', ] ].rename( columns={ @@ -4203,7 +4210,12 @@ def revised_model(): 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion', - 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed" + 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed", + 'Main Wall Insulation', + 'Secondary Wall Insulation', 'Loft insulation', 'Flat Roof', + 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation', + 'Main Heating', 'Water Heating', 'Heating Controls', 'Solar PV', + 'Other measures', "2 of 3 Data Sources Have Consensus on EPC" ] ].rename( columns={ @@ -4271,7 +4283,7 @@ def revised_model(): worksheet = worksheet.drop(columns=["Last EPC - uprn"]) # Save to Excel with multiple sheets - excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "04022025 Stonewater Priority List.xlsx") + excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "13022025 Stonewater Priority List.xlsx") with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer: worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True) mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index dba85b3f..1b937b2d 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -258,18 +258,16 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Eastlight" - DATA_FILENAME = "Eastlight addresses potential PV data pull required.xlsx" - SHEET_NAME = "Sheet1" - POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = None - ADDRESS1_COLUMN = "HouseName" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" + DATA_FILENAME = "Stonewater All Props for EPC Check 10.02.25.xlsx" + SHEET_NAME = "stonewater sap, insta" + POSTCODE_COLUMN = "Post Code" + FULLADDRESS_COLUMN = "Name" + ADDRESS1_COLUMN = "Name" ADDRESS1_METHOD = None - ADDRESS_COLS_TO_CONCAT = [ - "HouseName", "Block", "Address1" - ] + ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = 'Built In Year' + PROPERTY_YEAR_BUILT = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -633,7 +631,7 @@ def app(): # We want to deduce if flats have 50% of the properties below C75 # We group by postcode and property type - grouped = asset_list.groupby(["Postcode", "Property Type"]) + grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"]) flat_data = [] for _, group in grouped: @@ -643,7 +641,7 @@ def app(): flat_data.append( { - "Postcode": group["Postcode"].iloc[0], + "Postcode": group[POSTCODE_COLUMN].iloc[0], "Property Type": "Flat", "Number of Flats with EPC": num_flats, "Number of Flats below C75": num_below_c75, From ebed7027ac721353593f089e015a9467ae6fa43e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 15 Feb 2025 22:33:01 +0000 Subject: [PATCH 177/255] adding minimums for the number of SAP points solar PV will deliver --- backend/Property.py | 4 +++- recommendations/Recommendations.py | 7 +++++++ recommendations/SolarPvRecommendations.py | 21 ++++++++++++++++++++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index e19970eb..eaffd54d 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -1259,7 +1259,9 @@ class Property: if (self.building_id is not None) and (self.solar_panel_configuration is not None): return True - # If the property is in a conservation area, don't recommend + # If the property is in a conservation area, is listed or is a heriage building, solar panels + # become a difficult measure to generally get through planning restrictions and so we do not recommend + # solar panels if self.restricted_measures: return False diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 42f4e783..715332a5 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -623,6 +623,13 @@ class Recommendations: if li_sap_limit is not None: property_phase_impact["sap"] = min(property_phase_impact["sap"], li_sap_limit) + if rec["type"] == "solar_pv": + # We use the SAP points in the recommendation as a minimum + property_phase_impact["sap"] = ( + rec["sap_points"] if property_phase_impact["sap"] < rec["sap_points"] else + property_phase_impact["sap"] + ) + # Insert this information into the recommendation. if not rec.get("survey", False): rec["sap_points"] = property_phase_impact["sap"] diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py index 95f189d3..a97dbcb3 100644 --- a/recommendations/SolarPvRecommendations.py +++ b/recommendations/SolarPvRecommendations.py @@ -14,11 +14,16 @@ class SolarPvRecommendations: # This was previously set to 250w, but has been upped to 400 based on the systems used by Cotswolrd Energy Group SOLAR_PANEL_WATTAGE = 400 + # For domestic properties, we don't recommend a solar PV system with wattage outside of these + # bounds MAX_SYSTEM_WATTAGE = 6000 MIN_SYSTEM_WATTAGE = 1000 + # the maximum area of root we allow to be covered in solar panels for our recommendations. MAX_ROOF_AREA_PERCENTAGE = 0.7 + SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE = 1 + def __init__(self, property_instance): """ :param property_instance: Instance of the Property class, for the home associated to property_id @@ -212,6 +217,20 @@ class SolarPvRecommendations: roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / roof_area * 100) # We round up to the nearest 5 roof_coverage_percent = np.ceil(roof_coverage_percent / 5) * 5 + + # Typically, we've observed that every 5% of additional roof coverage will result in at least + # an additional 1 SAP points (though often 2 points) Given this, we can add a reasonable minimum + # for the number of SAP points we might expect. We've observed that for some cases where properties + # are hitting the higher SAP scores (e.g. EPC A and above), the model can sometimes under-predict + # the number of SAP points. This appears to be due to a relatively small number of properties + # actually achieving the upper echelons of EPC rating. This can be the case if we're simulating a + # whole house retrofit where the home is getting complete insulation, a heat pump and solar panels. + # Because panels are the final recommendation, they are often the measure that takes the home + # into the medium to high EPC A ranges and so because of a lack of training data, this means that + # we might sometime under-predict. This minimum is intended to try and reduce the negative impact + # of this. This minimum is used in Recommendations.calculate_recommendation_impact + minimum_sap_points = (roof_coverage_percent / 5) * self.SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE + for has_battery in [False, True]: cost_result = self.costs.solar_pv( has_battery=has_battery, @@ -240,7 +259,7 @@ class SolarPvRecommendations: "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": minimum_sap_points, "already_installed": already_installed, **cost_result, # This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we From 89d49690b5c9ca4efb89f3879bb7c414098e5ea2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 16 Feb 2025 17:02:51 +0000 Subject: [PATCH 178/255] added extraction of windows sap point --- etl/customers/remote_assessments/app.py | 12 ++++++------ recommendations/WindowsRecommendations.py | 12 ++++++++++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 70ceb76d..cce0f4fb 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 128 +PORTFOLIO_ID = 129 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,9 +19,9 @@ def app(): asset_list = [ { - "address": "46", - "postcode": "BS6 7BD", - "uprn": 61091, + "address": "19", + "postcode": "IP21 4YJ", + "uprn": 2630134524, } ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 61091, - "valuation": 897_000 + "uprn": 2630134524, + "valuation": 96_000 } ] # Store valuation data to s3 diff --git a/recommendations/WindowsRecommendations.py b/recommendations/WindowsRecommendations.py index 1f755369..46e56c93 100644 --- a/recommendations/WindowsRecommendations.py +++ b/recommendations/WindowsRecommendations.py @@ -215,21 +215,29 @@ class WindowsRecommendations: "glazed-type": glazed_type_ending, } + measure_type = "double_glazing" if not is_secondary_glazing else "secondary_glazing" + + non_invasive_recommendation = next( + (r for r in self.property.non_invasive_recommendations if r["type"] in ["windows_glazing", measure_type]), + {} + ) + self.recommendation = [ { "phase": phase, "parts": [], "type": "windows_glazing", - "measure_type": "double_glazing" if not is_secondary_glazing else "secondary_glazing", + "measure_type": measure_type, "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": non_invasive_recommendation.get("sap_points", None), "already_installed": already_installed, **cost_result, "is_secondary_glazing": is_secondary_glazing, "description_simulation": description_simulation, "simulation_config": simulation_config, + "survey": non_invasive_recommendation.get("survey", None), } ] From c09b693922c8c3c8ac55648de2772312f319d487 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 16 Feb 2025 18:25:17 +0000 Subject: [PATCH 179/255] minor tweaks to engine during remote assessments --- backend/app/assumptions.py | 1 + backend/app/plan/router.py | 2 +- etl/customers/remote_assessments/app.py | 14 ++++++++------ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py index 841ec2c1..8d0c05be 100644 --- a/backend/app/assumptions.py +++ b/backend/app/assumptions.py @@ -54,4 +54,5 @@ DESCRIPTIONS_TO_FUEL_TYPES = { "Gas instantaneous at point of use": {"fuel": "Natural Gas", "cop": 0.85}, "Room heaters, wood logs": {"fuel": "Wood Logs", "cop": 1}, "Boiler and radiators, coal": {"fuel": "Coal", "cop": 0.85}, + "From main system, no cylinderstat": {"fuel": "Natural Gas", "cop": 0.85}, } diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 949c8e4c..76c172ee 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -338,7 +338,7 @@ def extract_property_request_data( # Because we have some non-invasive recommendations that match on address and postcode, but not UPRN # we need to check existence of uprn - has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else True + has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else False if has_uprn: has_uprn = non_invasive_recommendations[0]["uprn"] not in ["", None] diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index cce0f4fb..ad97fd41 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 129 +PORTFOLIO_ID = 132 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,9 +19,11 @@ def app(): asset_list = [ { - "address": "19", - "postcode": "IP21 4YJ", - "uprn": 2630134524, + "address": "3", + "postcode": "BB8 0JF", + "uprn": 100010509503, + "property_type": "House", + "built_form": "End-Terrace", } ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +54,8 @@ def app(): valuation_data = [ { - "uprn": 2630134524, - "valuation": 96_000 + "uprn": 100010509503, + "valuation": 116_000 } ] # Store valuation data to s3 From 764dc7901f2e7fc117a4df1053b7d9fe7eb9ad34 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 18 Feb 2025 12:20:04 +0000 Subject: [PATCH 180/255] setting up EPC data extraction process for creation of reports --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/remote_assessments/app.py | 14 +-- etl/route_march_data_pull/app.py | 16 +-- survey_report/app.py | 152 +++++++++++++++++++++--- 5 files changed, 151 insertions(+), 35 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index ad97fd41..15f59c5e 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 132 +PORTFOLIO_ID = 133 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,11 +19,9 @@ def app(): asset_list = [ { - "address": "3", - "postcode": "BB8 0JF", - "uprn": 100010509503, - "property_type": "House", - "built_form": "End-Terrace", + "address": "40", + "postcode": "PE4 5BB", + "uprn": 100090220519, } ] asset_list = pd.DataFrame(asset_list) @@ -54,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 100010509503, - "valuation": 116_000 + "uprn": 100090220519, + "valuation": 135_000 } ] # Store valuation data to s3 diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 1b937b2d..f9cb7cbb 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -258,16 +258,16 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" - DATA_FILENAME = "Stonewater All Props for EPC Check 10.02.25.xlsx" - SHEET_NAME = "stonewater sap, insta" - POSTCODE_COLUMN = "Post Code" - FULLADDRESS_COLUMN = "Name" - ADDRESS1_COLUMN = "Name" - ADDRESS1_METHOD = None + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing" + DATA_FILENAME = "Community Housing PV data pull.xlsx" + SHEET_NAME = "Community Housing" + POSTCODE_COLUMN = "Postcode" + FULLADDRESS_COLUMN = "Full Address" + ADDRESS1_COLUMN = None + ADDRESS1_METHOD = "first_word" ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = None + PROPERTY_YEAR_BUILT = "Build_Date" # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} diff --git a/survey_report/app.py b/survey_report/app.py index be31bd52..774d2a15 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -1,4 +1,5 @@ import os +import requests import PyPDF2 from string import Template @@ -31,31 +32,135 @@ def generate_html_report(template_path, output_path, data): print(f"HTML report generated successfully: {output_path}") +class PlacidApi: + # Errors as defined by docs: https://placid.app/docs/2.0/rest/errors + ERROR_CODES = { + 400: "Bad request", + 401: "Unauthorized", + 404: "Template Not found", + 422: "Validation error", + 429: "Rate limit exceeded", + 500: "Internal server error", + } + + def __init__(self, api_key): + self.api_key = api_key + + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + "Accept": "application/json", + } + + def create_pdf( + self, + template_uuid: str, + current_epc_rating: str, + current_epc_rating_colour: str, + post_retrofit_epc_rating: str, + post_retrofit_epc_rating_colour: str, + ): + url = "https://api.placid.app/api/rest/pdfs" + + body = { + "webhook_success": None, + "passthrough": None, + "pages": [ + { + "template_uuid": template_uuid, + "layers": { + "current_epc_rating": { + "text": current_epc_rating, + "text_color": current_epc_rating_colour, + }, + "post_retrofit_epc_rating": { + "text": post_retrofit_epc_rating, + "text_color": post_retrofit_epc_rating_colour, + } + }, + }, + ] + } + + response = requests.post( + url, + headers=self.headers, + json=body + ) + + response_body = response.json() + pdf_id = response_body["id"] + + def get_pdf(self, pdf_id: str): + """ + Poll the API every 5 seconds until the PDF is ready + """ + url = f"https://api.placid.app/api/rest/pdfs/{pdf_id}" + + response = requests.get( + url, + headers=self.headers + ) + response_body = response.json() + + url = response_body["pdf_url"] + # Download the PDF form this uurl + pdf_download = requests.get(url) + with open("output.pdf", "wb") as f: + f.write(pdf_download.content) + + def handle(): """ Performs the data extraction process for the survey report :return: """ + PLACID_API_KEY = "placid-mpkwidzer2mens9h-hifa3dmbxpfeghpa" + TEMPLATE_UUID = "hnwqgtumckfbf" + placid_api = PlacidApi(PLACID_API_KEY) + + EPC_COLOURS = { + "A": "#117d58", + "B": "#2da55c", + "C": "#8dbd40", + "D": "#f7cd14", + "E": "#f3a96a", + "F": "#ef8026", + "G": "#e41e3b", + } + folders = [ - "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1", - "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2", - "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3", - "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 4", - "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 5", + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 " + "WILLIS ROAD FLAT 1 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 WILLIS " + "ROAD FLAT 1 PRE EPR PDF.pdf", + "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 " + "WILLIS ROAD FLAT 1 POST EPR PDF.pdf" + }, + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 " + "WILLIS ROAD FLAT 2 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 WILLIS " + "ROAD FLAT 2 PRE EPR PDF.pdf", + "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 " + "WILLIS ROAD FLAT 2 POST EPR PDF.pdf" + }, + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 " + "WILLIS ROAD FLAT 3 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 WILLIS " + "ROAD FLAT 3 PRE EPR PDF.pdf", + "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 " + "WILLIS ROAD FLAT 3 POST EPR PDF.pdf" + }, ] data = [] - for data_folder in folders: + for data_config in folders: - folder_contents = os.listdir(data_folder) - # We look for the following files: - # Site notes file_mapping = {} - for file in folder_contents: - # Check if it's a pdf file - if not file.endswith(".pdf"): - continue - filepath = os.path.join(data_folder, file) + for filename, filepath in data_config.items(): with (open(filepath, "rb") as f): pdf = PyPDF2.PdfReader(f) first_page = pdf.pages[0].extract_text() @@ -66,16 +171,27 @@ def handle(): # Check the report type report_type = detect_report_type(first_page) if report_type is not None: - file_mapping[report_type] = text + file_mapping[filename] = text # This is only set up to work with quido site notes so we must have it - site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) + site_notes_extractor = SiteNotesExtractor(file_mapping["site_notes"]) site_notes = site_notes_extractor.extract_all() # We also must have an EPR - epr_extractor = EPRExtractor(file_mapping["quidos_epr"]) + epr_extractor = EPRExtractor(file_mapping["epr"]) epr = epr_extractor.extract_all() + scenario_epr = EPRExtractor(file_mapping["scenario_epr"]) + scenario_epr = scenario_epr.extract_all() + + report_data = { + "template_uuid": TEMPLATE_UUID, + "current_epc_rating": site_notes["Current EPC Band"], + "current_epc_rating_colour": EPC_COLOURS[site_notes["Current EPC Band"]], + post_retrofit_epc_rating: str, + post_retrofit_epc_rating_colour: str, + } + # We now produce the combined data sheet which is the starting figure: data_sheet = {**epr, **site_notes} del data_sheet['Building Dimensions'] @@ -83,7 +199,9 @@ def handle(): data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] del data_sheet["Total Building Dimensions"] + data.append(data_sheet) + data = pd.DataFrame(data) # Generate the HTML report From 0de14c4e286b05ecd881aa05f81f1f6172472589 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 18 Feb 2025 19:49:29 +0000 Subject: [PATCH 181/255] quidos site notes extraction --- backend/ml_models/Valuation.py | 26 ++++++++- etl/route_march_data_pull/app.py | 69 ++++++++++++++++++---- survey_report/app.py | 92 ++++++++++++++++++++++------- survey_report/extraction/quidos.py | 94 +++++++++++++++++++++++++++++- 4 files changed, 243 insertions(+), 38 deletions(-) diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index 720005d3..6d4852b2 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -1,5 +1,4 @@ import numpy as np -from scipy.constants import value class PropertyValuation: @@ -216,6 +215,30 @@ class PropertyValuation: cls.UPRN_VALUE_LOOKUP.get(property_instance.uprn) ) + current_epc = property_instance.data["current-energy-rating"] + + if not current_value: + return { + "current_value": 0, + "lower_bound_increased_value": 0, + "upper_bound_increased_value": 0, + "average_increased_value": 0, + "average_increase": 0 + } + + return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost) + + @classmethod + def estimate_valuation_improvement(cls, current_value, current_epc, target_epc, total_cost=None): + """ + This function estimates the value of a property based on the current EPC rating and the target EPC rating + :param current_value: + :param current_epc: + :param target_epc: + :param total_cost: + :return: + """ + if not current_value: return { "current_value": 0, @@ -225,7 +248,6 @@ class PropertyValuation: "average_increase": 0 } - current_epc = property_instance.data["current-energy-rating"] # We get the spectrum of ratings between the current and target EPC epc_band_range = cls.EPC_BANDS[cls.EPC_BANDS.index(current_epc): cls.EPC_BANDS.index(target_epc) + 1] diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index f9cb7cbb..ee6a46d3 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -24,21 +24,24 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=False): +def get_data( + asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, uprn_column=None, + epc_api_only=False +): epc_data = [] errors = [] no_epc = [] for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): try: postcode = home[postcode_column] - house_number = home[address1_column].strip() + house_number = str(home[address1_column]).strip() full_address = home[fulladdress_column].strip() house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) if house_no is None: house_no = house_number uprn = manual_uprn_map.get(full_address, None) - if uprn is None and home.get("uprn"): - uprn = home["uprn"] + if uprn is None and home.get(uprn_column): + uprn = home[uprn_column] if pd.isnull(uprn): uprn = None @@ -149,7 +152,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m return epc_data, errors, no_epc -def extract_address1(asset_list, full_address_col, method="first_two_words"): +def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): if method == "first_two_words": asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") return asset_list @@ -158,6 +161,13 @@ def extract_address1(asset_list, full_address_col, method="first_two_words"): asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] return asset_list + if method == "house_number_extraction": + asset_list["address1_extracted"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + return asset_list + raise ValueError(f"Method {method} not recognized") @@ -258,16 +268,29 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing" - DATA_FILENAME = "Community Housing PV data pull.xlsx" - SHEET_NAME = "Community Housing" - POSTCODE_COLUMN = "Postcode" + # For Westward + # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + # DATA_FILENAME = "WESTWARD - completed list..xlsx" + # SHEET_NAME = "Sheet1" + # POSTCODE_COLUMN = "WFT EDIT Postcode" + # FULLADDRESS_COLUMN = "Address" + # ADDRESS1_COLUMN = None + # ADDRESS1_METHOD = "house_number_extraction" + # ADDRESS_COLS_TO_CONCAT = [] + # MISSING_POSTCODES_METHOD = None + # PROPERTY_YEAR_BUILT = "Build date" + # UPRN_COLUMN = "UPRN" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + SHEET_NAME = "Sheet1" + POSTCODE_COLUMN = 'Full Address.1' FULLADDRESS_COLUMN = "Full Address" ADDRESS1_COLUMN = None ADDRESS1_METHOD = "first_word" ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build_Date" + PROPERTY_YEAR_BUILT = "Build Date" + UPRN_COLUMN = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -299,7 +322,10 @@ def app(): if ADDRESS1_COLUMN is None: ADDRESS1_COLUMN = "address1_extracted" asset_list = extract_address1( - asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD + asset_list=asset_list, + full_address_col=FULLADDRESS_COLUMN, + postcode_col=POSTCODE_COLUMN, + method=ADDRESS1_METHOD ) if FULLADDRESS_COLUMN is None: @@ -315,6 +341,23 @@ def app(): asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False) asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False) + if UPRN_COLUMN is not None: + # Check if it's numeric and if so, make sure it's an integer + def convert_uprn(x): + + if pd.isnull(x): + return x + + # check if numeric + if np.isreal(x): + return str(int(x)) + + if str(x).isdigit(): + return str(int(x)) + return x + + asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn) + # We check for duplicated addresses asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] if asset_list["deduper"].duplicated().sum(): @@ -342,7 +385,8 @@ def app(): fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, postcode_column=POSTCODE_COLUMN, - manual_uprn_map=MANUAL_UPRN_MAP + manual_uprn_map=MANUAL_UPRN_MAP, + uprn_column=UPRN_COLUMN ) # We now retrieve any failed properties @@ -535,6 +579,7 @@ def app(): # 3) If we have year in the asset list, we flag entries where the built year is different from the # EPC Age band if PROPERTY_YEAR_BUILT is not None: + raise Exception("THIS WAS WRONG!") asset_list["Does Age Match EPC Age Band?"] = asset_list.apply( lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1 ) diff --git a/survey_report/app.py b/survey_report/app.py index 774d2a15..f6eddb8d 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -32,6 +32,15 @@ def generate_html_report(template_path, output_path, data): print(f"HTML report generated successfully: {output_path}") +def stringify_number(num: int, rounding: bool = True) -> str: + if num < 100000: # 5 figures or fewer + rounded_num = ((num + 99) // 100) * 100 if rounding else num + return f"{rounded_num:,}" + else: # More than 5 figures + rounded_num = ((num + 999) // 1000) * 1000 if rounding else num + return f"{rounded_num // 1000}k" + + class PlacidApi: # Errors as defined by docs: https://placid.app/docs/2.0/rest/errors ERROR_CODES = { @@ -89,7 +98,8 @@ class PlacidApi: ) response_body = response.json() - pdf_id = response_body["id"] + + return response_body def get_pdf(self, pdf_id: str): """ @@ -106,20 +116,22 @@ class PlacidApi: url = response_body["pdf_url"] # Download the PDF form this uurl pdf_download = requests.get(url) - with open("output.pdf", "wb") as f: + with open("survey_report/example_data/output.pdf", "wb") as f: f.write(pdf_download.content) -def handle(): +def handler(): """ Performs the data extraction process for the survey report :return: """ PLACID_API_KEY = "placid-mpkwidzer2mens9h-hifa3dmbxpfeghpa" - TEMPLATE_UUID = "hnwqgtumckfbf" + TEMPLATE_UUID = "5bst9mh1q9lk9" placid_api = PlacidApi(PLACID_API_KEY) + current_property_value = 250000 # Needs to be an input + EPC_COLOURS = { "A": "#117d58", "B": "#2da55c", @@ -136,26 +148,27 @@ def handle(): "WILLIS ROAD FLAT 1 PRE EPR SITE NOTES.pdf", "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 WILLIS " "ROAD FLAT 1 PRE EPR PDF.pdf", - "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 " - "WILLIS ROAD FLAT 1 POST EPR PDF.pdf" + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 1/3 WILLIS ROAD FLAT 1 POST EPR SITE NOTES.pdf" }, { "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 " "WILLIS ROAD FLAT 2 PRE EPR SITE NOTES.pdf", "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 WILLIS " "ROAD FLAT 2 PRE EPR PDF.pdf", - "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 " - "WILLIS ROAD FLAT 2 POST EPR PDF.pdf" + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 2/3 WILLIS ROAD FLAT 2 POST EPR SITE NOTES.pdf" }, { "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 " "WILLIS ROAD FLAT 3 PRE EPR SITE NOTES.pdf", "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 WILLIS " "ROAD FLAT 3 PRE EPR PDF.pdf", - "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 " - "WILLIS ROAD FLAT 3 POST EPR PDF.pdf" + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 3/3 WILLIS ROAD FLAT 3 POST EPR SITE NOTES.pdf" }, ] + data = [] for data_config in folders: @@ -181,26 +194,61 @@ def handle(): epr_extractor = EPRExtractor(file_mapping["epr"]) epr = epr_extractor.extract_all() - scenario_epr = EPRExtractor(file_mapping["scenario_epr"]) - scenario_epr = scenario_epr.extract_all() + # Valuation simulation + scenario_site_notes_extractor = SiteNotesExtractor(file_mapping["scenario_site_notes"]) + scenario_site_notes = scenario_site_notes_extractor.extract_all() + + from backend.ml_models.Valuation import PropertyValuation + valuation_uplift = PropertyValuation.estimate_valuation_improvement( + current_value=current_property_value, + current_epc=site_notes["Current EPC Band"], + target_epc=scenario_site_notes["Current EPC Band"], + ) + # TODO - should convert this, when it's more than 5 figures and we should certainly stringify this + + valuation_difference = round(valuation_uplift["average_increased_value"] - current_property_value) + + # Prepare the data for output + bill_savings = round( + site_notes['Estimated Annual Energy Cost (£)'] - scenario_site_notes['Estimated Annual Energy Cost (£)'] + ) + + carbon_savings = round( + site_notes["Current Carbon Emissions (TCO2)"] - scenario_site_notes["Current Carbon Emissions (TCO2)"], + 2 + ) + + payback_period = None + if payback_period is None: + raise NotImplementedError("Implement me") + + # We extract the measures from the site notes report_data = { - "template_uuid": TEMPLATE_UUID, "current_epc_rating": site_notes["Current EPC Band"], "current_epc_rating_colour": EPC_COLOURS[site_notes["Current EPC Band"]], - post_retrofit_epc_rating: str, - post_retrofit_epc_rating_colour: str, + "post_retrofit_epc_rating": scenario_site_notes["Current EPC Band"], + "post_retrofit_epc_rating_colour": EPC_COLOURS[scenario_site_notes["Current EPC Band"]], + "bill_savings": stringify_number(bill_savings), + "valuation_improvement": stringify_number(valuation_difference), + "carbon_savings": carbon_savings, + } # We now produce the combined data sheet which is the starting figure: - data_sheet = {**epr, **site_notes} - del data_sheet['Building Dimensions'] - # We unnest the Total Building Dimensions - data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] - data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] - del data_sheet["Total Building Dimensions"] + # data_sheet = {**epr, **site_notes} + # del data_sheet['Building Dimensions'] + # # We unnest the Total Building Dimensions + # data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] + # data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] + # del data_sheet["Total Building Dimensions"] - data.append(data_sheet) + create_pdf_response = placid_api.create_pdf( + template_uuid=TEMPLATE_UUID, **report_data + ) + # {'id': 769832, 'type': 'pdf', 'status': 'queued', 'pdf_url': None, 'transfer_url': None, 'passthrough': None} + # Download locally + placid_api.get_pdf(create_pdf_response["id"]) data = pd.DataFrame(data) diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py index 374df084..2e772886 100644 --- a/survey_report/extraction/quidos.py +++ b/survey_report/extraction/quidos.py @@ -108,8 +108,98 @@ class SiteNotesExtractor: self.extract_carbon_emissions() self.extract_bills_estimate() self.extract_building_dimensions() + + # Extract specific measures + # Primary wall + # Secondary wall + # Roof + # Floor + # Heating system + # Hot water system + # Windows + # Doors + # Lighting + # Ventilation + # Solar + return self.data + def extract_walls(self): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part, + including any alternative wall details within the 7.0 Walls section of the summary PDF text. + """ + + text = self.text + wall_data = [] + + # Isolate the 7.0 Walls section + wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL) + if not wall_section_match: + raise ValueError("Failed to locate the walls section in the text.") + + wall_section = wall_section_match.group(1) + + # Define patterns to match walls for each building part + wall_pattern = re.compile( + r"(?P
Main Property(?: Alternative)?|Extension \d+)\s*\n" + r"(?:Construction\s*(?P[^\n]*)\n)?" + r"(?:Insulation\s*(?P[^\n]*)\n)?" + r"(?:Insulation Thickness\(mm\)\s*(?P[^\n]*)\n)?" + r"(?:Wall Thickness Measured\?\s*(?P[^\n]*)\n)?" + r"(?:Wall Thickness\(mm\)\s*(?P\d+))?", + re.MULTILINE + ) + + # TODO: We aren't effectively picking up alternative walls + # alt_wall_pattern = re.compile( + # r"Alternative Wall Sheltered\s*.*?\n" + # r".*?Construction\s*(?P[^\n]*)\n" + # r"Insulation\s*(?P[^\n]*)\n" + # r"Insulation Thickness\(mm\)\s*(?P[^\n]*)\n" + # r"Wall Thickness Measured\?\s*(?P[^\n]*)\n" + # r"Wall Thickness\(mm\)\s*(?P\d+)?", + # re.MULTILINE + # ) + + for match in wall_pattern.finditer(wall_section): + building_part = match.group("section") + # has_alternative_wall = "Alternative" in building_part + building_part = "Main Property" if "Main Property" in building_part else building_part + + wall_entry = { + "Building Part": building_part, + "Wall Type": match.group("construction") or "Unknown", + "Wall Insulation": match.group("insulation") or "Unknown", + "Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown", + "Wall Thickness Measured": match.group("thickness_measured") or "Unknown", + "Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group( + "thickness").isdigit() else None, + "Alternative Wall Type": None, + "Alternative Wall Insulation": None, + "Alternative Insulation Thickness (mm)": None, + "Alternative Wall Thickness Measured": None, + "Alternative Wall Thickness (mm)": None, + } + + # Check if an alternative wall section exists + # if has_alternative_wall: + # alt_match = alt_wall_pattern.search(wall_section, match.end()) + # if alt_match: + # wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown" + # wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown" + # wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group( + # "alt_insulation_thickness") or "Unknown" + # wall_entry["Alternative Wall Thickness Measured"] = alt_match.group( + # "alt_thickness_measured") or "Unknown" + # wall_entry["Alternative Wall Thickness (mm)"] = int( + # alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group( + # "alt_thickness").isdigit() else None + + wall_data.append(wall_entry) + + return wall_data + class EPRExtractor: """ @@ -123,7 +213,7 @@ class EPRExtractor: self.text = pdf_text self.data = {} - def extract_heating_data(self): + def extract_heating_consumption(self): """ Extracts space heating and water heating values from the report. """ @@ -162,5 +252,5 @@ class EPRExtractor: Runs all extraction methods and returns a dictionary with extracted data. """ self.extract_address() - self.extract_heating_data() + self.extract_heating_consumption() return self.data From 55d2df17877d184b3bd9874a6da47cab6d3e6450 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 10:12:22 +0000 Subject: [PATCH 182/255] debygging epc searcher --- backend/SearchEpc.py | 3 + etl/route_march_data_pull/app.py | 95 +++++++++++++++++++++++++------- 2 files changed, 77 insertions(+), 21 deletions(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index c74a0b1f..e8a9dfaa 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -331,6 +331,9 @@ class SearchEpc: if row["lmk-key"] not in seen and not seen.add(row["lmk-key"]) ] + if data: + api_response["msg"] = self.SUCCESS + return api_response["msg"] def filter_rows(self, rows, property_type=None, address=None): diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index ee6a46d3..57239989 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -4,6 +4,7 @@ from BaseUtility import Definitions import pandas as pd import numpy as np from tqdm import tqdm +from datetime import datetime from dotenv import load_dotenv from backend.SearchEpc import SearchEpc @@ -172,7 +173,10 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t def process_age_band(x, year_built_column): - year_built = float(x[year_built_column]) + if isinstance(x[year_built_column], datetime): + year_built = x[year_built_column].year + else: + year_built = float(x[year_built_column]) if pd.isnull(x["Property Age Band"]) or ( x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES @@ -195,6 +199,12 @@ def process_age_band(x, year_built_column): if year_built < 2007: return "EPC Age Band is older than Year Built" + if x["Property Age Band"] == "England and Wales: 2012 onwards": + if year_built >= 2012: + return "EPC Age Band Matches Year Built" + if year_built < 2012: + return "EPC Age Band is older than Year Built" + if x["Property Age Band"] == "England and Wales: before 1900": if year_built < 1900: return "EPC Age Band Matches Year Built" @@ -206,7 +216,7 @@ def process_age_band(x, year_built_column): # so we extract the lower and upper date age_band = x["Property Age Band"].split(": ")[1] lower_date, upper_date = age_band.split("-") - if year_built <= float(upper_date) and year_built <= float(upper_date): + if year_built <= float(upper_date) and year_built >= float(lower_date): return "EPC Age Band Matches Year Built" if year_built > float(upper_date): @@ -269,28 +279,33 @@ def app(): # - Or the insulation required is loft/cavity (floors should be solid) # For Westward - # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - # DATA_FILENAME = "WESTWARD - completed list..xlsx" - # SHEET_NAME = "Sheet1" - # POSTCODE_COLUMN = "WFT EDIT Postcode" - # FULLADDRESS_COLUMN = "Address" - # ADDRESS1_COLUMN = None - # ADDRESS1_METHOD = "house_number_extraction" - # ADDRESS_COLS_TO_CONCAT = [] - # MISSING_POSTCODES_METHOD = None - # PROPERTY_YEAR_BUILT = "Build date" - # UPRN_COLUMN = "UPRN" - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + DATA_FILENAME = "WESTWARD - completed list..xlsx" SHEET_NAME = "Sheet1" - POSTCODE_COLUMN = 'Full Address.1' - FULLADDRESS_COLUMN = "Full Address" + POSTCODE_COLUMN = "WFT EDIT Postcode" + FULLADDRESS_COLUMN = "Address" ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_word" + ADDRESS1_METHOD = "house_number_extraction" ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build Date" - UPRN_COLUMN = None + PROPERTY_YEAR_BUILT = "Build date" + UPRN_COLUMN = "UPRN" + # If we have the non-intrusives data, this should be true + HAS_NON_INTRUSIVES = True + + # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + # SHEET_NAME = "Sheet1" + # POSTCODE_COLUMN = 'Full Address.1' + # FULLADDRESS_COLUMN = "Full Address" + # ADDRESS1_COLUMN = None + # ADDRESS1_METHOD = "first_word" + # ADDRESS_COLS_TO_CONCAT = [] + # MISSING_POSTCODES_METHOD = None + # PROPERTY_YEAR_BUILT = "Build Date" + # UPRN_COLUMN = None + # # If we have the non-intrusives data, this should be true + # HAS_NON_INTRUSIVES = True # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -358,6 +373,20 @@ def app(): asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn) + # We attempt to process the year built column + if PROPERTY_YEAR_BUILT is not None: + # We check if we have a datetime + if isinstance(asset_list[PROPERTY_YEAR_BUILT].iloc[0], datetime): + # We treat any string columns - with common values we see + datetime_remap = { + "Pre 1900": datetime(year=1899, month=12, day=31), + } + asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].replace(datetime_remap) + + asset_list[PROPERTY_YEAR_BUILT] = pd.to_datetime(asset_list[PROPERTY_YEAR_BUILT]) + # Convert this to year + asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].dt.year + # We check for duplicated addresses asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] if asset_list["deduper"].duplicated().sum(): @@ -579,11 +608,35 @@ def app(): # 3) If we have year in the asset list, we flag entries where the built year is different from the # EPC Age band if PROPERTY_YEAR_BUILT is not None: - raise Exception("THIS WAS WRONG!") asset_list["Does Age Match EPC Age Band?"] = asset_list.apply( lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1 ) + if HAS_NON_INTRUSIVES: + # Empty cavity: + # 1) Has been flagged on the non-intrusives as being empty or partially filled + # 2) The age is before 1995 + # 3) Remove anything that likley has access issues + asset_list["Suitable for Cavity Fill"] = ( + (asset_list["Construction"] == "CAVITY") & + asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) & + ( + (asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995 + ) + ) + + # asset_list["Suitable for Extraction"] = + asset_list[ + (asset_list["Construction"] == "Cavity") & + asset_list["Insulated"].isin(["RETRO DRILLED"]) & + ( + (asset_list[PROPERTY_YEAR_BUILT] <= 1995) + ) & + ( + asset_list[] + ) + ] + # 4) Flag properties that look like they're good candidates for solar installs # Firstly, flag if the fabric is completely done From 8432b7d202c24962bae64b04023600de13a6a03d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 11:50:28 +0000 Subject: [PATCH 183/255] creating the asset list class --- asset_list/AssetList.py | 64 ++++++++++++ etl/route_march_data_pull/app.py | 166 +++++++++++++++++++++---------- 2 files changed, 180 insertions(+), 50 deletions(-) create mode 100644 asset_list/AssetList.py diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py new file mode 100644 index 00000000..2a16e82f --- /dev/null +++ b/asset_list/AssetList.py @@ -0,0 +1,64 @@ +import os +import pandas as pd + + +class AssetList: + """ + This class is used to standardise asset lists so that we can process the core information in a consistent manner. + """ + + # These are the accepted methods we have for cleaning the address1 column + ADDRESS_1_CLEANING_METHODS = [ + "first_two_words", # This method will split on the fist two words, where the separator is a space + "first_word", # This method will split on the first word, where the separator is a space + "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber + "address1_extraction" # This method will use the NLP model to extract address1 + ] + + def __init__( + self, + local_filepath, + sheet_name, + address1_colname, + postcode_colname, + full_address_colname, + full_address_cols_to_concat=None, + missing_postcodes_method=None, + landlord_year_built=None, + landlord_uprn=None, + header=0 + ): + self.local_filepath = local_filepath + self.sheet_name = sheet_name + self.standardised_asset_list = None + # Read in the data + self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) + + # We detect the presence of the non-intrusive columns + self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False + + # Names of columns + self.address1_colname = address1_colname + self.postcode_colname = postcode_colname + self.full_address_colname = full_address_colname + self.landlord_year_built = landlord_year_built + self.landlord_uprn = landlord_uprn + + # parameters for cleaning + self.full_address_cols_to_concat = full_address_cols_to_concat + self.missing_postcodes_method = missing_postcodes_method + + def standardise(self): + """ + This function is used to standardise the asset list + :return: standardised asset list + """ + + # We keep just the columns we care about and will work through the various columns and standardise + self.standardised_asset_list = self.raw_asset_list[ + [ + + ] + ] + + raise NotImplementedError diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 57239989..06082774 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -5,6 +5,7 @@ import pandas as pd import numpy as np from tqdm import tqdm from datetime import datetime +from asset_list.AssetList import AssetList from dotenv import load_dotenv from backend.SearchEpc import SearchEpc @@ -172,60 +173,107 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t raise ValueError(f"Method {method} not recognized") -def process_age_band(x, year_built_column): - if isinstance(x[year_built_column], datetime): - year_built = x[year_built_column].year - else: - year_built = float(x[year_built_column]) +def process_age_band(asset_list, year_built_column): + processed_age_band = [] + for _, x in asset_list.iterrows(): - if pd.isnull(x["Property Age Band"]) or ( - x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES - ) or pd.isnull(year_built): - return "No EPC Age Band" + if pd.isnull(x["Property Age Band"]) or ( + x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES + ): + processed_age_band.append({ + "row_id": x["row_id"], + "epc_year_lower_bound": None, + "epc_year_upper_bound": None, + "Does Age Match EPC Age Band?": "No EPC Age Band" + }) + continue - # We check if we have a numeric data - if x["Property Age Band"].isdigit(): - if year_built == float(x["Property Age Band"]): - return "EPC Age Band Matches Year Built" - if year_built > float(x["Property Age Band"]): - return "EPC Age Band is older than Year Built" - if year_built < float(x["Property Age Band"]): - return "EPC Age Band is newer than Year Built" + # We exatract the upper and lower bounds + if x["Property Age Band"] in ["England and Wales: 2007 onwards", "England and Wales: 2012 onwards"]: + year_lower_bound = 2007 if x["Property Age Band"] == "England and Wales: 2007 onwards" else 2012 - # Handle specific case - if x["Property Age Band"] == "England and Wales: 2007 onwards": - if year_built >= 2007: - return "EPC Age Band Matches Year Built" - if year_built < 2007: - return "EPC Age Band is older than Year Built" + if pd.isnull(x[year_built_column]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[year_built_column] >= year_lower_bound + else "EPC Age Band is older than Year Built" + ) - if x["Property Age Band"] == "England and Wales: 2012 onwards": - if year_built >= 2012: - return "EPC Age Band Matches Year Built" - if year_built < 2012: - return "EPC Age Band is older than Year Built" + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": year_lower_bound, + "epc_year_upper_bound": None, + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue - if x["Property Age Band"] == "England and Wales: before 1900": - if year_built < 1900: - return "EPC Age Band Matches Year Built" - if year_built >= 1900: - return "EPC Age Band is newer than Year Built" + if x["Property Age Band"] == "England and Wales: before 1900": - # Age band will be formatted as such: - # 'England and Wales: {upper date}-{lower date}' - # so we extract the lower and upper date - age_band = x["Property Age Band"].split(": ")[1] - lower_date, upper_date = age_band.split("-") - if year_built <= float(upper_date) and year_built >= float(lower_date): - return "EPC Age Band Matches Year Built" + if pd.isnull(x[year_built_column]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[year_built_column] < 1900 + else "EPC Age Band is newer than Year Built" + ) - if year_built > float(upper_date): - return "EPC Age Band is older than Year Built" + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": None, + "epc_year_upper_bound": 1899, + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue - if year_built < float(upper_date): - return "EPC Age Band is newer than Year Built" + if x["Property Age Band"].isdigit(): - raise Exception("Should not reach here") + if pd.isnull(x[year_built_column]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[year_built_column] == int(x["Property Age Band"]) + else "EPC Age Band is different from Year Built" + ) + + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": int(x["Property Age Band"]), + "epc_year_upper_bound": int(x["Property Age Band"]), + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue + + # Oherwise, we extract the upper and lower bounds + age_band = x["Property Age Band"].split(": ")[1] + lower_date, upper_date = age_band.split("-") + + age_band_matches = ( + "EPC Age Band Matches Year Built" if (x[year_built_column] >= float(lower_date)) and ( + x[year_built_column] <= float(upper_date) + ) + else "EPC Age Band is older than Year Built" if x[year_built_column] > float(upper_date) + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + "row_id": x["row_id"], + "epc_year_lower_bound": int(lower_date), + "epc_year_upper_bound": int(upper_date), + "Does Age Match EPC Age Band?": age_band_matches + } + ) + + processed_age_band = pd.DataFrame(processed_age_band) + + return processed_age_band def app(): @@ -282,16 +330,27 @@ def app(): DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" DATA_FILENAME = "WESTWARD - completed list..xlsx" SHEET_NAME = "Sheet1" + POSTCODE_COLUMN = "WFT EDIT Postcode" FULLADDRESS_COLUMN = "Address" ADDRESS1_COLUMN = None ADDRESS1_METHOD = "house_number_extraction" + ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None PROPERTY_YEAR_BUILT = "Build date" UPRN_COLUMN = "UPRN" # If we have the non-intrusives data, this should be true HAS_NON_INTRUSIVES = True + PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits + + invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] + + asset_list = AssetList( + local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), + header=0, + sheet_name=SHEET_NAME + ) # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" @@ -608,8 +667,10 @@ def app(): # 3) If we have year in the asset list, we flag entries where the built year is different from the # EPC Age band if PROPERTY_YEAR_BUILT is not None: - asset_list["Does Age Match EPC Age Band?"] = asset_list.apply( - lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1 + # We process the age band and merge it on + processed_age_band = process_age_band(asset_list, PROPERTY_YEAR_BUILT) + asset_list = asset_list.merge( + processed_age_band, how="left", on="row_id" ) if HAS_NON_INTRUSIVES: @@ -621,7 +682,12 @@ def app(): (asset_list["Construction"] == "CAVITY") & asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) & ( - (asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995 + # Shold we defer to the year built provided by the HA? + (asset_list[PROPERTY_YEAR_BUILT] <= 1995) | (asset_list["epc_year_upper_bound"] <= 1995) + ) & + ( + # We check if the property type column contains one of the invalid property types + ~asset_list[PROPERTY_TYPE_COLUMN].str.lower().str.contains("|".join(invalid_property_types_dictionary)) ) ) @@ -633,9 +699,9 @@ def app(): (asset_list[PROPERTY_YEAR_BUILT] <= 1995) ) & ( - asset_list[] + asset_list[PROPERTY_TYPE_COLUMN] ) - ] + ] # 4) Flag properties that look like they're good candidates for solar installs # Firstly, flag if the fabric is completely done From 7e9347e530cc52fe38ceef66163447d6fd556b5e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 12:53:09 +0000 Subject: [PATCH 184/255] setting up libpostal --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 71 +++++++++- asset_list/README.md | 172 +++++++++++++++++++++++ asset_list/requirements.txt | 3 + asset_list/tests/test_standardisation.py | 9 ++ etl/route_march_data_pull/app.py | 18 ++- 7 files changed, 272 insertions(+), 5 deletions(-) create mode 100644 asset_list/README.md create mode 100644 asset_list/requirements.txt create mode 100644 asset_list/tests/test_standardisation.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..96ad7a95 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..fb10c6b0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 2a16e82f..35da9c3b 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,5 +1,10 @@ import os +import usaddress import pandas as pd +from utils.logger import setup_logger +from backend.SearchEpc import SearchEpc + +logger = setup_logger() class AssetList: @@ -15,6 +20,15 @@ class AssetList: "address1_extraction" # This method will use the NLP model to extract address1 ] + STANDARD_PROPERTY_TYPES = [ + "house", + "flat", + "bungalow", + "maisonette", + "park home", + "block house", + ] + def __init__( self, local_filepath, @@ -26,6 +40,10 @@ class AssetList: missing_postcodes_method=None, landlord_year_built=None, landlord_uprn=None, + landlord_property_type=None, + landlord_wall_construction=None, + landlord_heating_system=None, + landlord_existing_pv=None, header=0 ): self.local_filepath = local_filepath @@ -43,21 +61,72 @@ class AssetList: self.full_address_colname = full_address_colname self.landlord_year_built = landlord_year_built self.landlord_uprn = landlord_uprn + self.landlord_property_type = landlord_property_type + self.landlord_wall_construction = landlord_wall_construction + self.landlord_heating_system = landlord_heating_system + self.landlord_existing_pv = landlord_existing_pv # parameters for cleaning self.full_address_cols_to_concat = full_address_cols_to_concat self.missing_postcodes_method = missing_postcodes_method + self.debug_information = { + "property_type": None, + "wall_construction": None, + "heating_system": None, + "existing_pv": None + } + + @classmethod + def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"): + + if method not in cls.ADDRESS_1_CLEANING_METHODS: + raise ValueError(f"Method {method} for producing address1 not recognized") + + if method == "first_two_words": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + if method == "first_word": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + return asset_list + + if method == "house_number_extraction": + asset_list["address1_extracted"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + return asset_list + + if method == "address1_extraction": + + x = asset_list_df[FULLADDRESS_COLUMN].values[0] + parsed = usaddress.parse(x) + + def extract_address_1(): + + + raise ValueError(f"Method {method} not recognized") + + @staticmethod + def _address1_extraction(x): + + def standardise(self): """ This function is used to standardise the asset list :return: standardised asset list """ + if self.address1_colname is None: + # If we do not have this, we produce it + + # We keep just the columns we care about and will work through the various columns and standardise self.standardised_asset_list = self.raw_asset_list[ [ - + self.address1_colname, self.postcode_colname, self.full_address_colname, + self.landlord_year_built, self.landlord_uprn, self.landlord_property_type ] ] diff --git a/asset_list/README.md b/asset_list/README.md new file mode 100644 index 00000000..1bf734a4 --- /dev/null +++ b/asset_list/README.md @@ -0,0 +1,172 @@ +# libpostal Installation Guide for macOS M1 + +## Overview + +`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide +provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python. + +--- + +## 📌 Prerequisites + +Before installing `libpostal`, ensure you have the necessary dependencies installed. + +### **1️⃣ Install Required Dependencies** + +Open a terminal and run: + +```bash +brew install curl autoconf automake libtool pkg-config +``` + +### **2️⃣ Clone the libpostal Repository** + +```bash +git clone https://github.com/openvenues/libpostal.git +cd libpostal +``` + +### **3️⃣ Run Bootstrap Script** + +```bash +./bootstrap.sh +``` + +### **4️⃣ Configure the Build (Important for M1 Macs)** + +Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility. + +```bash +./configure --disable-sse2 --datadir=/usr/local/libpostal_data +``` + +*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)* + +### **5️⃣ Compile and Install** + +```bash +make -j$(sysctl -n hw.ncpu) +sudo make install +``` + +### **6️⃣ Install Python Bindings** + +Once `libpostal` is installed, install the Python package: + +```bash +pip install postal +``` + +--- + +## ✅ **Verify Installation** + +To check if `libpostal` was installed successfully, run: + +```bash +python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))" +``` + +**Expected Output:** + +``` +[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')] +``` + +--- + +## 📌 **Usage Example in Python** + +### **Address Parsing** + +```python +from postal.parser import parse + +address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL" +parsed_address = dict(parse(address)) + +print(parsed_address) +``` + +**Expected Output:** + +```python +{ + 'house_number': '23', + 'road': 'Clifton Hill', + 'city': 'Newtown', + 'city': 'Exeter', + 'postcode': 'EX1 2DL' +} +``` + +### **Address Normalization** + +```python +from postal.normalize import normalize_string + +address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL" +normalized = normalize_string(address) + +print(normalized) +``` + +--- + +## 📌 **Troubleshooting** + +### **1️⃣ libpostal Not Found?** + +If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure: + +- You ran `sudo make install` +- Your Python environment recognizes `postal`. Try: + ```bash + pip install postal --no-cache-dir + ``` +- If using a virtual environment (`venv`), activate it before running Python. + +### **2️⃣ Compilation Issues on macOS?** + +If `make` fails, try running: + +```bash +brew reinstall autoconf automake libtool pkg-config +``` + +Then restart the installation process. + +### **3️⃣ Can't Find libpostal Data Directory?** + +Ensure `libpostal_data` exists in the correct directory: + +```bash +ls /usr/local/libpostal_data +``` + +If missing, re-run `./configure` with the correct path. + +--- + +## 🛠 **Uninstallation** + +To remove `libpostal`, run: + +```bash +sudo rm -rf /usr/local/lib/libpostal* +sudo rm -rf /usr/local/include/libpostal* +rm -rf ~/libpostal +pip uninstall postal +``` + +--- + +## 📌 **Additional Resources** + +- [Libpostal GitHub](https://github.com/openvenues/libpostal) +- [Libpostal Python Bindings](https://pypi.org/project/postal/) +- [Homebrew](https://brew.sh/) + +--- + +### 🎉 You’re all set! Now you can use `libpostal` to parse and clean address data efficiently. 🚀 diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt new file mode 100644 index 00000000..d77c8a58 --- /dev/null +++ b/asset_list/requirements.txt @@ -0,0 +1,3 @@ +postal +pandas +usaddress \ No newline at end of file diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py new file mode 100644 index 00000000..f0e6ce11 --- /dev/null +++ b/asset_list/tests/test_standardisation.py @@ -0,0 +1,9 @@ +from asset_list.AssetList import AssetList + + +def test_address1_extraction(): + example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL' + + AssetList._extract_address1( + example, + ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 06082774..74dc28e0 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -346,10 +346,24 @@ def app(): invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] - asset_list = AssetList( + self = AssetList( local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, - sheet_name=SHEET_NAME + sheet_name=SHEET_NAME, + address1_colname=ADDRESS1_COLUMN, + postcode_colname=POSTCODE_COLUMN, + full_address_colname=FULLADDRESS_COLUMN, + full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, + missing_postcodes_method=MISSING_POSTCODES_METHOD, + landlord_year_built=PROPERTY_YEAR_BUILT, + landlord_uprn=UPRN_COLUMN, + landlord_property_type=PROPERTY_TYPE_COLUMN, + landlord_wall_construction="Wall Construction (EPC)", + landlord_heating_system="Heat Source", + landlord_existing_pv="PV (Y/N)" + ) + self.standardised_asset_list( + # In here, we might want to pass some specific remaps ) # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" From cb0194c3b96f839e5050073eb76e2f23e822c87f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 14:12:57 +0000 Subject: [PATCH 185/255] working on address extraction --- asset_list/AssetList.py | 119 +++++++++++++--- asset_list/README.md | 172 ----------------------- asset_list/requirements.txt | 7 +- asset_list/tests/test_standardisation.py | 9 +- backend/SearchEpc.py | 14 +- backend/tests/test_search_epc.py | 9 ++ etl/route_march_data_pull/app.py | 2 + 7 files changed, 130 insertions(+), 202 deletions(-) delete mode 100644 asset_list/README.md diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 35da9c3b..1a3f6180 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -17,7 +17,7 @@ class AssetList: "first_two_words", # This method will split on the fist two words, where the separator is a space "first_word", # This method will split on the first word, where the separator is a space "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber - "address1_extraction" # This method will use the NLP model to extract address1 + # "address1_extraction" # This method will use the NLP model to extract address1 ] STANDARD_PROPERTY_TYPES = [ @@ -29,6 +29,19 @@ class AssetList: "block house", ] + # Standard column Names + STANDARD_ADDRESS_1 = "domna_address_1" + STANDARD_POSTCODE = "domna_postcode" + STANDARD_FULL_ADDRESS = "domna_full_address" + STANDARD_YEAR_BUILT = "domna_year_built" + STANDARD_UPRN = "ordnance_survey_uprn" + STANDARD_PROPERTY_TYPE = "landlord_property_type" + STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction" + STANDARD_HEATING_SYSTEM = "landlord_heating_system" + STANDARD_EXISTING_PV = "landlord_existing_pv" + + DOMNA_PROPERTY_ID = "domna_property_id" + def __init__( self, local_filepath, @@ -36,8 +49,10 @@ class AssetList: address1_colname, postcode_colname, full_address_colname, + landlord_property_id=None, full_address_cols_to_concat=None, missing_postcodes_method=None, + address1_extraction_method=None, landlord_year_built=None, landlord_uprn=None, landlord_property_type=None, @@ -48,14 +63,15 @@ class AssetList: ): self.local_filepath = local_filepath self.sheet_name = sheet_name - self.standardised_asset_list = None # Read in the data self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) + self.standardised_asset_list = self.raw_asset_list.copy() # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False # Names of columns + self.landlord_property_id = landlord_property_id self.address1_colname = address1_colname self.postcode_colname = postcode_colname self.full_address_colname = full_address_colname @@ -69,6 +85,7 @@ class AssetList: # parameters for cleaning self.full_address_cols_to_concat = full_address_cols_to_concat self.missing_postcodes_method = missing_postcodes_method + self.address1_extraction_method = address1_extraction_method self.debug_information = { "property_type": None, @@ -77,40 +94,50 @@ class AssetList: "existing_pv": None } - @classmethod - def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"): + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): - if method not in cls.ADDRESS_1_CLEANING_METHODS: + if method not in self.ADDRESS_1_CLEANING_METHODS: raise ValueError(f"Method {method} for producing address1 not recognized") if method == "first_two_words": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") return asset_list if method == "first_word": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0] return asset_list if method == "house_number_extraction": - asset_list["address1_extracted"] = asset_list.apply( + asset_list[self.address1_colname] = asset_list.apply( lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), axis=1 ) return asset_list - if method == "address1_extraction": - - x = asset_list_df[FULLADDRESS_COLUMN].values[0] - parsed = usaddress.parse(x) - - def extract_address_1(): - - - raise ValueError(f"Method {method} not recognized") + raise ValueError(f"Method {method} not recognized") @staticmethod def _address1_extraction(x): + pass + def create_property_id(self): + """ + This function creates the domna property ID, which is simply a hash of the full address and postcode + We want all figures to be positive + :return: + """ + import sys + self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = ( + self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[ + self.postcode_colname] + ).apply(lambda x: hash(x) % 2 ** sys.hash_info.width) + + @staticmethod + def _strip_postcode_from_full_address(full_address, postcode): + cleaned = full_address.replace(postcode, "") + # Remove any trailing commas and spaces + cleaned = cleaned.rstrip(", ").strip(",").strip() + return cleaned def standardise(self): """ @@ -118,15 +145,63 @@ class AssetList: :return: standardised asset list """ - if self.address1_colname is None: - # If we do not have this, we produce it + # Remove rows without a postcode + if self.postcode_colname is not None: + self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname]) + # We clean up portential non-breaking spaces, and double spaces + for col in [ + c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if + c is not None + ]: + self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str) + self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False) + self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False) + + if self.address1_colname is None: + if self.address1_extraction_method is None: + raise ValueError("Missing address 1 - please specify an extraction method") + self.address1_colname = self.STANDARD_ADDRESS_1 + # If we do not have this, we produce it + self.standardised_asset_list = self._extract_address1( + asset_list=self.standardised_asset_list, + full_address_col=self.full_address_colname, + postcode_col=self.postcode_colname, + method=self.address1_extraction_method + ) + + if self.full_address_colname is None: + if not self.full_address_cols_to_concat: + raise ValueError("Missing full address - please specify columns to concatenate") + self.full_address_colname = self.STANDARD_FULL_ADDRESS + self.standardised_asset_list[self.full_address_colname] = ( + self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1) + ) + else: + + # Make sure to strip the postcode out of the full address + self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply( + lambda x: self._strip_postcode_from_full_address( + full_address=x[self.full_address_colname], + postcode=x[self.postcode_colname] + ), + axis=1 + ) + + # We create the domna property id + self.create_property_id() # We keep just the columns we care about and will work through the various columns and standardise - self.standardised_asset_list = self.raw_asset_list[ + self.standardised_asset_list = self.standardised_asset_list[ [ - self.address1_colname, self.postcode_colname, self.full_address_colname, - self.landlord_year_built, self.landlord_uprn, self.landlord_property_type + self.landlord_property_id, + self.DOMNA_PROPERTY_ID, + self.address1_colname, + self.postcode_colname, + self.full_address_colname, + self.landlord_year_built, + self.landlord_uprn, + self.landlord_property_type, ] ] diff --git a/asset_list/README.md b/asset_list/README.md deleted file mode 100644 index 1bf734a4..00000000 --- a/asset_list/README.md +++ /dev/null @@ -1,172 +0,0 @@ -# libpostal Installation Guide for macOS M1 - -## Overview - -`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide -provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python. - ---- - -## 📌 Prerequisites - -Before installing `libpostal`, ensure you have the necessary dependencies installed. - -### **1️⃣ Install Required Dependencies** - -Open a terminal and run: - -```bash -brew install curl autoconf automake libtool pkg-config -``` - -### **2️⃣ Clone the libpostal Repository** - -```bash -git clone https://github.com/openvenues/libpostal.git -cd libpostal -``` - -### **3️⃣ Run Bootstrap Script** - -```bash -./bootstrap.sh -``` - -### **4️⃣ Configure the Build (Important for M1 Macs)** - -Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility. - -```bash -./configure --disable-sse2 --datadir=/usr/local/libpostal_data -``` - -*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)* - -### **5️⃣ Compile and Install** - -```bash -make -j$(sysctl -n hw.ncpu) -sudo make install -``` - -### **6️⃣ Install Python Bindings** - -Once `libpostal` is installed, install the Python package: - -```bash -pip install postal -``` - ---- - -## ✅ **Verify Installation** - -To check if `libpostal` was installed successfully, run: - -```bash -python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))" -``` - -**Expected Output:** - -``` -[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')] -``` - ---- - -## 📌 **Usage Example in Python** - -### **Address Parsing** - -```python -from postal.parser import parse - -address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL" -parsed_address = dict(parse(address)) - -print(parsed_address) -``` - -**Expected Output:** - -```python -{ - 'house_number': '23', - 'road': 'Clifton Hill', - 'city': 'Newtown', - 'city': 'Exeter', - 'postcode': 'EX1 2DL' -} -``` - -### **Address Normalization** - -```python -from postal.normalize import normalize_string - -address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL" -normalized = normalize_string(address) - -print(normalized) -``` - ---- - -## 📌 **Troubleshooting** - -### **1️⃣ libpostal Not Found?** - -If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure: - -- You ran `sudo make install` -- Your Python environment recognizes `postal`. Try: - ```bash - pip install postal --no-cache-dir - ``` -- If using a virtual environment (`venv`), activate it before running Python. - -### **2️⃣ Compilation Issues on macOS?** - -If `make` fails, try running: - -```bash -brew reinstall autoconf automake libtool pkg-config -``` - -Then restart the installation process. - -### **3️⃣ Can't Find libpostal Data Directory?** - -Ensure `libpostal_data` exists in the correct directory: - -```bash -ls /usr/local/libpostal_data -``` - -If missing, re-run `./configure` with the correct path. - ---- - -## 🛠 **Uninstallation** - -To remove `libpostal`, run: - -```bash -sudo rm -rf /usr/local/lib/libpostal* -sudo rm -rf /usr/local/include/libpostal* -rm -rf ~/libpostal -pip uninstall postal -``` - ---- - -## 📌 **Additional Resources** - -- [Libpostal GitHub](https://github.com/openvenues/libpostal) -- [Libpostal Python Bindings](https://pypi.org/project/postal/) -- [Homebrew](https://brew.sh/) - ---- - -### 🎉 You’re all set! Now you can use `libpostal` to parse and clean address data efficiently. 🚀 diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index d77c8a58..d6d64471 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -1,3 +1,8 @@ postal pandas -usaddress \ No newline at end of file +usaddress +pydantic-settings==2.6.0 +epc-api-python==1.0.2 +fuzzywuzzy +boto3 +openpyxl \ No newline at end of file diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py index f0e6ce11..1a083bbc 100644 --- a/asset_list/tests/test_standardisation.py +++ b/asset_list/tests/test_standardisation.py @@ -1,9 +1,12 @@ from asset_list.AssetList import AssetList +from backend.SearchEpc import + def test_address1_extraction(): example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL' - AssetList._extract_address1( - example, - ) + # AssetList._extract_address1( + # example, + # ) + pass diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index e8a9dfaa..79a041ec 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -208,9 +208,14 @@ class SearchEpc: try: # Updated regex to catch house numbers including alphanumeric ones pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)' - match = re.search(pattern, address) - if match: - return next(g for g in match.groups() if g is not None) + match1 = re.search(pattern, address) + if match1: + return next(g for g in match1.groups() if g is not None) + + pattern2 = r'(?i)(flat|apartment)\s*([a-zA-Z]?\d+[a-zA-Z]?)' + match2 = re.search(pattern2, address) + if match2: + return match2.group(2) parsed = usaddress.parse(address) # First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected @@ -221,7 +226,8 @@ class SearchEpc: continue if part == postcode.split(" ")[1]: continue - return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary + return part.rstrip( + ",") # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary # number # Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py index 3b2e2a5b..562585ad 100644 --- a/backend/tests/test_search_epc.py +++ b/backend/tests/test_search_epc.py @@ -48,3 +48,12 @@ class TestSearchEpcIntegration: assert epc_searcher.newest_epc["lmk-key"] == lmk_key assert epc_searcher.newest_epc["uprn"] == uprn assert len(epc_searcher.older_epcs) == n_old_epcs + + def test_search_housenumber(self): + eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter' + res1 = SearchEpc.get_house_number(eg1, None) + assert res1 == "A11" + + eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL' + res2 = SearchEpc.get_house_number(eg2, None) + assert res2 == "A9" diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 74dc28e0..fcf11765 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -352,9 +352,11 @@ def app(): sheet_name=SHEET_NAME, address1_colname=ADDRESS1_COLUMN, postcode_colname=POSTCODE_COLUMN, + landlord_property_id="UPRN", full_address_colname=FULLADDRESS_COLUMN, full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, missing_postcodes_method=MISSING_POSTCODES_METHOD, + address1_extraction_method=ADDRESS1_METHOD, landlord_year_built=PROPERTY_YEAR_BUILT, landlord_uprn=UPRN_COLUMN, landlord_property_type=PROPERTY_TYPE_COLUMN, From 0a643d80adb412ea4069664cc12efaf9e71fad42 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 14:21:29 +0000 Subject: [PATCH 186/255] building out multi-unit flagging --- asset_list/AssetList.py | 16 ++++++++++++++-- asset_list/tests/test_standardisation.py | 11 ++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 1a3f6180..fde24fe2 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,5 +1,4 @@ -import os -import usaddress +import re import pandas as pd from utils.logger import setup_logger from backend.SearchEpc import SearchEpc @@ -42,6 +41,9 @@ class AssetList: DOMNA_PROPERTY_ID = "domna_property_id" + # Regular expression for identifying if the address might point to multiple units + MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b') + def __init__( self, local_filepath, @@ -139,6 +141,14 @@ class AssetList: cleaned = cleaned.rstrip(", ").strip(",").strip() return cleaned + @classmethod + def _identify_multi_address(cls, address): + # We check if the address is comma separated + if "," in address: + address1_section = address.split(",")[0] + # We look for string in the form (x-y) + return bool(cls.MULTI_UNIT_REGEX.search(address1_section)) + def standardise(self): """ This function is used to standardise the asset list @@ -205,4 +215,6 @@ class AssetList: ] ] + # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) + raise NotImplementedError diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py index 1a083bbc..b6d9a391 100644 --- a/asset_list/tests/test_standardisation.py +++ b/asset_list/tests/test_standardisation.py @@ -1,12 +1,5 @@ from asset_list.AssetList import AssetList -from backend.SearchEpc import - -def test_address1_extraction(): - example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL' - - # AssetList._extract_address1( - # example, - # ) - pass +def test_multi_unit_address_flagging(): + assert AssetList._identify_multi_address('Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL') From ecf8e46c65ae7e09725258bcb578690d1156bf14 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 22:12:29 +0000 Subject: [PATCH 187/255] getting asset list class live --- .idea/terraform.xml | 6 + asset_list/AssetList.py | 321 +++++++++++++++++++++++-- asset_list/app.py | 1 + asset_list/mappings/exising_pv.py | 8 + asset_list/mappings/heating_systems.py | 46 ++++ asset_list/mappings/property_type.py | 16 ++ asset_list/mappings/walls.py | 38 +++ asset_list/requirements.txt | 4 +- etl/route_march_data_pull/app.py | 5 +- 9 files changed, 420 insertions(+), 25 deletions(-) create mode 100644 .idea/terraform.xml create mode 100644 asset_list/app.py create mode 100644 asset_list/mappings/exising_pv.py create mode 100644 asset_list/mappings/heating_systems.py create mode 100644 asset_list/mappings/property_type.py create mode 100644 asset_list/mappings/walls.py diff --git a/.idea/terraform.xml b/.idea/terraform.xml new file mode 100644 index 00000000..cd46a3d3 --- /dev/null +++ b/.idea/terraform.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index fde24fe2..e61cc89b 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,16 +1,200 @@ +import os import re +from datetime import datetime +from openai import OpenAI +import tiktoken +import numpy as np import pandas as pd +from fuzzywuzzy import process from utils.logger import setup_logger from backend.SearchEpc import SearchEpc +import asset_list.mappings.property_type as property_type_mappings +import asset_list.mappings.walls as walls_mappings +import asset_list.mappings.heating_systems as heating_mappings +import asset_list.mappings.exising_pv as existing_pv_mappings logger = setup_logger() +# OpenAI API Key (set this in your environment variables for security) +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + + +class DataRemapper: + def __init__(self, standard_values, standard_map=None, max_tokens=1000): + """ + Initialize the remapper with standard values and a predefined mapping. + + :param standard_values: Set of allowed standardized values. + :param standard_map: Dictionary of common remappings {raw_value: standard_value}. + """ + self.standard_values = {v.lower() for v in standard_values} # Normalize to lowercase + self.standard_map = {k.lower(): v.lower() for k, v in (standard_map or {}).items()} # Predefined mappings + self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity + self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing + + # Tokenizer for counting tokens + self.tokenizer = tiktoken.encoding_for_model(self.ai_model) + + # Track token usage and remap dictionary + self.total_tokens_used = 0 + self.total_cost = 0 + self.remap_dict = {} # {original_value: standardized_value} + self.max_tokens = 1000 # Limit for OpenAI API + + # Memoization for AI calls + self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}} + # Capture the reponse for debugging + self.ai_response = None + + # OpenAI pricing (as of Feb 2024) + self.pricing = { + "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000}, + "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000}, + } + + self.openai_client = OpenAI(api_key=OPENAI_API_KEY) + + @staticmethod + def clean_string(text): + """Basic text cleaning: remove extra spaces, punctuation, and normalize case.""" + if not isinstance(text, str): + return None + text = text.strip().lower() + text = re.sub(r'[^\w\s]', '', text) # Remove punctuation + return text + + def fuzzy_match(self, text): + """Use fuzzy matching to find the closest standard value.""" + match, score = process.extractOne(text, self.standard_values) if text else (None, 0) + return match if score >= self.fuzzy_threshold else None + + def count_tokens(self, text): + """Estimate the number of tokens in a given text.""" + return len(self.tokenizer.encode(text)) if text else 0 + + def ai_standardize(self, unmapped_values): + """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization.""" + if not unmapped_values: + return {} + + unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization + if unmapped_tuple in self.ai_cache: + return self.ai_cache[unmapped_tuple] # Return memoized result + + prompt = f""" + You are an expert in data classification. Standardize each of these values into one of the categories: + {list(self.standard_values)}. + + Return only a JSON dictionary where: + - The keys are the original values. + - The values are the standardized ones. + + Strictly return JSON **without markdown formatting** or extra text. + + Example Output: + {{ + "BLKHOUS": "block house", + "BEDSIT": "bedsit" + }} + + Values to standardize: + {unmapped_values} + """ + + # Count input tokens + input_tokens = self.count_tokens(prompt) + if input_tokens > self.max_tokens: + raise ValueError("Input tokens exceed the maximum limit.") + + response = self.openai_client.chat.completions.create( + model=self.ai_model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=0.1, + ) + + output_text = response.choices[0].message.content.strip() + output_tokens = self.count_tokens(output_text) # Count output tokens + + # Track total token usage + self.total_tokens_used += input_tokens + output_tokens + + # Estimate cost + input_cost = input_tokens * self.pricing[self.ai_model]["input"] + output_cost = output_tokens * self.pricing[self.ai_model]["output"] + self.total_cost += input_cost + output_cost + + try: + # Parse response as dictionary + mapping = eval(output_text) # OpenAI should return a valid dictionary + except: + mapping = {val: "unknown" for val in unmapped_values} # Fallback + + # Memoize the AI response + self.ai_cache[unmapped_tuple] = mapping + # We store the raw AI response for debugging + logger.debug(f"AI Response: {mapping}") + self.ai_response = output_text + + return mapping + + def standardize_list(self, values_to_remap): + """ + Standardizes a list of values and returns a dictionary {original_value: standardized_value}. + + :param values_to_remap: List of raw values to standardize. + :return: Dictionary {original_value: standardized_value}. + """ + unique_values = set(values_to_remap) # Process only unique values + + unmapped_values = [] + for value in unique_values: + if pd.isna(value): # Handle NaN values + self.remap_dict[value] = "unknown" + continue + + cleaned_value = self.clean_string(value) + + # Rule-Based Check (Predefined Mapping) + if cleaned_value in self.standard_map: + self.remap_dict[value] = self.standard_map[cleaned_value] + continue + + # Exact Match in Standard Values + if cleaned_value in self.standard_values: + self.remap_dict[value] = cleaned_value + continue + + # Fuzzy Matching + fuzzy_match = self.fuzzy_match(cleaned_value) + if fuzzy_match: + self.remap_dict[value] = fuzzy_match + continue + + # Capture anything that wasn't mapped + unmapped_values.append(value) + + # AI Model - remap anything unmapped (batch request) + ai_mapping = self.ai_standardize(unmapped_values) + self.remap_dict.update(ai_mapping) + + return self.remap_dict + + def report_usage(self): + """Prints a summary of token usage and cost.""" + print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}") + print(f"💰 Estimated Cost: ${self.total_cost:.4f}") + class AssetList: """ This class is used to standardise asset lists so that we can process the core information in a consistent manner. """ + DATETIME_REMAP = { + "Pre 1900": datetime(year=1899, month=12, day=31), + } + # These are the accepted methods we have for cleaning the address1 column ADDRESS_1_CLEANING_METHODS = [ "first_two_words", # This method will split on the fist two words, where the separator is a space @@ -19,15 +203,6 @@ class AssetList: # "address1_extraction" # This method will use the NLP model to extract address1 ] - STANDARD_PROPERTY_TYPES = [ - "house", - "flat", - "bungalow", - "maisonette", - "park home", - "block house", - ] - # Standard column Names STANDARD_ADDRESS_1 = "domna_address_1" STANDARD_POSTCODE = "domna_postcode" @@ -44,6 +219,15 @@ class AssetList: # Regular expression for identifying if the address might point to multiple units MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b') + # List of columns relating to the non-intrusive data + NON_INTRUSIVES_COLNAMES = [ + "Archetype", "Construction", "Insulated", "Material", "CIGA Check Required", + "PV, ACCESS ISSUE, SEE NOTES", "OFF GAS - ROOF ORIENTATION", + "Any further surveyor notes", 'Surveyors Name' + ] + + #### Mapping for wall construction + def __init__( self, local_filepath, @@ -96,6 +280,8 @@ class AssetList: "existing_pv": None } + self.variable_mappings = {} + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): if method not in self.ADDRESS_1_CLEANING_METHODS: @@ -149,7 +335,7 @@ class AssetList: # We look for string in the form (x-y) return bool(cls.MULTI_UNIT_REGEX.search(address1_section)) - def standardise(self): + def init_standardise(self): """ This function is used to standardise the asset list :return: standardised asset list @@ -202,19 +388,110 @@ class AssetList: self.create_property_id() # We keep just the columns we care about and will work through the various columns and standardise - self.standardised_asset_list = self.standardised_asset_list[ - [ - self.landlord_property_id, - self.DOMNA_PROPERTY_ID, - self.address1_colname, - self.postcode_colname, - self.full_address_colname, - self.landlord_year_built, - self.landlord_uprn, - self.landlord_property_type, - ] + variables = [ + self.landlord_property_id, + self.DOMNA_PROPERTY_ID, + self.address1_colname, + self.postcode_colname, + self.full_address_colname, + self.landlord_uprn, + self.landlord_property_type, + self.landlord_year_built, + self.landlord_wall_construction, + self.landlord_heating_system, + self.landlord_existing_pv ] + rename = {} + + if self.non_intrusives_present: + variables += self.NON_INTRUSIVES_COLNAMES + rename = { + **rename, + **dict( + zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES]) + ) + } + + self.standardised_asset_list = self.standardised_asset_list[variables].rename( + columns=rename + ) # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) + self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[ + self.full_address_colname + ].apply(lambda x: self._identify_multi_address(x)) - raise NotImplementedError + # We handle cleaning for walls, in the instance that the landlord provides us with EPC data and + # we see instances of "average thermal transmittance" in the description + self.standardised_asset_list[self.landlord_wall_construction] = np.where( + self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains( + "average thermal transmittance" + ), + "new build - average thermal transmittance", + self.standardised_asset_list[self.landlord_wall_construction] + ) + + # Clear our build year column + + # We attempt to process the year built column + if self.landlord_year_built is not None: + # We check if we have a datetime + if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime): + # We treat any string columns - with common values we see + self.standardised_asset_list[self.landlord_year_built] = ( + self.standardised_asset_list[self.landlord_year_built].replace(self.DATETIME_REMAP) + ) + + self.standardised_asset_list[self.landlord_year_built] = pd.to_datetime( + self.standardised_asset_list[self.landlord_year_built] + ) + # Convert this to year + self.standardised_asset_list[self.landlord_year_built] = ( + self.standardised_asset_list[self.landlord_year_built].dt.year + ) + else: + raise NotImplementedError("Year built column must be a datetime - implement me") + + # We now create standard lookups + to_remap = { + self.landlord_property_type: { + "standard_values": property_type_mappings.STANDARD_PROPERTY_TYPES, + "standard_map": property_type_mappings.PROPERTY_MAPPING + }, + self.landlord_wall_construction: { + "standard_values": walls_mappings.STANDARD_WALL_CONSTRUCTIONS, + "standard_map": walls_mappings.WALL_CONSTRUCTION_MAPPINGS + }, + self.landlord_heating_system: { + "standard_values": heating_mappings.STANDARD_HEATING_SYSTEMS, + "standard_map": heating_mappings.HEATING_MAPPINGS + }, + self.landlord_existing_pv: { + "standard_values": existing_pv_mappings.STANDARD_EXISTING_PV, + "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS + } + } + + for variable, config in to_remap.items(): + logger.info("Standardising variable: %s", variable) + values_to_remap = self.standardised_asset_list[variable].unique() + # We want to map this to our standardised list of property types we're interested in + remapper = DataRemapper(standard_values=config["standard_values"], standard_map=config["standard_map"]) + remap_dictionary = remapper.standardize_list(values_to_remap=values_to_remap.tolist()) + self.variable_mappings[variable] = remap_dictionary + + # We now print out the variable mappings, which can be reviewed by the user, before the final standardised + # asset list is returned + + def apply_standardiation(self, override_empty_mappings=False): + """ + This function applies the standardisation to the asset list + :param override_empty_mappings: If true, will override the check for empty mappings. This is only relevant + if there are no categories which need remapping which is highly unlikely + :return: + """ + if not self.variable_mappings and not override_empty_mappings: + raise ValueError("Please run init_standardise first") + + def create_lookup_mappings(self): + pass diff --git a/asset_list/app.py b/asset_list/app.py new file mode 100644 index 00000000..21b405d8 --- /dev/null +++ b/asset_list/app.py @@ -0,0 +1 @@ +import os diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py new file mode 100644 index 00000000..1e45bd83 --- /dev/null +++ b/asset_list/mappings/exising_pv.py @@ -0,0 +1,8 @@ +STANDARD_EXISTING_PV = { + "already has PV", "no PV", "unknown" +} + +EXISTING_PV_MAPPINGS = { + "NO": "no PV", + "YES": "already has PV", +} diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py new file mode 100644 index 00000000..4fce39ab --- /dev/null +++ b/asset_list/mappings/heating_systems.py @@ -0,0 +1,46 @@ +STANDARD_HEATING_SYSTEMS = { + "gas combi boiler", + "electric storage heaters", + "district heating", + "gas condensing boiler", + "oil boiler", + "gas condensing combi", + "air source heat pump", + "boiler - other fuel", + "ground source heat pump", + "electric radiators", + "other", + "electric boiler", + "unknown", + "communal gas boiler", +} + +HEATING_MAPPINGS = { + "Combi - GAS": "gas combi boiler", + "E7 Storage Heaters": "electric storage heaters", + "District heating system": "district heating", + "Condensing Boiler - GAS": "gas condensing boiler", + "Boiler Oil/other": "oil boiler", + "Condensing Combi - Gas": "gas condensing combi", + "Air Source Source Heat Pump": "air source heat pump", + "Biomass Boiler": "boiler - other fuel", + "Ground Source Heat Pump": "ground source heat pump", + "Electric Oil filled radiators": "electric radiators", + "Solid Fuel": "other", + "LPG Boiler": "boiler - other fuel", + "Electric Boiler": "electric boiler", + "No data": "unknown", + "Boiler Communal/Commercial - GAS": "communal gas boiler", + "Eco Electric Radiators": "electric radiators", + "Gas fire": "other", + "Backboiler - Solid fuel": "other", +} + +# array(['Combi - GAS', 'E7 Storage Heaters', 'District heating system', +# 'Condensing Boiler - GAS', 'Boiler Oil/other', +# 'Condensing Combi - Gas', 'Air Source Source Heat Pump', +# 'Biomass Boiler', 'Ground Source Heat Pump', +# 'Electric Oil filled radiators', 'Solid Fuel', 'LPG Boiler', +# 'Electric Boiler', 'No data', 'Boiler Communal/Commercial - GAS', +# 'Eco Electric Radiators', 'Gas fire', 'Backboiler - Solid fuel'], +# dtype=object) diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py new file mode 100644 index 00000000..bcad9ede --- /dev/null +++ b/asset_list/mappings/property_type.py @@ -0,0 +1,16 @@ +# These are the standard categories for property types +STANDARD_PROPERTY_TYPES = { + "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house", + "unknown", "other" +} + +# This is a basic mapping that we use to map values that we've seen commonly to standard values +PROPERTY_MAPPING = { + "HOUSE": "house", + "FLAT": "flat", + "MAISONET": "maisonette", + "BUNGALOW": "bungalow", + "BLKHOUS": "block house", + "BEDSIT": "bedsit", + "COACHSE": "coach house", +} diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py new file mode 100644 index 00000000..7dec7d12 --- /dev/null +++ b/asset_list/mappings/walls.py @@ -0,0 +1,38 @@ +STANDARD_WALL_CONSTRUCTIONS = { + "uninsulated cavity", "filled cavity", "partial insulated cavity", "timber frame", "solid brick", + "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob", + "new build - average thermal transmittance", +} + +WALL_CONSTRUCTION_MAPPINGS = { + "New Build - Average Thermal Transmittance": "new build - average thermal transmittance", + 'Average thermal transmittance 0.25 W/m?K': 'unknown', + 'Cavity wall, as built, insulated (assumed)': 'filled cavity', + 'Average thermal transmittance 0.31 W/m?K': 'unknown', + 'Cavity wall, as built, no insulation (assumed)': 'uninsulated cavity', + 'Average thermal transmittance 0.30 W/m?K': 'unknown', 'Average thermal transmittance 0.28 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.25 W/m-¦K': 'unknown', 'Average thermal transmittance 0.21 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.20 W/m-¦K': 'unknown', 'Average thermal transmittance 0.29 W/m?K': 'unknown', + 'Average thermal transmittance 0.16 W/m?K': 'unknown', + 'Average thermal transmittance 0.27 W/m²K': 'unknown', + 'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.18 W/m?K': 'unknown', + 'Granite or whin, with internal insulation': 'granite or whinstone', + 'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown', + 'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown', + 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown', + 'Average thermal transmittance 0.64 W/m?K': 'unknown', 'Average thermal transmittance 0.61 W/m?K': 'unknown', + 'Sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', + 'Average thermal transmittance 0.33 W/m?K': 'unknown', 'Cavity wall,': 'unknown', + 'Cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', + 'Average thermal transmittance 0.29 W/m-¦K': 'unknown', 'Average thermal transmittance 0.32 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.19 W/m-¦K': 'unknown', 'Average thermal transmittance 0.27 W/m?K': 'unknown', + 'Average thermal transmittance 0.22 W/m?K': 'unknown', 'Average thermal transmittance 0.38 W/m?K': 'unknown', + 'Average thermal transmittance 0.26 W/m?K': 'unknown', 'Average thermal transmittance 0.27 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.18 W/m-¦K': 'unknown', 'Average thermal transmittance = 0.27 W/m?K': 'unknown', + 'Cavity wall, with external insulation': 'filled cavity', 'Average thermal transmittance 0.21 W/m?K': 'unknown', + 'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown', + 'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown', + 'Cavity wall, with internal insulation': 'filled cavity', + 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown' +} diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index d6d64471..0c16c43a 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -5,4 +5,6 @@ pydantic-settings==2.6.0 epc-api-python==1.0.2 fuzzywuzzy boto3 -openpyxl \ No newline at end of file +openpyxl +openai +tiktoken \ No newline at end of file diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index fcf11765..ca5195d6 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -364,10 +364,11 @@ def app(): landlord_heating_system="Heat Source", landlord_existing_pv="PV (Y/N)" ) - self.standardised_asset_list( - # In here, we might want to pass some specific remaps + self.init_standardise( ) + self.apply_transformations() + # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" # SHEET_NAME = "Sheet1" From 978deb286bc411a563631e81685319a38ef9061e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 22:32:05 +0000 Subject: [PATCH 188/255] debugging remapper --- asset_list/AssetList.py | 19 ++++++++++---- asset_list/mappings/exising_pv.py | 4 +++ asset_list/mappings/heating_systems.py | 17 ++++++------- asset_list/mappings/property_type.py | 2 ++ asset_list/mappings/walls.py | 34 +++++++++++++++++++++++++- etl/route_march_data_pull/app.py | 5 ++-- 6 files changed, 63 insertions(+), 18 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index e61cc89b..8f905a33 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -27,8 +27,8 @@ class DataRemapper: :param standard_values: Set of allowed standardized values. :param standard_map: Dictionary of common remappings {raw_value: standard_value}. """ - self.standard_values = {v.lower() for v in standard_values} # Normalize to lowercase - self.standard_map = {k.lower(): v.lower() for k, v in (standard_map or {}).items()} # Predefined mappings + self.standard_values = standard_values + self.standard_map = standard_map self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing @@ -39,7 +39,7 @@ class DataRemapper: self.total_tokens_used = 0 self.total_cost = 0 self.remap_dict = {} # {original_value: standardized_value} - self.max_tokens = 1000 # Limit for OpenAI API + self.max_tokens = max_tokens # Limit for OpenAI API # Memoization for AI calls self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}} @@ -61,6 +61,8 @@ class DataRemapper: return None text = text.strip().lower() text = re.sub(r'[^\w\s]', '', text) # Remove punctuation + # Replace double strings + text = re.sub(r'\s+', ' ', text) return text def fuzzy_match(self, text): @@ -106,6 +108,7 @@ class DataRemapper: if input_tokens > self.max_tokens: raise ValueError("Input tokens exceed the maximum limit.") + logger.info("Calling OpenAI API for standardization...") response = self.openai_client.chat.completions.create( model=self.ai_model, messages=[{"role": "user", "content": prompt}], @@ -156,8 +159,14 @@ class DataRemapper: cleaned_value = self.clean_string(value) # Rule-Based Check (Predefined Mapping) - if cleaned_value in self.standard_map: - self.remap_dict[value] = self.standard_map[cleaned_value] + if cleaned_value in self.standard_map or value in self.standard_map: + self.remap_dict[value] = ( + self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value] + ) + continue + + if value.lower() in self.standard_map: + self.remap_dict[value] = self.standard_map[value.lower()] continue # Exact Match in Standard Values diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py index 1e45bd83..06e77bba 100644 --- a/asset_list/mappings/exising_pv.py +++ b/asset_list/mappings/exising_pv.py @@ -5,4 +5,8 @@ STANDARD_EXISTING_PV = { EXISTING_PV_MAPPINGS = { "NO": "no PV", "YES": "already has PV", + "no": "no PV", + "yes": "already has PV", + True: "already has PV", + False: "no PV", } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 4fce39ab..2fbdff70 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -34,13 +34,12 @@ HEATING_MAPPINGS = { "Eco Electric Radiators": "electric radiators", "Gas fire": "other", "Backboiler - Solid fuel": "other", + 'combi - gas': 'gas combi boiler', 'e7 storage heaters': 'electric storage heaters', + 'district heating system': 'district heating', 'condensing boiler - gas': 'gas condensing boiler', + 'boiler oil/other': 'oil boiler', 'condensing combi - gas': 'gas condensing combi', + 'air source source heat pump': 'air source heat pump', 'biomass boiler': 'boiler - other fuel', + 'ground source heat pump': 'ground source heat pump', 'electric oil filled radiators': 'electric radiators', + 'solid fuel': 'other', 'lpg boiler': 'boiler - other fuel', 'electric boiler': 'electric boiler', + 'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler', + 'eco electric radiators': 'electric radiators', 'gas fire': 'other', 'backboiler - solid fuel': 'other', } - -# array(['Combi - GAS', 'E7 Storage Heaters', 'District heating system', -# 'Condensing Boiler - GAS', 'Boiler Oil/other', -# 'Condensing Combi - Gas', 'Air Source Source Heat Pump', -# 'Biomass Boiler', 'Ground Source Heat Pump', -# 'Electric Oil filled radiators', 'Solid Fuel', 'LPG Boiler', -# 'Electric Boiler', 'No data', 'Boiler Communal/Commercial - GAS', -# 'Eco Electric Radiators', 'Gas fire', 'Backboiler - Solid fuel'], -# dtype=object) diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index bcad9ede..ec569123 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -11,6 +11,8 @@ PROPERTY_MAPPING = { "MAISONET": "maisonette", "BUNGALOW": "bungalow", "BLKHOUS": "block house", + "blkhous": "block house", "BEDSIT": "bedsit", "COACHSE": "coach house", + "coachse": "coach house", } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 7dec7d12..33db1fef 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,3 +1,5 @@ +from asset_list.AssetList import DataRemapper + STANDARD_WALL_CONSTRUCTIONS = { "uninsulated cavity", "filled cavity", "partial insulated cavity", "timber frame", "solid brick", "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob", @@ -18,6 +20,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown', 'Average thermal transmittance 0.18 W/m?K': 'unknown', 'Granite or whin, with internal insulation': 'granite or whinstone', + "Granite or whinstone, as built, insulated (assumed)": "granite or whinstone", 'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown', 'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown', 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown', @@ -34,5 +37,34 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown', 'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown', 'Cavity wall, with internal insulation': 'filled cavity', - 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown' + 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown', + 'new build - average thermal transmittance': 'new build - average thermal transmittance', + 'average thermal transmittance 0.25 w/m?k': 'unknown', + 'cavity wall, as built, insulated (assumed)': 'filled cavity', + 'average thermal transmittance 0.31 w/m?k': 'unknown', + 'cavity wall, as built, no insulation (assumed)': 'uninsulated cavity', + 'average thermal transmittance 0.30 w/m?k': 'unknown', 'average thermal transmittance 0.28 w/m-¦k': 'unknown', + 'average thermal transmittance 0.25 w/m-¦k': 'unknown', 'average thermal transmittance 0.21 w/m-¦k': 'unknown', + 'average thermal transmittance 0.20 w/m-¦k': 'unknown', 'average thermal transmittance 0.29 w/m?k': 'unknown', + 'average thermal transmittance 0.16 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m²k': 'unknown', + 'average thermal transmittance 0.15 w/m-¦k': 'unknown', 'average thermal transmittance 0.23 w/m-¦k': 'unknown', + 'average thermal transmittance 0.18 w/m?k': 'unknown', + 'granite or whin, with internal insulation': 'granite or whinstone', + 'average thermal transmittance 0.22 w/m-¦k': 'unknown', 'average thermal transmittance 0.24 w/m?k': 'unknown', + 'average thermal transmittance 0.16 w/m-¦k': 'unknown', 'average thermal transmittance 0.35 w/m?k': 'unknown', + 'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown', + 'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown', + 'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', + 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': 'unknown', + 'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', + 'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown', + 'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown', + 'average thermal transmittance 0.22 w/m?k': 'unknown', 'average thermal transmittance 0.38 w/m?k': 'unknown', + 'average thermal transmittance 0.26 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m-¦k': 'unknown', + 'average thermal transmittance 0.18 w/m-¦k': 'unknown', 'average thermal transmittance = 0.27 w/m?k': 'unknown', + 'cavity wall, with external insulation': 'filled cavity', 'average thermal transmittance 0.21 w/m?k': 'unknown', + 'average thermal transmittance 0.23 w/m?k': 'unknown', 'average thermal transmittance 0.20 w/m?k': 'unknown', + 'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown', + 'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown', + 'average thermal transmittance 0.28 w/m?k': 'unknown', } diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index ca5195d6..1289fb09 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -346,7 +346,7 @@ def app(): invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] - self = AssetList( + asset_list = AssetList( local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME, @@ -364,8 +364,7 @@ def app(): landlord_heating_system="Heat Source", landlord_existing_pv="PV (Y/N)" ) - self.init_standardise( - ) + asset_list.init_standardise() self.apply_transformations() From 776285dd1592e037f9345a4396d83db671dedd03 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 22:35:21 +0000 Subject: [PATCH 189/255] added map printing --- asset_list/AssetList.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 8f905a33..87402924 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,8 +1,9 @@ import os import re +import tiktoken +from pprint import pprint from datetime import datetime from openai import OpenAI -import tiktoken import numpy as np import pandas as pd from fuzzywuzzy import process @@ -491,6 +492,12 @@ class AssetList: # We now print out the variable mappings, which can be reviewed by the user, before the final standardised # asset list is returned + for variable, mapping in self.variable_mappings.items(): + pprint(f"Variable: {variable}") + pprint(mapping) + # Print a space + print("\n") + pprint("=======================================") def apply_standardiation(self, override_empty_mappings=False): """ From 75e7c13a29ed98059a99e54245b72cebd9c52f48 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Feb 2025 22:51:48 +0000 Subject: [PATCH 190/255] modifying creation of ids --- asset_list/AssetList.py | 37 ++++++++++++++++++++++++++++---- etl/route_march_data_pull/app.py | 13 +++-------- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 87402924..b153b624 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1,3 +1,4 @@ +import hashlib import os import re import tiktoken @@ -324,11 +325,24 @@ class AssetList: We want all figures to be positive :return: """ - import sys + + # We'll remove punctuation and whitespace from the address, before hashing to produce an ID + + def _make_hash(value): + """Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value.""" + # Normalize and remove special characters for cleaner ID + cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower() + + # Generate SHA-256 hash and truncate it + short_hash = hashlib.sha256(value.encode()).hexdigest()[:12] + + return f"{cleaned_value}-{short_hash}" + + # Apply transformation self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = ( - self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[ - self.postcode_colname] - ).apply(lambda x: hash(x) % 2 ** sys.hash_info.width) + self.standardised_asset_list[self.full_address_colname] + + self.standardised_asset_list[self.postcode_colname] + ).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash) @staticmethod def _strip_postcode_from_full_address(full_address, postcode): @@ -509,5 +523,20 @@ class AssetList: if not self.variable_mappings and not override_empty_mappings: raise ValueError("Please run init_standardise first") + logger.info("Applying standardisation to asset list") + + for variable, mapping in self.variable_mappings.items(): + self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping) + + if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum(): + # Drop the dupes + pprint( + f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated " + f"addresses - dropping" + ) + self.standardised_asset_list = self.standardised_asset_list[ + ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() + ] + def create_lookup_mappings(self): pass diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 1289fb09..54ae2280 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -344,7 +344,8 @@ def app(): HAS_NON_INTRUSIVES = True PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits - invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"] + # Maps addresses to uprn in problematic cases + MANUAL_UPRN_MAP = {} asset_list = AssetList( local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), @@ -366,7 +367,7 @@ def app(): ) asset_list.init_standardise() - self.apply_transformations() + asset_list.apply_standardiation() # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" @@ -382,9 +383,6 @@ def app(): # # If we have the non-intrusives data, this should be true # HAS_NON_INTRUSIVES = True - # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = {} - asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) if MISSING_POSTCODES_METHOD is not None: @@ -464,11 +462,6 @@ def app(): # We check for duplicated addresses asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] - if asset_list["deduper"].duplicated().sum(): - # Drop the dupes - print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping") - asset_list = asset_list[~asset_list["deduper"].duplicated()] - asset_list = asset_list.drop(columns=["deduper"]) # We chunk up this data into 5000 rows at a time # Create the chunks directory From fe6de36782bc3d413f7813ee54ad151e11bc929d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 07:46:52 +0000 Subject: [PATCH 191/255] creating new maps --- etl/route_march_data_pull/app.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 54ae2280..d520895d 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -6,6 +6,10 @@ import numpy as np from tqdm import tqdm from datetime import datetime from asset_list.AssetList import AssetList +from asset_list.mappings.property_type import PROPERTY_MAPPING +from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS +from asset_list.mappings.heating_systems import HEATING_MAPPINGS +from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS from dotenv import load_dotenv from backend.SearchEpc import SearchEpc @@ -367,6 +371,21 @@ def app(): ) asset_list.init_standardise() + # We produce the new maps, which can be saved for future useage + + new_property_type_map = PROPERTY_MAPPING.copy().update( + asset_list.variable_mappings[asset_list.landlord_property_type] + ) + new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_wall_construction] + ) + new_heating_map = HEATING_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_heating_system] + ) + new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_existing_pv] + ) + asset_list.apply_standardiation() # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" From 63dbda005d63d590b1d2e1b156d15d125a67c746 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 07:57:47 +0000 Subject: [PATCH 192/255] completing full rename --- asset_list/AssetList.py | 51 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index b153b624..8379cc2a 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -218,8 +218,9 @@ class AssetList: STANDARD_ADDRESS_1 = "domna_address_1" STANDARD_POSTCODE = "domna_postcode" STANDARD_FULL_ADDRESS = "domna_full_address" - STANDARD_YEAR_BUILT = "domna_year_built" + STANDARD_YEAR_BUILT = "landlord_year_built" STANDARD_UPRN = "ordnance_survey_uprn" + STANDARD_LANDLORD_PROPERTY_ID = "landlord_property_id" STANDARD_PROPERTY_TYPE = "landlord_property_type" STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction" STANDARD_HEATING_SYSTEM = "landlord_heating_system" @@ -293,6 +294,8 @@ class AssetList: self.variable_mappings = {} + self.rename_map = {} + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): if method not in self.ADDRESS_1_CLEANING_METHODS: @@ -359,6 +362,25 @@ class AssetList: # We look for string in the form (x-y) return bool(cls.MULTI_UNIT_REGEX.search(address1_section)) + @staticmethod + def _convert_uprn(x): + """ + Used to convert UPRNS to integer strings + :param x: uprn to convert + :return: converted uprn + """ + + if pd.isnull(x): + return x + + # check if numeric + if np.isreal(x): + return str(int(x)) + + if str(x).isdigit(): + return str(int(x)) + return x + def init_standardise(self): """ This function is used to standardise the asset list @@ -411,6 +433,12 @@ class AssetList: # We create the domna property id self.create_property_id() + # Clean up the UPRN column, if the landlord has provided them + if self.landlord_uprn is not None: + self.standardised_asset_list[self.landlord_uprn] = ( + self.standardised_asset_list[self.landlord_uprn].apply(self._convert_uprn) + ) + # We keep just the columns we care about and will work through the various columns and standardise variables = [ self.landlord_property_id, @@ -425,7 +453,21 @@ class AssetList: self.landlord_heating_system, self.landlord_existing_pv ] - rename = {} + # Keep just non-null variables (e.g landlord may not provide uprn + variables = [v for v in variables if v is not None] + rename = { + self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID, + self.address1_colname: self.STANDARD_ADDRESS_1, + self.postcode_colname: self.STANDARD_POSTCODE, + self.full_address_colname: self.STANDARD_FULL_ADDRESS, + self.landlord_uprn: self.STANDARD_UPRN, + self.landlord_property_type: self.STANDARD_PROPERTY_TYPE, + self.landlord_year_built: self.STANDARD_YEAR_BUILT, + self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION, + self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM, + self.landlord_existing_pv: self.STANDARD_EXISTING_PV + } + rename = {k: v for k, v in rename.items() if k is not None} if self.non_intrusives_present: variables += self.NON_INTRUSIVES_COLNAMES @@ -538,5 +580,10 @@ class AssetList: ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() ] + # Apply renames to our standard names + self.standardised_asset_list = self.standardised_asset_list.rename( + columns=self.rename_map + ) + def create_lookup_mappings(self): pass From 47ad0e8275ce218b0cd44de6342ff619d83a0d81 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 08:21:59 +0000 Subject: [PATCH 193/255] refactoring get_data methodology --- asset_list/AssetList.py | 23 +++-- etl/route_march_data_pull/app.py | 149 +++++++++---------------------- 2 files changed, 53 insertions(+), 119 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 8379cc2a..14dce093 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -295,6 +295,7 @@ class AssetList: self.variable_mappings = {} self.rename_map = {} + self.keep_variables = [] def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): @@ -454,8 +455,8 @@ class AssetList: self.landlord_existing_pv ] # Keep just non-null variables (e.g landlord may not provide uprn - variables = [v for v in variables if v is not None] - rename = { + self.keep_variables = [v for v in variables if v is not None] + self.rename_map = { self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID, self.address1_colname: self.STANDARD_ADDRESS_1, self.postcode_colname: self.STANDARD_POSTCODE, @@ -467,21 +468,17 @@ class AssetList: self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM, self.landlord_existing_pv: self.STANDARD_EXISTING_PV } - rename = {k: v for k, v in rename.items() if k is not None} + self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None} if self.non_intrusives_present: - variables += self.NON_INTRUSIVES_COLNAMES - rename = { - **rename, + self.keep_variables += self.NON_INTRUSIVES_COLNAMES + self.rename_map = { + **self.rename_map, **dict( zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES]) ) } - self.standardised_asset_list = self.standardised_asset_list[variables].rename( - columns=rename - ) - # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[ self.full_address_colname @@ -498,10 +495,9 @@ class AssetList: ) # Clear our build year column - # We attempt to process the year built column if self.landlord_year_built is not None: - # We check if we have a datetime + # We check if we have a datetime - year built has not been renamed if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime): # We treat any string columns - with common values we see self.standardised_asset_list[self.landlord_year_built] = ( @@ -581,7 +577,8 @@ class AssetList: ] # Apply renames to our standard names - self.standardised_asset_list = self.standardised_asset_list.rename( + # Perform final variable selection and renaming: + self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename( columns=self.rename_map ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index d520895d..83e5e0ca 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -1,10 +1,10 @@ import os import time -from BaseUtility import Definitions +import json import pandas as pd import numpy as np from tqdm import tqdm -from datetime import datetime +from BaseUtility import Definitions from asset_list.AssetList import AssetList from asset_list.mappings.property_type import PROPERTY_MAPPING from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS @@ -31,8 +31,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data( - asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, uprn_column=None, - epc_api_only=False + asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, + uprn_column=None, epc_api_only=False, row_id_name="row_id" ): epc_data = [] errors = [] @@ -103,12 +103,12 @@ def get_data( searcher.find_property(skip_os=True) if searcher.newest_epc is None: - no_epc.append(home["row_id"]) + no_epc.append(home[row_id_name]) continue if epc_api_only: epc = { - "row_id": home["row_id"], + row_id_name: home[row_id_name], **searcher.newest_epc.copy() } @@ -144,7 +144,7 @@ def get_data( time.sleep(np.random.uniform(0.1, 1)) epc = { - "row_id": home["row_id"], + row_id_name: home[row_id_name], **searcher.newest_epc.copy(), "recommendations": property_recommendations["rows"], "find_my_epc_data": find_epc_data, @@ -152,7 +152,7 @@ def get_data( epc_data.append(epc) except Exception as e: - errors.append(home["row_id"]) + errors.append(home[row_id_name]) time.sleep(5) return epc_data, errors, no_epc @@ -402,113 +402,48 @@ def app(): # # If we have the non-intrusives data, this should be true # HAS_NON_INTRUSIVES = True - asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) - - if MISSING_POSTCODES_METHOD is not None: - if MISSING_POSTCODES_METHOD == "last_two_words": - # Replace any double spaces - asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False) - asset_list["Postcode"] = np.where( - pd.isnull(asset_list["Postcode"]), - asset_list[FULLADDRESS_COLUMN].str.split(" ").str[-2:].str.join(" "), - asset_list["Postcode"] - ) - else: - raise ValueError(f"Method {MISSING_POSTCODES_METHOD} not recognized") - - asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() - asset_list["row_id"] = asset_list.index - - # We clean up portential non-breaking spaces, and double spaces - for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]: - asset_list[col] = asset_list[col].astype(str) - asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) - asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) - asset_list[col] = asset_list[col].str.strip() - - if ADDRESS1_COLUMN is None: - ADDRESS1_COLUMN = "address1_extracted" - asset_list = extract_address1( - asset_list=asset_list, - full_address_col=FULLADDRESS_COLUMN, - postcode_col=POSTCODE_COLUMN, - method=ADDRESS1_METHOD - ) - - if FULLADDRESS_COLUMN is None: - FULLADDRESS_COLUMN = "fulladdress_extracted" - # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas - # Sometimes, some of the columns are empty, so we need to remove them - asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply( - lambda x: ", ".join([y for y in x if not pd.isnull(y)]), axis=1 - ) - - # We clean up portential non-breaking spaces, and double spaces - asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].astype(str) - asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False) - asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False) - - if UPRN_COLUMN is not None: - # Check if it's numeric and if so, make sure it's an integer - def convert_uprn(x): - - if pd.isnull(x): - return x - - # check if numeric - if np.isreal(x): - return str(int(x)) - - if str(x).isdigit(): - return str(int(x)) - return x - - asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn) - - # We attempt to process the year built column - if PROPERTY_YEAR_BUILT is not None: - # We check if we have a datetime - if isinstance(asset_list[PROPERTY_YEAR_BUILT].iloc[0], datetime): - # We treat any string columns - with common values we see - datetime_remap = { - "Pre 1900": datetime(year=1899, month=12, day=31), - } - asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].replace(datetime_remap) - - asset_list[PROPERTY_YEAR_BUILT] = pd.to_datetime(asset_list[PROPERTY_YEAR_BUILT]) - # Convert this to year - asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].dt.year - - # We check for duplicated addresses - asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] + ### We retrieve the EPC data # We chunk up this data into 5000 rows at a time # Create the chunks directory - if not os.path.exists(os.path.join(DATA_FOLDER, "Chunks")): - os.makedirs(os.path.join(DATA_FOLDER, "Chunks")) - chunk_size = 5000 - errors = [] - no_epc = [] + force_retrieve_data = False skip = None # Used to skip already completed chunks - for i in range(0, len(asset_list), chunk_size): + chunk_size = 5000 + filename = "Chunk {i}.csv" + download_folder = os.path.join(DATA_FOLDER, "Chunks") + if not os.path.exists(download_folder): + os.makedirs(download_folder) + + chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size)) + downloaded_files = {filename.format(i=i) for i in chunk_indexes} + + # We check if we have files associated to these files already and if we do, and we do not want to force the + # fetching of the data, we skip + folder_contents = os.listdir(download_folder) + if all(x in folder_contents for x in downloaded_files): + skip = max(chunk_indexes) + + for i in range(0, len(asset_list.standardised_asset_list), chunk_size): print(f"Processing chunk {i} to {i + chunk_size}") - if skip is not None: + if skip is not None and not force_retrieve_data: if i <= skip: continue - chunk = asset_list[i:i + chunk_size] + chunk = asset_list.standardised_asset_list[i:i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( asset_list=chunk, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, + row_id_name=asset_list.DOMNA_PROPERTY_ID, + fulladdress_column=asset_list.STANDARD_FULL_ADDRESS, + address1_column=asset_list.STANDARD_ADDRESS_1, + postcode_column=asset_list.STANDARD_POSTCODE, manual_uprn_map=MANUAL_UPRN_MAP, - uprn_column=UPRN_COLUMN + uprn_column=asset_list.STANDARD_UPRN ) # We now retrieve any failed properties - chunk_failed = chunk[chunk["row_id"].isin(errors)] + chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] epc_data_failed, _, _ = get_data( asset_list=chunk_failed, + row_id_name=asset_list.DOMNA_PROPERTY_ID, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, postcode_column=POSTCODE_COLUMN, @@ -517,20 +452,22 @@ def app(): ) epc_data_chunk.extend(epc_data_failed) - errors.extend(errors_chunk) - no_epc.extend(no_epc_chunk) # Append the failed data to the main data # Store the chunk locally as a csv pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) + # Store the errors and no-data locally + with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f: + json.dump(errors_chunk, f) + + with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f: + json.dump(no_epc_chunk, f) # We read in and concatenate the created created chunks - chunks_folder = os.path.join(DATA_FOLDER, "Chunks") # List the contents - chunk_files = os.listdir(chunks_folder) epc_data = [] - for file in chunk_files: - csv_data = pd.read_csv(os.path.join(chunks_folder, file)) + for file in downloaded_files: + csv_data = pd.read_csv(os.path.join(download_folder, file)) # We need to convert the recommendations back to a list csv_data["recommendations"] = csv_data["recommendations"].apply(eval) csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) From 591ce5445839780ea64db5376eb0457d27da3d34 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 08:26:09 +0000 Subject: [PATCH 194/255] hndling case where landlord uprn and landlord property id are the sames --- asset_list/AssetList.py | 6 ++++++ etl/route_march_data_pull/app.py | 9 ++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 14dce093..5e8ff29c 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -297,6 +297,12 @@ class AssetList: self.rename_map = {} self.keep_variables = [] + # Finally, we handle the case where the landlord's property ID is actually the OS UPRN + if self.landlord_uprn == self.landlord_property_id: + self.raw_asset_list[self.STANDARD_UPRN] = self.raw_asset_list[self.landlord_uprn].copy() + # Update the reference to landlord UPRn + self.landlord_uprn = self.STANDARD_UPRN + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): if method not in self.ADDRESS_1_CLEANING_METHODS: diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 83e5e0ca..4bf9fe3a 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -511,6 +511,7 @@ def app(): find_my_epc_data["Solar photovoltaics"] = False # Retrieve just the data we need + epc_df = epc_df[ [ "row_id", @@ -527,21 +528,23 @@ def app(): "walls-description", "floor-description", "transaction-type", - # New fields needed "secondheat-description", "total-floor-area", "construction-age-band", "floor-height", "number-habitable-rooms", "mainheat-description", - # - "energy-consumption-current", # kwh/m2 + 'mainheatcont-description', + "energy-consumption-current", "photo-supply", ] ].rename( columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"} ) + asset_list.merge_data(epc_df) + asset_list.insert_ + asset_list = asset_list.merge( epc_df, how="left", From 4a6802a5a24715ca0f047a70b680d6dc484cd7b4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 08:27:35 +0000 Subject: [PATCH 195/255] fixed bug to reference standardised data when copying uprn instead of raw --- asset_list/AssetList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 5e8ff29c..86b1bf87 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -299,7 +299,7 @@ class AssetList: # Finally, we handle the case where the landlord's property ID is actually the OS UPRN if self.landlord_uprn == self.landlord_property_id: - self.raw_asset_list[self.STANDARD_UPRN] = self.raw_asset_list[self.landlord_uprn].copy() + self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy() # Update the reference to landlord UPRn self.landlord_uprn = self.STANDARD_UPRN From 37cc43adb1b331d267c724faaf804afaa0b7f2fc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 08:39:29 +0000 Subject: [PATCH 196/255] refactoring creation of epc dataset --- asset_list/AssetList.py | 42 +++++++++++++++++ etl/route_march_data_pull/app.py | 77 +++++++------------------------- 2 files changed, 59 insertions(+), 60 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 86b1bf87..88425e6d 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -202,6 +202,33 @@ class AssetList: This class is used to standardise asset lists so that we can process the core information in a consistent manner. """ + EPC_API_DATA_NAMES = { + "uprn": "epc_os_uprn", + "address1": "epc_address1", + "address": "epc_address", + "postcode": "epc_postcode", + "inspection-date": "epc_inspection_date", + "current-energy-efficiency": "epc_sap_score_on_register", + "current-energy-rating": "epc_rating_on_register", + "property-type": "epc_property_type", + "built-form": "epc_archetype", + "total-floor-area": "epc_total_floor_area", + "construction-age-band": "epc_age_band", + "floor-height": "epc_floor_height", + "number-habitable-rooms": "epc_number_habitable_rooms", + "walls-description": "epc_wall_construction", + "roof-description": "epc_roof_construction", + "floor-description": "epc_floor_construction", + "mainheat-description": "epc_heating_type", + 'mainheatcont-description': "epc_heating_controls", + "secondheat-description": "epc_secondary_heating", + "transaction-type": "epc_reason", + "energy-consumption-current": "epc_heat_demand", + } + FIND_EPC_DATA_NAMES = { + + } + DATETIME_REMAP = { "Pre 1900": datetime(year=1899, month=12, day=31), } @@ -590,3 +617,18 @@ class AssetList: def create_lookup_mappings(self): pass + + def merge_data(self, df: pd.DataFrame): + """ + Used to insert data into the standardised asset list, based on the domna property id + :return: + """ + if self.DOMNA_PROPERTY_ID not in df.columns: + raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}") + + if df[self.DOMNA_PROPERTY_ID].duplicated().sum(): + raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs") + + self.standardised_asset_list = self.standardised_asset_list.merge( + df, how="left", on=self.DOMNA_PROPERTY_ID + ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 4bf9fe3a..2e66c4aa 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -474,20 +474,22 @@ def app(): epc_data.append(csv_data) epc_df = pd.concat(epc_data) + # TODO: TEMP!!! + epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID}) # We expand out the recommendations - recommendations_df = epc_df[["row_id", "recommendations"]] + recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] unique_recommendations = set() for _, row in recommendations_df.iterrows(): unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) - columns = ["row_id"] + list(unique_recommendations) + columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) transformed_data = [] for _, row in recommendations_df.iterrows(): # Initialize a dictionary for this row with False for all recommendations row_data = {col: False for col in columns} - row_data["row_id"] = row["row_id"] + row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID] # Set True for each recommendation present in this row for rec in row["recommendations"]: @@ -500,10 +502,11 @@ def app(): transformed_df = pd.DataFrame(transformed_data) # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation # recommendations - transformed_df = transformed_df[["row_id", "Cavity wall insulation"]] + transformed_df = transformed_df[[asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation"]] # Get the find my epc data - find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( + find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( + columns=["find_my_epc_data"]).join( pd.json_normalize(epc_df["find_my_epc_data"]) ) # We check if we get the solar pv column: @@ -513,46 +516,15 @@ def app(): # Retrieve just the data we need epc_df = epc_df[ - [ - "row_id", - "uprn", - "address1", - "address", - "postcode", - "property-type", - "built-form", - "inspection-date", - "current-energy-rating", - "current-energy-efficiency", - "roof-description", - "walls-description", - "floor-description", - "transaction-type", - "secondheat-description", - "total-floor-area", - "construction-age-band", - "floor-height", - "number-habitable-rooms", - "mainheat-description", - 'mainheatcont-description', - "energy-consumption-current", - "photo-supply", - ] - ].rename( - columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"} + [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) + ].rename( + columns=asset_list.EPC_API_DATA_NAMES ) - asset_list.merge_data(epc_df) - asset_list.insert_ - - asset_list = asset_list.merge( - epc_df, - how="left", - on="row_id" - ).merge( + epc_df = epc_df.merge( find_my_epc_data[ [ - "row_id", "heating_text", "hot_water_text", 'Assessor’s name', + asset_list.DOMNA_PROPERTY_ID, "heating_text", "hot_water_text", 'Assessor’s name', "Assessor's Telephone", "Assessor's Email", "Accreditation scheme", "Assessor’s ID", "Solar photovoltaics" ] @@ -564,31 +536,16 @@ def app(): } ), how="left", - on="row_id" + on=asset_list.DOMNA_PROPERTY_ID ) + asset_list.merge_data(epc_df) + asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) asset_list = asset_list.drop(columns=["photo-supply"]) # Rename the columns - asset_list = asset_list.rename(columns={ - "inspection-date": "Date of last EPC", - "current-energy-efficiency": "SAP score on register", - "current-energy-rating": "EPC rating on register", - "property-type": "Property Type", - "built-form": "Archetype - EPC", - "total-floor-area": "Property Floor Area", - "construction-age-band": "Property Age Band", - "floor-height": "Property Floor Height", - "number-habitable-rooms": "Number of Habitable Rooms", - "walls-description": "Wall Construction", - "roof-description": "Roof Construction", - "floor-description": "Floor Construction", - "mainheat-description": "Heating Type", - "secondheat-description": "Secondary Heating", - "transaction-type": "Reason for last EPC", - "energy-consumption-current": "Heat Demand (kWh/m2)", - }) + asset_list = asset_list asset_list["Estimated Number of Floors"] = asset_list.apply( lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( From ecc9d9954073858685ef1877d574fc5fc73606b2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 08:45:15 +0000 Subject: [PATCH 197/255] major refactor of handling of epc data and starting to set up extract_attributes --- asset_list/AssetList.py | 23 ++++++++++++++++++----- etl/route_march_data_pull/app.py | 17 ++++------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 88425e6d..4ca4c2b8 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -226,7 +226,14 @@ class AssetList: "energy-consumption-current": "epc_heat_demand", } FIND_EPC_DATA_NAMES = { - + "heating_text": "epc_estiamted_heating_kwh", + "hot_water_text": "epc_estimated_hotwater_kwh", + 'Assessor’s name': "epc_assessor_name", + "Assessor's Telephone": "epc_assessor_telephone", + "Assessor's Email": "epc_assessor_email", + "Accreditation scheme": "epc_assessor_accreditation", + "Assessor’s ID": "epc_assessor_id", + "Solar photovoltaics": "epc_solar_pv" } DATETIME_REMAP = { @@ -265,7 +272,8 @@ class AssetList: "Any further surveyor notes", 'Surveyors Name' ] - #### Mapping for wall construction + # Attributes - these are columns that we produce, calcualted based on other pieces of data + ATTRIBUTE_HAS_SOLAR = "attribute_has_solar" def __init__( self, @@ -615,9 +623,6 @@ class AssetList: columns=self.rename_map ) - def create_lookup_mappings(self): - pass - def merge_data(self, df: pd.DataFrame): """ Used to insert data into the standardised asset list, based on the domna property id @@ -632,3 +637,11 @@ class AssetList: self.standardised_asset_list = self.standardised_asset_list.merge( df, how="left", on=self.DOMNA_PROPERTY_ID ) + + def extract_attributes(self): + # Used to extracty the typical attributes that we use to identify viable work + + self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = ( + self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] | + ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""]) + ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 2e66c4aa..8b112ea2 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -522,25 +522,16 @@ def app(): ) epc_df = epc_df.merge( - find_my_epc_data[ - [ - asset_list.DOMNA_PROPERTY_ID, "heating_text", "hot_water_text", 'Assessor’s name', - "Assessor's Telephone", "Assessor's Email", "Accreditation scheme", - "Assessor’s ID", "Solar photovoltaics" - ] - ].rename( - columns={ - "Solar photovoltaics": "Has Solar PV", - "heating_text": "Heating Estimated kWh", - "hot_water_text": "Hot Water Estimated kWh", - } - ), + find_my_epc_data[[asset_list.DOMNA_PROPERTY_ID] + list(asset_list.FIND_EPC_DATA_NAMES.keys())] + .rename(columns=asset_list.FIND_EPC_DATA_NAMES), how="left", on=asset_list.DOMNA_PROPERTY_ID ) asset_list.merge_data(epc_df) + asset_list.extract_attributes() + asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) asset_list = asset_list.drop(columns=["photo-supply"]) From ed333e1714fa9ff3a4f09bc789e5aa37bca0bc8e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 09:04:26 +0000 Subject: [PATCH 198/255] refactored est no floors --- asset_list/AssetList.py | 27 +++++++++++++++++++++++++ etl/route_march_data_pull/app.py | 12 +++++------ recommendations/recommendation_utils.py | 7 +++++-- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 4ca4c2b8..74469c63 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -15,6 +15,12 @@ import asset_list.mappings.walls as walls_mappings import asset_list.mappings.heating_systems as heating_mappings import asset_list.mappings.exising_pv as existing_pv_mappings +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + logger = setup_logger() # OpenAI API Key (set this in your environment variables for security) @@ -224,6 +230,7 @@ class AssetList: "secondheat-description": "epc_secondary_heating", "transaction-type": "epc_reason", "energy-consumption-current": "epc_heat_demand", + "photo-supply": "epc_photo_supply" } FIND_EPC_DATA_NAMES = { "heating_text": "epc_estiamted_heating_kwh", @@ -274,6 +281,7 @@ class AssetList: # Attributes - these are columns that we produce, calcualted based on other pieces of data ATTRIBUTE_HAS_SOLAR = "attribute_has_solar" + ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors" def __init__( self, @@ -645,3 +653,22 @@ class AssetList: self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] | ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""]) ) + + accepted_epc_property_types = ["House", "Flat", "Bungalow", "Maisonette"] + + # The logic here is: + # 1) Take the property type provided by the HA themselves + # 2) In absence of that, take the EPC property type + # 3) Otherwise use None + self.standardised_asset_list[self.ATTRIBUTE_NUMBER_OF_FLOORS] = self.standardised_asset_list.apply( + lambda x: estimate_number_of_floors( + property_type=( + x[self.STANDARD_PROPERTY_TYPE].title() if + x[self.STANDARD_PROPERTY_TYPE].title() in accepted_epc_property_types else ( + x[self.EPC_API_DATA_NAMES["property-type"]] if not + pd.isnull(x[self.EPC_API_DATA_NAMES["property-type"]]) else None + ) + ) + ), + axis=1 + ) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 8b112ea2..9754e726 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -514,7 +514,6 @@ def app(): find_my_epc_data["Solar photovoltaics"] = False # Retrieve just the data we need - epc_df = epc_df[ [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) ].rename( @@ -529,15 +528,14 @@ def app(): ) asset_list.merge_data(epc_df) + # TODO: TEMP!!! + epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str) + asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge( + epc_df, how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" + ) asset_list.extract_attributes() - asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) - asset_list = asset_list.drop(columns=["photo-supply"]) - - # Rename the columns - asset_list = asset_list - asset_list["Estimated Number of Floors"] = asset_list.apply( lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( x["Property Type"]) else None, axis=1 diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py index 00da6107..602684cf 100644 --- a/recommendations/recommendation_utils.py +++ b/recommendations/recommendation_utils.py @@ -205,7 +205,7 @@ def get_wall_u_value( mapped_value = wall_uvalues_df[ wall_uvalues_df["Wall_type"] == mapped_description - ][age_band].values[0] + ][age_band].values[0] if pd.isnull(mapped_value) and "Park home" in mapped_description: # We don't know enough in this case so we default to 0 @@ -428,6 +428,9 @@ def estimate_number_of_floors(property_type): Using the property type, we estimate the number of floors in the property """ + if property_type is None: + return None + if property_type == "House": number_of_floors = 2 elif property_type in ["Flat", "Bungalow"]: @@ -560,7 +563,7 @@ def get_floor_u_value( insulation_lookup = s11[ s11["Age_band"].str.contains(age_band) & s11["Floor_construction"] == floor_type - ] + ] if insulation_lookup.empty: insulation_thickness = 0 else: From 8bf6aa5af23378c0a1a27f6f756f3440d89b6bc4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 09:20:25 +0000 Subject: [PATCH 199/255] refactoring construction of the attributes --- asset_list/AssetList.py | 65 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 74469c63..5f4436b8 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -21,6 +21,8 @@ from recommendations.recommendation_utils import ( estimate_number_of_floors ) +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + logger = setup_logger() # OpenAI API Key (set this in your environment variables for security) @@ -279,9 +281,19 @@ class AssetList: "Any further surveyor notes", 'Surveyors Name' ] + # This SAP threshold is a key search criteria for properties that may be eligible for extraction + SAP_RATING_THRESHOLD = 75 + # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable + EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5 + # Attributes - these are columns that we produce, calcualted based on other pieces of data ATTRIBUTE_HAS_SOLAR = "attribute_has_solar" ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors" + ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter" + ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area" + ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness" + ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{SAP_RATING_THRESHOLD}_and_below" + ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}" def __init__( self, @@ -672,3 +684,56 @@ class AssetList: ), axis=1 ) + + self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]].astype(float) + ) + # Replace "" value with None + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].replace("", None) + ) + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].astype(float) + ) + + # Estimate the perimeter + self.standardised_asset_list[self.ATTRIBUTE_ESTIMATED_PERIMETER] = self.standardised_asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + ), axis=1 + ) + + self.standardised_asset_list[self.ATTRIBUTE_HEAT_LOSS_AREA] = self.standardised_asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + floor_height=( + float(x[self.EPC_API_DATA_NAMES["floor-height"]]) if + x[self.EPC_API_DATA_NAMES["floor-height"]] else 2.5 + ), + perimeter=x[self.ATTRIBUTE_ESTIMATED_PERIMETER], + built_form=x[self.EPC_API_DATA_NAMES["built-form"]] + ), + axis=1 + ) + + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply( + lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[ + "insulation_thickness"] if not pd.isnull( + x[self.EPC_API_DATA_NAMES["roof-description"]]) else None, + axis=1 + ) + + # We produce some additional fields + # 1) Is the SAP rating below C75 + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= + self.SAP_RATING_THRESHOLD + ) + # 2) Flag anything where the EPC is older than 5 years + + self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = ( + pd.to_datetime( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["lodgement-date"]] + ).dt.year < self.EPC_YEAR_THRESHOLD + ) From c0ebffb6cbab5d4f4e2d24f82f352cb8b7024638 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Feb 2025 20:50:05 +0000 Subject: [PATCH 200/255] coding up logic to identify work types --- asset_list/AssetList.py | 250 ++++++++++++++++++++++++- asset_list/mappings/heating_systems.py | 1 + asset_list/mappings/walls.py | 27 ++- etl/route_march_data_pull/app.py | 164 +--------------- 4 files changed, 270 insertions(+), 172 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 5f4436b8..81aa525a 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -10,6 +10,7 @@ import pandas as pd from fuzzywuzzy import process from utils.logger import setup_logger from backend.SearchEpc import SearchEpc +from BaseUtility import Definitions import asset_list.mappings.property_type as property_type_mappings import asset_list.mappings.walls as walls_mappings import asset_list.mappings.heating_systems as heating_mappings @@ -282,7 +283,9 @@ class AssetList: ] # This SAP threshold is a key search criteria for properties that may be eligible for extraction - SAP_RATING_THRESHOLD = 75 + FILLED_CAVITY_SAP_THRESHOLD = 75 + # This SAP the + EMPTY_CAVITY_SAP_THRESHOLD = 71 # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5 @@ -292,9 +295,17 @@ class AssetList: ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter" ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area" ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness" - ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{SAP_RATING_THRESHOLD}_and_below" + ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below" ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}" + # These are the descriptions that we look for in the EPC data that are indicative of no insulation + EPC_NO_WALL_INSULATION_DESCRIPTIONS = [ + "cavity wall, as built, no insulation (assumed)", + "cavity wall, as built, partial insulation (assumed)", + "cavity wall, as built, partial insulation", + "cavity wall, as built, no insulation", + ] + def __init__( self, local_filepath, @@ -728,12 +739,241 @@ class AssetList: # 1) Is the SAP rating below C75 self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= - self.SAP_RATING_THRESHOLD + self.FILLED_CAVITY_SAP_THRESHOLD ) # 2) Flag anything where the EPC is older than 5 years - self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = ( pd.to_datetime( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["lodgement-date"]] + self.standardised_asset_list[self.EPC_API_DATA_NAMES["inspection-date"]] ).dt.year < self.EPC_YEAR_THRESHOLD ) + + self.process_age_band() + + def process_age_band(self): + processed_age_band = [] + for _, x in self.standardised_asset_list.iterrows(): + + if pd.isnull(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) or ( + x[self.EPC_API_DATA_NAMES["construction-age-band"]] in Definitions.DATA_ANOMALY_MATCHES + ): + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": None, + "epc_year_upper_bound": None, + "Does Age Match EPC Age Band?": "No EPC Age Band" + } + ) + continue + + # We exatract the upper and lower bounds + if x[self.EPC_API_DATA_NAMES["construction-age-band"]] in [ + "England and Wales: 2007 onwards", "England and Wales: 2012 onwards" + ]: + year_lower_bound = 2007 if x[self.EPC_API_DATA_NAMES[ + "construction-age-band"]] == "England and Wales: 2007 onwards" else 2012 + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] >= year_lower_bound + else "EPC Age Band is older than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": year_lower_bound, + "epc_year_upper_bound": None, + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue + + if x[self.EPC_API_DATA_NAMES["construction-age-band"]] == "England and Wales: before 1900": + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] < 1900 + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": None, + "epc_year_upper_bound": 1899, + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue + + if x[self.EPC_API_DATA_NAMES["construction-age-band"]].isdigit(): + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] == int( + x[self.EPC_API_DATA_NAMES["construction-age-band"]] + ) + else "EPC Age Band is different from Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), + "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), + "Does Age Match EPC Age Band?": age_band_matches + } + ) + continue + + # Oherwise, we extract the upper and lower bounds + age_band = x[self.EPC_API_DATA_NAMES["construction-age-band"]].split(": ")[1] + lower_date, upper_date = age_band.split("-") + + age_band_matches = ( + "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and ( + x[self.STANDARD_YEAR_BUILT] <= float(upper_date) + ) + else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date) + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": int(lower_date), + "epc_year_upper_bound": int(upper_date), + "Does Age Match EPC Age Band?": age_band_matches + } + ) + + processed_age_band = pd.DataFrame(processed_age_band) + + self.standardised_asset_list = self.standardised_asset_list.merge( + processed_age_band, how="left" + ) + + def identify_worktypes(self): + + # If we have non-intrusives completed, we can use this to identify work types + + if self.non_intrusives_present: + ###################################################### + # Empty cavity: + ###################################################### + # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled + # 2) The age is before 1995 + # TODO: 3) Remove anything that likley has access issues + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & + self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) + ) + + self.standardised_asset_list["epc_indicates_empty_cavity"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( + self.EPC_NO_WALL_INSULATION_DESCRIPTIONS + ) & ( + self.standardised_asset_list["epc_year_upper_bound"] <= 1995 + ) & ( + ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) + ) + + ###################################################### + # Extraction + ###################################################### + + # TODO When filterting like this, 627 properties are flagged as not needing a CIGA check and 582 are flagged + # as needing a CIGA check. What is the logic we should be applying here? + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & + (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & + (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "FORMALDEHYDE"]) + ) & ( + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + ) + + ###################################################### + # Solar + ###################################################### + # Criteria: + + # TODO: Standardise these columns with our cleaned_data object + + # Check 1: Does the property have a valid heating system? + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( + ["air source heat pump", "ground source heat pump", "high heat retention storage heaters"] + ) + ) + + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] + .str.lower().str.contains("air source heat pump|ground source heat pump") + ) | ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains( + "electric storage heaters" + ) & ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES[ + "mainheatcont-description"]] == "Controls for high heat retention storage heaters" + ) + ) + ) + + # Check 2: Does the property have solar already + self.standardised_asset_list["property_has_solar"] = ( + (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") | + (self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF") | + (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR]) + ) + + # Check 3: Does the property meet the fabric condition + # Solar PV installs are subject to the minimum insulation requirements which means: + # 1) one of the following insulation measures must be installed as part of the same + # ECO4 project: + # • roof insulation (flat roof, pitched roof, room-in-roof) + # • exterior facing wall insulation (cavity wall, solid wall) + # • party cavity wall insulation + # • floor insulation (solid and underfloor) + # + # OR + # + # all measures (except any exempted measure referred to in paragraph 4.28) + # listed in paragraph a) must already be installed + # + # With this in mind, we look for 2 clases + # 1) The property is fully insulated apart from the loft (<200mm insulation) + # 2) THe property is fully insulated + + self.standardised_asset_list["solar_landlord_walls_insulated"] = ( + self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( + ["filled cavity", "insulated solid brick"] + ) + ) + + EPC_INSULATED_WALLS_SUBSTRINGS = [ + ", insulated", "with external insulation", "with internal insulation", "filled cavity" + ] + + self.standardised_asset_list["landlord_wall_construction"].value_counts() + + EPC_INSULATED_ROOF_SUBSTRINGS = [ + "(another dwelling above)", "limited insulation", "(other premises above)", + ", no insulation", + ] diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 2fbdff70..89bfe0c4 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -13,6 +13,7 @@ STANDARD_HEATING_SYSTEMS = { "electric boiler", "unknown", "communal gas boiler", + "high heat retention storage heaters", } HEATING_MAPPINGS = { diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 33db1fef..c5cca599 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,8 +1,10 @@ from asset_list.AssetList import DataRemapper STANDARD_WALL_CONSTRUCTIONS = { - "uninsulated cavity", "filled cavity", "partial insulated cavity", "timber frame", "solid brick", - "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob", + "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation", + "timber frame", "uninsulated solid brick", + "insulated solid brick", "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", + "cob", "new build - average thermal transmittance", } @@ -26,7 +28,8 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown', 'Average thermal transmittance 0.64 W/m?K': 'unknown', 'Average thermal transmittance 0.61 W/m?K': 'unknown', 'Sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', - 'Average thermal transmittance 0.33 W/m?K': 'unknown', 'Cavity wall,': 'unknown', + 'Average thermal transmittance 0.33 W/m?K': 'unknown', + 'Cavity wall,': "cavity unknown insulation", 'Cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', 'Average thermal transmittance 0.29 W/m-¦K': 'unknown', 'Average thermal transmittance 0.32 W/m-¦K': 'unknown', 'Average thermal transmittance 0.19 W/m-¦K': 'unknown', 'Average thermal transmittance 0.27 W/m?K': 'unknown', @@ -55,7 +58,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown', 'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown', 'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', - 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': 'unknown', + 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': "cavity unknown insulation", 'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', 'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown', 'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown', @@ -67,4 +70,20 @@ WALL_CONSTRUCTION_MAPPINGS = { 'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown', 'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown', 'average thermal transmittance 0.28 w/m?k': 'unknown', + 'Cavity wall, filled cavity': 'filled cavity', + 'Cavity wall, filled cavity and external insulation': 'filled cavity', + 'Granite or whinstone, as built, no insulation (assumed)': 'granite or ' + 'whinstone', + 'Solid brick, as built, insulated (assumed)': 'insulated solid brick', + 'Solid brick, as built, no insulation (assumed)': 'uninsulated solid brick', + 'Solid brick, with external insulation': 'insulated solid brick', + 'Solid brick, with internal insulation': 'insulated solid brick', + 'System built, as built, insulated (assumed)': 'system built', + 'System built, as built, no insulation (assumed)': 'system built', + 'System built, with external insulation': 'system built', + 'System built, with internal insulation': 'system built', + 'Timber frame, as built, insulated (assumed)': 'timber frame', + 'Timber frame, as built, no insulation (assumed)': 'timber frame', + 'Timber frame, as built, partial insulation (assumed)': 'timber frame', + 'Timber frame, with additional insulation': 'timber frame', } diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 9754e726..fbf7e10d 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -4,7 +4,6 @@ import json import pandas as pd import numpy as np from tqdm import tqdm -from BaseUtility import Definitions from asset_list.AssetList import AssetList from asset_list.mappings.property_type import PROPERTY_MAPPING from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS @@ -14,13 +13,6 @@ from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS from dotenv import load_dotenv from backend.SearchEpc import SearchEpc from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc -from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes - -from recommendations.recommendation_utils import ( - estimate_perimeter, - estimate_external_wall_area, - estimate_number_of_floors -) from etl.epc_clean.epc_attributes.attribute_utils import ( extract_thermal_transmittance @@ -177,109 +169,6 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t raise ValueError(f"Method {method} not recognized") -def process_age_band(asset_list, year_built_column): - processed_age_band = [] - for _, x in asset_list.iterrows(): - - if pd.isnull(x["Property Age Band"]) or ( - x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES - ): - processed_age_band.append({ - "row_id": x["row_id"], - "epc_year_lower_bound": None, - "epc_year_upper_bound": None, - "Does Age Match EPC Age Band?": "No EPC Age Band" - }) - continue - - # We exatract the upper and lower bounds - if x["Property Age Band"] in ["England and Wales: 2007 onwards", "England and Wales: 2012 onwards"]: - year_lower_bound = 2007 if x["Property Age Band"] == "England and Wales: 2007 onwards" else 2012 - - if pd.isnull(x[year_built_column]): - age_band_matches = "No Year Built From Landlord" - else: - age_band_matches = ( - "EPC Age Band Matches Year Built" if x[year_built_column] >= year_lower_bound - else "EPC Age Band is older than Year Built" - ) - - processed_age_band.append( - { - "row_id": x["row_id"], - "epc_year_lower_bound": year_lower_bound, - "epc_year_upper_bound": None, - "Does Age Match EPC Age Band?": age_band_matches - } - ) - continue - - if x["Property Age Band"] == "England and Wales: before 1900": - - if pd.isnull(x[year_built_column]): - age_band_matches = "No Year Built From Landlord" - else: - age_band_matches = ( - "EPC Age Band Matches Year Built" if x[year_built_column] < 1900 - else "EPC Age Band is newer than Year Built" - ) - - processed_age_band.append( - { - "row_id": x["row_id"], - "epc_year_lower_bound": None, - "epc_year_upper_bound": 1899, - "Does Age Match EPC Age Band?": age_band_matches - } - ) - continue - - if x["Property Age Band"].isdigit(): - - if pd.isnull(x[year_built_column]): - age_band_matches = "No Year Built From Landlord" - else: - age_band_matches = ( - "EPC Age Band Matches Year Built" if x[year_built_column] == int(x["Property Age Band"]) - else "EPC Age Band is different from Year Built" - ) - - processed_age_band.append( - { - "row_id": x["row_id"], - "epc_year_lower_bound": int(x["Property Age Band"]), - "epc_year_upper_bound": int(x["Property Age Band"]), - "Does Age Match EPC Age Band?": age_band_matches - } - ) - continue - - # Oherwise, we extract the upper and lower bounds - age_band = x["Property Age Band"].split(": ")[1] - lower_date, upper_date = age_band.split("-") - - age_band_matches = ( - "EPC Age Band Matches Year Built" if (x[year_built_column] >= float(lower_date)) and ( - x[year_built_column] <= float(upper_date) - ) - else "EPC Age Band is older than Year Built" if x[year_built_column] > float(upper_date) - else "EPC Age Band is newer than Year Built" - ) - - processed_age_band.append( - { - "row_id": x["row_id"], - "epc_year_lower_bound": int(lower_date), - "epc_year_upper_bound": int(upper_date), - "Does Age Match EPC Age Band?": age_band_matches - } - ) - - processed_age_band = pd.DataFrame(processed_age_band) - - return processed_age_band - - def app(): """ This app is EPC pulling data for some properties owned by Livewest @@ -531,62 +420,11 @@ def app(): # TODO: TEMP!!! epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str) asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge( - epc_df, how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" + epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" ) asset_list.extract_attributes() - asset_list["Estimated Number of Floors"] = asset_list.apply( - lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( - x["Property Type"]) else None, axis=1 - ) - - asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) - # Replace "" value with None - asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) - asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) - - asset_list["Estimated Perimeter (m)"] = asset_list.apply( - lambda x: estimate_perimeter( - floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], - num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], - ), axis=1 - ) - - asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( - lambda x: estimate_external_wall_area( - num_floors=x["Estimated Number of Floors"], - floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, - perimeter=x["Estimated Perimeter (m)"], - built_form=x["Archetype - EPC"] - ), - axis=1 - ) - - asset_list["Roof Insulation Thickness"] = asset_list.apply( - lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( - x["Roof Construction"]) else None, - axis=1 - ) - - # We produce some additional fields - # 1) Is the SAP rating below C75 - asset_list["SAP Rating is 75 and below"] = asset_list["SAP score on register"] <= 75 - # 2) Flag anything where the EPC is older than 5 years - cutoff_year = pd.Timestamp.now().year - 5 - asset_list[f"EPC is pre {cutoff_year}"] = ( - pd.to_datetime(asset_list["Date of last EPC"]).dt.year < cutoff_year - ) - - # 3) If we have year in the asset list, we flag entries where the built year is different from the - # EPC Age band - if PROPERTY_YEAR_BUILT is not None: - # We process the age band and merge it on - processed_age_band = process_age_band(asset_list, PROPERTY_YEAR_BUILT) - asset_list = asset_list.merge( - processed_age_band, how="left", on="row_id" - ) - if HAS_NON_INTRUSIVES: # Empty cavity: # 1) Has been flagged on the non-intrusives as being empty or partially filled From 4db9d48e366e121abcfe83e2dfd335d33151bc68 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 12:39:06 +0000 Subject: [PATCH 201/255] adding the solar floor eligibiltiy criteria --- asset_list/AssetList.py | 85 ++++++++++++++++++++++++++++---- asset_list/requirements.txt | 3 +- etl/route_march_data_pull/app.py | 28 ++++++++++- 3 files changed, 105 insertions(+), 11 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 81aa525a..4666cf63 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -306,6 +306,17 @@ class AssetList: "cavity wall, as built, no insulation", ] + # List of strings that we look for in the EPC data, where substrings indicate that the wall is insulated + EPC_INSULATED_WALLS_SUBSTRINGS = [ + ", insulated", "with external insulation", "with internal insulation", "filled cavity" + ] + + # List of strings that we look for in the EPC data, where substrings indicate that the roof is insulated + EPC_INSULATED_ROOF_SUBSTRINGS = [ + "(another dwelling above)", ", insulated", ", insulated (assumed) ", + ", ceiling insulated", + ] + def __init__( self, local_filepath, @@ -861,7 +872,10 @@ class AssetList: processed_age_band, how="left" ) - def identify_worktypes(self): + def identify_worktypes(self, cleaned): + + if not self.non_intrusives_present: + raise NotImplementedError("Need to implement the case for non-intrusives") # If we have non-intrusives completed, we can use this to identify work types @@ -892,6 +906,17 @@ class AssetList: ) ) + self.standardised_asset_list["empty_cavity"] = ( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | + self.standardised_asset_list["epc_indicates_empty_cavity"] + ) + # We add a reason + self.standardised_asset_list["empty_cavity_reason"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], + "Non-Intrusive Data", + "EPC Data" + ) + ###################################################### # Extraction ###################################################### @@ -967,13 +992,55 @@ class AssetList: ) ) - EPC_INSULATED_WALLS_SUBSTRINGS = [ - ", insulated", "with external insulation", "with internal insulation", "filled cavity" - ] + # TODO: We don't have information about the roof from this landlord + self.standardised_asset_list["solar_epc_walls_insulated"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains( + "|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS) + ) + ) - self.standardised_asset_list["landlord_wall_construction"].value_counts() + # We merge on the u-value for average thermal transmittance + roof_uvalue_data = pd.DataFrame(cleaned["roof-description"]) + roof_uvalue_data = roof_uvalue_data[ + ~pd.isnull(roof_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["roof-description"], + "thermal_transmittance": "roof_u_value" + } + ) - EPC_INSULATED_ROOF_SUBSTRINGS = [ - "(another dwelling above)", "limited insulation", "(other premises above)", - ", no insulation", - ] + self.standardised_asset_list = self.standardised_asset_list.merge( + roof_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] + ) + + # If the u-value of a roof is less than 0.7 we consider it insulated + self.standardised_asset_list["solar_epc_roof_insulated"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains( + "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False + ) | ( + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) >= 270 if str(x).isdigit() else False + ) + ) | ( + self.standardised_asset_list["roof_u_value"].apply( + lambda x: x <= 0.7 if not pd.isnull(x) else False + ) + ) + ) + + self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[ + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) < 270 if str(x).isdigit() else False + ) + + self.standardised_asset_list["solar_epc_floor_is_solid"] = self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"] + ].str.lower().str.contains("solid") + self.standardised_asset_list["solar_epc_floor_is_solid"] = ( + self.standardised_asset_list["solar_epc_floor_is_solid"].fillna(False) + ) + + z = self.standardised_asset_list[ + self.standardised_asset_list["solar_epc_floor_is_solid"] == True + ] diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index 0c16c43a..fd045d46 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -7,4 +7,5 @@ fuzzywuzzy boto3 openpyxl openai -tiktoken \ No newline at end of file +tiktoken +msgpack \ No newline at end of file diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index fbf7e10d..32c36fe8 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -391,13 +391,28 @@ def app(): transformed_df = pd.DataFrame(transformed_data) # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation # recommendations - transformed_df = transformed_df[[asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation"]] + transformed_df = transformed_df[ + [ + asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation", "Floor insulation (solid floor)", + "Floor insulation", "Floor insulation (suspended floor)" + ] + ] + + transformed_df["epc_has_floor_recommendation"] = ( + transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] | + transformed_df["Floor insulation (suspended floor)"] + ) # Get the find my epc data find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( columns=["find_my_epc_data"]).join( pd.json_normalize(epc_df["find_my_epc_data"]) ) + find_my_epc_data = find_my_epc_data.merge( + transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], + how="left", on=asset_list.DOMNA_PROPERTY_ID + ) + # We check if we get the solar pv column: if "Solar photovoltaics" not in find_my_epc_data.columns: find_my_epc_data["Solar photovoltaics"] = False @@ -425,6 +440,17 @@ def app(): asset_list.extract_attributes() + # TODO - Use this! + import msgpack + from utils.s3 import read_from_s3 + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + asset_list.identify_worktypes(cleaned) + if HAS_NON_INTRUSIVES: # Empty cavity: # 1) Has been flagged on the non-intrusives as being empty or partially filled From c544c95282df3a9c50fc84ab46bd387f889a4b4d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 14:35:12 +0000 Subject: [PATCH 202/255] working on solar criteria --- asset_list/AssetList.py | 105 +++++++++++++++++++++++++++---- etl/route_march_data_pull/app.py | 8 +-- 2 files changed, 96 insertions(+), 17 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 4666cf63..056f8b5d 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -936,9 +936,6 @@ class AssetList: # Solar ###################################################### # Criteria: - - # TODO: Standardise these columns with our cleaned_data object - # Check 1: Does the property have a valid heating system? self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( @@ -993,9 +990,35 @@ class AssetList: ) # TODO: We don't have information about the roof from this landlord + + # We merge on the u-value for average thermal transmittance + walls_uvalue_data = pd.DataFrame(cleaned["walls-description"]) + walls_uvalue_data = walls_uvalue_data[ + ~pd.isnull(walls_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["walls-description"], + "thermal_transmittance": "walls_u_value" + } + ) + self.standardised_asset_list = self.standardised_asset_list.merge( + walls_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["walls-description"] + ) + self.standardised_asset_list["solar_epc_walls_insulated"] = ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains( - "|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS) + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES[ + "walls-description"]].str.lower().str.contains( + "|".join( + self.EPC_INSULATED_WALLS_SUBSTRINGS) + ) + ) | ( + self.standardised_asset_list[ + "walls_u_value"].apply( + lambda x: x <= 0.3 if not pd.isnull( + x) else False + ) ) ) @@ -1034,13 +1057,69 @@ class AssetList: lambda x: int(x) < 270 if str(x).isdigit() else False ) - self.standardised_asset_list["solar_epc_floor_is_solid"] = self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["floor-description"] - ].str.lower().str.contains("solid") - self.standardised_asset_list["solar_epc_floor_is_solid"] = ( - self.standardised_asset_list["solar_epc_floor_is_solid"].fillna(False) + # TODO: Fill with False - should be temp! + self.standardised_asset_list["epc_has_floor_recommendation"] = ( + self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) ) - z = self.standardised_asset_list[ - self.standardised_asset_list["solar_epc_floor_is_solid"] == True - ] + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = ( + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str + .lower().str.contains("solid") + ) & ( + ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) + ) | ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.contains("solid") + ) & ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower() + .str.contains(", insulated") + ) + ) + ) + + # We now put together the criteria: + # Flag properties that look eligible for solar, that have solid floors + # TODO: We'll need to revise this + self.standardised_asset_list["solar_eligible_solid_floor"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] + ) & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + ) + + # Solid floor but needs a loft top-up + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] + ) & + # Roof is insulated + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + ) + + # Suspended floor, fully insulated + + # ~self.standardised_asset_list["solar_epc_loft_needs_topup"] & diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 32c36fe8..0de85a27 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -389,11 +389,9 @@ def app(): transformed_data.append(row_data) transformed_df = pd.DataFrame(transformed_data) - # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation - # recommendations transformed_df = transformed_df[ [ - asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation", "Floor insulation (solid floor)", + asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", "Floor insulation", "Floor insulation (suspended floor)" ] ] @@ -425,7 +423,9 @@ def app(): ) epc_df = epc_df.merge( - find_my_epc_data[[asset_list.DOMNA_PROPERTY_ID] + list(asset_list.FIND_EPC_DATA_NAMES.keys())] + find_my_epc_data[ + [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) + ] .rename(columns=asset_list.FIND_EPC_DATA_NAMES), how="left", on=asset_list.DOMNA_PROPERTY_ID From 84ae26a9133e91a3f1904db2407f2f84bfb7305a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 15:14:18 +0000 Subject: [PATCH 203/255] added the eligibility criteria for solar and aggregate figures: --- asset_list/AssetList.py | 117 ++++++++++++++++++++++++++++++- etl/route_march_data_pull/app.py | 1 - 2 files changed, 114 insertions(+), 4 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 056f8b5d..ffe53d40 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -341,6 +341,8 @@ class AssetList: # Read in the data self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) self.standardised_asset_list = self.raw_asset_list.copy() + # Will be used to store aggregated figures against the various work types + self.work_type_figures = {} # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False @@ -1062,6 +1064,23 @@ class AssetList: self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) ) + # We merge on the u-value for average thermal transmittance + floors_uvalue_data = pd.DataFrame(cleaned["floor-description"]) + floors_uvalue_data = floors_uvalue_data[ + ~pd.isnull(floors_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["floor-description"], + "thermal_transmittance": "floor_u_value" + } + ) + + # Merge on + self.standardised_asset_list = self.standardised_asset_list.merge( + floors_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["floor-description"] + ) + + # We assume that a U-value of 0.5 or below is indicative of an insulated floor self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = ( ( ( @@ -1072,7 +1091,8 @@ class AssetList: ) ) | ( ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.contains("solid") + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"]].str.lower().str.contains("solid") ) & ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower() .str.contains(", insulated") @@ -1080,6 +1100,33 @@ class AssetList: ) ) + # Check for other floor types, insulated + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] = ( + # The floor is suspended and insulated + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str + .lower().str.contains("suspended") + ) & ( + ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) + ) | ( + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"] + ].str.lower().str.contains("suspended") + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"] + ].str.lower().str.contains(", insulated") + ) + ) | ( + self.standardised_asset_list["floor_u_value"].apply( + lambda x: x <= 0.5 if not pd.isnull(x) else False + ) + ) + ) + # We now put together the criteria: # Flag properties that look eligible for solar, that have solid floors # TODO: We'll need to revise this @@ -1120,6 +1167,70 @@ class AssetList: self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] ) - # Suspended floor, fully insulated + # Other floor type, fully insulated + self.standardised_asset_list["solar_eligible_other_floor"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] + ) & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + ) - # ~self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Other floor type, needs loft top-up + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] + ) & + # Roof need loft top-up + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Floor is not solid, but is insulated + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + ) + + # Produce some aggregate figures + self.work_type_figures = { + # Empty cavity from non-intrusives + "Empty Cavity (non-intrusives)": ( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"].sum() + ), + "Empty Cavity (EPC)": ( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] + ).sum() + ), + "Cavity Extraction": ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"].sum() + ), + "Solar PV (Solid Floor)": ( + self.standardised_asset_list["solar_eligible_solid_floor"].sum() + ), + "Solar PV (Solid Floor, Needs Loft Top-up)": ( + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"].sum() + ), + "Solar PV (Other Floor)": ( + self.standardised_asset_list["solar_eligible_other_floor"].sum() + ), + "Solar PV (Other Floor, Needs Loft Top-up)": ( + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"].sum() + ) + } diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 0de85a27..5960f69b 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -440,7 +440,6 @@ def app(): asset_list.extract_attributes() - # TODO - Use this! import msgpack from utils.s3 import read_from_s3 cleaned = read_from_s3( From 5df47a86ae889b4e26191550f79fc4720f2878a7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 15:15:58 +0000 Subject: [PATCH 204/255] removed cirular import --- asset_list/mappings/walls.py | 2 - etl/route_march_data_pull/app.py | 126 +------------------------------ 2 files changed, 3 insertions(+), 125 deletions(-) diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index c5cca599..1fc52fcb 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,5 +1,3 @@ -from asset_list.AssetList import DataRemapper - STANDARD_WALL_CONSTRUCTIONS = { "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation", "timber frame", "uninsulated solid brick", diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 5960f69b..7bf3cca8 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -4,6 +4,8 @@ import json import pandas as pd import numpy as np from tqdm import tqdm +import msgpack +from utils.s3 import read_from_s3 from asset_list.AssetList import AssetList from asset_list.mappings.property_type import PROPERTY_MAPPING from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS @@ -440,8 +442,6 @@ def app(): asset_list.extract_attributes() - import msgpack - from utils.s3 import read_from_s3 cleaned = read_from_s3( s3_file_name="cleaned_epc_data/cleaned.bson", bucket_name="retrofit-data-dev" @@ -450,114 +450,7 @@ def app(): asset_list.identify_worktypes(cleaned) - if HAS_NON_INTRUSIVES: - # Empty cavity: - # 1) Has been flagged on the non-intrusives as being empty or partially filled - # 2) The age is before 1995 - # 3) Remove anything that likley has access issues - asset_list["Suitable for Cavity Fill"] = ( - (asset_list["Construction"] == "CAVITY") & - asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) & - ( - # Shold we defer to the year built provided by the HA? - (asset_list[PROPERTY_YEAR_BUILT] <= 1995) | (asset_list["epc_year_upper_bound"] <= 1995) - ) & - ( - # We check if the property type column contains one of the invalid property types - ~asset_list[PROPERTY_TYPE_COLUMN].str.lower().str.contains("|".join(invalid_property_types_dictionary)) - ) - ) - - # asset_list["Suitable for Extraction"] = - asset_list[ - (asset_list["Construction"] == "Cavity") & - asset_list["Insulated"].isin(["RETRO DRILLED"]) & - ( - (asset_list[PROPERTY_YEAR_BUILT] <= 1995) - ) & - ( - asset_list[PROPERTY_TYPE_COLUMN] - ) - ] - - # 4) Flag properties that look like they're good candidates for solar installs - # Firstly, flag if the fabric is completely done - - insulated_wall_substrings = [ - ", insulated", "with external insulation", "with internal insulation", "filled cavity" - ] - - insulated_roof_substrings = [ - "(another dwelling above)", "limited insulation", "(other premises above)", - ", no insulation", - ] - - def check_solar_insulation_conditions(x): - - if pd.isnull(x["Wall Construction"]): - return None - - if "average thermal transmittance" in x["Wall Construction"].lower(): - # We extract out the u-values - wall_uvalue = extract_thermal_transmittance({}, x["Wall Construction"])[0]["thermal_transmittance"] - roof_uvalue = extract_thermal_transmittance({}, x["Roof Construction"])[0]["thermal_transmittance"] - floor_uvalue = extract_thermal_transmittance({}, x["Floor Construction"])[0]["thermal_transmittance"] - - roof_uvalue = 0 if roof_uvalue is None else roof_uvalue - floor_uvalue = 0 if floor_uvalue is None else floor_uvalue - - # We apply some cutoffs - if wall_uvalue < 0.7 and roof_uvalue < 0.7 and floor_uvalue < 0.7: - return "Walls, Roof and Floor have U-values below 0.7" - - return "Confirm U-values" - - walls_insulated = any( - insulated_substring in x["Wall Construction"].lower() for insulated_substring in insulated_wall_substrings - ) - roof_is_numeric = False - if str(x["Roof Insulation Thickness"]).isdigit(): - roof_is_numeric = True - roof_insulated = int(x["Roof Insulation Thickness"]) >= 200 - else: - roof_insulated = any( - insulated_substring in x["Roof Construction"].lower() for insulated_substring in - insulated_roof_substrings - ) - - floor_is_solid = "solid" in x["Floor Construction"].lower() - - if walls_insulated and roof_insulated and floor_is_solid: - return "Walls Insulated, Roof Insulated, Floor Solid" - - if walls_insulated and floor_is_solid and roof_is_numeric: - return "Walls Insulated, Floor Solid, Loft need top-up" - - return "Not Fully Insulated or no data" - - asset_list["Solar Fabric Condition"] = asset_list.apply(check_solar_insulation_conditions, axis=1) - - asset_list["Good Solar Candidate"] = ( - asset_list["SAP Rating is 75 and below"] & - ~asset_list["Has Solar PV"] & - ( - asset_list["Heating Type"].isin( - [ - "Electric storage heaters", - "Room heaters, electric", - ] - ) | asset_list["Heating Type"].str.contains("heat pump", case=False) - ) & ( - asset_list["Solar Fabric Condition"].isin( - [ - "Walls Insulated, Roof Insulated, Floor Solid", - "Walls, Roof and Floor have U-values below 0.7", - "Walls Insulated, Floor Solid, Loft need top-up" - ] - ) - ) - ) - + # TODO: We should do this breakdown for flats def flat_analysis(asset_list): # We need to deduce the building name - we strip out the house number @@ -596,19 +489,6 @@ def app(): flat_data = flat_analysis(asset_list) - # For all of the columns in transformed_df, prefix with "Recommendation: " - for col in transformed_df.columns: - if col == "row_id": - continue - transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"}) - - asset_list = asset_list.merge( - transformed_df, - how="left", - on="row_id" - ) - asset_list = asset_list.drop(columns=["row_id", "index"]) - # Store as an excel filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data From d86ab5ff8df50e58248bff92582084462fc2166b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 15:18:53 +0000 Subject: [PATCH 205/255] restructuing app location --- asset_list/app.py | 497 ++++++++++++++++++++ etl/route_march_data_pull/app.py | 502 --------------------- etl/route_march_data_pull/requirements.txt | 0 3 files changed, 497 insertions(+), 502 deletions(-) delete mode 100644 etl/route_march_data_pull/app.py delete mode 100644 etl/route_march_data_pull/requirements.txt diff --git a/asset_list/app.py b/asset_list/app.py index 21b405d8..1a7788fe 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -1 +1,498 @@ import os +import time +import json +import pandas as pd +import numpy as np +from tqdm import tqdm +import msgpack +from utils.s3 import read_from_s3 +from asset_list.AssetList import AssetList +from asset_list.mappings.property_type import PROPERTY_MAPPING +from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS +from asset_list.mappings.heating_systems import HEATING_MAPPINGS +from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS + +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data( + asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, + uprn_column=None, epc_api_only=False, row_id_name="row_id" +): + epc_data = [] + errors = [] + no_epc = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home[postcode_column] + house_number = str(home[address1_column]).strip() + full_address = home[fulladdress_column].strip() + house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) + if house_no is None: + house_no = house_number + uprn = manual_uprn_map.get(full_address, None) + if uprn is None and home.get(uprn_column): + uprn = home[uprn_column] + + if pd.isnull(uprn): + uprn = None + + searcher = SearchEpc( + address1=str(house_no), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5, + uprn=uprn + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + + # Check if we have a flat or appartment + if searcher.newest_epc is None and uprn is None: + # Try again: + if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: + # Backup + add1 = full_address.split(",") + if len(add1) > 1: + add1 = add1[1].strip() + else: + # Try splitting on space + add1 = full_address.split(" ")[0].strip() + + else: + add1 = str(house_number) + searcher = SearchEpc( + address1=add1, + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + + if ( + "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in + house_number.lower() + ): + searcher.ordnance_survey_client.property_type = "Flat" + + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + no_epc.append(home[row_id_name]) + continue + + if epc_api_only: + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy() + } + + epc_data.append(epc) + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + # Retrieve data from FindMyEPC + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e) and "address1" in searcher.newest_epc: + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e): + find_epc_data = {} + else: + find_epc_data = {} + except Exception as e: + raise Exception(f"Error retrieving FindMyEPC data: {e}") + time.sleep(np.random.uniform(0.1, 1)) + + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"], + "find_my_epc_data": find_epc_data, + } + + epc_data.append(epc) + except Exception as e: + errors.append(home[row_id_name]) + time.sleep(5) + + return epc_data, errors, no_epc + + +def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): + if method == "first_two_words": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + if method == "first_word": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + return asset_list + + if method == "house_number_extraction": + asset_list["address1_extracted"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + return asset_list + + raise ValueError(f"Method {method} not recognized") + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + """ + + # TODO: + # For cavity work: + # - Flag any entries that have a different wall type between non-intrusive data against EPC + # - Worth double checking entries that have a difference in wall construction + # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity + # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation + # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats + # are less than C75 + # - Flag anything pre SAP2012 + # - Flag anything over 5 years old + # - Look at year built vs age band + # + # For Solar: + # - Discount any that have solar PV - based on non-intrusives and from the inspections team + # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with + # electric room heaters but it might need to be an EPC E + # - Fabric - check the floor, wall and roof: + # - Filled or empty cavity is good + # - Insulated solid/timber/system built is good + # - SCIS/CEG needs solid floors + # - JJC don’t care + # - Anything with a loft 200 or below + # - Anything C75 and above won’t qualify + # - Insulated loft = 200mm + # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) + # - Or the insulation required is loft/cavity (floors should be solid) + + # For Westward + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + DATA_FILENAME = "WESTWARD - completed list..xlsx" + SHEET_NAME = "Sheet1" + + POSTCODE_COLUMN = "WFT EDIT Postcode" + FULLADDRESS_COLUMN = "Address" + ADDRESS1_COLUMN = None + ADDRESS1_METHOD = "house_number_extraction" + + ADDRESS_COLS_TO_CONCAT = [] + MISSING_POSTCODES_METHOD = None + PROPERTY_YEAR_BUILT = "Build date" + UPRN_COLUMN = "UPRN" + # If we have the non-intrusives data, this should be true + HAS_NON_INTRUSIVES = True + PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits + + # Maps addresses to uprn in problematic cases + MANUAL_UPRN_MAP = {} + + asset_list = AssetList( + local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), + header=0, + sheet_name=SHEET_NAME, + address1_colname=ADDRESS1_COLUMN, + postcode_colname=POSTCODE_COLUMN, + landlord_property_id="UPRN", + full_address_colname=FULLADDRESS_COLUMN, + full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, + missing_postcodes_method=MISSING_POSTCODES_METHOD, + address1_extraction_method=ADDRESS1_METHOD, + landlord_year_built=PROPERTY_YEAR_BUILT, + landlord_uprn=UPRN_COLUMN, + landlord_property_type=PROPERTY_TYPE_COLUMN, + landlord_wall_construction="Wall Construction (EPC)", + landlord_heating_system="Heat Source", + landlord_existing_pv="PV (Y/N)" + ) + asset_list.init_standardise() + + # We produce the new maps, which can be saved for future useage + + new_property_type_map = PROPERTY_MAPPING.copy().update( + asset_list.variable_mappings[asset_list.landlord_property_type] + ) + new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_wall_construction] + ) + new_heating_map = HEATING_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_heating_system] + ) + new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_existing_pv] + ) + + asset_list.apply_standardiation() + + # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + # SHEET_NAME = "Sheet1" + # POSTCODE_COLUMN = 'Full Address.1' + # FULLADDRESS_COLUMN = "Full Address" + # ADDRESS1_COLUMN = None + # ADDRESS1_METHOD = "first_word" + # ADDRESS_COLS_TO_CONCAT = [] + # MISSING_POSTCODES_METHOD = None + # PROPERTY_YEAR_BUILT = "Build Date" + # UPRN_COLUMN = None + # # If we have the non-intrusives data, this should be true + # HAS_NON_INTRUSIVES = True + + ### We retrieve the EPC data + + # We chunk up this data into 5000 rows at a time + # Create the chunks directory + force_retrieve_data = False + skip = None # Used to skip already completed chunks + chunk_size = 5000 + filename = "Chunk {i}.csv" + download_folder = os.path.join(DATA_FOLDER, "Chunks") + if not os.path.exists(download_folder): + os.makedirs(download_folder) + + chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size)) + downloaded_files = {filename.format(i=i) for i in chunk_indexes} + + # We check if we have files associated to these files already and if we do, and we do not want to force the + # fetching of the data, we skip + folder_contents = os.listdir(download_folder) + if all(x in folder_contents for x in downloaded_files): + skip = max(chunk_indexes) + + for i in range(0, len(asset_list.standardised_asset_list), chunk_size): + print(f"Processing chunk {i} to {i + chunk_size}") + if skip is not None and not force_retrieve_data: + if i <= skip: + continue + chunk = asset_list.standardised_asset_list[i:i + chunk_size] + epc_data_chunk, errors_chunk, no_epc_chunk = get_data( + asset_list=chunk, + row_id_name=asset_list.DOMNA_PROPERTY_ID, + fulladdress_column=asset_list.STANDARD_FULL_ADDRESS, + address1_column=asset_list.STANDARD_ADDRESS_1, + postcode_column=asset_list.STANDARD_POSTCODE, + manual_uprn_map=MANUAL_UPRN_MAP, + uprn_column=asset_list.STANDARD_UPRN + ) + + # We now retrieve any failed properties + chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] + epc_data_failed, _, _ = get_data( + asset_list=chunk_failed, + row_id_name=asset_list.DOMNA_PROPERTY_ID, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN, + manual_uprn_map=MANUAL_UPRN_MAP, + epc_api_only=False + ) + + epc_data_chunk.extend(epc_data_failed) + + # Append the failed data to the main data + # Store the chunk locally as a csv + pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) + # Store the errors and no-data locally + with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f: + json.dump(errors_chunk, f) + + with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f: + json.dump(no_epc_chunk, f) + + # We read in and concatenate the created created chunks + # List the contents + epc_data = [] + for file in downloaded_files: + csv_data = pd.read_csv(os.path.join(download_folder, file)) + # We need to convert the recommendations back to a list + csv_data["recommendations"] = csv_data["recommendations"].apply(eval) + csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) + epc_data.append(csv_data) + + epc_df = pd.concat(epc_data) + # TODO: TEMP!!! + epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID}) + + # We expand out the recommendations + recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + transformed_df = transformed_df[ + [ + asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", + "Floor insulation", "Floor insulation (suspended floor)" + ] + ] + + transformed_df["epc_has_floor_recommendation"] = ( + transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] | + transformed_df["Floor insulation (suspended floor)"] + ) + + # Get the find my epc data + find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( + columns=["find_my_epc_data"]).join( + pd.json_normalize(epc_df["find_my_epc_data"]) + ) + find_my_epc_data = find_my_epc_data.merge( + transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], + how="left", on=asset_list.DOMNA_PROPERTY_ID + ) + + # We check if we get the solar pv column: + if "Solar photovoltaics" not in find_my_epc_data.columns: + find_my_epc_data["Solar photovoltaics"] = False + + # Retrieve just the data we need + epc_df = epc_df[ + [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) + ].rename( + columns=asset_list.EPC_API_DATA_NAMES + ) + + epc_df = epc_df.merge( + find_my_epc_data[ + [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) + ] + .rename(columns=asset_list.FIND_EPC_DATA_NAMES), + how="left", + on=asset_list.DOMNA_PROPERTY_ID + ) + + asset_list.merge_data(epc_df) + # TODO: TEMP!!! + epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str) + asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge( + epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" + ) + + asset_list.extract_attributes() + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + asset_list.identify_worktypes(cleaned) + + # TODO: We should do this breakdown for flats + def flat_analysis(asset_list): + + # We need to deduce the building name - we strip out the house number + def extract_building_name(x): + # TODO: This doesn't really work + if pd.isnull(x): + return None + house_no = SearchEpc.get_house_number(address=x, postcode=None) + if house_no: + return x.replace(house_no, "").strip() + return x.split(",")[0].strip() + + # We want to deduce if flats have 50% of the properties below C75 + # We group by postcode and property type + grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"]) + + flat_data = [] + for _, group in grouped: + if "flat" in group["Property Type"].str.lower().values: + num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0) + num_below_c75 = group["SAP score on register"].lt(75).sum() + + flat_data.append( + { + "Postcode": group[POSTCODE_COLUMN].iloc[0], + "Property Type": "Flat", + "Number of Flats with EPC": num_flats, + "Number of Flats below C75": num_below_c75, + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats) + } + ) + + flat_data = pd.DataFrame(flat_data) + + return flat_data + + flat_data = flat_analysis(asset_list) + + # Store as an excel + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" + # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data + + with pd.ExcelWriter(filename) as writer: + asset_list.to_excel(writer, sheet_name="EPC Data", index=False) + flat_data.to_excel(writer, sheet_name="Flat Data", index=False) + + matches_review = asset_list[ + [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] + ] diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py deleted file mode 100644 index 7bf3cca8..00000000 --- a/etl/route_march_data_pull/app.py +++ /dev/null @@ -1,502 +0,0 @@ -import os -import time -import json -import pandas as pd -import numpy as np -from tqdm import tqdm -import msgpack -from utils.s3 import read_from_s3 -from asset_list.AssetList import AssetList -from asset_list.mappings.property_type import PROPERTY_MAPPING -from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS -from asset_list.mappings.heating_systems import HEATING_MAPPINGS -from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS - -from dotenv import load_dotenv -from backend.SearchEpc import SearchEpc -from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc - -from etl.epc_clean.epc_attributes.attribute_utils import ( - extract_thermal_transmittance -) - -load_dotenv(dotenv_path="backend/.env") -EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") - - -def get_data( - asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, - uprn_column=None, epc_api_only=False, row_id_name="row_id" -): - epc_data = [] - errors = [] - no_epc = [] - for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): - try: - postcode = home[postcode_column] - house_number = str(home[address1_column]).strip() - full_address = home[fulladdress_column].strip() - house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) - if house_no is None: - house_no = house_number - uprn = manual_uprn_map.get(full_address, None) - if uprn is None and home.get(uprn_column): - uprn = home[uprn_column] - - if pd.isnull(uprn): - uprn = None - - searcher = SearchEpc( - address1=str(house_no), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5, - uprn=uprn - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None - - searcher.find_property(skip_os=True) - - # Check if we have a flat or appartment - if searcher.newest_epc is None and uprn is None: - # Try again: - if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: - # Backup - add1 = full_address.split(",") - if len(add1) > 1: - add1 = add1[1].strip() - else: - # Try splitting on space - add1 = full_address.split(" ")[0].strip() - - else: - add1 = str(house_number) - searcher = SearchEpc( - address1=add1, - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5 - ) - - if ( - "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in - house_number.lower() - ): - searcher.ordnance_survey_client.property_type = "Flat" - - searcher.find_property(skip_os=True) - - if searcher.newest_epc is None: - no_epc.append(home[row_id_name]) - continue - - if epc_api_only: - epc = { - row_id_name: home[row_id_name], - **searcher.newest_epc.copy() - } - - epc_data.append(epc) - continue - - # Look for EPC recommendatons - try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} - - # Retrieve data from FindMyEPC - try: - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - except ValueError as e: - if "No EPC found" in str(e) and "address1" in searcher.newest_epc: - try: - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - except ValueError as e: - if "No EPC found" in str(e): - find_epc_data = {} - else: - find_epc_data = {} - except Exception as e: - raise Exception(f"Error retrieving FindMyEPC data: {e}") - time.sleep(np.random.uniform(0.1, 1)) - - epc = { - row_id_name: home[row_id_name], - **searcher.newest_epc.copy(), - "recommendations": property_recommendations["rows"], - "find_my_epc_data": find_epc_data, - } - - epc_data.append(epc) - except Exception as e: - errors.append(home[row_id_name]) - time.sleep(5) - - return epc_data, errors, no_epc - - -def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): - if method == "first_two_words": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") - return asset_list - - if method == "first_word": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] - return asset_list - - if method == "house_number_extraction": - asset_list["address1_extracted"] = asset_list.apply( - lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), - axis=1 - ) - return asset_list - - raise ValueError(f"Method {method} not recognized") - - -def app(): - """ - This app is EPC pulling data for some properties owned by Livewest - - Data request contents: - Date of last EPC - Reason for EPC - SAP score on register - Property Type - Property Area - Property Age - Any Dimensions (HLP,PW,RH) - Property Wall Construction - Heating Type - Secondary Heating - Loft Insulation Depth - - Additional if possible: - Heat loss calculations - EPC recommendations - Property UPRN - """ - - # TODO: - # For cavity work: - # - Flag any entries that have a different wall type between non-intrusive data against EPC - # - Worth double checking entries that have a difference in wall construction - # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity - # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation - # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats - # are less than C75 - # - Flag anything pre SAP2012 - # - Flag anything over 5 years old - # - Look at year built vs age band - # - # For Solar: - # - Discount any that have solar PV - based on non-intrusives and from the inspections team - # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with - # electric room heaters but it might need to be an EPC E - # - Fabric - check the floor, wall and roof: - # - Filled or empty cavity is good - # - Insulated solid/timber/system built is good - # - SCIS/CEG needs solid floors - # - JJC don’t care - # - Anything with a loft 200 or below - # - Anything C75 and above won’t qualify - # - Insulated loft = 200mm - # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) - # - Or the insulation required is loft/cavity (floors should be solid) - - # For Westward - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - DATA_FILENAME = "WESTWARD - completed list..xlsx" - SHEET_NAME = "Sheet1" - - POSTCODE_COLUMN = "WFT EDIT Postcode" - FULLADDRESS_COLUMN = "Address" - ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "house_number_extraction" - - ADDRESS_COLS_TO_CONCAT = [] - MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build date" - UPRN_COLUMN = "UPRN" - # If we have the non-intrusives data, this should be true - HAS_NON_INTRUSIVES = True - PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits - - # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = {} - - asset_list = AssetList( - local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), - header=0, - sheet_name=SHEET_NAME, - address1_colname=ADDRESS1_COLUMN, - postcode_colname=POSTCODE_COLUMN, - landlord_property_id="UPRN", - full_address_colname=FULLADDRESS_COLUMN, - full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, - missing_postcodes_method=MISSING_POSTCODES_METHOD, - address1_extraction_method=ADDRESS1_METHOD, - landlord_year_built=PROPERTY_YEAR_BUILT, - landlord_uprn=UPRN_COLUMN, - landlord_property_type=PROPERTY_TYPE_COLUMN, - landlord_wall_construction="Wall Construction (EPC)", - landlord_heating_system="Heat Source", - landlord_existing_pv="PV (Y/N)" - ) - asset_list.init_standardise() - - # We produce the new maps, which can be saved for future useage - - new_property_type_map = PROPERTY_MAPPING.copy().update( - asset_list.variable_mappings[asset_list.landlord_property_type] - ) - new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_wall_construction] - ) - new_heating_map = HEATING_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_heating_system] - ) - new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_existing_pv] - ) - - asset_list.apply_standardiation() - - # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - # SHEET_NAME = "Sheet1" - # POSTCODE_COLUMN = 'Full Address.1' - # FULLADDRESS_COLUMN = "Full Address" - # ADDRESS1_COLUMN = None - # ADDRESS1_METHOD = "first_word" - # ADDRESS_COLS_TO_CONCAT = [] - # MISSING_POSTCODES_METHOD = None - # PROPERTY_YEAR_BUILT = "Build Date" - # UPRN_COLUMN = None - # # If we have the non-intrusives data, this should be true - # HAS_NON_INTRUSIVES = True - - ### We retrieve the EPC data - - # We chunk up this data into 5000 rows at a time - # Create the chunks directory - force_retrieve_data = False - skip = None # Used to skip already completed chunks - chunk_size = 5000 - filename = "Chunk {i}.csv" - download_folder = os.path.join(DATA_FOLDER, "Chunks") - if not os.path.exists(download_folder): - os.makedirs(download_folder) - - chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size)) - downloaded_files = {filename.format(i=i) for i in chunk_indexes} - - # We check if we have files associated to these files already and if we do, and we do not want to force the - # fetching of the data, we skip - folder_contents = os.listdir(download_folder) - if all(x in folder_contents for x in downloaded_files): - skip = max(chunk_indexes) - - for i in range(0, len(asset_list.standardised_asset_list), chunk_size): - print(f"Processing chunk {i} to {i + chunk_size}") - if skip is not None and not force_retrieve_data: - if i <= skip: - continue - chunk = asset_list.standardised_asset_list[i:i + chunk_size] - epc_data_chunk, errors_chunk, no_epc_chunk = get_data( - asset_list=chunk, - row_id_name=asset_list.DOMNA_PROPERTY_ID, - fulladdress_column=asset_list.STANDARD_FULL_ADDRESS, - address1_column=asset_list.STANDARD_ADDRESS_1, - postcode_column=asset_list.STANDARD_POSTCODE, - manual_uprn_map=MANUAL_UPRN_MAP, - uprn_column=asset_list.STANDARD_UPRN - ) - - # We now retrieve any failed properties - chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] - epc_data_failed, _, _ = get_data( - asset_list=chunk_failed, - row_id_name=asset_list.DOMNA_PROPERTY_ID, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, - manual_uprn_map=MANUAL_UPRN_MAP, - epc_api_only=False - ) - - epc_data_chunk.extend(epc_data_failed) - - # Append the failed data to the main data - # Store the chunk locally as a csv - pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) - # Store the errors and no-data locally - with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f: - json.dump(errors_chunk, f) - - with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f: - json.dump(no_epc_chunk, f) - - # We read in and concatenate the created created chunks - # List the contents - epc_data = [] - for file in downloaded_files: - csv_data = pd.read_csv(os.path.join(download_folder, file)) - # We need to convert the recommendations back to a list - csv_data["recommendations"] = csv_data["recommendations"].apply(eval) - csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) - epc_data.append(csv_data) - - epc_df = pd.concat(epc_data) - # TODO: TEMP!!! - epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID}) - - # We expand out the recommendations - recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] - - unique_recommendations = set() - for _, row in recommendations_df.iterrows(): - unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) - - columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) - transformed_data = [] - for _, row in recommendations_df.iterrows(): - # Initialize a dictionary for this row with False for all recommendations - row_data = {col: False for col in columns} - row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID] - - # Set True for each recommendation present in this row - for rec in row["recommendations"]: - recommendation_text = rec["improvement-summary-text"] - row_data[recommendation_text] = True - - # Append the row data to transformed_data - transformed_data.append(row_data) - - transformed_df = pd.DataFrame(transformed_data) - transformed_df = transformed_df[ - [ - asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", - "Floor insulation", "Floor insulation (suspended floor)" - ] - ] - - transformed_df["epc_has_floor_recommendation"] = ( - transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] | - transformed_df["Floor insulation (suspended floor)"] - ) - - # Get the find my epc data - find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( - columns=["find_my_epc_data"]).join( - pd.json_normalize(epc_df["find_my_epc_data"]) - ) - find_my_epc_data = find_my_epc_data.merge( - transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], - how="left", on=asset_list.DOMNA_PROPERTY_ID - ) - - # We check if we get the solar pv column: - if "Solar photovoltaics" not in find_my_epc_data.columns: - find_my_epc_data["Solar photovoltaics"] = False - - # Retrieve just the data we need - epc_df = epc_df[ - [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) - ].rename( - columns=asset_list.EPC_API_DATA_NAMES - ) - - epc_df = epc_df.merge( - find_my_epc_data[ - [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) - ] - .rename(columns=asset_list.FIND_EPC_DATA_NAMES), - how="left", - on=asset_list.DOMNA_PROPERTY_ID - ) - - asset_list.merge_data(epc_df) - # TODO: TEMP!!! - epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str) - asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge( - epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" - ) - - asset_list.extract_attributes() - - cleaned = read_from_s3( - s3_file_name="cleaned_epc_data/cleaned.bson", - bucket_name="retrofit-data-dev" - ) - cleaned = msgpack.unpackb(cleaned, raw=False) - - asset_list.identify_worktypes(cleaned) - - # TODO: We should do this breakdown for flats - def flat_analysis(asset_list): - - # We need to deduce the building name - we strip out the house number - def extract_building_name(x): - # TODO: This doesn't really work - if pd.isnull(x): - return None - house_no = SearchEpc.get_house_number(address=x, postcode=None) - if house_no: - return x.replace(house_no, "").strip() - return x.split(",")[0].strip() - - # We want to deduce if flats have 50% of the properties below C75 - # We group by postcode and property type - grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"]) - - flat_data = [] - for _, group in grouped: - if "flat" in group["Property Type"].str.lower().values: - num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0) - num_below_c75 = group["SAP score on register"].lt(75).sum() - - flat_data.append( - { - "Postcode": group[POSTCODE_COLUMN].iloc[0], - "Property Type": "Flat", - "Number of Flats with EPC": num_flats, - "Number of Flats below C75": num_below_c75, - "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats) - } - ) - - flat_data = pd.DataFrame(flat_data) - - return flat_data - - flat_data = flat_analysis(asset_list) - - # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" - # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data - - with pd.ExcelWriter(filename) as writer: - asset_list.to_excel(writer, sheet_name="EPC Data", index=False) - flat_data.to_excel(writer, sheet_name="Flat Data", index=False) - - matches_review = asset_list[ - [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] - ] diff --git a/etl/route_march_data_pull/requirements.txt b/etl/route_march_data_pull/requirements.txt deleted file mode 100644 index e69de29b..00000000 From 759e81f6606ee9355612ed9526acd8c77dc12096 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 15:25:38 +0000 Subject: [PATCH 206/255] refactoring --- asset_list/app.py | 20 +++++++++++++++----- asset_list/requirements.txt | 3 ++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/asset_list/app.py b/asset_list/app.py index 1a7788fe..df2fe9cc 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -21,13 +21,21 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data( - asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, + df, fulladdress_column, address1_column, postcode_column, manual_uprn_map, uprn_column=None, epc_api_only=False, row_id_name="row_id" ): + # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs + property_type_map = { + "house": "House", + "flat": "Flat", + "maisonette": "Maisonette", + "bungalow": "Bungalow", + } + epc_data = [] errors = [] no_epc = [] - for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + for _, home in tqdm(df.iterrows(), total=len(df)): try: postcode = home[postcode_column] house_number = str(home[address1_column]).strip() @@ -42,19 +50,21 @@ def get_data( if pd.isnull(uprn): uprn = None + property_type = property_type_map.get(home[AssetList.STANDARD_PROPERTY_TYPE], None) + searcher = SearchEpc( address1=str(house_no), postcode=postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="", - property_type=None, + property_type=property_type, fast=True, full_address=full_address, max_retries=5, uprn=uprn ) # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.property_type = property_type searcher.ordnance_survey_client.built_form = None searcher.find_property(skip_os=True) @@ -317,7 +327,7 @@ def app(): continue chunk = asset_list.standardised_asset_list[i:i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( - asset_list=chunk, + df=chunk, row_id_name=asset_list.DOMNA_PROPERTY_ID, fulladdress_column=asset_list.STANDARD_FULL_ADDRESS, address1_column=asset_list.STANDARD_ADDRESS_1, diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index fd045d46..fd43ac64 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -8,4 +8,5 @@ boto3 openpyxl openai tiktoken -msgpack \ No newline at end of file +msgpack +beautifulsoup4 \ No newline at end of file From 33558957df5b718fd81f9a89064f24ceffa2b139 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 17:00:12 +0000 Subject: [PATCH 207/255] adding methodology to estimate the EPC if we don't have it --- asset_list/app.py | 22 +++++++++++++--------- backend/SearchEpc.py | 2 +- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/asset_list/app.py b/asset_list/app.py index df2fe9cc..5bbf25d4 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -21,9 +21,13 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data( - df, fulladdress_column, address1_column, postcode_column, manual_uprn_map, - uprn_column=None, epc_api_only=False, row_id_name="row_id" + df, manual_uprn_map, epc_api_only=False, row_id_name="row_id" ): + uprn_column = AssetList.STANDARD_UPRN + fulladdress_column = AssetList.STANDARD_FULL_ADDRESS + address1_column = AssetList.STANDARD_ADDRESS_1 + postcode_column = AssetList.STANDARD_POSTCODE + # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs property_type_map = { "house": "House", @@ -57,14 +61,14 @@ def get_data( postcode=postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="", - property_type=property_type, + property_type=None, fast=True, full_address=full_address, max_retries=5, uprn=uprn ) # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = property_type + searcher.ordnance_survey_client.property_type = None searcher.ordnance_survey_client.built_form = None searcher.find_property(skip_os=True) @@ -102,6 +106,11 @@ def get_data( searcher.find_property(skip_os=True) + # As a final resort, we estimate the EPC + if property_type is not None: + searcher.ordnance_survey_client.property_type = property_type + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: no_epc.append(home[row_id_name]) continue @@ -328,12 +337,7 @@ def app(): chunk = asset_list.standardised_asset_list[i:i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, - row_id_name=asset_list.DOMNA_PROPERTY_ID, - fulladdress_column=asset_list.STANDARD_FULL_ADDRESS, - address1_column=asset_list.STANDARD_ADDRESS_1, - postcode_column=asset_list.STANDARD_POSTCODE, manual_uprn_map=MANUAL_UPRN_MAP, - uprn_column=asset_list.STANDARD_UPRN ) # We now retrieve any failed properties diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 79a041ec..0d921bec 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -337,7 +337,7 @@ class SearchEpc: if row["lmk-key"] not in seen and not seen.add(row["lmk-key"]) ] - if data: + if data["rows"]: api_response["msg"] = self.SUCCESS return api_response["msg"] From d69baa21dab3c066b20b3823f9bac52da4eba7da Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 17:22:00 +0000 Subject: [PATCH 208/255] estimating epcs --- asset_list/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asset_list/app.py b/asset_list/app.py index 5bbf25d4..229bf171 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -337,6 +337,7 @@ def app(): chunk = asset_list.standardised_asset_list[i:i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, + row_id_name=asset_list.DOMNA_PROPERTY_ID, manual_uprn_map=MANUAL_UPRN_MAP, ) From d1dc536ab0c4424ac6fda9c39422659a547e8fbe Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 17:33:18 +0000 Subject: [PATCH 209/255] merging on epc data --- asset_list/AssetList.py | 2 +- asset_list/app.py | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index ffe53d40..2d224daa 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -751,7 +751,7 @@ class AssetList: # We produce some additional fields # 1) Is the SAP rating below C75 self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].astype(float) <= self.FILLED_CAVITY_SAP_THRESHOLD ) # 2) Flag anything where the EPC is older than 5 years diff --git a/asset_list/app.py b/asset_list/app.py index 229bf171..34cc9579 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -376,8 +376,6 @@ def app(): epc_data.append(csv_data) epc_df = pd.concat(epc_data) - # TODO: TEMP!!! - epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID}) # We expand out the recommendations recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] @@ -445,11 +443,6 @@ def app(): ) asset_list.merge_data(epc_df) - # TODO: TEMP!!! - epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str) - asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge( - epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn" - ) asset_list.extract_attributes() From ea1a7b559d7fd3fa1c3f4b54365fe2eeebf0a3b3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 21 Feb 2025 22:57:56 +0000 Subject: [PATCH 210/255] fixed bug with calling find epc --- asset_list/app.py | 10 +++++----- etl/find_my_epc/RetrieveFindMyEpc.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/asset_list/app.py b/asset_list/app.py index 34cc9579..3c1ab627 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -34,6 +34,9 @@ def get_data( "flat": "Flat", "maisonette": "Maisonette", "bungalow": "Bungalow", + "block house": "House", + "coach house": "House", + "bedsit": "Flat" } epc_data = [] @@ -107,7 +110,7 @@ def get_data( searcher.find_property(skip_os=True) # As a final resort, we estimate the EPC - if property_type is not None: + if property_type is not None and searcher.newest_epc is None: searcher.ordnance_survey_client.property_type = property_type searcher.find_property(skip_os=True) @@ -344,11 +347,8 @@ def app(): # We now retrieve any failed properties chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] epc_data_failed, _, _ = get_data( - asset_list=chunk_failed, + df=chunk_failed, row_id_name=asset_list.DOMNA_PROPERTY_ID, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, manual_uprn_map=MANUAL_UPRN_MAP, epc_api_only=False ) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index eaba1058..9852cc0d 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -330,7 +330,8 @@ class RetrieveFindMyEpc: "roomstat_programmer_trvs", "time_temperature_zone_control" ], "Replacement warm air unit": [], - "Secondary glazing": ["secondary_glazing"] + "Secondary glazing": ["secondary_glazing"], + "Condensing heating unit": ["boiler_upgrade"], } survey = True From 7b4218299ff1c3b108d3259cecb7fee13f4d1096 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 12:11:47 +0000 Subject: [PATCH 211/255] adding work reasons --- asset_list/AssetList.py | 78 ++++++++++++++++++++++++++++++++++++----- asset_list/app.py | 37 ++++++++++--------- 2 files changed, 91 insertions(+), 24 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 2d224daa..54f6cd96 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -296,7 +296,7 @@ class AssetList: ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area" ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness" ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below" - ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}" + ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"epc_is_pre_{EPC_YEAR_THRESHOLD}" # These are the descriptions that we look for in the EPC data that are indicative of no insulation EPC_NO_WALL_INSULATION_DESCRIPTIONS = [ @@ -775,7 +775,7 @@ class AssetList: self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": None, "epc_year_upper_bound": None, - "Does Age Match EPC Age Band?": "No EPC Age Band" + "does_age_band_match_epc_age_band": "No EPC Age Band" } ) continue @@ -800,7 +800,7 @@ class AssetList: self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": year_lower_bound, "epc_year_upper_bound": None, - "Does Age Match EPC Age Band?": age_band_matches + "does_age_band_match_epc_age_band": age_band_matches } ) continue @@ -820,7 +820,7 @@ class AssetList: self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": None, "epc_year_upper_bound": 1899, - "Does Age Match EPC Age Band?": age_band_matches + "does_age_band_match_epc_age_band": age_band_matches } ) continue @@ -842,7 +842,7 @@ class AssetList: self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), - "Does Age Match EPC Age Band?": age_band_matches + "does_age_band_match_epc_age_band": age_band_matches } ) continue @@ -864,7 +864,7 @@ class AssetList: self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": int(lower_date), "epc_year_upper_bound": int(upper_date), - "Does Age Match EPC Age Band?": age_band_matches + "does_age_band_match_epc_age_band": age_band_matches } ) @@ -892,7 +892,12 @@ class AssetList: (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) & + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) ) self.standardised_asset_list["epc_indicates_empty_cavity"] = ( @@ -1206,6 +1211,11 @@ class AssetList: self.standardised_asset_list["solar_epc_floor_is_other_insulated"] ) + # Drop anything we don't need + self.standardised_asset_list = self.standardised_asset_list.drop( + columns=["walls_u_value", "roof_u_value", "floor_u_value"] + ) + # Produce some aggregate figures self.work_type_figures = { # Empty cavity from non-intrusives @@ -1219,7 +1229,11 @@ class AssetList: ).sum() ), "Cavity Extraction": ( - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"].sum() + ( + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + ~self.standardised_asset_list["epc_indicates_empty_cavity"] & + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] + ).sum() ), "Solar PV (Solid Floor)": ( self.standardised_asset_list["solar_eligible_solid_floor"].sum() @@ -1234,3 +1248,51 @@ class AssetList: self.standardised_asset_list["solar_eligible_other_floor_needs_loft"].sum() ) } + + # Finally, we note why each property has been flagged + self.standardised_asset_list["cavity_reason"] = None + self.standardised_asset_list["cavity_reason"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], + "Non-Intrusive Data Showed Empty Cavity", + self.standardised_asset_list["cavity_reason"] + ) + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] + ), + "EPC Data Showed Empty Cavity", + self.standardised_asset_list["cavity_reason"] + ) + # Flag extraction + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "Non-Intrusive Data Showed Cavity Extraction", + self.standardised_asset_list["cavity_reason"] + ) + + # Flag solar + self.standardised_asset_list["solar_reason"] = None + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_solid_floor"], + "Solid Floor, Insulated, No Solar", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"], + "Solid Floor, Insulated, Needs Loft", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_other_floor"], + "Other Floor, Insulated, No Solar", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"], + "Other Floor, Insulated, Needs Loft", + self.standardised_asset_list["solar_reason"] + ) diff --git a/asset_list/app.py b/asset_list/app.py index 3c1ab627..65d4ab87 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -376,6 +376,7 @@ def app(): epc_data.append(csv_data) epc_df = pd.concat(epc_data) + epc_df["estimated"] = epc_df["estimated"].fillna(False) # We expand out the recommendations recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] @@ -454,36 +455,40 @@ def app(): asset_list.identify_worktypes(cleaned) + from pprint import pprint + pprint(asset_list.work_type_figures) + # TODO: We should do this breakdown for flats def flat_analysis(asset_list): # We need to deduce the building name - we strip out the house number - def extract_building_name(x): - # TODO: This doesn't really work - if pd.isnull(x): - return None - house_no = SearchEpc.get_house_number(address=x, postcode=None) - if house_no: - return x.replace(house_no, "").strip() - return x.split(",")[0].strip() # We want to deduce if flats have 50% of the properties below C75 # We group by postcode and property type - grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"]) + grouped = asset_list.standardised_asset_list.groupby( + [asset_list.STANDARD_POSTCODE, asset_list.STANDARD_PROPERTY_TYPE] + ) flat_data = [] for _, group in grouped: - if "flat" in group["Property Type"].str.lower().values: - num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0) - num_below_c75 = group["SAP score on register"].lt(75).sum() + if "flat" in group[asset_list.STANDARD_PROPERTY_TYPE].values: + num_flats = group[asset_list.STANDARD_PROPERTY_TYPE].shape[0] + num_below_c75 = group[ + asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(asset_list.FILLED_CAVITY_SAP_THRESHOLD).sum() + # Check if any flats are below C69 + num_flats_below_c69 = group[ + asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(69).sum() flat_data.append( { - "Postcode": group[POSTCODE_COLUMN].iloc[0], + "Postcode": group[asset_list.STANDARD_POSTCODE].iloc[0], "Property Type": "Flat", "Number of Flats with EPC": num_flats, "Number of Flats below C75": num_below_c75, - "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats) + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats), + "num_flats_below_c69": num_flats_below_c69, } ) @@ -494,11 +499,11 @@ def app(): flat_data = flat_analysis(asset_list) # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx" # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data with pd.ExcelWriter(filename) as writer: - asset_list.to_excel(writer, sheet_name="EPC Data", index=False) + asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) flat_data.to_excel(writer, sheet_name="Flat Data", index=False) matches_review = asset_list[ From 99a0948e2bd3ab14197821a694cbf1d2383baff3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 16:11:02 +0000 Subject: [PATCH 212/255] getting ready to work on the colchester data --- asset_list/AssetList.py | 82 ++++++++++++++++++++++++++++++++-------- asset_list/app.py | 83 ++++++----------------------------------- 2 files changed, 78 insertions(+), 87 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 54f6cd96..2b80287c 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -343,6 +343,7 @@ class AssetList: self.standardised_asset_list = self.raw_asset_list.copy() # Will be used to store aggregated figures against the various work types self.work_type_figures = {} + self.flat_data = None # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False @@ -649,6 +650,9 @@ class AssetList: logger.info("Applying standardisation to asset list") for variable, mapping in self.variable_mappings.items(): + self.standardised_asset_list[variable + "_original_from_landlord"] = ( + self.standardised_asset_list[variable].copy() + ) self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping) if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum(): @@ -663,6 +667,12 @@ class AssetList: # Apply renames to our standard names # Perform final variable selection and renaming: + + # We add the original columns to the keep variables + self.keep_variables += [ + k + "_original_from_landlord" for k in self.variable_mappings.keys() + ] + self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename( columns=self.rename_map ) @@ -912,18 +922,6 @@ class AssetList: self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD ) ) - - self.standardised_asset_list["empty_cavity"] = ( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | - self.standardised_asset_list["epc_indicates_empty_cavity"] - ) - # We add a reason - self.standardised_asset_list["empty_cavity_reason"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], - "Non-Intrusive Data", - "EPC Data" - ) - ###################################################### # Extraction ###################################################### @@ -933,7 +931,7 @@ class AssetList: self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & - (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "FORMALDEHYDE"]) + (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "COMPACTED BEAD"]) ) & ( self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) @@ -996,6 +994,12 @@ class AssetList: ) ) + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = ( + self.standardised_asset_list["non-intrusives: Insulated"].isin( + ["EWI", "RETRO DRILLED", "FILLED AT BUILD"] + ) + ) + # TODO: We don't have information about the roof from this landlord # We merge on the u-value for average thermal transmittance @@ -1146,7 +1150,8 @@ class AssetList: # The walls are insulated ( self.standardised_asset_list["solar_landlord_walls_insulated"] | - self.standardised_asset_list["solar_epc_walls_insulated"] + self.standardised_asset_list["solar_epc_walls_insulated"] | + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] ) & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & @@ -1165,7 +1170,8 @@ class AssetList: # The walls are insulated ( self.standardised_asset_list["solar_landlord_walls_insulated"] | - self.standardised_asset_list["solar_epc_walls_insulated"] + self.standardised_asset_list["solar_epc_walls_insulated"] | + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] ) & # Roof is insulated self.standardised_asset_list["solar_epc_loft_needs_topup"] & @@ -1216,6 +1222,15 @@ class AssetList: columns=["walls_u_value", "roof_u_value", "floor_u_value"] ) + # Adjust flagged extraction jobs to remove anything for solar + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & + ~self.standardised_asset_list["solar_eligible_solid_floor"] & + ~self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] + # ~self.standardised_asset_list["solar_eligible_other_floor"] & + # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] + ) + # Produce some aggregate figures self.work_type_figures = { # Empty cavity from non-intrusives @@ -1296,3 +1311,40 @@ class AssetList: "Other Floor, Insulated, Needs Loft", self.standardised_asset_list["solar_reason"] ) + + def flat_analysis(self): + + # We need to deduce the building name - we strip out the house number + + # We want to deduce if flats have 50% of the properties below C75 + # We group by postcode and property type + grouped = self.standardised_asset_list.groupby( + [self.STANDARD_POSTCODE, self.STANDARD_PROPERTY_TYPE] + ) + + flat_data = [] + for _, group in grouped: + if "flat" in group[self.STANDARD_PROPERTY_TYPE].values: + num_flats = group[self.STANDARD_PROPERTY_TYPE].shape[0] + num_below_c75 = group[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(self.FILLED_CAVITY_SAP_THRESHOLD).sum() + # Check if any flats are below C69 + num_flats_below_c69 = group[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(69).sum() + + flat_data.append( + { + "Postcode": group[self.STANDARD_POSTCODE].iloc[0], + "Property Type": "Flat", + "Number of Flats with EPC": num_flats, + "Number of Flats below C75": num_below_c75, + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats), + "Number of Flats Below C69": num_flats_below_c69, + } + ) + + flat_data = pd.DataFrame(flat_data) + + self.flat_data = flat_data diff --git a/asset_list/app.py b/asset_list/app.py index 65d4ab87..f164e94e 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -4,6 +4,7 @@ import json import pandas as pd import numpy as np from tqdm import tqdm +from pprint import pprint import msgpack from utils.s3 import read_from_s3 from asset_list.AssetList import AssetList @@ -239,23 +240,18 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - # For Westward - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - DATA_FILENAME = "WESTWARD - completed list..xlsx" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" SHEET_NAME = "Sheet1" - - POSTCODE_COLUMN = "WFT EDIT Postcode" - FULLADDRESS_COLUMN = "Address" + POSTCODE_COLUMN = 'Full Address.1' + FULLADDRESS_COLUMN = "Full Address" ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "house_number_extraction" - + ADDRESS1_METHOD = "first_word" ADDRESS_COLS_TO_CONCAT = [] MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build date" - UPRN_COLUMN = "UPRN" - # If we have the non-intrusives data, this should be true - HAS_NON_INTRUSIVES = True - PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits + PROPERTY_YEAR_BUILT = "Build Date" + UPRN_COLUMN = None + PROPERTY_TYPE_COLUMN = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} @@ -297,20 +293,6 @@ def app(): asset_list.apply_standardiation() - # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - # SHEET_NAME = "Sheet1" - # POSTCODE_COLUMN = 'Full Address.1' - # FULLADDRESS_COLUMN = "Full Address" - # ADDRESS1_COLUMN = None - # ADDRESS1_METHOD = "first_word" - # ADDRESS_COLS_TO_CONCAT = [] - # MISSING_POSTCODES_METHOD = None - # PROPERTY_YEAR_BUILT = "Build Date" - # UPRN_COLUMN = None - # # If we have the non-intrusives data, this should be true - # HAS_NON_INTRUSIVES = True - ### We retrieve the EPC data # We chunk up this data into 5000 rows at a time @@ -455,48 +437,9 @@ def app(): asset_list.identify_worktypes(cleaned) - from pprint import pprint pprint(asset_list.work_type_figures) - # TODO: We should do this breakdown for flats - def flat_analysis(asset_list): - - # We need to deduce the building name - we strip out the house number - - # We want to deduce if flats have 50% of the properties below C75 - # We group by postcode and property type - grouped = asset_list.standardised_asset_list.groupby( - [asset_list.STANDARD_POSTCODE, asset_list.STANDARD_PROPERTY_TYPE] - ) - - flat_data = [] - for _, group in grouped: - if "flat" in group[asset_list.STANDARD_PROPERTY_TYPE].values: - num_flats = group[asset_list.STANDARD_PROPERTY_TYPE].shape[0] - num_below_c75 = group[ - asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"] - ].lt(asset_list.FILLED_CAVITY_SAP_THRESHOLD).sum() - # Check if any flats are below C69 - num_flats_below_c69 = group[ - asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"] - ].lt(69).sum() - - flat_data.append( - { - "Postcode": group[asset_list.STANDARD_POSTCODE].iloc[0], - "Property Type": "Flat", - "Number of Flats with EPC": num_flats, - "Number of Flats below C75": num_below_c75, - "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats), - "num_flats_below_c69": num_flats_below_c69, - } - ) - - flat_data = pd.DataFrame(flat_data) - - return flat_data - - flat_data = flat_analysis(asset_list) + asset_list.flat_analysis() # Store as an excel filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx" @@ -504,8 +447,4 @@ def app(): with pd.ExcelWriter(filename) as writer: asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) - flat_data.to_excel(writer, sheet_name="Flat Data", index=False) - - matches_review = asset_list[ - [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] - ] + asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False) From 5391afeaaaa024ff7b1a54fc18f565b9c46a3925 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 16:52:42 +0000 Subject: [PATCH 213/255] handling the case of landlord property id being missing --- asset_list/AssetList.py | 2 +- asset_list/app.py | 58 ++++++++++++++++++++++------------------- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 2b80287c..c2784eb1 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -378,7 +378,7 @@ class AssetList: self.keep_variables = [] # Finally, we handle the case where the landlord's property ID is actually the OS UPRN - if self.landlord_uprn == self.landlord_property_id: + if (self.landlord_uprn == self.landlord_property_id) and (self.landlord_property_id is not None): self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy() # Update the reference to landlord UPRn self.landlord_uprn = self.STANDARD_UPRN diff --git a/asset_list/app.py b/asset_list/app.py index f164e94e..89b15c06 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -240,39 +240,43 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - SHEET_NAME = "Sheet1" - POSTCODE_COLUMN = 'Full Address.1' - FULLADDRESS_COLUMN = "Full Address" - ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_word" - ADDRESS_COLS_TO_CONCAT = [] - MISSING_POSTCODES_METHOD = None - PROPERTY_YEAR_BUILT = "Build Date" - UPRN_COLUMN = None - PROPERTY_TYPE_COLUMN = None + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + sheet_name = "Sheet1" + postcode_column = 'Full Address.1' + fulladdress_column = "Full Address" + address1_column = None + address1_method = "first_word" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build Date" + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_wall_construction = "Wallinsul" + landlord_heating_system = "HeatSorc" + landlord_existing_pv = None + landlord_property_id = None # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} asset_list = AssetList( - local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME), + local_filepath=os.path.join(data_folder, data_filename), header=0, - sheet_name=SHEET_NAME, - address1_colname=ADDRESS1_COLUMN, - postcode_colname=POSTCODE_COLUMN, - landlord_property_id="UPRN", - full_address_colname=FULLADDRESS_COLUMN, - full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT, - missing_postcodes_method=MISSING_POSTCODES_METHOD, - address1_extraction_method=ADDRESS1_METHOD, - landlord_year_built=PROPERTY_YEAR_BUILT, - landlord_uprn=UPRN_COLUMN, - landlord_property_type=PROPERTY_TYPE_COLUMN, - landlord_wall_construction="Wall Construction (EPC)", - landlord_heating_system="Heat Source", - landlord_existing_pv="PV (Y/N)" + sheet_name=sheet_name, + address1_colname=address1_column, + postcode_colname=postcode_column, + landlord_property_id=landlord_property_id, + full_address_colname=fulladdress_column, + full_address_cols_to_concat=address_cols_to_concat, + missing_postcodes_method=missing_postcodes_method, + address1_extraction_method=address1_method, + landlord_year_built=landlord_year_built, + landlord_uprn=landlord_os_uprn, + landlord_property_type=landlord_property_type, + landlord_wall_construction=landlord_wall_construction, + landlord_heating_system=landlord_heating_system, + landlord_existing_pv=landlord_existing_pv ) asset_list.init_standardise() From 8fa8307e33dc27793815eccadbb11fa3a28d1c68 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 18:36:00 +0000 Subject: [PATCH 214/255] ai mappings --- asset_list/AssetList.py | 32 ++++++++++++++++++++++- asset_list/app.py | 2 +- asset_list/mappings/heating_systems.py | 35 ++++++++++++++++++++------ asset_list/mappings/property_type.py | 9 ++++++- asset_list/mappings/walls.py | 13 +++++++--- 5 files changed, 77 insertions(+), 14 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index c2784eb1..06ec5907 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -598,7 +598,35 @@ class AssetList: self.standardised_asset_list[self.landlord_year_built].dt.year ) else: - raise NotImplementedError("Year built column must be a datetime - implement me") + # We attempt to convert the year built to a datetime, by detecting the format and converting + + def extract_year(date_str): + """ + Extracts the year from a date string in the format '01-Jul-YYYY'. + Returns the extracted year as an integer or None if the format is incorrect. + """ + known_errors = ["#MULTIVALUE"] + + if pd.isnull(date_str) or date_str in known_errors: + return None + + if isinstance(date_str, str): + match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str) + if match: + return int(match.group(1)) # Extract the year and convert to integer + + if isinstance(date_str, datetime): + return date_str.year + + # Check if date_str is a year itself + if str(date_str).isdigit() & (len(str(date_str)) == 4): + return int(date_str) + + raise NotImplementedError("Unhandled format for year built - implement me") + + self.standardised_asset_list[self.landlord_year_built] = self.standardised_asset_list[ + self.landlord_year_built + ].apply(extract_year) # We now create standard lookups to_remap = { @@ -619,6 +647,8 @@ class AssetList: "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS } } + # Keep just entries where the key is not None + to_remap = {k: v for k, v in to_remap.items() if k is not None} for variable, config in to_remap.items(): logger.info("Standardising variable: %s", variable) diff --git a/asset_list/app.py b/asset_list/app.py index 89b15c06..1cb7808e 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -255,7 +255,7 @@ def app(): landlord_wall_construction = "Wallinsul" landlord_heating_system = "HeatSorc" landlord_existing_pv = None - landlord_property_id = None + landlord_property_id = "Property Reference" # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 89bfe0c4..b58f13f2 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -1,3 +1,5 @@ +import numpy as np + STANDARD_HEATING_SYSTEMS = { "gas combi boiler", "electric storage heaters", @@ -35,12 +37,31 @@ HEATING_MAPPINGS = { "Eco Electric Radiators": "electric radiators", "Gas fire": "other", "Backboiler - Solid fuel": "other", - 'combi - gas': 'gas combi boiler', 'e7 storage heaters': 'electric storage heaters', - 'district heating system': 'district heating', 'condensing boiler - gas': 'gas condensing boiler', - 'boiler oil/other': 'oil boiler', 'condensing combi - gas': 'gas condensing combi', - 'air source source heat pump': 'air source heat pump', 'biomass boiler': 'boiler - other fuel', - 'ground source heat pump': 'ground source heat pump', 'electric oil filled radiators': 'electric radiators', - 'solid fuel': 'other', 'lpg boiler': 'boiler - other fuel', 'electric boiler': 'electric boiler', + 'combi - gas': 'gas combi boiler', + 'e7 storage heaters': 'electric storage heaters', + 'district heating system': 'district heating', + 'condensing boiler - gas': 'gas condensing boiler', + 'boiler oil/other': 'oil boiler', + 'condensing combi - gas': 'gas condensing combi', + 'air source source heat pump': 'air source heat pump', + 'biomass boiler': 'boiler - other fuel', + 'ground source heat pump': 'ground source heat pump', + 'electric oil filled radiators': 'electric radiators', + 'solid fuel': 'other', + 'lpg boiler': 'boiler - other fuel', + 'electric boiler': 'electric boiler', 'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler', - 'eco electric radiators': 'electric radiators', 'gas fire': 'other', 'backboiler - solid fuel': 'other', + 'eco electric radiators': 'electric radiators', + 'gas fire': 'other', 'backboiler - solid fuel': 'other', + 'ASHP': 'air source heat pump', + 'COMMHEAT': 'communal gas boiler', + 'GBB': 'gas combi boiler', + 'GFS': 'gas condensing boiler', + 'GWA': 'gas condensing boiler', + 'GWM': 'gas condensing combi', + 'HDU': 'district heating', + 'OILBLR': 'oil boiler', + 'SOLIDFUEL': 'boiler - other fuel', + 'STORHTR': 'high heat retention storage heaters', + np.nan: 'unknown', } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index ec569123..2612f058 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -1,7 +1,7 @@ # These are the standard categories for property types STANDARD_PROPERTY_TYPES = { "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house", - "unknown", "other" + "unknown", "other", "block of flats" } # This is a basic mapping that we use to map values that we've seen commonly to standard values @@ -15,4 +15,11 @@ PROPERTY_MAPPING = { "BEDSIT": "bedsit", "COACHSE": "coach house", "coachse": "coach house", + 'Admin Unit Type': 'unknown', + 'Block': 'block of flats', + 'Bungalow': 'bungalow', + 'Flat': 'flat', + 'House': 'house', + 'Maisonette': 'maisonette', + 'Stairwell': 'other' } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 1fc52fcb..82b31d01 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,7 +1,8 @@ STANDARD_WALL_CONSTRUCTIONS = { "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation", - "timber frame", "uninsulated solid brick", - "insulated solid brick", "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", + "uninsulated solid brick", "insulated solid brick", "solid brick unknown insulation", + "timber frame", + "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob", "new build - average thermal transmittance", } @@ -70,8 +71,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'average thermal transmittance 0.28 w/m?k': 'unknown', 'Cavity wall, filled cavity': 'filled cavity', 'Cavity wall, filled cavity and external insulation': 'filled cavity', - 'Granite or whinstone, as built, no insulation (assumed)': 'granite or ' - 'whinstone', + 'Granite or whinstone, as built, no insulation (assumed)': 'granite or whinstone', 'Solid brick, as built, insulated (assumed)': 'insulated solid brick', 'Solid brick, as built, no insulation (assumed)': 'uninsulated solid brick', 'Solid brick, with external insulation': 'insulated solid brick', @@ -84,4 +84,9 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Timber frame, as built, no insulation (assumed)': 'timber frame', 'Timber frame, as built, partial insulation (assumed)': 'timber frame', 'Timber frame, with additional insulation': 'timber frame', + 'CAVITY': 'partial unknown cavity', + 'COMB': 'unknown', + 'NONE': 'unknown', + 'NOTKNOWN': 'unknown', + 'SOLID': 'solid brick unknown insulation', } From c3049732f0d680a38aa9acacd3f15ff9e16d80f0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 24 Feb 2025 18:44:06 +0000 Subject: [PATCH 215/255] handling block of flats --- asset_list/AssetList.py | 7 +++++++ asset_list/app.py | 25 ++++++++++++++++--------- asset_list/mappings/heating_systems.py | 2 +- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 06ec5907..72086c60 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -344,6 +344,7 @@ class AssetList: # Will be used to store aggregated figures against the various work types self.work_type_figures = {} self.flat_data = None + self.duplicated_addresses = None # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False @@ -691,6 +692,12 @@ class AssetList: f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated " f"addresses - dropping" ) + + # Keep a record of duplicates + self.duplicated_addresses = self.standardised_asset_list[ + self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() + ][[self.DOMNA_PROPERTY_ID, self.address1_colname, self.postcode_colname]].copy() + self.standardised_asset_list = self.standardised_asset_list[ ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() ] diff --git a/asset_list/app.py b/asset_list/app.py index 1cb7808e..a24c4043 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -45,6 +45,12 @@ def get_data( no_epc = [] for _, home in tqdm(df.iterrows(), total=len(df)): try: + + # If we have a block of flats, we cannot retrieve this data + if home[AssetList.STANDARD_PROPERTY_TYPE] == "block of flats": + no_epc.append(home[row_id_name]) + continue + postcode = home[postcode_column] house_number = str(home[address1_column]).strip() full_address = home[fulladdress_column].strip() @@ -283,16 +289,17 @@ def app(): # We produce the new maps, which can be saved for future useage new_property_type_map = PROPERTY_MAPPING.copy().update( - asset_list.variable_mappings[asset_list.landlord_property_type] + asset_list.variable_mappings[asset_list.landlord_property_type] if asset_list.landlord_property_type else {} ) new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_wall_construction] + asset_list.variable_mappings[asset_list.landlord_wall_construction] if + asset_list.landlord_wall_construction else {} ) new_heating_map = HEATING_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_heating_system] + asset_list.variable_mappings[asset_list.landlord_heating_system] if asset_list.landlord_heating_system else {} ) new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_existing_pv] + asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {} ) asset_list.apply_standardiation() @@ -305,7 +312,7 @@ def app(): skip = None # Used to skip already completed chunks chunk_size = 5000 filename = "Chunk {i}.csv" - download_folder = os.path.join(DATA_FOLDER, "Chunks") + download_folder = os.path.join(data_folder, "Chunks") if not os.path.exists(download_folder): os.makedirs(download_folder) @@ -343,12 +350,12 @@ def app(): # Append the failed data to the main data # Store the chunk locally as a csv - pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False) + pd.DataFrame(epc_data_chunk).to_csv(os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False) # Store the errors and no-data locally - with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f: + with open(os.path.join(data_folder, f"Chunks/Chunk {i} errors.json"), "w") as f: json.dump(errors_chunk, f) - with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f: + with open(os.path.join(data_folder, f"Chunks/Chunk {i} nodata.csv"), "w") as f: json.dump(no_epc_chunk, f) # We read in and concatenate the created created chunks @@ -446,7 +453,7 @@ def app(): asset_list.flat_analysis() # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx" + filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx" # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data with pd.ExcelWriter(filename) as writer: diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index b58f13f2..4879efcc 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -62,6 +62,6 @@ HEATING_MAPPINGS = { 'HDU': 'district heating', 'OILBLR': 'oil boiler', 'SOLIDFUEL': 'boiler - other fuel', - 'STORHTR': 'high heat retention storage heaters', + 'STORHTR': 'electric storage heaters', np.nan: 'unknown', } From 0ffc59861c4d70a822c0830838bc740a2598331f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 25 Feb 2025 08:19:08 +0000 Subject: [PATCH 216/255] examining results on colchester --- asset_list/AssetList.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 72086c60..0156a2a3 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -233,7 +233,8 @@ class AssetList: "secondheat-description": "epc_secondary_heating", "transaction-type": "epc_reason", "energy-consumption-current": "epc_heat_demand", - "photo-supply": "epc_photo_supply" + "photo-supply": "epc_photo_supply", + "estimated": "estimated" } FIND_EPC_DATA_NAMES = { "heating_text": "epc_estiamted_heating_kwh", @@ -714,6 +715,22 @@ class AssetList: columns=self.rename_map ) + # We fill any standard columns that are not in the data because they were not provided by the landlord + missing_variables = [ + v for v in [ + self.STANDARD_EXISTING_PV, + self.STANDARD_HEATING_SYSTEM, + self.STANDARD_UPRN, + self.STANDARD_PROPERTY_TYPE, + self.STANDARD_YEAR_BUILT, + self.STANDARD_WALL_CONSTRUCTION, + self.STANDARD_HEATING_SYSTEM, + self.STANDARD_EXISTING_PV + ] if v not in self.standardised_asset_list.columns + ] + for v in missing_variables: + self.standardised_asset_list[v] = None + def merge_data(self, df: pd.DataFrame): """ Used to insert data into the standardised asset list, based on the domna property id @@ -963,7 +980,6 @@ class AssetList: # Extraction ###################################################### - # TODO When filterting like this, 627 properties are flagged as not needing a CIGA check and 582 are flagged # as needing a CIGA check. What is the logic we should be applying here? self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & @@ -974,6 +990,15 @@ class AssetList: ) ) + z = self.standardised_asset_list[ + self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES" + ] + z["non-intrusives: Insulated"].value_counts() + z["non-intrusives: Material"].value_counts() + z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts() + z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max() + zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105] + ###################################################### # Solar ###################################################### From 67f3e8ab703ea2893cdb9f9a6a9bd7bbee9344f8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 25 Feb 2025 08:41:08 +0000 Subject: [PATCH 217/255] reviewing methodology --- asset_list/AssetList.py | 51 +++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 0156a2a3..76f2b145 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -951,7 +951,7 @@ class AssetList: ###################################################### # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled # 2) The age is before 1995 - # TODO: 3) Remove anything that likley has access issues + # 3) We don't remove anything that haas access issues yet self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & @@ -976,6 +976,19 @@ class AssetList: self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD ) ) + + z0 = self.standardised_asset_list[ + self.standardised_asset_list["epc_indicates_empty_cavity"] & ( + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] + ) + ] + z0['non-intrusives: Construction'].value_counts() + z0['non-intrusives: Insulated'].value_counts() + z00 = z0[z0['non-intrusives: Insulated'] == "EWI"] + + # If the EPC is estimated, perhaps we should defer to the non-intrusives? + z00[""] + ###################################################### # Extraction ###################################################### @@ -990,14 +1003,26 @@ class AssetList: ) ) - z = self.standardised_asset_list[ - self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES" - ] - z["non-intrusives: Insulated"].value_counts() - z["non-intrusives: Material"].value_counts() - z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts() - z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max() - zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105] + # z3 = self.standardised_asset_list[ + # self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] + # ] + # z3['non-intrusives: Material'].value_counts() + # self.standardised_asset_list['non-intrusives: Material'].value_counts() + # + # z = self.standardised_asset_list[ + # self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES" + # ] + # z["non-intrusives: Insulated"].value_counts() + # z["non-intrusives: Material"].value_counts() + # z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts() + # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max() + # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].min() + # z[self.STANDARD_YEAR_BUILT].describe() + # + # zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105] + # z2 = self.standardised_asset_list[ + # self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "NO" + # ] ###################################################### # Solar @@ -1159,6 +1184,10 @@ class AssetList: .lower().str.contains("solid") ) & ( ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) & ( + # We do not utilise estimated EPCs for this method because we will always find that + # "epc_has_floor_recommendation" is False + ~self.standardised_asset_list["estimated"] ) ) | ( ( @@ -1180,6 +1209,10 @@ class AssetList: .lower().str.contains("suspended") ) & ( ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) & ( + # We do not utilise estimated EPCs for this method because we will always find that + # "epc_has_floor_recommendation" is False + ~self.standardised_asset_list["estimated"] ) ) | ( ( From ddfbf33494f6741b974217fffc5bb4ba784560a0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 26 Feb 2025 11:00:12 +0000 Subject: [PATCH 218/255] westward complete --- asset_list/AssetList.py | 95 ++++++++++++++----------- asset_list/app.py | 42 +++++++---- asset_list/mappings/walls.py | 2 +- etl/customers/remote_assessments/app.py | 14 ++-- recommendations/HeatingRecommender.py | 2 +- 5 files changed, 94 insertions(+), 61 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 76f2b145..31b11c66 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -286,7 +286,7 @@ class AssetList: # This SAP threshold is a key search criteria for properties that may be eligible for extraction FILLED_CAVITY_SAP_THRESHOLD = 75 # This SAP the - EMPTY_CAVITY_SAP_THRESHOLD = 71 + EMPTY_CAVITY_SAP_THRESHOLD = 75 # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5 @@ -956,13 +956,28 @@ class AssetList: (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & ( self.standardised_asset_list[ self.EPC_API_DATA_NAMES["current-energy-efficiency"] ] <= self.EMPTY_CAVITY_SAP_THRESHOLD ) ) + # Let's also flag work that looks eligible without the SAP filter + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & + self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) + ) + + # If non_intrusive_indicates_empty_cavity is True, + # set non_intrusive_indicates_empty_cavity_no_sap_filter to False + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], + False, + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] + ) self.standardised_asset_list["epc_indicates_empty_cavity"] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( @@ -977,17 +992,16 @@ class AssetList: ) ) - z0 = self.standardised_asset_list[ - self.standardised_asset_list["epc_indicates_empty_cavity"] & ( - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] - ) - ] - z0['non-intrusives: Construction'].value_counts() - z0['non-intrusives: Insulated'].value_counts() - z00 = z0[z0['non-intrusives: Insulated'] == "EWI"] - - # If the EPC is estimated, perhaps we should defer to the non-intrusives? - z00[""] + # If the EPC is esimtated, we defer to the non-intrusives + self.standardised_asset_list["epc_indicates_empty_cavity"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + self.standardised_asset_list["estimated"] + ), + False, + self.standardised_asset_list["epc_indicates_empty_cavity"] + ) ###################################################### # Extraction @@ -997,33 +1011,14 @@ class AssetList: self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & - (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "COMPACTED BEAD"]) + (~self.standardised_asset_list['non-intrusives: Material'].isin( + ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"] + ) ) & ( self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) ) - # z3 = self.standardised_asset_list[ - # self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] - # ] - # z3['non-intrusives: Material'].value_counts() - # self.standardised_asset_list['non-intrusives: Material'].value_counts() - # - # z = self.standardised_asset_list[ - # self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES" - # ] - # z["non-intrusives: Insulated"].value_counts() - # z["non-intrusives: Material"].value_counts() - # z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts() - # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max() - # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].min() - # z[self.STANDARD_YEAR_BUILT].describe() - # - # zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105] - # z2 = self.standardised_asset_list[ - # self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "NO" - # ] - ###################################################### # Solar ###################################################### @@ -1114,7 +1109,7 @@ class AssetList: ) | ( self.standardised_asset_list[ "walls_u_value"].apply( - lambda x: x <= 0.3 if not pd.isnull( + lambda x: x <= 0.7 if not pd.isnull( x) else False ) ) @@ -1141,7 +1136,7 @@ class AssetList: "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False ) | ( self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( - lambda x: int(x) >= 270 if str(x).isdigit() else False + lambda x: int(x) >= 200 if str(x).isdigit() else False ) ) | ( self.standardised_asset_list["roof_u_value"].apply( @@ -1152,7 +1147,7 @@ class AssetList: self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[ self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( - lambda x: int(x) < 270 if str(x).isdigit() else False + lambda x: int(x) < 200 if str(x).isdigit() else False ) # TODO: Fill with False - should be temp! @@ -1187,7 +1182,7 @@ class AssetList: ) & ( # We do not utilise estimated EPCs for this method because we will always find that # "epc_has_floor_recommendation" is False - ~self.standardised_asset_list["estimated"] + (self.standardised_asset_list["estimated"] == False) ) ) | ( ( @@ -1212,7 +1207,7 @@ class AssetList: ) & ( # We do not utilise estimated EPCs for this method because we will always find that # "epc_has_floor_recommendation" is False - ~self.standardised_asset_list["estimated"] + self.standardised_asset_list["estimated"] == False ) ) | ( ( @@ -1274,6 +1269,7 @@ class AssetList: ) # Other floor type, fully insulated + self.standardised_asset_list["solar_eligible_other_floor"] = ( # Landlord data or EPC data indicates the heating system is appropriate ( @@ -1332,6 +1328,9 @@ class AssetList: "Empty Cavity (non-intrusives)": ( self.standardised_asset_list["non_intrusive_indicates_empty_cavity"].sum() ), + "Empty Cavity (non-intrusives, no SAP filter)": ( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() + ), "Empty Cavity (EPC)": ( ( self.standardised_asset_list["epc_indicates_empty_cavity"] & @@ -1359,6 +1358,17 @@ class AssetList: ) } + # We produce a breakdown of the property types, for cavity fills + cavity_fills = self.standardised_asset_list[ + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | ( + self.standardised_asset_list["epc_indicates_empty_cavity"] + ) + ] + + self.work_type_breakdowns = { + "empty_cavity": cavity_fills[self.STANDARD_PROPERTY_TYPE].value_counts() + } + # Finally, we note why each property has been flagged self.standardised_asset_list["cavity_reason"] = None self.standardised_asset_list["cavity_reason"] = np.where( @@ -1366,6 +1376,11 @@ class AssetList: "Non-Intrusive Data Showed Empty Cavity", self.standardised_asset_list["cavity_reason"] ) + self.standardised_asset_list["cavity_reason"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"], + "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed", + self.standardised_asset_list["cavity_reason"] + ) self.standardised_asset_list["cavity_reason"] = np.where( ( self.standardised_asset_list["epc_indicates_empty_cavity"] & diff --git a/asset_list/app.py b/asset_list/app.py index a24c4043..09ccac02 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -246,22 +246,40 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + # data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + # sheet_name = "Sheet1" + # postcode_column = 'Full Address.1' + # fulladdress_column = "Full Address" + # address1_column = None + # address1_method = "first_word" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Build Date" + # landlord_os_uprn = None + # landlord_property_type = "Property Type" + # landlord_wall_construction = "Wallinsul" + # landlord_heating_system = "HeatSorc" + # landlord_existing_pv = None + # landlord_property_id = "Property Reference" + + # For Westward + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + data_filename = "WESTWARD - completed list..xlsx" sheet_name = "Sheet1" - postcode_column = 'Full Address.1' - fulladdress_column = "Full Address" + postcode_column = "WFT EDIT Postcode" + fulladdress_column = "Address" address1_column = None - address1_method = "first_word" + address1_method = "house_number_extraction" address_cols_to_concat = [] missing_postcodes_method = None - landlord_year_built = "Build Date" - landlord_os_uprn = None - landlord_property_type = "Property Type" - landlord_wall_construction = "Wallinsul" - landlord_heating_system = "HeatSorc" - landlord_existing_pv = None - landlord_property_id = "Property Reference" + landlord_year_built = "Build date" + landlord_os_uprn = "UPRN" + landlord_property_type = "Location type" + landlord_wall_construction = "Wall Construction (EPC)" + landlord_heating_system = "Heat Source" + landlord_existing_pv = "PV (Y/N)" + landlord_property_id = "Place ref" # Maps addresses to uprn in problematic cases MANUAL_UPRN_MAP = {} diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 82b31d01..78d64988 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -84,7 +84,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Timber frame, as built, no insulation (assumed)': 'timber frame', 'Timber frame, as built, partial insulation (assumed)': 'timber frame', 'Timber frame, with additional insulation': 'timber frame', - 'CAVITY': 'partial unknown cavity', + 'CAVITY': 'cavity unknown insulation', 'COMB': 'unknown', 'NONE': 'unknown', 'NOTKNOWN': 'unknown', diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 15f59c5e..aac0a1a6 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 133 +PORTFOLIO_ID = 137 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,10 +19,10 @@ def app(): asset_list = [ { - "address": "40", - "postcode": "PE4 5BB", - "uprn": 100090220519, - } + "address": "41 Gainsborough Way", + "postcode": "BA21 5XU", + "uprn": 30016708, + }, ] asset_list = pd.DataFrame(asset_list) @@ -52,8 +52,8 @@ def app(): valuation_data = [ { - "uprn": 100090220519, - "valuation": 135_000 + "uprn": 30016708, + "valuation": 189000 } ] # Store valuation data to s3 diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index c5c07f89..dd81680a 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -993,7 +993,7 @@ class HeatingRecommender: # We check if there's a mains connection and the hot water is inefficient, as this will improve with a boiler has_inefficient_water = ( self.property.data["mains-gas-flag"] and - self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"] + self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"] ) non_invasive_recommendation = next(( From bb8070967b3f0e8e0234fd07e0428acc9568d208 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Mar 2025 14:38:01 +0000 Subject: [PATCH 219/255] big commit --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 76 ++++++++++++++++++++++--- asset_list/app.py | 67 +++++++++++----------- backend/Funding.py | 12 ++-- backend/app/plan/router.py | 2 +- etl/customers/remote_assessments/app.py | 41 ++++++++++--- etl/find_my_epc/AssetListEpcData.py | 20 +++++-- recommendations/HeatingRecommender.py | 2 + 9 files changed, 159 insertions(+), 65 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 96ad7a95..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index fb10c6b0..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 31b11c66..306edd99 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -344,6 +344,7 @@ class AssetList: self.standardised_asset_list = self.raw_asset_list.copy() # Will be used to store aggregated figures against the various work types self.work_type_figures = {} + self.work_type_breakdowns = {} self.flat_data = None self.duplicated_addresses = None @@ -577,7 +578,7 @@ class AssetList: self.standardised_asset_list[self.landlord_wall_construction] = np.where( self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains( "average thermal transmittance" - ), + ) == True, "new build - average thermal transmittance", self.standardised_asset_list[self.landlord_wall_construction] ) @@ -1019,6 +1020,23 @@ class AssetList: ) ) + # Also include work without the SAP filter as optimistic + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = ( + (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & + (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & + (~self.standardised_asset_list['non-intrusives: Material'].isin( + ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"] + ) + ) + ) + + # Adjust + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"], + False, + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] + ) + ###################################################### # Solar ###################################################### @@ -1109,8 +1127,7 @@ class AssetList: ) | ( self.standardised_asset_list[ "walls_u_value"].apply( - lambda x: x <= 0.7 if not pd.isnull( - x) else False + lambda x: x <= 0.7 if not pd.isnull(x) else False ) ) ) @@ -1322,26 +1339,58 @@ class AssetList: # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] ) + blocks_of_flats = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" + ] + + non_blocks_of_flats = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" + ] + # Produce some aggregate figures self.work_type_figures = { # Empty cavity from non-intrusives - "Empty Cavity (non-intrusives)": ( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"].sum() + "Empty Cavity (non-intrusives)": non_blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum(), + "Empty Cavity (non-intrusives, blocks of flats)": ( + blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum() ), "Empty Cavity (non-intrusives, no SAP filter)": ( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() + non_blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() + ), + "Empty Cavity (non-intrusives, no SAP filter, blocks of flats)": ( + blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() ), "Empty Cavity (EPC)": ( ( - self.standardised_asset_list["epc_indicates_empty_cavity"] & - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] + non_blocks_of_flats["epc_indicates_empty_cavity"] & + ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"] + ).sum() + ), + "Empty Cavity (EPC, blocks of flat)": ( + ( + blocks_of_flats["epc_indicates_empty_cavity"] & + ~blocks_of_flats["non_intrusive_indicates_empty_cavity"] ).sum() ), "Cavity Extraction": ( + ( + ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"] & + ~non_blocks_of_flats["epc_indicates_empty_cavity"] & + non_blocks_of_flats["non_intrusive_indicates_cavity_extraction"] + ).sum() + ), + "Cavity Extraction (blocks of flats)": ( + ( + ~blocks_of_flats["non_intrusive_indicates_empty_cavity"] & + ~blocks_of_flats["epc_indicates_empty_cavity"] & + blocks_of_flats["non_intrusive_indicates_cavity_extraction"] + ).sum() + ), + "Cavity Extraction (no SAP filter)": ( ( ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & ~self.standardised_asset_list["epc_indicates_empty_cavity"] & - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] ).sum() ), "Solar PV (Solid Floor)": ( @@ -1398,6 +1447,15 @@ class AssetList: "Non-Intrusive Data Showed Cavity Extraction", self.standardised_asset_list["cavity_reason"] ) + # extraction no sap filter + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "Non-Intrusive Data Showed Cavity Extraction but all SAP scores allowed", + self.standardised_asset_list["cavity_reason"] + ) # Flag solar self.standardised_asset_list["solar_reason"] = None diff --git a/asset_list/app.py b/asset_list/app.py index 09ccac02..84999e93 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -246,43 +246,43 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - # data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - # sheet_name = "Sheet1" - # postcode_column = 'Full Address.1' - # fulladdress_column = "Full Address" - # address1_column = None - # address1_method = "first_word" - # address_cols_to_concat = [] - # missing_postcodes_method = None - # landlord_year_built = "Build Date" - # landlord_os_uprn = None - # landlord_property_type = "Property Type" - # landlord_wall_construction = "Wallinsul" - # landlord_heating_system = "HeatSorc" - # landlord_existing_pv = None - # landlord_property_id = "Property Reference" - - # For Westward - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - data_filename = "WESTWARD - completed list..xlsx" + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" sheet_name = "Sheet1" - postcode_column = "WFT EDIT Postcode" - fulladdress_column = "Address" + postcode_column = 'Full Address.1' + fulladdress_column = "Full Address" address1_column = None - address1_method = "house_number_extraction" + address1_method = "first_word" address_cols_to_concat = [] missing_postcodes_method = None - landlord_year_built = "Build date" - landlord_os_uprn = "UPRN" - landlord_property_type = "Location type" - landlord_wall_construction = "Wall Construction (EPC)" - landlord_heating_system = "Heat Source" - landlord_existing_pv = "PV (Y/N)" - landlord_property_id = "Place ref" + landlord_year_built = "Build Date" + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_wall_construction = "Wallinsul" + landlord_heating_system = "HeatSorc" + landlord_existing_pv = None + landlord_property_id = "Property Reference" + + # For Westward + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + # data_filename = "WESTWARD - completed list..xlsx" + # sheet_name = "Sheet1" + # postcode_column = "WFT EDIT Postcode" + # fulladdress_column = "Address" + # address1_column = None + # address1_method = "house_number_extraction" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Build date" + # landlord_os_uprn = "UPRN" + # landlord_property_type = "Location type" + # landlord_wall_construction = "Wall Construction (EPC)" + # landlord_heating_system = "Heat Source" + # landlord_existing_pv = "PV (Y/N)" + # landlord_property_id = "Place ref" # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = {} + manual_uprn_map = {} asset_list = AssetList( local_filepath=os.path.join(data_folder, data_filename), @@ -352,7 +352,7 @@ def app(): epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, row_id_name=asset_list.DOMNA_PROPERTY_ID, - manual_uprn_map=MANUAL_UPRN_MAP, + manual_uprn_map=manual_uprn_map, ) # We now retrieve any failed properties @@ -360,7 +360,7 @@ def app(): epc_data_failed, _, _ = get_data( df=chunk_failed, row_id_name=asset_list.DOMNA_PROPERTY_ID, - manual_uprn_map=MANUAL_UPRN_MAP, + manual_uprn_map=manual_uprn_map, epc_api_only=False ) @@ -464,6 +464,7 @@ def app(): ) cleaned = msgpack.unpackb(cleaned, raw=False) + # TODO: We should break out the identification of work types to flag blocks of flats specifically asset_list.identify_worktypes(cleaned) pprint(asset_list.work_type_figures) diff --git a/backend/Funding.py b/backend/Funding.py index f0780c51..2839c7ff 100644 --- a/backend/Funding.py +++ b/backend/Funding.py @@ -149,7 +149,8 @@ class Funding: :return: """ measure_table = pd.DataFrame([ - m for m in self.recommendations if m in measures and m["default"] + m for m in self.recommendations if + (m["type"] in measures) or (m["measure_type"] in measures) and m["default"] ]) measure_table["post_install_sap"] = measure_table["sap_points"] + self.starting_sap @@ -180,13 +181,10 @@ class Funding: measure_table["cost_minus_funding"] = measure_table["total"] - measure_table["estimated_funding"] measure_table["cost_minus_funding_per_sap"] = measure_table["cost_minus_funding"] / measure_table["sap_points"] measure_table = measure_table.sort_values(["cost_minus_funding_per_sap", "total"], ascending=[True, False]) - # Recommend the measure, with estimated funding amount - recommended_measure = measure_table.head(1) - return { - "measure_type": recommended_measure["measure_type"], - "estimated_funding": recommended_measure["estimated_funding"] - } + return measure_table[ + ["type", "measure_type", "Cost Savings", "estimated_funding"] + ].rename(columns={"Cost Savings": "project_score"}).to_dict("records") def sap_to_eco_band(self, sap_points): """ diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 76c172ee..d82e774b 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -825,7 +825,7 @@ async def trigger_plan(body: PlanTriggerRequest): property_recommendations=recommendations[p.id], project_scores_matrix=eco_project_scores_matrix, whlg_eligible_postcodes=whlg_eligible_postcodes, - gbis_abs_rate=20, + gbis_abs_rate=15, eco4_abs_rate=15, ) funding_calulator.check_eligibiltiy() diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index aac0a1a6..fc3b7ec6 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 137 +PORTFOLIO_ID = 134 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,10 +19,25 @@ def app(): asset_list = [ { - "address": "41 Gainsborough Way", - "postcode": "BA21 5XU", - "uprn": 30016708, + "address": "Flat 2, 42 Malden Road, London NW5 3HG", + "postcode": "NW5 3HG", + "uprn": 5117165, }, + { + "address": "15 Bournville Lane", + "postcode": "B30 2JY", + "uprn": 100070301128 + }, + { + "address": "34 Bournville Lane", + "postcode": "B30 2LN", + "uprn": 100070301140 + }, + { + "address": "36 Bournville Lane", + "postcode": "B30 2LN", + "uprn": 100070301142 + } ] asset_list = pd.DataFrame(asset_list) @@ -52,9 +67,21 @@ def app(): valuation_data = [ { - "uprn": 30016708, - "valuation": 189000 - } + "uprn": 5117165, + "valuation": 467_000 + }, + { + "uprn": 100070301128, + "valuation": 335_000 + }, + { + "uprn": 100070301140, + "valuation": 276_000 + }, + { + "uprn": 100070301142, + "valuation": 276_000 + }, ] # Store valuation data to s3 valuation_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuation.csv" diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py index bce8cd1f..1d2e1472 100644 --- a/etl/find_my_epc/AssetListEpcData.py +++ b/etl/find_my_epc/AssetListEpcData.py @@ -72,12 +72,20 @@ class AssetListEpcData: epc_searcher.find_property(skip_os=True) if epc_searcher.newest_epc is None: continue - - find_epc_searcher = RetrieveFindMyEpc( - address=epc_searcher.newest_epc["address1"], - postcode=epc_searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + # Attempt both methods: + try: + find_epc_searcher = RetrieveFindMyEpc( + address=epc_searcher.newest_epc["address1"] + ", " + epc_searcher.newest_epc["address2"], + postcode=epc_searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except Exception as e: + logger.error(f"Error retrieving find my epc data: {e}") + find_epc_searcher = RetrieveFindMyEpc( + address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() time.sleep(0.5) # We need uprn diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index dd81680a..e4dd3a78 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -852,6 +852,8 @@ class HeatingRecommender: else: heating_simulation_config["mainheat_energy_eff_ending"] = self.property.data["mainheat-energy-eff"] + # TODO:We possibly shouldn't touch the hot water energy efficiency if we aren't recommending dual immersion + # we'll keep this for the moment though if self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"]: heating_simulation_config["hot_water_energy_eff_ending"] = "Average" else: From a6daeab88928f87cf2da87e482e2eedeea620b61 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 4 Mar 2025 11:27:43 +0000 Subject: [PATCH 220/255] working on funding --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/Funding.py | 157 +++++++++++++++++------- etl/customers/remote_assessments/app.py | 40 ++---- 4 files changed, 129 insertions(+), 72 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/Funding.py b/backend/Funding.py index 2839c7ff..f5f85b9f 100644 --- a/backend/Funding.py +++ b/backend/Funding.py @@ -98,11 +98,14 @@ class Funding: self, scheme: str, eligible: bool, + types: List[str], measure_types: List[str], + project_score: float, estimated_funding: float, notify_tenant_benefits_requirements: bool, notify_council_tax_band_requirements: bool, notify_tenant_low_income_requirements: bool, + innovation_required: bool, ): """" """ @@ -113,11 +116,14 @@ class Funding: return { "scheme": scheme, "eligible": eligible, + "type": types, "measure_types": measure_types, + "project_score": project_score, "estimated_funding": estimated_funding, "requires_benefits": notify_tenant_benefits_requirements, "requires_council_tax_band": notify_council_tax_band_requirements, - "requires_low_income": notify_tenant_low_income_requirements + "requires_low_income": notify_tenant_low_income_requirements, + "innovation_required": innovation_required, } @staticmethod @@ -140,7 +146,7 @@ class Funding: """ pass - def find_best_gbis_measure(self, measures): + def find_gbis_measures(self, measures): """ The best measure is one that: 1) Creates some SAP movement, therefore enables eligiblity @@ -247,21 +253,26 @@ class Funding: ) and (self.council_tax_band in [None, "A", "B", "C", "D"]) ): - # We find the best measure for GBIS - recommended_measure = self.find_best_gbis_measure( + # This function pulls out the various measures that can provide funding under GBIS + recommended_measures = self.find_gbis_measures( measures=[m for m in valid_measures if m not in ["cavity_wall_insulation", "loft_insulation"]] ) # If the council tax band is missing, we nofify the customer that this is a requirement that # should be checked - return self.output( - scheme="gbis", - eligible=True, - measure_types=[recommended_measure["measure_type"]], - estimated_funding=recommended_measure["estimated_funding"], - notify_tenant_benefits_requirements=False, - notify_council_tax_band_requirements=self.council_tax_band is None, - notify_tenant_low_income_requirements=False, - ) + return [ + self.output( + scheme="gbis", + eligible=True, + types=[m["type"]], # This is single measure so we only have one type + measure_types=[m["measure_type"]], + project_score=m["project_score"], + estimated_funding=m["estimated_funding"], + notify_tenant_benefits_requirements=False, + notify_council_tax_band_requirements=self.council_tax_band is None, + notify_tenant_low_income_requirements=False, + innovation_required=False + ) for m in recommended_measures + ] # Low income/flex if ( @@ -271,28 +282,83 @@ class Funding: # Find the best measure, and can also include CWI/LI but requires the tenant to be # low inome or on benefits # We find the best measure for GBIS - recommended_measure = self.find_best_gbis_measure(measures=valid_measures) - return self.output( - scheme="gbis", - eligible=True, - measure_types=[recommended_measure["measure_type"]], - estimated_funding=recommended_measure["estimated_funding"], - notify_tenant_benefits_requirements=True, - notify_council_tax_band_requirements=False, - notify_tenant_low_income_requirements=True, - ) + recommended_measures = self.find_gbis_measures(measures=valid_measures) + return [ + self.output( + scheme="gbis", + eligible=True, + types=[m["type"]], # This is single measure so we only have one type + measure_types=[m["measure_type"]], + project_score=m["project_score"], + estimated_funding=m["estimated_funding"], + notify_tenant_benefits_requirements=True, + notify_council_tax_band_requirements=False, + notify_tenant_low_income_requirements=True, + innovation_required=False + ) for m in recommended_measures + ] # Otherwise, no funding availability - return self.output( - scheme="gbis", - eligible=False, - measure_types=[], - estimated_funding=0, - notify_tenant_benefits_requirements=False, - notify_council_tax_band_requirements=False, - notify_tenant_low_income_requirements=False + return [] + + def gbis_social(self): + """ + Because this is social housing, we have two typical means for eligibility + 1) EPC D, where an innovation measure is required + 2) EPC G-E, where an innovation measure isn't required + :return: + """ + valid_measures = [ + "internal_wall_insulation", + "external_wall_insulation", + "flat_roof_insulation", + "suspended_floor_insulation", + "room_roof_insulation", + # Not available for every eligiblity type + "cavity_wall_insulation", + "loft_insulation", + "heating_control" + ] + + recommended_measures = self.find_gbis_measures( + measures=valid_measures ) + # All measures are available + if self.starting_sap == "D": + return [ + self.output( + scheme="gbis", + eligible=True, + types=[m["type"]], # This is single measure so we only have one type + measure_types=[m["measure_type"]], + project_score=m["project_score"], + estimated_funding=m["estimated_funding"], + notify_tenant_benefits_requirements=False, + notify_council_tax_band_requirements=False, + notify_tenant_low_income_requirements=False, + innovation_required=True + ) for m in recommended_measures + ] + + if self.starting_sap in ["G", "F", "E"]: + return [ + self.output( + scheme="gbis", + eligible=True, + types=[m["type"]], # This is single measure so we only have one type + measure_types=[m["measure_type"]], + project_score=m["project_score"], + estimated_funding=m["estimated_funding"], + notify_tenant_benefits_requirements=False, + notify_council_tax_band_requirements=False, + notify_tenant_low_income_requirements=False, + innovation_required=False + ) for m in recommended_measures + ] + + return [] + def gbis(self): """ Check if a property is eligible for GBIS @@ -303,24 +369,33 @@ class Funding: self.gbis_eligibiltiy = self.gbis_prs() return + if self.tenure == "Social": + self.gbis_eligibiltiy = self.gbis_social() + raise NotImplementedError("Implement social/oo") def whlg(self): if self.tenure == "Social": # We can't do anything for social housing - self.whlg_eligibility = self.output( - scheme="whlg", - eligible=False, - measure_types=[], - estimated_funding=0, - notify_tenant_benefits_requirements=False, - notify_council_tax_band_requirements=False, - notify_tenant_low_income_requirements=False - ) + self.whlg_eligibility = [] return if not self.whlg_eligible_postcodes.empty: - print("Eligible implement me!") + raise Exception("Implement me") + # self.whlg_eligibility = [ + # self.output( + # scheme, + # eligible, + # types, + # measure_types, + # project_score: float, + # estimated_funding: float, + # notify_tenant_benefits_requirements: bool, + # notify_council_tax_band_requirements: bool, + # notify_tenant_low_income_requirements: bool, + # innovation_required: bool, + # ) + # ] def eco4(self): if self.tenure == "Private": diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index fc3b7ec6..a4d60d85 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 134 +PORTFOLIO_ID = 138 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,25 +19,15 @@ def app(): asset_list = [ { - "address": "Flat 2, 42 Malden Road, London NW5 3HG", - "postcode": "NW5 3HG", - "uprn": 5117165, + "address": "42 Rippolson Road", + "postcode": "SE18 1NS", + "uprn": 100020999275, }, { - "address": "15 Bournville Lane", - "postcode": "B30 2JY", - "uprn": 100070301128 + "address": "66 Riverdale Road", + "postcode": "DA8 1PX", + "uprn": 100020235516 }, - { - "address": "34 Bournville Lane", - "postcode": "B30 2LN", - "uprn": 100070301140 - }, - { - "address": "36 Bournville Lane", - "postcode": "B30 2LN", - "uprn": 100070301142 - } ] asset_list = pd.DataFrame(asset_list) @@ -67,20 +57,12 @@ def app(): valuation_data = [ { - "uprn": 5117165, - "valuation": 467_000 + "valuation": 469_000, + "uprn": 100020999275, }, { - "uprn": 100070301128, - "valuation": 335_000 - }, - { - "uprn": 100070301140, - "valuation": 276_000 - }, - { - "uprn": 100070301142, - "valuation": 276_000 + "valuation": 382_000, + "uprn": 100020235516 }, ] # Store valuation data to s3 From 3ab1e94ea14fa3b8fcdbde795cec7d98c4535b31 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 4 Mar 2025 12:48:46 +0000 Subject: [PATCH 221/255] debugging asset list for ealing without wall --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 24 ++++++++++++----- asset_list/app.py | 39 ++++++++-------------------- asset_list/mappings/property_type.py | 3 ++- 5 files changed, 32 insertions(+), 38 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..96ad7a95 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..fb10c6b0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 306edd99..d4288114 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -575,13 +575,18 @@ class AssetList: # We handle cleaning for walls, in the instance that the landlord provides us with EPC data and # we see instances of "average thermal transmittance" in the description - self.standardised_asset_list[self.landlord_wall_construction] = np.where( - self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains( - "average thermal transmittance" - ) == True, - "new build - average thermal transmittance", - self.standardised_asset_list[self.landlord_wall_construction] - ) + if self.landlord_wall_construction is not None: + self.standardised_asset_list[self.landlord_wall_construction] = np.where( + self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains( + "average thermal transmittance" + ) == True, + "new build - average thermal transmittance", + self.standardised_asset_list[self.landlord_wall_construction] + ) + else: + # We want to make sure that we have a column for wall construction + self.landlord_wall_construction = "landlord_wall_construction" + self.standardised_asset_list[self.landlord_wall_construction] = None # Clear our build year column # We attempt to process the year built column @@ -625,6 +630,11 @@ class AssetList: if str(date_str).isdigit() & (len(str(date_str)) == 4): return int(date_str) + # Remove any non-numeric characters + date_str = re.sub(r"\D", "", str(date_str)) + if str(date_str).isdigit() & (len(str(date_str)) == 4): + return int(date_str) + raise NotImplementedError("Unhandled format for year built - implement me") self.standardised_asset_list[self.landlord_year_built] = self.standardised_asset_list[ diff --git a/asset_list/app.py b/asset_list/app.py index 84999e93..45839157 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -246,40 +246,23 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" - data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" - sheet_name = "Sheet1" - postcode_column = 'Full Address.1' - fulladdress_column = "Full Address" + # Ealing + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing/Programme data - 04032025" + data_filename = "Ealing BC - Property Plus Tenure 25.02.2025.xlsx" + sheet_name = "IGNORE - FULL MAIN" + postcode_column = 'Postcode' + fulladdress_column = "Address" address1_column = None address1_method = "first_word" address_cols_to_concat = [] missing_postcodes_method = None - landlord_year_built = "Build Date" + landlord_year_built = "Year Built" landlord_os_uprn = None - landlord_property_type = "Property Type" - landlord_wall_construction = "Wallinsul" - landlord_heating_system = "HeatSorc" + landlord_property_type = "Property Type Code" + landlord_wall_construction = None + landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "Property Reference" - - # For Westward - # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - # data_filename = "WESTWARD - completed list..xlsx" - # sheet_name = "Sheet1" - # postcode_column = "WFT EDIT Postcode" - # fulladdress_column = "Address" - # address1_column = None - # address1_method = "house_number_extraction" - # address_cols_to_concat = [] - # missing_postcodes_method = None - # landlord_year_built = "Build date" - # landlord_os_uprn = "UPRN" - # landlord_property_type = "Location type" - # landlord_wall_construction = "Wall Construction (EPC)" - # landlord_heating_system = "Heat Source" - # landlord_existing_pv = "PV (Y/N)" - # landlord_property_id = "Place ref" + landlord_property_id = "Property Ref" # Maps addresses to uprn in problematic cases manual_uprn_map = {} diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index 2612f058..ce3cce27 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -21,5 +21,6 @@ PROPERTY_MAPPING = { 'Flat': 'flat', 'House': 'house', 'Maisonette': 'maisonette', - 'Stairwell': 'other' + 'Stairwell': 'other', + 'MAISON': 'maisonette' } From 61eb2349ba34c85948f0b18b1a706047deb29016 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 5 Mar 2025 12:46:15 +0000 Subject: [PATCH 222/255] preparing data for hubspot upload --- asset_list/AssetList.py | 219 ++++++++++++++++++++++++++++++++++++++++ asset_list/app.py | 61 ++++++++++- 2 files changed, 279 insertions(+), 1 deletion(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index d4288114..25a40f99 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -376,6 +376,7 @@ class AssetList: } self.variable_mappings = {} + self.hubspot_data = None self.rename_map = {} self.keep_variables = [] @@ -1526,3 +1527,221 @@ class AssetList: flat_data = pd.DataFrame(flat_data) self.flat_data = flat_data + + def prepare_for_crm(self, contact_details, company_domain, crm_pipeline_name, first_dealstage, assigned_surveyors): + """ + This function prepares the data for upload into Hubspot + :return: + """ + # This is a placeholder for now + + # This maps the opportunities as we reference them, to the product data as stored in Hubspot + product_lookup_table = { + "Non-Intrusive Data Showed Cavity Extraction": { + "name": "Extract & Fill - ECO4", "id": 100307905778, "unit_price": 500 + }, + "Non-Intrusive Data Showed Empty Cavity": { + "name": "Empty Cavity & Loft - ECO4", "id": 82733738177, "unit_price": 1000 + }, + "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed": { + "name": "Empty Cavity & Loft - ECO4", "id": 82733738177, "unit_price": 1000 + }, + "Non-Intrusive Data Showed Cavity Extraction but all SAP scores allowed": { + "name": "Extract & Fill - ECO4", "id": 100307905778, "unit_price": 500 + }, + "EPC Data Showed Empty Cavity": { + "name": "Empty Cavity & Loft - ECO4", "id": 82733738177, "unit_price": 1000 + }, + "Solid Floor, Insulated, No Solar": { + "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608 + }, + "Solid Floor, Insulated, Needs Loft": { + "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608 + }, + "Other Floor, Insulated, No Solar": { + "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608 + }, + "Other Floor, Insulated, Needs Loft": { + "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608 + } + } + # We check if all products are covered in the lookup table + cavity_products = self.standardised_asset_list["cavity_reason"].unique() + solar_products = self.standardised_asset_list["solar_reason"].unique() + # Check if there any options not in out lookup table + if ( + any(x for x in cavity_products if x not in product_lookup_table) or + any(x for x in solar_products if x not in product_lookup_table) + ): + raise ValueError("We have products not referenced in the lookup table - check this") + + programme_data = self.standardised_asset_list.copy() + + # Exclusions - these are properties we won't treat for the moment + product_exclusions = [ + "Other Floor, Insulated, No Solar", + "Other Floor, Insulated, Needs Loft" + ] + if product_exclusions: + logger.warning("Excluding products: %s", product_exclusions) + + programme_data = programme_data[programme_data["solar_reason"].isin(product_exclusions) == False] + + # Merge on the contact details + programme_data = programme_data.merge( + contact_details, + how="left", + left_on=self.STANDARD_LANDLORD_PROPERTY_ID, + right_on=self.landlord_property_id, + ) + + programme_data["Company Domain Name "] = company_domain + # Append the product data onto the programme data + programme_data["cavity_product"] = programme_data["cavity_reason"].map( + lambda x: product_lookup_table.get(x, {"name": None})["name"] + ) + programme_data["solar_product"] = programme_data["solar_reason"].map( + lambda x: product_lookup_table.get(x, {"name": None})["name"] + ) + + programme_data["domna_product"] = programme_data["solar_reason"].copy() + programme_data["domna_product"] = np.where( + pd.isnull(programme_data["domna_product"]), + programme_data["solar_product"], + programme_data["domna_product"] + ) + # We filter just on rows where we have a product + programme_data = programme_data[ + ~pd.isnull(programme_data["domna_product"]) + ] + programme_data = programme_data.drop(columns=["solar_product", "cavity_product"]) + + product_df = ( + pd.DataFrame(product_lookup_table).T[["name", "id", "unit_price"]] + .reset_index() + .rename( + columns={ + "name": "Name ", + "id": 'Product ID ', + "unit_price": 'Unit price ', + "index": "domna_product" + } + ) + ) + + product_df['Quantity '] = 1 + + # Append on the product data + programme_data = programme_data.merge( + product_df, + how="left", + on="domna_product", + ) + + # Add in deal and pipeline information + programme_data["dealname"] = programme_data[self.STANDARD_FULL_ADDRESS] + " : " + programme_data[ + "domna_product"] + programme_data['Pipeline '] = crm_pipeline_name + programme_data['Deal Stage '] = first_dealstage + programme_data['Associations: Listing'] = "Property Owner" + + programme_data = programme_data.merge( + assigned_surveyors.rename( + columns={self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID} + ), how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID + ) + + # This maps the hubspot schema to the template. Anything that is not covered in this will be flagged + schema_mappings = { + 'Name ': self.DOMNA_PROPERTY_ID, # TODO: Maybe change this? + 'Company Domain Name ': 'Company Domain Name ', + 'Email ': 'email', # TODO: Review + 'First Name ': 'first name', # TODO: Review + 'Last Name ': 'last name', # TODO: Review + 'Phone ': 'phone', # TODO: Review + 'Full Address ': self.STANDARD_FULL_ADDRESS, + 'Address 1 ': self.STANDARD_ADDRESS_1, + 'Address 2 ': None, # TODO: Don't have this for the moment + 'Postcode ': self.STANDARD_POSTCODE, + 'Property Type ': self.STANDARD_PROPERTY_TYPE, + 'Property Sub Type ': None, # TODO: Don't have this for the moment + 'Bedroom(s) ': None, # TODO: Don't have this for the moment + 'Domna Property ID ': self.DOMNA_PROPERTY_ID, + 'National UPRN ': ( + self.STANDARD_UPRN if self.STANDARD_UPRN is not None else self.EPC_API_DATA_NAMES["uprn"] + ), + 'Owner Property ID ': self.STANDARD_LANDLORD_PROPERTY_ID, + 'Wall Construction ': self.STANDARD_WALL_CONSTRUCTION, + 'Heating System ': self.STANDARD_HEATING_SYSTEM, + 'Year Built ': self.STANDARD_YEAR_BUILT, + 'Boiler Make ': None, # TODO: Don't have this for the moment + 'Boiler Model ': None, # TODO: Don't have this for the moment + 'Non-Intrusives: Date Checked ': None, + # TODO: Don't have this for the moment + 'Non-Intrusives: Wall Type ': ( + "non-intrusives: Construction" if self.non_intrusives_present else None + ), + 'Non-intrusives: Insulation ': ( + "non-intrusives: Insulated" if self.non_intrusives_present else None + ), + 'Non-intrusives: Insulation Material ': ( + "non-intrusives: Material" if self.non_intrusives_present else None + ), + 'Non-Intrusives: CIGA Check Required ': ( + 'non-intrusives: CIGA Check Required' if self.non_intrusives_present else None + ), + 'Non-Intrusives: PV Access Issues ': ( + 'non-intrusives: PV, ACCESS ISSUE, SEE NOTES' if self.non_intrusives_present else None + ), + 'Non-Intrusives: Roof Orientation ': ( + 'non-intrusives: OFF GAS - ROOF ORIENTATION' if self.non_intrusives_present else None + ), + 'Non-Intrusives: Surveyor Notes ': ( + 'non-intrusives: Any further surveyor notes' if self.non_intrusives_present else None + ), + 'Non-Intrusives: Surveyor Name ': ( + 'non-intrusives: Surveyors Name' if self.non_intrusives_present else None + ), + 'CIGA: Date Requested ': None, # TODO: Don't have this for the moment + 'CIGA: Cavity Guarantee Found ': None, + 'Last EPC: Is Estimated ': self.EPC_API_DATA_NAMES["estimated"], + 'Last EPC: EPC Rating ': self.EPC_API_DATA_NAMES["current-energy-rating"], + 'Last EPC: SAP Rating ': self.EPC_API_DATA_NAMES["current-energy-efficiency"], + 'Last EPC: Main Heating Description ': self.EPC_API_DATA_NAMES[ + "mainheat-description"], + 'Last EPC: Heating Controls ': self.EPC_API_DATA_NAMES[ + "mainheatcont-description"], + 'Last EPC: Lodgement Date ': self.EPC_API_DATA_NAMES["inspection-date"], + 'Last EPC: Floor Area ': self.EPC_API_DATA_NAMES["total-floor-area"], + 'Last EPC: Wall ': self.EPC_API_DATA_NAMES["walls-description"], + 'Last EPC: Roof ': self.EPC_API_DATA_NAMES["roof-description"], + 'Last EPC: Floor ': self.EPC_API_DATA_NAMES["floor-description"], + 'Last EPC: Room Height ': self.EPC_API_DATA_NAMES["floor-height"], + 'Last EPC: Age Band ': self.EPC_API_DATA_NAMES["construction-age-band"], + 'Deal Stage ': 'Deal Stage ', + 'Pipeline ': 'Pipeline ', + 'Expected Commencement Date ': None, # TODO: Need to set this, + 'Deal Name ': "dealname", # Need to create this, + 'Product ID ': 'Product ID ', + 'Name ': 'Name ', + 'Unit price ': 'Unit price ', + 'Quantity ': 'Quantity ', + 'Deal Owner': 'surveyor_email', + 'Amount ': 'Unit price ', + } + + # We now create the finalised dataset to be uploaded into Hubspot + variables_required = list(schema_mappings.values()) + variables_required = [v for v in variables_required if v is not None] + # We now flag anything that has a none value, which is information we haven't got right now + none_variables = [k for k, v in schema_mappings.items() if v is None] + # We'll add placeholder columns for the None variables + programme_data = programme_data[variables_required] + for col in none_variables: + programme_data[col] = None + + programme_data = programme_data.rename( + columns={v: k for k, v in schema_mappings.items() if v is not None} + ) + + self.hubspot_data = programme_data diff --git a/asset_list/app.py b/asset_list/app.py index 45839157..475bd7b3 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -262,7 +262,25 @@ def app(): landlord_wall_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "Property Ref" + landlord_property_id = "Property ref" + + # For Westward + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + # data_filename = "WESTWARD - completed list..xlsx" + # sheet_name = "Sheet1" + # postcode_column = "WFT EDIT Postcode" + # fulladdress_column = "Address" + # address1_column = None + # address1_method = "house_number_extraction" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Build date" + # landlord_os_uprn = "UPRN" + # landlord_property_type = "Location type" + # landlord_wall_construction = "Wall Construction (EPC)" + # landlord_heating_system = "Heat Source" + # landlord_existing_pv = "PV (Y/N)" + # landlord_property_id = "Place ref" # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -454,6 +472,47 @@ def app(): asset_list.flat_analysis() + # Convert to a format suitable for CRM + contact_details = pd.DataFrame( + [ + { + asset_list.landlord_property_id: "EXETEMORH0100010", + "first name": "Khalim", + "last name": "Conn-Kowlessar", + "email": "kconnkowlessar@gmail.com", + "phone": "075399248" + } + ] + ) + + assigned_surveyors = pd.DataFrame( + [ + { + asset_list.landlord_property_id: "EXETEMORH0100010", + "surveyor_name": "Khalim Conn-Kowlessar", + "surveyor_email": "khalim@domna.homes", + } + ] + ) + + # TODO: Sort the output by postcode + + company_domain = "ealing.gov.uk" + crm_pipeline_name = "Survey Management" + first_dealstage = "READY TO BEGIN SCHEDULING" + # TODO - temp, upload to either SharePoint or AWS + hubspot_template = pd.read_csv("~/Downloads/Hubspot Upload Template - Demo V2(Template).csv") + hubspot_schema = hubspot_template.columns.tolist() + + asset_list.prepare_for_crm( + contact_details=contact_details, + assigned_surveyors=assigned_surveyors, + company_domain=company_domain, + crm_pipeline_name=crm_pipeline_name, + first_dealstage=first_dealstage + ) + hubspt_data = asset_list.hubspot_data + # Store as an excel filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx" # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data From 8012fa44338e078282bebecfb3c3e439730631f7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 6 Mar 2025 07:17:30 +0000 Subject: [PATCH 223/255] preparing for hubspot upload --- asset_list/AssetList.py | 85 ++++++++++++++++++++++++++++++++++++--- asset_list/app.py | 32 +++++++-------- backend/app/plan/utils.py | 4 -- 3 files changed, 95 insertions(+), 26 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 25a40f99..ed1cdf2c 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -347,6 +347,8 @@ class AssetList: self.work_type_breakdowns = {} self.flat_data = None self.duplicated_addresses = None + self.contact_details = None + self.contact_detail_fields = None # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False @@ -1528,7 +1530,70 @@ class AssetList: self.flat_data = flat_data - def prepare_for_crm(self, contact_details, company_domain, crm_pipeline_name, first_dealstage, assigned_surveyors): + @staticmethod + def split_full_name(x): + if pd.isnull(x): + return None, None, None + x = x.lower() + titles = ["mr", "mrs", "ms", "miss", "dr", "prof"] + # Remove titles + detected_title = [title for title in titles if x.startswith(title)] + if detected_title: + for title in detected_title: + x = x.replace(title, "") + x = x.strip() + first_name, last_name = x.split(" ")[0], x.split(" ")[-1] + title = detected_title[0].title() if detected_title else None + return title, first_name.title(), last_name.title() + + def load_contact_details( + self, + local_filepath, + sheet_name, + landlord_property_id, + phone_number_column=None, + email_column=None, + fullname_column=None, + firstname_column=None, + lastname_column=None + ): + + self.contact_detail_fields = { + "landlord_property_id": landlord_property_id, + "phone_number": phone_number_column, + "email": email_column, + "fullname": fullname_column, + "firstname": firstname_column, + "lastname": lastname_column + } + + details_colnames = [ + phone_number_column, email_column, fullname_column, firstname_column, lastname_column + ] + # We'll fill them + none_details = [x for x in details_colnames if x is None] + details_colnames = [x for x in details_colnames if x is not None] + + contact_details = pd.read_excel( + local_filepath, sheet_name=sheet_name + )[[self.contact_detail_fields["landlord_property_id"]] + details_colnames] + contact_details = contact_details[ + ~pd.isnull(contact_details[self.contact_detail_fields["landlord_property_id"]]) + ] + # Fill anything we don't have + for detail in none_details: + contact_details[detail] = None + + if fullname_column and not (firstname_column and lastname_column): + contact_details["title"], contact_details["first_name"], contact_details["last_name"] = zip( + *contact_details[fullname_column].apply(self.split_full_name) + ) + else: + raise NotImplementedError("Implement me") + + self.contact_details = contact_details + + def prepare_for_crm(self, company_domain, crm_pipeline_name, first_dealstage, assigned_surveyors): """ This function prepares the data for upload into Hubspot :return: @@ -1589,7 +1654,7 @@ class AssetList: # Merge on the contact details programme_data = programme_data.merge( - contact_details, + self.contact_details, how="left", left_on=self.STANDARD_LANDLORD_PROPERTY_ID, right_on=self.landlord_property_id, @@ -1655,10 +1720,18 @@ class AssetList: schema_mappings = { 'Name ': self.DOMNA_PROPERTY_ID, # TODO: Maybe change this? 'Company Domain Name ': 'Company Domain Name ', - 'Email ': 'email', # TODO: Review - 'First Name ': 'first name', # TODO: Review - 'Last Name ': 'last name', # TODO: Review - 'Phone ': 'phone', # TODO: Review + 'Email ': ( + self.contact_detail_fields["email"] if self.contact_detail_fields["email"] else None + ), # TODO: Review + 'First Name ': ( + self.contact_detail_fields["firstname"] if self.contact_detail_fields["firstname"] else None + ), # TODO: Review + 'Last Name ': ( + self.contact_detail_fields["lastname"] if self.contact_detail_fields["lastname"] else None + ), # TODO: Review + 'Phone ': ( + self.contact_detail_fields["phone_number"] if self.contact_detail_fields["phone_number"] else None + ), # TODO: Review 'Full Address ': self.STANDARD_FULL_ADDRESS, 'Address 1 ': self.STANDARD_ADDRESS_1, 'Address 2 ': None, # TODO: Don't have this for the moment diff --git a/asset_list/app.py b/asset_list/app.py index 475bd7b3..ba3a1b82 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -472,23 +472,23 @@ def app(): asset_list.flat_analysis() - # Convert to a format suitable for CRM - contact_details = pd.DataFrame( - [ - { - asset_list.landlord_property_id: "EXETEMORH0100010", - "first name": "Khalim", - "last name": "Conn-Kowlessar", - "email": "kconnkowlessar@gmail.com", - "phone": "075399248" - } - ] + asset_list.load_contact_details( + local_filepath=os.path.join(data_folder, "Full property list wth D&V report V look up 12.2.25.xlsx"), + sheet_name="Report 1", + landlord_property_id=asset_list.landlord_property_id, + phone_number_column='Property Current Tel. Number', + fullname_column='Proeprty Current Occupant', + firstname_column=None, + lastname_column=None, + email_column=None, # TODO - we need this ) + # Convert to a format suitable for CRM + # TODO: TEMP assigned_surveyors = pd.DataFrame( [ { - asset_list.landlord_property_id: "EXETEMORH0100010", + asset_list.landlord_property_id: "02610001", "surveyor_name": "Khalim Conn-Kowlessar", "surveyor_email": "khalim@domna.homes", } @@ -501,17 +501,14 @@ def app(): crm_pipeline_name = "Survey Management" first_dealstage = "READY TO BEGIN SCHEDULING" # TODO - temp, upload to either SharePoint or AWS - hubspot_template = pd.read_csv("~/Downloads/Hubspot Upload Template - Demo V2(Template).csv") - hubspot_schema = hubspot_template.columns.tolist() asset_list.prepare_for_crm( - contact_details=contact_details, assigned_surveyors=assigned_surveyors, company_domain=company_domain, crm_pipeline_name=crm_pipeline_name, first_dealstage=first_dealstage ) - hubspt_data = asset_list.hubspot_data + hubspot_data = asset_list.hubspot_data # Store as an excel filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx" @@ -520,3 +517,6 @@ def app(): with pd.ExcelWriter(filename) as writer: asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False) + + # Store the Hubspot export as a csv + hubspot_data.to_csv(os.path.join(data_folder, "Hubspot Export.csv"), index=False) diff --git a/backend/app/plan/utils.py b/backend/app/plan/utils.py index 07d4642d..34fb02e7 100644 --- a/backend/app/plan/utils.py +++ b/backend/app/plan/utils.py @@ -1,9 +1,5 @@ -import pandas as pd -from backend.Property import Property from utils.s3 import read_from_s3 -from recommendations.recommendation_utils import get_wall_u_value, get_floor_u_value, get_roof_u_value - from backend.app.config import get_settings import msgpack From e0839628810e463ebb7a2e9e10defec8415616e5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 6 Mar 2025 14:37:01 +0000 Subject: [PATCH 224/255] setting up hubspot for ealing --- asset_list/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asset_list/app.py b/asset_list/app.py index ba3a1b82..ab74f829 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -489,6 +489,7 @@ def app(): [ { asset_list.landlord_property_id: "02610001", + "week_commencing": "10/10/2025", "surveyor_name": "Khalim Conn-Kowlessar", "surveyor_email": "khalim@domna.homes", } From 66e0fdea2828b14a8bde1e54d8696a24ed2f07d8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 6 Mar 2025 17:04:01 +0000 Subject: [PATCH 225/255] preparing data pull for acis --- asset_list/AssetList.py | 31 ++++++--- asset_list/app.py | 91 +++++++++++++++++--------- asset_list/mappings/heating_systems.py | 6 ++ asset_list/mappings/property_type.py | 42 +++++++++++- asset_list/mappings/walls.py | 12 ++++ 5 files changed, 142 insertions(+), 40 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index ed1cdf2c..e0bb73f4 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -283,6 +283,8 @@ class AssetList: "Any further surveyor notes", 'Surveyors Name' ] + OLD_FORMAT_NON_INTRUSIVE_COLNAMES = ['WFT Findings', 'ECO Eligibility'] + # This SAP threshold is a key search criteria for properties that may be eligible for extraction FILLED_CAVITY_SAP_THRESHOLD = 75 # This SAP the @@ -351,7 +353,9 @@ class AssetList: self.contact_detail_fields = None # We detect the presence of the non-intrusive columns - self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False + self.non_intrusives_present = "CIGA Check Required" in self.raw_asset_list.columns + # We detect if we have the old format of non-intruvies + self.old_format_non_intrusives_present = "WFT Findings" in self.raw_asset_list.columns # Names of columns self.landlord_property_id = landlord_property_id @@ -562,14 +566,19 @@ class AssetList: } self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None} + non_intrusive_columns = [] if self.non_intrusives_present: - self.keep_variables += self.NON_INTRUSIVES_COLNAMES - self.rename_map = { - **self.rename_map, - **dict( - zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES]) - ) - } + non_intrusive_columns = self.NON_INTRUSIVES_COLNAMES + + if self.old_format_non_intrusives_present: + non_intrusive_columns = self.OLD_FORMAT_NON_INTRUSIVE_COLNAMES + + self.rename_map = { + **self.rename_map, + **dict( + zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in non_intrusive_columns]) + ) + } # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[ @@ -616,7 +625,11 @@ class AssetList: Extracts the year from a date string in the format '01-Jul-YYYY'. Returns the extracted year as an integer or None if the format is incorrect. """ - known_errors = ["#MULTIVALUE"] + known_errors = [ + "#MULTIVALUE", + "This cell has an external reference that can't be shown or edited. Editing this cell will " + "remove the external reference." + ] if pd.isnull(date_str) or date_str in known_errors: return None diff --git a/asset_list/app.py b/asset_list/app.py index ab74f829..7275709d 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -247,22 +247,22 @@ def app(): # - Or the insulation required is loft/cavity (floors should be solid) # Ealing - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing/Programme data - 04032025" - data_filename = "Ealing BC - Property Plus Tenure 25.02.2025.xlsx" - sheet_name = "IGNORE - FULL MAIN" - postcode_column = 'Postcode' - fulladdress_column = "Address" - address1_column = None - address1_method = "first_word" - address_cols_to_concat = [] - missing_postcodes_method = None - landlord_year_built = "Year Built" - landlord_os_uprn = None - landlord_property_type = "Property Type Code" - landlord_wall_construction = None - landlord_heating_system = None - landlord_existing_pv = None - landlord_property_id = "Property ref" + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing/Programme data - 04032025" + # data_filename = "Ealing BC - Property Plus Tenure 25.02.2025.xlsx" + # sheet_name = "IGNORE - FULL MAIN" + # postcode_column = 'Postcode' + # fulladdress_column = "Address" + # address1_column = None + # address1_method = "first_word" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Year Built" + # landlord_os_uprn = None + # landlord_property_type = "Property Type Code" + # landlord_wall_construction = None + # landlord_heating_system = None + # landlord_existing_pv = None + # landlord_property_id = "Property ref" # For Westward # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" @@ -282,6 +282,24 @@ def app(): # landlord_existing_pv = "PV (Y/N)" # landlord_property_id = "Place ref" + # For ACIS - programme re-build + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/ACIS/ACIS Full Programme Review March 2025" + data_filename = "ACIS asset list.xlsx" + sheet_name = "Assets" + address1_column = "House No" + postcode_column = "Postcode" + landlord_property_id = "UPRN" + fulladdress_column = None + address_cols_to_concat = ["House No", "Street", "Town"] + missing_postcodes_method = None + address1_method = None + landlord_year_built = "YEAR BUILT" + landlord_os_uprn = None + landlord_property_type = "Property type" + landlord_wall_construction = "Wall Constuction" + landlord_heating_system = "Heating" + landlord_existing_pv = None + # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -306,20 +324,33 @@ def app(): asset_list.init_standardise() # We produce the new maps, which can be saved for future useage - - new_property_type_map = PROPERTY_MAPPING.copy().update( - asset_list.variable_mappings[asset_list.landlord_property_type] if asset_list.landlord_property_type else {} - ) - new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_wall_construction] if - asset_list.landlord_wall_construction else {} - ) - new_heating_map = HEATING_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_heating_system] if asset_list.landlord_heating_system else {} - ) - new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update( - asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {} - ) + new_property_type_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_property_type] if + asset_list.landlord_property_type else {} + ).items() + if k not in PROPERTY_MAPPING + } + new_wall_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_wall_construction] if + asset_list.landlord_wall_construction else {} + ).items() + if k not in WALL_CONSTRUCTION_MAPPINGS + } + new_heating_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_heating_system] if + asset_list.landlord_heating_system else {} + ).items() + if k not in HEATING_MAPPINGS + } + new_existing_pv_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {} + ).items() + if k not in EXISTING_PV_MAPPINGS + } asset_list.apply_standardiation() diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 4879efcc..33d3701a 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -64,4 +64,10 @@ HEATING_MAPPINGS = { 'SOLIDFUEL': 'boiler - other fuel', 'STORHTR': 'electric storage heaters', np.nan: 'unknown', + 'Oil': 'boiler - other fuel', + 'Gas': 'gas condensing boiler', + 'Electric': 'electric storage heaters', + 'Solid fuel': 'other', + 'No Heat': 'unknown', + 'GSHP': 'ground source heat pump' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index ce3cce27..1fe1daac 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -1,3 +1,5 @@ +import numpy as np + # These are the standard categories for property types STANDARD_PROPERTY_TYPES = { "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house", @@ -22,5 +24,43 @@ PROPERTY_MAPPING = { 'House': 'house', 'Maisonette': 'maisonette', 'Stairwell': 'other', - 'MAISON': 'maisonette' + 'MAISON': 'maisonette', + '3 Bed Semi Detached House': 'house', + '3 Bed Mid Terrace House': 'house', + '2 Bed Semi Detached House': 'house', + '4 Bed Semi Detached House': 'house', + '2 Bed End Terrace House': 'house', + '1 Bed Sheltered Bungalow': 'bungalow', + '1 Bed 1st Floor Sheltered Flat': 'flat', + '2 Bed Second Floor Flat': 'flat', + '1 Bed Mid Terrace House': 'house', + '1 Bed End Terrace House': 'house', + '7 Bed Detached House': 'house', + '4 Bed End Terrace House': 'house', + '1 Bed Link House': 'house', + '1 Bed Second Floor Flat': 'flat', + '2 Bed Detached House': 'house', + '1 Bed Ground Floor Flat': 'flat', + '2 Bed Sheltered Bungalow': 'bungalow', + '4 Bed Mid Terrace House': 'house', + '2 Bed Mid Terrace House': 'house', + '2 Bed First Floor Flat': 'flat', + '3 Bed Detached House': 'house', + 'Ground Floor Bedsit': 'bedsit', + '3 Bed Bungalow': 'bungalow', + np.nan: 'unknown', + '5 Bed End Terrace House': 'house', + '1 Bed Grd Floor Sheltered Flat': 'flat', + '3 Bed End Terrace House': 'house', + '2 Bed Second Floor Maisonette': 'maisonette', + '2 Bed Ground Floor Flat': 'flat', + '2 Bed First Floor Maisonette': 'maisonette', + '4 Bed Detached House': 'house', + '1 Bed Bungalow': 'bungalow', + '2 Bed Bungalow': 'bungalow', + 'First Floor Bedsit': 'bedsit', + '3 Bed First Floor Maisonette': 'maisonette', + '2 Bed 1st Floor Sheltered Flat': 'flat', + '1 Bed First Floor Flat': 'flat', + '3 Bed First Floor Flat': 'flat' } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 78d64988..959701ca 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,3 +1,5 @@ +import numpy as np + STANDARD_WALL_CONSTRUCTIONS = { "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation", "uninsulated solid brick", "insulated solid brick", "solid brick unknown insulation", @@ -89,4 +91,14 @@ WALL_CONSTRUCTION_MAPPINGS = { 'NONE': 'unknown', 'NOTKNOWN': 'unknown', 'SOLID': 'solid brick unknown insulation', + np.nan: 'unknown', + 'RENDER/TIMBER FRAME': 'timber frame', + 'SYSTEM BUILT': 'system built', + 'PCC PANELS': 'other', + 'NOT APPLICABLE - FLAT': 'unknown', + 'BRICK/TIMBER FRAME': 'timber frame', + 'BRICK/BLOCK CAVITY': 'cavity unknown insulation', + 'STONE SOLID': 'sandstone or limestone', + 'EXT CLADDING SYSTEM': 'system built', + 'BRICK/BLOCK SOLID': 'solid brick unknown insulation' } From 831abc884f2a2fb24c47d73ada019b9d154ce695 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 8 Mar 2025 15:38:05 +0000 Subject: [PATCH 226/255] attempting to match masters --- asset_list/AssetList.py | 126 ++++++++++++++++++++++ asset_list/app.py | 16 +++ etl/customers/stonewater/data_cleaning.py | 1 + 3 files changed, 143 insertions(+) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index e0bb73f4..3007269b 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -7,6 +7,7 @@ from datetime import datetime from openai import OpenAI import numpy as np import pandas as pd +from tqdm import tqdm from fuzzywuzzy import process from utils.logger import setup_logger from backend.SearchEpc import SearchEpc @@ -351,6 +352,9 @@ class AssetList: self.duplicated_addresses = None self.contact_details = None self.contact_detail_fields = None + self.outcomes = None + self.outcomes_no_match = None + self.master_surveyed = None # We detect the presence of the non-intrusive columns self.non_intrusives_present = "CIGA Check Required" in self.raw_asset_list.columns @@ -758,6 +762,11 @@ class AssetList: for v in missing_variables: self.standardised_asset_list[v] = None + # Convert to string + self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] = ( + self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID].astype(str) + ) + def merge_data(self, df: pd.DataFrame): """ Used to insert data into the standardised asset list, based on the domna property id @@ -1831,3 +1840,120 @@ class AssetList: ) self.hubspot_data = programme_data + + def flag_outcomes( + self, + outcomes_filepath, + outcomes_sheetname + ): + if outcomes_filepath is None: + pass + + # ToDO: Parameterise for future use? + self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname) + self.outcomes["row_id"] = self.outcomes.index + + logger.info("Matching outcomes to ") + # Merge the outcomes onto the asset list - we check we're able to match sufficiently well + lookup = [] + nomatch = [] + for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)): + address_clean = x["Address"].lower().replace(",", "").replace(" ", " ") + + matched = self.standardised_asset_list[ + (self.standardised_asset_list[ + self.STANDARD_FULL_ADDRESS + ].str.lower().str.replace(",", "").str.replace(" ", " ") == address_clean) + ] + + if not matched.empty and matched.shape[0] == 1: + lookup.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + + nomatch.append(x["row_id"]) + + self.outcomes_no_match = self.outcomes[self.outcomes["row_id"].isin(nomatch)] + lookup = pd.DataFrame(lookup) + + # We will have duplicated domna property IDs, where a surveyor has been to a property multiple times + # Where we have multiple rows, we want to make a call on what the action should be. For example, + # there may be properties that have been visited multiple times where the outcome was "See notes" implying + # that the surveyor had a detailed explanation as to why they couldn't gain access so if this has + # happened multiple times, in this case we judge that the work may not be viable + lookup = lookup.merge( + self.outcomes[["row_id", "Outcome", "Notes", "Week Commencing"]], how="left", on="row_id" + ) + + visit_counts = ( + lookup.groupby(self.DOMNA_PROPERTY_ID)["row_id"] + .count() + .reset_index() + .rename(columns={"row_id": "visit_count"}) + .sort_values("visit_count", ascending=False) + ) + + pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index() + pivot_df = pivot_df.merge( + visit_counts, how="left", on="domna_property_id" + ) + + # We merge this data onto outcomes + self.outcomes["matched_to_asset_list"] = self.outcomes["row_id"].isin(lookup["row_id"].values) + self.outcomes = self.outcomes.merge( + lookup, how="left", on="row_id" + ) + + # We merge out pivoted outcomes onto the asset list + self.standardised_asset_list = self.standardised_asset_list.merge( + pivot_df, how="left", left_on=self.DOMNA_PROPERTY_ID, right_on="domna_property_id" + ) + + def flag_survey_master( + self, + master_filepaths + ): + # TODO: This probably needs further expansion + + logger.info("Getting masters and merging onto asset list") + master_surveyed = [] + for filepath in master_filepaths: + master_data = pd.read_csv(filepath) + # Strip columns + master_data.columns = [c.strip() for c in master_data.columns] + + install_col = ( + "INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns + else "INSTALL / CANCELLATION DATE" + ) + + # We just need to check if any were cancelled + master_to_append = master_data[ + ["UPRN", install_col, "SUBMISSION DATE"] + ].rename(columns={"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, install_col: "survey_status"}) + master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") + + master_surveyed.append(master_to_append) + + master_surveyed = pd.concat(master_surveyed) + master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])] + master_surveyed = master_surveyed[ + ~master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID].isin( + ["NOT ON ASSET LIST", "Missing From Asset List"] + ) + ] + + master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID] = master_surveyed[ + self.STANDARD_LANDLORD_PROPERTY_ID + ].astype(str) + + # We de-dupe crudely on landlord property id + self.master_surveyed = master_surveyed.drop_duplicates(subset=[self.STANDARD_LANDLORD_PROPERTY_ID]) + + self.standardised_asset_list = self.standardised_asset_list.merge( + self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID + ) diff --git a/asset_list/app.py b/asset_list/app.py index 7275709d..8e2df56d 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -299,6 +299,9 @@ def app(): landlord_wall_construction = "Wall Constuction" landlord_heating_system = "Heating" landlord_existing_pv = None + outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx" + master_filename_eco3 = "ECO 3 -Table 1.csv" + master_filename_eco4 = "ECO 4 -Table 1.csv" # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -354,6 +357,18 @@ def app(): asset_list.apply_standardiation() + # We now flag properties that have been treated under existing programmes + asset_list.flag_outcomes( + outcomes_filepath=os.path.join(data_folder, outcomes_filename), + outcomes_sheetname="Feedback" + ) + + asset_list.flag_survey_master( + master_filepaths=[ + os.path.join(data_folder, f) for f in [master_filename_eco3, master_filename_eco4] if f is not None + ], + ) + ### We retrieve the EPC data # We chunk up this data into 5000 rows at a time @@ -497,6 +512,7 @@ def app(): cleaned = msgpack.unpackb(cleaned, raw=False) # TODO: We should break out the identification of work types to flag blocks of flats specifically + # TODO: Append existing outcomes onto the sheet. asset_list.identify_worktypes(cleaned) pprint(asset_list.work_type_figures) diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py index a5da0c79..eedae9b9 100644 --- a/etl/customers/stonewater/data_cleaning.py +++ b/etl/customers/stonewater/data_cleaning.py @@ -96,6 +96,7 @@ def download_data_from_sharepoint(): folder for folder in contents["value"] if folder["name"] in folders_to_keep ] for folder_to_pull in folders_to_pull: + # Get the contents folder_contents = sharepoint_client.list_folder_contents( drive_id=sharepoint_client.document_drive["id"], From c4eb72fb92986efab0459bf3f91b7131978044e7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 8 Mar 2025 17:30:54 +0000 Subject: [PATCH 227/255] working on plusdane matching --- asset_list/AssetList.py | 53 ++++++++++++++++- asset_list/app.py | 78 +++++++++++++++++--------- asset_list/mappings/heating_systems.py | 28 ++++++++- asset_list/mappings/property_type.py | 3 +- asset_list/mappings/walls.py | 19 ++++++- 5 files changed, 147 insertions(+), 34 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 3007269b..21b2111f 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -397,6 +397,13 @@ class AssetList: # Update the reference to landlord UPRn self.landlord_uprn = self.STANDARD_UPRN + # Handle the case when full address and address 1 are the same + if self.full_address_colname == self.address1_colname: + self.full_address_colname = self.STANDARD_FULL_ADDRESS + self.standardised_asset_list[self.full_address_colname] = ( + self.standardised_asset_list[self.address1_colname].copy() + ) + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): if method not in self.ADDRESS_1_CLEANING_METHODS: @@ -632,7 +639,8 @@ class AssetList: known_errors = [ "#MULTIVALUE", "This cell has an external reference that can't be shown or edited. Editing this cell will " - "remove the external reference." + "remove the external reference.", + "ND" ] if pd.isnull(date_str) or date_str in known_errors: @@ -642,6 +650,9 @@ class AssetList: match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str) if match: return int(match.group(1)) # Extract the year and convert to integer + if "-" in date_str: + # We probably have a range + return int(date_str.split("-")[1].strip()) if isinstance(date_str, datetime): return date_str.year @@ -1853,7 +1864,7 @@ class AssetList: self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname) self.outcomes["row_id"] = self.outcomes.index - logger.info("Matching outcomes to ") + logger.info("Matching outcomes to asset list") # Merge the outcomes onto the asset list - we check we're able to match sufficiently well lookup = [] nomatch = [] @@ -1866,7 +1877,7 @@ class AssetList: ].str.lower().str.replace(",", "").str.replace(" ", " ") == address_clean) ] - if not matched.empty and matched.shape[0] == 1: + if matched.shape[0] == 1: lookup.append( { "row_id": x["row_id"], @@ -1875,6 +1886,42 @@ class AssetList: ) continue + if "UPRN" in x: + matched = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] == x["UPRN"] + ] + + if matched.shape[0] == 1: + lookup.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + + matched = self.standardised_asset_list[ + (self.standardised_asset_list[self.STANDARD_POSTCODE] == x["Post Code"]) + ].copy() + if not matched.empty: + matched["houseno"] = matched.apply( + lambda x: SearchEpc.get_house_number(x[self.STANDARD_ADDRESS_1], x[self.STANDARD_POSTCODE]), + axis=1 + ) + matched = matched[ + matched["houseno"].astype(str) == str(x["Numb."]) + ] + if matched.shape[0] == 1: + lookup.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + elif not matched.empty: + raise NotImplementedError("Implement me - multiple matches on house number") + nomatch.append(x["row_id"]) self.outcomes_no_match = self.outcomes[self.outcomes["row_id"].isin(nomatch)] diff --git a/asset_list/app.py b/asset_list/app.py index 8e2df56d..fb71a70e 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -125,21 +125,22 @@ def get_data( no_epc.append(home[row_id_name]) continue - if epc_api_only: - epc = { - row_id_name: home[row_id_name], - **searcher.newest_epc.copy() - } - - epc_data.append(epc) - continue - # Look for EPC recommendatons try: property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) except: property_recommendations = {"rows": []} + if epc_api_only: + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + continue + # Retrieve data from FindMyEPC try: find_epc_searcher = RetrieveFindMyEpc( @@ -283,25 +284,46 @@ def app(): # landlord_property_id = "Place ref" # For ACIS - programme re-build - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/ACIS/ACIS Full Programme Review March 2025" - data_filename = "ACIS asset list.xlsx" - sheet_name = "Assets" - address1_column = "House No" - postcode_column = "Postcode" + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/ACIS/ACIS Full Programme Review March 2025" + # data_filename = "ACIS asset list.xlsx" + # sheet_name = "Assets" + # address1_column = "House No" + # postcode_column = "Postcode" + # landlord_property_id = "UPRN" + # fulladdress_column = None + # address_cols_to_concat = ["House No", "Street", "Town"] + # missing_postcodes_method = None + # address1_method = None + # landlord_year_built = "YEAR BUILT" + # landlord_os_uprn = None + # landlord_property_type = "Property type" + # landlord_wall_construction = "Wall Constuction" + # landlord_heating_system = "Heating" + # landlord_existing_pv = None + # outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx" + # master_filename_eco3 = "ECO 3 -Table 1.csv" + # master_filename_eco4 = "ECO 4 -Table 1.csv" + + # For plus dane + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane" + data_filename = "PLUS DANE Asset List - for analysis.xlsx" + sheet_name = "Asset List" + address1_column = " Address" + postcode_column = " Postcode" landlord_property_id = "UPRN" - fulladdress_column = None - address_cols_to_concat = ["House No", "Street", "Town"] + fulladdress_column = " Address" + address_cols_to_concat = [] missing_postcodes_method = None address1_method = None - landlord_year_built = "YEAR BUILT" + landlord_year_built = "Property Age" landlord_os_uprn = None - landlord_property_type = "Property type" - landlord_wall_construction = "Wall Constuction" - landlord_heating_system = "Heating" + landlord_property_type = "Property Type" + landlord_wall_construction = "Landlord Wall Full" + landlord_heating_system = "Landlord Heating" landlord_existing_pv = None - outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx" - master_filename_eco3 = "ECO 3 -Table 1.csv" - master_filename_eco4 = "ECO 4 -Table 1.csv" + outcomes_filename = "plus dane outcomes.xlsx" + outcomes_sheetname = "EVERYTHING" + master_filepaths = ["JJC Rolling Master.csv", "SCIS Rolling Master.csv"] # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -360,19 +382,18 @@ def app(): # We now flag properties that have been treated under existing programmes asset_list.flag_outcomes( outcomes_filepath=os.path.join(data_folder, outcomes_filename), - outcomes_sheetname="Feedback" + outcomes_sheetname=outcomes_sheetname ) asset_list.flag_survey_master( - master_filepaths=[ - os.path.join(data_folder, f) for f in [master_filename_eco3, master_filename_eco4] if f is not None - ], + master_filepaths=master_filepaths ) ### We retrieve the EPC data # We chunk up this data into 5000 rows at a time # Create the chunks directory + epc_api_only = False force_retrieve_data = False skip = None # Used to skip already completed chunks chunk_size = 5000 @@ -400,6 +421,7 @@ def app(): df=chunk, row_id_name=asset_list.DOMNA_PROPERTY_ID, manual_uprn_map=manual_uprn_map, + epc_api_only=epc_api_only ) # We now retrieve any failed properties @@ -408,7 +430,7 @@ def app(): df=chunk_failed, row_id_name=asset_list.DOMNA_PROPERTY_ID, manual_uprn_map=manual_uprn_map, - epc_api_only=False + epc_api_only=epc_api_only ) epc_data_chunk.extend(epc_data_failed) diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 33d3701a..f397391c 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -16,6 +16,7 @@ STANDARD_HEATING_SYSTEMS = { "unknown", "communal gas boiler", "high heat retention storage heaters", + "room heaters" } HEATING_MAPPINGS = { @@ -69,5 +70,30 @@ HEATING_MAPPINGS = { 'Electric': 'electric storage heaters', 'Solid fuel': 'other', 'No Heat': 'unknown', - 'GSHP': 'ground source heat pump' + 'GSHP': 'ground source heat pump', + + 'Boiler Oil': 'oil boiler', + 'Boiler Electricity': 'electric boiler', + 'Boiler ND': 'unknown', + 'ND Mains gas': 'unknown', + 'Room heaters Mains gas': "room heaters", + 'Heat pump (air) Electricity': 'air source heat pump', + 'Room heaters Electricity': 'electric radiators', + 'Room heaters Oil': 'room heaters', + 'No heating system ND': 'unknown', + 'Heat pump (wet) Electricity': 'ground source heat pump', + 'Room heaters Biomass': 'room heaters', + 'ND Solid fuel': 'unknown', + 'Boiler Mains gas': 'gas combi boiler', + 'Boiler LPG': 'boiler - other fuel', + 'Room heaters Solid fuel': 'room heaters', + 'ND ND': 'unknown', + 'Storage heating Electricity': 'electric storage heaters', + 'ND Electricity': 'unknown', + 'Community heating Community (non-gas)': 'district heating', + 'No heating system N/A': 'unknown', + 'Boiler Solid fuel': 'boiler - other fuel', + 'Community heating Community (mains gas)': 'communal gas boiler', + 'Boiler Biomass': 'boiler - other fuel', + 'No heating system Mains gas': 'unknown' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index 1fe1daac..ccee5d3e 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -62,5 +62,6 @@ PROPERTY_MAPPING = { '3 Bed First Floor Maisonette': 'maisonette', '2 Bed 1st Floor Sheltered Flat': 'flat', '1 Bed First Floor Flat': 'flat', - '3 Bed First Floor Flat': 'flat' + '3 Bed First Floor Flat': 'flat', + 'ND': 'unknown' } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 959701ca..2313f063 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -100,5 +100,22 @@ WALL_CONSTRUCTION_MAPPINGS = { 'BRICK/BLOCK CAVITY': 'cavity unknown insulation', 'STONE SOLID': 'sandstone or limestone', 'EXT CLADDING SYSTEM': 'system built', - 'BRICK/BLOCK SOLID': 'solid brick unknown insulation' + 'BRICK/BLOCK SOLID': 'solid brick unknown insulation', + + 'Cavity Filled cavity (with internal/external)': 'filled cavity', + 'ND (inferred) Filled cavity': 'filled cavity', + 'Cavity Filled cavity': 'filled cavity', + 'Cavity Unknown insulation': 'cavity unknown insulation', + 'Timber frame As-built': 'timber frame', + 'System build Unknown insulation': 'system built', + 'Cavity As-built': 'unknown', + 'System build External': 'system built', + 'ND (inferred) ND (inferred)': 'unknown', + 'Solid brick External': 'insulated solid brick', + 'Cavity External': 'filled cavity', + 'System build As-built': 'system built', + 'Solid brick Internal': 'insulated solid brick', + 'Cavity Internal': 'filled cavity', + 'System build Internal': 'system built', + 'Solid brick As-built': 'solid brick unknown insulation' } From 9eba778eb13cbfd6842dec2e698c899abc5816f9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 8 Mar 2025 17:54:38 +0000 Subject: [PATCH 228/255] trying to match master to asset list --- asset_list/AssetList.py | 20 +++++++++++++++++-- asset_list/app.py | 9 +++++++-- .../ha_15_32/ha_analysis_batch_3.py | 11 ++++++++++ 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 21b2111f..05f6b10e 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1932,8 +1932,10 @@ class AssetList: # there may be properties that have been visited multiple times where the outcome was "See notes" implying # that the surveyor had a detailed explanation as to why they couldn't gain access so if this has # happened multiple times, in this case we judge that the work may not be viable + + date_col = "Week Commencing" if "Week Commencing" in self.outcomes else "Survey Date" lookup = lookup.merge( - self.outcomes[["row_id", "Outcome", "Notes", "Week Commencing"]], how="left", on="row_id" + self.outcomes[["row_id", "Outcome", "Notes", date_col]], how="left", on="row_id" ) visit_counts = ( @@ -1949,6 +1951,9 @@ class AssetList: visit_counts, how="left", on="domna_property_id" ) + if pivot_df[self.DOMNA_PROPERTY_ID].duplicated().sum(): + raise Exception("We have duplicated property IDs in the outcomes data") + # We merge this data onto outcomes self.outcomes["matched_to_asset_list"] = self.outcomes["row_id"].isin(lookup["row_id"].values) self.outcomes = self.outcomes.merge( @@ -1962,10 +1967,16 @@ class AssetList: def flag_survey_master( self, - master_filepaths + master_filepaths, + master_to_asset_list_filepath=None ): # TODO: This probably needs further expansion + if master_to_asset_list_filepath is not None: + id_map = pd.read_csv(master_to_asset_list_filepath) + else: + id_map = pd.DataFrame() + logger.info("Getting masters and merging onto asset list") master_surveyed = [] for filepath in master_filepaths: @@ -1973,6 +1984,11 @@ class AssetList: # Strip columns master_data.columns = [c.strip() for c in master_data.columns] + if not id_map.empty: + master_data = master_data.merge( + id_map, how="left", on=['NO.', 'Street / Block Name', 'Post Code'] + ) + install_col = ( "INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns else "INSTALL / CANCELLATION DATE" diff --git a/asset_list/app.py b/asset_list/app.py index fb71a70e..bea9cdde 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -323,7 +323,11 @@ def app(): landlord_existing_pv = None outcomes_filename = "plus dane outcomes.xlsx" outcomes_sheetname = "EVERYTHING" - master_filepaths = ["JJC Rolling Master.csv", "SCIS Rolling Master.csv"] + master_filepaths = [ + os.path.join(data_folder, "JJC Rolling Master.csv"), + os.path.join(data_folder, "SCIS Rolling Master.csv"), + ] + master_to_asset_list_filepath = os.path.join(data_folder, "surveys_to_assets.csv") # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -386,7 +390,8 @@ def app(): ) asset_list.flag_survey_master( - master_filepaths=master_filepaths + master_filepaths=master_filepaths, + master_to_asset_list_filepath=master_to_asset_list_filepath ) ### We retrieve the EPC data diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index aca36584..e97f0202 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2897,6 +2897,17 @@ class DataLoader: # Merge onto the survey list survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id") + # TEMP FOR NEWER WORK + # matching_lookup = matching_lookup.merge( + # asset_list[["asset_list_row_id", "UPRN"]], how="left", on="asset_list_row_id" + # ).merge( + # survey_list[["survey_list_row_id", "NO.", "Street / Block Name", "Post Code"]], + # how="left", on="survey_list_row_id" + # ) + # matching_lookup.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane/surveys_to_assets.csv" + # ) + return survey_list @staticmethod From bb2164ccf859a585caca74aecd1e66bda0d2cf0d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 8 Mar 2025 18:39:25 +0000 Subject: [PATCH 229/255] debugging non-intrusive colnames --- asset_list/AssetList.py | 27 +++++++++++++++++++++------ asset_list/app.py | 30 ++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 05f6b10e..fe4be9f5 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -584,10 +584,12 @@ class AssetList: if self.old_format_non_intrusives_present: non_intrusive_columns = self.OLD_FORMAT_NON_INTRUSIVE_COLNAMES + self.keep_variables += non_intrusive_columns + self.rename_map = { **self.rename_map, **dict( - zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in non_intrusive_columns]) + zip(non_intrusive_columns, ["non-intrusives: " + c for c in non_intrusive_columns]) ) } @@ -987,7 +989,7 @@ class AssetList: def identify_worktypes(self, cleaned): - if not self.non_intrusives_present: + if not self.non_intrusives_present and not self.old_format_non_intrusives_present: raise NotImplementedError("Need to implement the case for non-intrusives") # If we have non-intrusives completed, we can use this to identify work types @@ -1855,7 +1857,9 @@ class AssetList: def flag_outcomes( self, outcomes_filepath, - outcomes_sheetname + outcomes_sheetname, + outcomes_postcode, + outcomes_houseno ): if outcomes_filepath is None: pass @@ -1901,7 +1905,7 @@ class AssetList: continue matched = self.standardised_asset_list[ - (self.standardised_asset_list[self.STANDARD_POSTCODE] == x["Post Code"]) + (self.standardised_asset_list[self.STANDARD_POSTCODE] == x[outcomes_postcode]) ].copy() if not matched.empty: matched["houseno"] = matched.apply( @@ -1909,7 +1913,7 @@ class AssetList: axis=1 ) matched = matched[ - matched["houseno"].astype(str) == str(x["Numb."]) + matched["houseno"].astype(str) == str(x[outcomes_houseno]) ] if matched.shape[0] == 1: lookup.append( @@ -1920,7 +1924,18 @@ class AssetList: ) continue elif not matched.empty: - raise NotImplementedError("Implement me - multiple matches on house number") + # Use levenstein distance to match + matched["address"] = matched[self.STANDARD_ADDRESS_1] + " " + matched[self.STANDARD_POSTCODE] + + best_match = process.extractOne(x["Address"], matched[self.STANDARD_FULL_ADDRESS].values)[0] + matched = matched[matched[self.STANDARD_FULL_ADDRESS] == best_match] + lookup.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue nomatch.append(x["row_id"]) diff --git a/asset_list/app.py b/asset_list/app.py index bea9cdde..63ca40d8 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -301,8 +301,14 @@ def app(): # landlord_heating_system = "Heating" # landlord_existing_pv = None # outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx" - # master_filename_eco3 = "ECO 3 -Table 1.csv" - # master_filename_eco4 = "ECO 4 -Table 1.csv" + # outcomes_sheetname = "Feedback" + # outcomes_postcode = "Postcode" + # outcomes_houseno = "No" + # master_filepaths = [ + # os.path.join(data_folder, "ECO 3 -Table 1.csv"), + # os.path.join(data_folder, "ECO 4 -Table 1.csv"), + # ] + # master_to_asset_list_filepath = None # For plus dane data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane" @@ -323,6 +329,8 @@ def app(): landlord_existing_pv = None outcomes_filename = "plus dane outcomes.xlsx" outcomes_sheetname = "EVERYTHING" + outcomes_postcode = "Post Code" + outcomes_houseno = "Numb." master_filepaths = [ os.path.join(data_folder, "JJC Rolling Master.csv"), os.path.join(data_folder, "SCIS Rolling Master.csv"), @@ -386,7 +394,9 @@ def app(): # We now flag properties that have been treated under existing programmes asset_list.flag_outcomes( outcomes_filepath=os.path.join(data_folder, outcomes_filename), - outcomes_sheetname=outcomes_sheetname + outcomes_sheetname=outcomes_sheetname, + outcomes_postcode=outcomes_postcode, + outcomes_houseno=outcomes_houseno ) asset_list.flag_survey_master( @@ -457,7 +467,9 @@ def app(): csv_data = pd.read_csv(os.path.join(download_folder, file)) # We need to convert the recommendations back to a list csv_data["recommendations"] = csv_data["recommendations"].apply(eval) - csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) + # We don't have this if we didn't run the pulling from find my epc + if "find_my_epc_data" in csv_data.columns: + csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) epc_data.append(csv_data) epc_df = pd.concat(epc_data) @@ -499,6 +511,9 @@ def app(): ) # Get the find my epc data + if "find_my_epc_data" not in epc_df.columns: + epc_df["find_my_epc_data"] = None + find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( columns=["find_my_epc_data"]).join( pd.json_normalize(epc_df["find_my_epc_data"]) @@ -519,6 +534,13 @@ def app(): columns=asset_list.EPC_API_DATA_NAMES ) + # Look for columns not in the find my EPC data, which will have happened if we didn't + # retrieve it in the first place + missed_find_epc_cols = [c for c in list(asset_list.FIND_EPC_DATA_NAMES.keys()) if c not in find_my_epc_data.columns] + if missed_find_epc_cols: + for c in missed_find_epc_cols: + find_my_epc_data[c] = None + epc_df = epc_df.merge( find_my_epc_data[ [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) From d06791db18ce60240bf2a87141338a719a040e73 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 8 Mar 2025 19:56:15 +0000 Subject: [PATCH 230/255] modifying detecting logic --- asset_list/AssetList.py | 125 +++++++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 48 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index fe4be9f5..d9922a97 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -993,64 +993,93 @@ class AssetList: raise NotImplementedError("Need to implement the case for non-intrusives") # If we have non-intrusives completed, we can use this to identify work types + ###################################################### + # Empty cavity: + ###################################################### + # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled + # 2) The age is before 1995 + # 3) We don't remove anything that haas access issues yet if self.non_intrusives_present: - ###################################################### - # Empty cavity: - ###################################################### - # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled - # 2) The age is before 1995 - # 3) We don't remove anything that haas access issues yet - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( - (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter = ( (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & - self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"] - ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) + ) + elif self.old_format_non_intrusives_present: + non_intrusives_wall_filter = ( + self.standardised_asset_list['non-intrusives: WFT Findings'].isin( + [ + "EMPTY CAVITY", "Partial fill" + ] ) ) - # Let's also flag work that looks eligible without the SAP filter - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = ( - (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & - (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & - self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) - ) + else: + raise NotImplementedError("need to implement the case for non-intrusives") - # If non_intrusive_indicates_empty_cavity is True, - # set non_intrusive_indicates_empty_cavity_no_sap_filter to False - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], - False, - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ] <= self.EMPTY_CAVITY_SAP_THRESHOLD ) + ) - self.standardised_asset_list["epc_indicates_empty_cavity"] = ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( - self.EPC_NO_WALL_INSULATION_DESCRIPTIONS - ) & ( - self.standardised_asset_list["epc_year_upper_bound"] <= 1995 - ) & ( - ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] - ) & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD - ) - ) + # Let's also flag work that looks eligible without the SAP filter + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) + ) - # If the EPC is esimtated, we defer to the non-intrusives - self.standardised_asset_list["epc_indicates_empty_cavity"] = np.where( - ( - self.standardised_asset_list["epc_indicates_empty_cavity"] & - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & - self.standardised_asset_list["estimated"] - ), - False, - self.standardised_asset_list["epc_indicates_empty_cavity"] + # If non_intrusive_indicates_empty_cavity is True, + # set non_intrusive_indicates_empty_cavity_no_sap_filter to False + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], + False, + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] + ) + + self.standardised_asset_list["epc_indicates_empty_cavity"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( + self.EPC_NO_WALL_INSULATION_DESCRIPTIONS + ) & ( + self.standardised_asset_list["epc_year_upper_bound"] <= 1995 + ) & ( + ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD ) + ) + + # If the EPC is esimtated, we defer to the non-intrusives + self.standardised_asset_list["epc_indicates_empty_cavity"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + self.standardised_asset_list["estimated"] + ), + False, + self.standardised_asset_list["epc_indicates_empty_cavity"] + ) + + + if self.non_intrusives_present: + + + if self.non_intrusives_present: + + + + + + + + + ###################################################### # Extraction From 816a1fa565116fd92e02cf426d29c8d2f68727d1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 8 Mar 2025 21:58:46 +0000 Subject: [PATCH 231/255] matching for acis done --- asset_list/AssetList.py | 777 +++++++++++++++++++++++----------------- 1 file changed, 457 insertions(+), 320 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index d9922a97..689e752b 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -800,7 +800,7 @@ class AssetList: self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = ( self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] | - ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""]) + ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, "", np.nan]) ) accepted_epc_property_types = ["House", "Flat", "Bungalow", "Maisonette"] @@ -1007,10 +1007,8 @@ class AssetList: ) elif self.old_format_non_intrusives_present: non_intrusives_wall_filter = ( - self.standardised_asset_list['non-intrusives: WFT Findings'].isin( - [ - "EMPTY CAVITY", "Partial fill" - ] + self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().isin( + ["empty cavity", "partial fill"] ) ) else: @@ -1018,7 +1016,7 @@ class AssetList: self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & - non_intrusives_wall_filter & + non_intrusives_wall_filter & (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & ( self.standardised_asset_list[ @@ -1066,39 +1064,14 @@ class AssetList: self.standardised_asset_list["epc_indicates_empty_cavity"] ) + ###################################################### + # Extraction + ###################################################### + # as needing a CIGA check. What is the logic we should be applying here? if self.non_intrusives_present: - - if self.non_intrusives_present: - - - - - - - - - - - ###################################################### - # Extraction - ###################################################### - - # as needing a CIGA check. What is the logic we should be applying here? - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( - (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & - (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & - (~self.standardised_asset_list['non-intrusives: Material'].isin( - ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"] - ) - ) & ( - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - ) - - # Also include work without the SAP filter as optimistic - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = ( + extraction_wall_filter = ( (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & (~self.standardised_asset_list['non-intrusives: Material'].isin( @@ -1107,314 +1080,446 @@ class AssetList: ) ) - # Adjust - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"], - False, - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + extraction_wall_filter & ( + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) ) - ###################################################### - # Solar - ###################################################### - # Criteria: - # Check 1: Does the property have a valid heating system? - self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( - self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( - ["air source heat pump", "ground source heat pump", "high heat retention storage heaters"] + # Also include work without the SAP filter as optimistic + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = ( + extraction_wall_filter + ) + + elif self.old_format_non_intrusives_present: + print("Review these categories with Kieran") + extraction_wall_filter = ( + self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin( + ["retro drilled", "retro filled", "fibre from build", "polybead"] ) ) - self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = ( - ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] - .str.lower().str.contains("air source heat pump|ground source heat pump") - ) | ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains( - "electric storage heaters" - ) & ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES[ - "mainheatcont-description"]] == "Controls for high heat retention storage heaters" - ) + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + extraction_wall_filter & ( + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + ) + + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = ( + extraction_wall_filter + ) + + else: + raise NotImplementedError("need to implement the case for non-intrusives") + + # Adjust + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"], + False, + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] + ) + + ###################################################### + # Solar + ###################################################### + # Criteria: + # Check 1: Does the property have a valid heating system? + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( + ["air source heat pump", "ground source heat pump", "high heat retention storage heaters"] + ) + ) + self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] = ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( + ["electric storage heaters", "room heaters"] + ) + ) + + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] + .str.lower().str.contains("air source heat pump|ground source heat pump") + ) | ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains( + "electric storage heaters" + ) & ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES[ + "mainheatcont-description"]] == "Controls for high heat retention storage heaters" ) ) + ) - # Check 2: Does the property have solar already - self.standardised_asset_list["property_has_solar"] = ( - (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") | - (self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF") | - (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR]) + self.standardised_asset_list["solar_epc_data_indicates_requires_heating_upgrade"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains( + "electric storage heaters|room heaters" + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheatcont-description"] + ] != "Controls for high heat retention storage heaters" ) + ) - # Check 3: Does the property meet the fabric condition - # Solar PV installs are subject to the minimum insulation requirements which means: - # 1) one of the following insulation measures must be installed as part of the same - # ECO4 project: - # • roof insulation (flat roof, pitched roof, room-in-roof) - # • exterior facing wall insulation (cavity wall, solid wall) - # • party cavity wall insulation - # • floor insulation (solid and underfloor) - # - # OR - # - # all measures (except any exempted measure referred to in paragraph 4.28) - # listed in paragraph a) must already be installed - # - # With this in mind, we look for 2 clases - # 1) The property is fully insulated apart from the loft (<200mm insulation) - # 2) THe property is fully insulated + # Basic check - both of the previous two shouldn't be true simultaneously + if ( + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] & + self.standardised_asset_list["solar_epc_data_indicates_requires_heating_upgrade"] + ).sum(): + raise ValueError("Both heating system checks are true - this should not be possible") - self.standardised_asset_list["solar_landlord_walls_insulated"] = ( - self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( - ["filled cavity", "insulated solid brick"] + # Check 2: Does the property have solar already + if self.non_intrusives_present: + existing_solar_non_intrusives_check = ( + self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF" + ) + elif self.old_format_non_intrusives_present: + existing_solar_non_intrusives_check = ( + self.standardised_asset_list["non-intrusives: WFT Findings"].str.lower().str.strip().isin( + ["solar pv on roof"] ) ) + else: + raise NotImplementedError("need to implement the case for non-intrusives") + self.standardised_asset_list["property_has_solar"] = ( + (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") | + existing_solar_non_intrusives_check | + (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR]) + ) + + # Check 3: Does the property meet the fabric condition + # Solar PV installs are subject to the minimum insulation requirements which means: + # 1) one of the following insulation measures must be installed as part of the same + # ECO4 project: + # • roof insulation (flat roof, pitched roof, room-in-roof) + # • exterior facing wall insulation (cavity wall, solid wall) + # • party cavity wall insulation + # • floor insulation (solid and underfloor) + # + # OR + # + # all measures (except any exempted measure referred to in paragraph 4.28) + # listed in paragraph a) must already be installed + # + # With this in mind, we look for 2 clases + # 1) The property is fully insulated apart from the loft (<200mm insulation) + # 2) THe property is fully insulated + + print("Should we include cavity properties where they might be uninsulated?") + self.standardised_asset_list["solar_landlord_walls_insulated"] = ( + self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( + ["filled cavity", "insulated solid brick"] + ) + ) + + if self.non_intrusives_present: self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = ( self.standardised_asset_list["non-intrusives: Insulated"].isin( ["EWI", "RETRO DRILLED", "FILLED AT BUILD"] ) ) - - # TODO: We don't have information about the roof from this landlord - - # We merge on the u-value for average thermal transmittance - walls_uvalue_data = pd.DataFrame(cleaned["walls-description"]) - walls_uvalue_data = walls_uvalue_data[ - ~pd.isnull(walls_uvalue_data["thermal_transmittance"]) - ][["original_description", "thermal_transmittance"]].rename( - columns={ - "original_description": self.EPC_API_DATA_NAMES["walls-description"], - "thermal_transmittance": "walls_u_value" - } - ) - self.standardised_asset_list = self.standardised_asset_list.merge( - walls_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["walls-description"] + elif self.old_format_non_intrusives_present: + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = ( + self.standardised_asset_list["non-intrusives: WFT Findings"].str.lower().str.strip().isin( + ["retro drilled", "retro filled", "ewi", "retro drilled/ solid"] + ) ) + else: + raise NotImplementedError("need to implement the case for non-intrusives") - self.standardised_asset_list["solar_epc_walls_insulated"] = ( + # TODO: We don't have information about the roof from this landlord + + # We merge on the u-value for average thermal transmittance + walls_uvalue_data = pd.DataFrame(cleaned["walls-description"]) + walls_uvalue_data = walls_uvalue_data[ + ~pd.isnull(walls_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["walls-description"], + "thermal_transmittance": "walls_u_value" + } + ) + self.standardised_asset_list = self.standardised_asset_list.merge( + walls_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["walls-description"] + ) + + self.standardised_asset_list["solar_epc_walls_insulated"] = ( + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES[ + "walls-description"]].str.lower().str.contains( + "|".join( + self.EPC_INSULATED_WALLS_SUBSTRINGS) + ) + ) | ( + self.standardised_asset_list[ + "walls_u_value"].apply( + lambda x: x <= 0.7 if not pd.isnull(x) else False + ) + ) + ) + + # We merge on the u-value for average thermal transmittance + roof_uvalue_data = pd.DataFrame(cleaned["roof-description"]) + roof_uvalue_data = roof_uvalue_data[ + ~pd.isnull(roof_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["roof-description"], + "thermal_transmittance": "roof_u_value" + } + ) + + self.standardised_asset_list = self.standardised_asset_list.merge( + roof_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] + ) + + # If the u-value of a roof is less than 0.7 we consider it insulated + self.standardised_asset_list["solar_epc_roof_insulated"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains( + "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False + ) | ( + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) >= 200 if str(x).isdigit() else False + ) + ) | ( + self.standardised_asset_list["roof_u_value"].apply( + lambda x: x <= 0.7 if not pd.isnull(x) else False + ) + ) + ) + + self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[ + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) < 200 if str(x).isdigit() else False + ) + + # TODO: Fill with False - should be temp! + self.standardised_asset_list["epc_has_floor_recommendation"] = ( + self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) + ) + + # We merge on the u-value for average thermal transmittance + floors_uvalue_data = pd.DataFrame(cleaned["floor-description"]) + floors_uvalue_data = floors_uvalue_data[ + ~pd.isnull(floors_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["floor-description"], + "thermal_transmittance": "floor_u_value" + } + ) + + # Merge on + self.standardised_asset_list = self.standardised_asset_list.merge( + floors_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["floor-description"] + ) + + # We assume that a U-value of 0.5 or below is indicative of an insulated floor + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = ( + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str + .lower().str.contains("solid") + ) & ( + ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) & ( + # We do not utilise estimated EPCs for this method because we will always find that + # "epc_has_floor_recommendation" is False + (self.standardised_asset_list["estimated"] == False) + ) + ) | ( ( self.standardised_asset_list[ - self.EPC_API_DATA_NAMES[ - "walls-description"]].str.lower().str.contains( - "|".join( - self.EPC_INSULATED_WALLS_SUBSTRINGS) - ) - ) | ( + self.EPC_API_DATA_NAMES["floor-description"]].str.lower().str.contains("solid") + ) & ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower() + .str.contains(", insulated") + ) + ) + ) + + # Check for other floor types, insulated + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] = ( + # The floor is suspended and insulated + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str + .lower().str.contains("suspended") + ) & ( + ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) & ( + # We do not utilise estimated EPCs for this method because we will always find that + # "epc_has_floor_recommendation" is False + self.standardised_asset_list["estimated"] == False + ) + ) | ( + ( self.standardised_asset_list[ - "walls_u_value"].apply( - lambda x: x <= 0.7 if not pd.isnull(x) else False - ) + self.EPC_API_DATA_NAMES["floor-description"] + ].str.lower().str.contains("suspended") + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"] + ].str.lower().str.contains(", insulated") + ) + ) | ( + self.standardised_asset_list["floor_u_value"].apply( + lambda x: x <= 0.5 if not pd.isnull(x) else False ) ) + ) + #################################### + # Check solar eligibility + #################################### - # We merge on the u-value for average thermal transmittance - roof_uvalue_data = pd.DataFrame(cleaned["roof-description"]) - roof_uvalue_data = roof_uvalue_data[ - ~pd.isnull(roof_uvalue_data["thermal_transmittance"]) - ][["original_description", "thermal_transmittance"]].rename( - columns={ - "original_description": self.EPC_API_DATA_NAMES["roof-description"], - "thermal_transmittance": "roof_u_value" - } - ) + # Set up the filters to stop repetition + correct_heating_system = ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) - self.standardised_asset_list = self.standardised_asset_list.merge( - roof_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] - ) + needs_heating_upgrade = ( + self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] | + self.standardised_asset_list["solar_epc_data_indicates_requires_heating_upgrade"] + ) - # If the u-value of a roof is less than 0.7 we consider it insulated - self.standardised_asset_list["solar_epc_roof_insulated"] = ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains( - "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False - ) | ( - self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( - lambda x: int(x) >= 200 if str(x).isdigit() else False - ) - ) | ( - self.standardised_asset_list["roof_u_value"].apply( - lambda x: x <= 0.7 if not pd.isnull(x) else False - ) - ) - ) + walls_are_insulated = ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] | + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] + ) - self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[ - self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( - lambda x: int(x) < 200 if str(x).isdigit() else False - ) + self.standardised_asset_list["solar_eligible_solid_floor"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + ) - # TODO: Fill with False - should be temp! - self.standardised_asset_list["epc_has_floor_recommendation"] = ( - self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) - ) + # With heating upgrade + self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] = ( + # Needs heating upgrade + needs_heating_upgrade & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + ) + # Because the EPC data can be contradictrory, we remove any overlap + self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] = np.where( + self.standardised_asset_list["solar_eligible_solid_floor"], + False, + self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] + ) - # We merge on the u-value for average thermal transmittance - floors_uvalue_data = pd.DataFrame(cleaned["floor-description"]) - floors_uvalue_data = floors_uvalue_data[ - ~pd.isnull(floors_uvalue_data["thermal_transmittance"]) - ][["original_description", "thermal_transmittance"]].rename( - columns={ - "original_description": self.EPC_API_DATA_NAMES["floor-description"], - "thermal_transmittance": "floor_u_value" - } - ) + # We shouldn't have an overlap + if ( + self.standardised_asset_list["solar_eligible_solid_floor"] & + self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] + ).sum(): + raise ValueError("Both heating upgrade and no heating upgrade are true - this should not be possible") - # Merge on - self.standardised_asset_list = self.standardised_asset_list.merge( - floors_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["floor-description"] - ) + # Solid floor but needs a loft top-up + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + ) - # We assume that a U-value of 0.5 or below is indicative of an insulated floor - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = ( - ( - ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str - .lower().str.contains("solid") - ) & ( - ~self.standardised_asset_list["epc_has_floor_recommendation"] - ) & ( - # We do not utilise estimated EPCs for this method because we will always find that - # "epc_has_floor_recommendation" is False - (self.standardised_asset_list["estimated"] == False) - ) - ) | ( - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["floor-description"]].str.lower().str.contains("solid") - ) & ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower() - .str.contains(", insulated") - ) - ) - ) + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft_needs_heating_upgrade"] = ( + # Needs heating upgrade + needs_heating_upgrade & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + ) - # Check for other floor types, insulated - self.standardised_asset_list["solar_epc_floor_is_other_insulated"] = ( - # The floor is suspended and insulated - ( - ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str - .lower().str.contains("suspended") - ) & ( - ~self.standardised_asset_list["epc_has_floor_recommendation"] - ) & ( - # We do not utilise estimated EPCs for this method because we will always find that - # "epc_has_floor_recommendation" is False - self.standardised_asset_list["estimated"] == False - ) - ) | ( - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["floor-description"] - ].str.lower().str.contains("suspended") - ) & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["floor-description"] - ].str.lower().str.contains(", insulated") - ) - ) | ( - self.standardised_asset_list["floor_u_value"].apply( - lambda x: x <= 0.5 if not pd.isnull(x) else False - ) - ) - ) + # Other floor type, fully insulated + self.standardised_asset_list["solar_eligible_other_floor"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + ) - # We now put together the criteria: - # Flag properties that look eligible for solar, that have solid floors - # TODO: We'll need to revise this - self.standardised_asset_list["solar_eligible_solid_floor"] = ( - # Landlord data or EPC data indicates the heating system is appropriate - ( - self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | - self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] - ) & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - ( - self.standardised_asset_list["solar_landlord_walls_insulated"] | - self.standardised_asset_list["solar_epc_walls_insulated"] | - self.standardised_asset_list["solar_non_intrusives_walls_insulated"] - ) & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] - ) + # With heating upgrade + self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade"] = ( + # Needs heating upgrade + needs_heating_upgrade & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + ) - # Solid floor but needs a loft top-up - self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] = ( - # Landlord data or EPC data indicates the heating system is appropriate - ( - self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | - self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] - ) & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - ( - self.standardised_asset_list["solar_landlord_walls_insulated"] | - self.standardised_asset_list["solar_epc_walls_insulated"] | - self.standardised_asset_list["solar_non_intrusives_walls_insulated"] - ) & - # Roof is insulated - self.standardised_asset_list["solar_epc_loft_needs_topup"] & - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] - ) + # Other floor type, needs loft top-up + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof need loft top-up + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Floor is not solid, but is insulated + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + ) - # Other floor type, fully insulated + # With heating upgrade + self.standardised_asset_list["solar_eligible_other_floor_needs_loft_needs_heating_upgrade"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + needs_heating_upgrade & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof need loft top-up + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Floor is not solid, but is insulated + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + ) - self.standardised_asset_list["solar_eligible_other_floor"] = ( - # Landlord data or EPC data indicates the heating system is appropriate - ( - self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | - self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] - ) & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - ( - self.standardised_asset_list["solar_landlord_walls_insulated"] | - self.standardised_asset_list["solar_epc_walls_insulated"] - ) & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - self.standardised_asset_list["solar_epc_floor_is_other_insulated"] - ) + # Drop anything we don't need + self.standardised_asset_list = self.standardised_asset_list.drop( + columns=["walls_u_value", "roof_u_value", "floor_u_value"] + ) - # Other floor type, needs loft top-up - self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] = ( - # Landlord data or EPC data indicates the heating system is appropriate - ( - self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | - self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] - ) & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - ( - self.standardised_asset_list["solar_landlord_walls_insulated"] | - self.standardised_asset_list["solar_epc_walls_insulated"] - ) & - # Roof need loft top-up - self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Floor is not solid, but is insulated - self.standardised_asset_list["solar_epc_floor_is_other_insulated"] - ) - - # Drop anything we don't need - self.standardised_asset_list = self.standardised_asset_list.drop( - columns=["walls_u_value", "roof_u_value", "floor_u_value"] - ) - - # Adjust flagged extraction jobs to remove anything for solar - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & - ~self.standardised_asset_list["solar_eligible_solid_floor"] & - ~self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] - # ~self.standardised_asset_list["solar_eligible_other_floor"] & - # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] - ) + # Adjust flagged extraction jobs to remove anything for solar + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & + ~self.standardised_asset_list["solar_eligible_solid_floor"] & + ~self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] + # ~self.standardised_asset_list["solar_eligible_other_floor"] & + # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] + ) blocks_of_flats = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" @@ -1484,17 +1589,6 @@ class AssetList: ) } - # We produce a breakdown of the property types, for cavity fills - cavity_fills = self.standardised_asset_list[ - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | ( - self.standardised_asset_list["epc_indicates_empty_cavity"] - ) - ] - - self.work_type_breakdowns = { - "empty_cavity": cavity_fills[self.STANDARD_PROPERTY_TYPE].value_counts() - } - # Finally, we note why each property has been flagged self.standardised_asset_list["cavity_reason"] = None self.standardised_asset_list["cavity_reason"] = np.where( @@ -1538,25 +1632,68 @@ class AssetList: self.standardised_asset_list["solar_reason"] = None self.standardised_asset_list["solar_reason"] = np.where( self.standardised_asset_list["solar_eligible_solid_floor"], - "Solid Floor, Insulated, No Solar", + "Solid Floor, Insulated, No Existing Solar", self.standardised_asset_list["solar_reason"] ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"], + "Solid Floor, Insulated, No Existing Solar, Needs Heating Upgrade", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"], - "Solid Floor, Insulated, Needs Loft", + "Solid Floor, Insulated, Needs Loft, No Existing Solar", self.standardised_asset_list["solar_reason"] ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft_needs_heating_upgrade"], + "Solid Floor, Insulated, Needs Loft, No Existing Solar, Needs Heating Upgrade", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( self.standardised_asset_list["solar_eligible_other_floor"], - "Other Floor, Insulated, No Solar", + "Other Floor, Insulated, No Existing Solar", self.standardised_asset_list["solar_reason"] ) self.standardised_asset_list["solar_reason"] = np.where( - self.standardised_asset_list["solar_eligible_other_floor_needs_loft"], - "Other Floor, Insulated, Needs Loft", + self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade"], + "Other Floor, Insulated, No Existing Solar, Needs Heating Upgrade", self.standardised_asset_list["solar_reason"] ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"], + "Other Floor, Insulated, Needs Loft, No Existing Solar", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_other_floor_needs_loft_needs_heating_upgrade"], + "Other Floor, Insulated, Needs Loft, No Existing Solar, Needs Heating Upgrade", + self.standardised_asset_list["solar_reason"] + ) + + # Flag anything that has existing outcomes + if self.outcomes is not None: + self.standardised_asset_list["cavity_reason"] = np.where( + ( + (self.standardised_asset_list["Surveyed"] > 0) | + (self.standardised_asset_list["Installer Refusal"] > 0) + ), + None, + self.standardised_asset_list["cavity_reason"] + ) + + if self.master_surveyed is not None: + self.standardised_asset_list["cavity_reason"] = np.where( + ( + (~pd.isnull(self.standardised_asset_list["SUBMISSION DATE"])) + ), + None, + self.standardised_asset_list["cavity_reason"] + ) + def flat_analysis(self): # We need to deduce the building name - we strip out the house number From 45b372b9ae43dc1c867f5ff425d8fc4b3e6e5c91 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 10 Mar 2025 19:11:09 +0000 Subject: [PATCH 232/255] refining detection for solar and breakdown counts --- asset_list/AssetList.py | 409 +++++++++++++------- asset_list/app.py | 72 +++- asset_list/mappings/heating_systems.py | 13 +- asset_list/mappings/property_type.py | 10 +- asset_list/mappings/walls.py | 23 +- etl/customers/mod/pilot/1. Create Sample.py | 34 ++ 6 files changed, 406 insertions(+), 155 deletions(-) create mode 100644 etl/customers/mod/pilot/1. Create Sample.py diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 689e752b..dc22a8a2 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1012,7 +1012,8 @@ class AssetList: ) ) else: - raise NotImplementedError("need to implement the case for non-intrusives") + # We set the filter to False, as we have no non-intrusives + non_intrusives_wall_filter = False self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & @@ -1110,7 +1111,8 @@ class AssetList: ) else: - raise NotImplementedError("need to implement the case for non-intrusives") + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = False + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = False # Adjust self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = np.where( @@ -1131,7 +1133,7 @@ class AssetList: ) self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] = ( self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( - ["electric storage heaters", "room heaters"] + ["electric storage heaters", "room heaters", "electric radiators"] ) ) @@ -1179,7 +1181,8 @@ class AssetList: ) ) else: - raise NotImplementedError("need to implement the case for non-intrusives") + # We don't have an indication + existing_solar_non_intrusives_check = False self.standardised_asset_list["property_has_solar"] = ( (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") | @@ -1208,7 +1211,7 @@ class AssetList: print("Should we include cavity properties where they might be uninsulated?") self.standardised_asset_list["solar_landlord_walls_insulated"] = ( self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( - ["filled cavity", "insulated solid brick"] + ["filled cavity", "insulated solid brick", "insulated timber frame"] ) ) @@ -1225,7 +1228,7 @@ class AssetList: ) ) else: - raise NotImplementedError("need to implement the case for non-intrusives") + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = False # TODO: We don't have information about the roof from this landlord @@ -1294,7 +1297,6 @@ class AssetList: lambda x: int(x) < 200 if str(x).isdigit() else False ) - # TODO: Fill with False - should be temp! self.standardised_asset_list["epc_has_floor_recommendation"] = ( self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) ) @@ -1339,36 +1341,6 @@ class AssetList: ) ) - # Check for other floor types, insulated - self.standardised_asset_list["solar_epc_floor_is_other_insulated"] = ( - # The floor is suspended and insulated - ( - ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str - .lower().str.contains("suspended") - ) & ( - ~self.standardised_asset_list["epc_has_floor_recommendation"] - ) & ( - # We do not utilise estimated EPCs for this method because we will always find that - # "epc_has_floor_recommendation" is False - self.standardised_asset_list["estimated"] == False - ) - ) | ( - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["floor-description"] - ].str.lower().str.contains("suspended") - ) & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["floor-description"] - ].str.lower().str.contains(", insulated") - ) - ) | ( - self.standardised_asset_list["floor_u_value"].apply( - lambda x: x <= 0.5 if not pd.isnull(x) else False - ) - ) - ) #################################### # Check solar eligibility #################################### @@ -1390,7 +1362,13 @@ class AssetList: self.standardised_asset_list["solar_non_intrusives_walls_insulated"] ) + not_a_flat = ( + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "flat" + ) + self.standardised_asset_list["solar_eligible_solid_floor"] = ( + # Property isn't a flag + not_a_flat & # Landlord data or EPC data indicates the heating system is appropriate correct_heating_system & # The property doesn't currently have solar @@ -1399,11 +1377,32 @@ class AssetList: walls_are_insulated & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + # Floor type check + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP below threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + + self.standardised_asset_list["solar_eligible_solid_floor_sap_above_threshold"] = ( + # Property isn't a flag + not_a_flat & + # Landlord data or EPC data indicates the heating system is appropriate + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + # Floor type check + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP above threshold + ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) # With heating upgrade self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] = ( + not_a_flat & # Needs heating upgrade needs_heating_upgrade & # The property doesn't currently have solar @@ -1412,14 +1411,43 @@ class AssetList: walls_are_insulated & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + # Floor type check + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP Below threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] & + # SAP above threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) + # With heating upgrade, above threshold + self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade_sap_above_threshold"] = ( + not_a_flat & + # Needs heating upgrade + needs_heating_upgrade & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + # Floor type check + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP Below threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] & + # SAP above threshold + ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + # Because the EPC data can be contradictrory, we remove any overlap self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] = np.where( self.standardised_asset_list["solar_eligible_solid_floor"], False, self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] ) + self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade_sap_above_threshold"] = np.where( + self.standardised_asset_list["solar_eligible_solid_floor_sap_above_threshold"], + False, + self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade_sap_above_threshold"] + ) # We shouldn't have an overlap if ( @@ -1430,6 +1458,7 @@ class AssetList: # Solid floor but needs a loft top-up self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] = ( + not_a_flat & # Landlord data or EPC data indicates the heating system is appropriate correct_heating_system & # The property doesn't currently have solar @@ -1438,10 +1467,31 @@ class AssetList: walls_are_insulated & # Roof is insulated self.standardised_asset_list["solar_epc_loft_needs_topup"] & - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + # Check floor + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP below threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) + # Solid floor, needs loft, above SAP thresold + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft_sap_above_threshold"] = ( + not_a_flat & + # Landlord data or EPC data indicates the heating system is appropriate + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Check floor + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP above threshold + ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + # Needs loft & heating self.standardised_asset_list["solar_eligible_solid_floor_needs_loft_needs_heating_upgrade"] = ( + not_a_flat & # Needs heating upgrade needs_heating_upgrade & # The property doesn't currently have solar @@ -1450,11 +1500,33 @@ class AssetList: walls_are_insulated & # Roof is insulated self.standardised_asset_list["solar_epc_loft_needs_topup"] & - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + # Floor type + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP below threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + + self.standardised_asset_list[ + "solar_eligible_solid_floor_needs_loft_needs_heating_upgrade_sap_above_threshold" + ] = ( + not_a_flat & + # Needs heating upgrade + needs_heating_upgrade & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Floor type + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP above threshold + ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) # Other floor type, fully insulated self.standardised_asset_list["solar_eligible_other_floor"] = ( + not_a_flat & # Landlord data or EPC data indicates the heating system is appropriate correct_heating_system & # The property doesn't currently have solar @@ -1463,11 +1535,30 @@ class AssetList: walls_are_insulated & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & - self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + # Floor type + ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP below threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + self.standardised_asset_list["solar_eligible_other_floor_sap_above_threshold"] = ( + not_a_flat & + # Landlord data or EPC data indicates the heating system is appropriate + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + # Floor type - other types + ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP above threshold + ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) # With heating upgrade self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade"] = ( + not_a_flat & # Needs heating upgrade needs_heating_upgrade & # The property doesn't currently have solar @@ -1476,11 +1567,37 @@ class AssetList: walls_are_insulated & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & - self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + # Other floor types + ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP below threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) + # With heating upgrade, SAP above threshold + self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade_sap_above_threshold"] = ( + not_a_flat & + # Needs heating upgrade + needs_heating_upgrade & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + # Other floor types + ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP above threshold + ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + # Check for overlap + if ( + self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade"] & + self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade_sap_above_threshold"] + ).sum(): + raise ValueError("Both heating upgrade and no heating upgrade are true - this should not be possible") # Other floor type, needs loft top-up self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] = ( + not_a_flat & # Landlord data or EPC data indicates the heating system is appropriate correct_heating_system & # The property doesn't currently have solar @@ -1489,12 +1606,31 @@ class AssetList: walls_are_insulated & # Roof need loft top-up self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Floor is not solid, but is insulated - self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + # Other floor types + ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP below threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + # Other floor type, needs loft top-up, SAP above threshold + self.standardised_asset_list["solar_eligible_other_floor_needs_loft_sap_above_threshold"] = ( + not_a_flat & + # Landlord data or EPC data indicates the heating system is appropriate + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof need loft top-up + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Other floor types + ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP above threshold + ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) # With heating upgrade self.standardised_asset_list["solar_eligible_other_floor_needs_loft_needs_heating_upgrade"] = ( + not_a_flat & # Landlord data or EPC data indicates the heating system is appropriate needs_heating_upgrade & # The property doesn't currently have solar @@ -1503,8 +1639,28 @@ class AssetList: walls_are_insulated & # Roof need loft top-up self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Floor is not solid, but is insulated - self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + # Other floor types + ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP below threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + + self.standardised_asset_list[ + "solar_eligible_other_floor_needs_loft_needs_heating_upgrade_sap_above_threshold" + ] = ( + not_a_flat & + # Landlord data or EPC data indicates the heating system is appropriate + needs_heating_upgrade & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_are_insulated & + # Roof need loft top-up + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Other floor types + ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & + # SAP above threshold + ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) # Drop anything we don't need @@ -1529,66 +1685,6 @@ class AssetList: self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" ] - # Produce some aggregate figures - self.work_type_figures = { - # Empty cavity from non-intrusives - "Empty Cavity (non-intrusives)": non_blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum(), - "Empty Cavity (non-intrusives, blocks of flats)": ( - blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum() - ), - "Empty Cavity (non-intrusives, no SAP filter)": ( - non_blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() - ), - "Empty Cavity (non-intrusives, no SAP filter, blocks of flats)": ( - blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() - ), - "Empty Cavity (EPC)": ( - ( - non_blocks_of_flats["epc_indicates_empty_cavity"] & - ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"] - ).sum() - ), - "Empty Cavity (EPC, blocks of flat)": ( - ( - blocks_of_flats["epc_indicates_empty_cavity"] & - ~blocks_of_flats["non_intrusive_indicates_empty_cavity"] - ).sum() - ), - "Cavity Extraction": ( - ( - ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"] & - ~non_blocks_of_flats["epc_indicates_empty_cavity"] & - non_blocks_of_flats["non_intrusive_indicates_cavity_extraction"] - ).sum() - ), - "Cavity Extraction (blocks of flats)": ( - ( - ~blocks_of_flats["non_intrusive_indicates_empty_cavity"] & - ~blocks_of_flats["epc_indicates_empty_cavity"] & - blocks_of_flats["non_intrusive_indicates_cavity_extraction"] - ).sum() - ), - "Cavity Extraction (no SAP filter)": ( - ( - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & - ~self.standardised_asset_list["epc_indicates_empty_cavity"] & - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] - ).sum() - ), - "Solar PV (Solid Floor)": ( - self.standardised_asset_list["solar_eligible_solid_floor"].sum() - ), - "Solar PV (Solid Floor, Needs Loft Top-up)": ( - self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"].sum() - ), - "Solar PV (Other Floor)": ( - self.standardised_asset_list["solar_eligible_other_floor"].sum() - ), - "Solar PV (Other Floor, Needs Loft Top-up)": ( - self.standardised_asset_list["solar_eligible_other_floor_needs_loft"].sum() - ) - } - # Finally, we note why each property has been flagged self.standardised_asset_list["cavity_reason"] = None self.standardised_asset_list["cavity_reason"] = np.where( @@ -1628,51 +1724,55 @@ class AssetList: self.standardised_asset_list["cavity_reason"] ) + ###################################################### # Flag solar + ###################################################### self.standardised_asset_list["solar_reason"] = None - self.standardised_asset_list["solar_reason"] = np.where( - self.standardised_asset_list["solar_eligible_solid_floor"], - "Solid Floor, Insulated, No Existing Solar", - self.standardised_asset_list["solar_reason"] - ) - self.standardised_asset_list["solar_reason"] = np.where( - self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"], - "Solid Floor, Insulated, No Existing Solar, Needs Heating Upgrade", - self.standardised_asset_list["solar_reason"] - ) - self.standardised_asset_list["solar_reason"] = np.where( - self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"], - "Solid Floor, Insulated, Needs Loft, No Existing Solar", - self.standardised_asset_list["solar_reason"] - ) - self.standardised_asset_list["solar_reason"] = np.where( - self.standardised_asset_list["solar_eligible_solid_floor_needs_loft_needs_heating_upgrade"], - "Solid Floor, Insulated, Needs Loft, No Existing Solar, Needs Heating Upgrade", - self.standardised_asset_list["solar_reason"] - ) + # Map of variables and fill values for the solar_reason variable + solar_reason_map = { + "solar_eligible_solid_floor": "Solar Eligible, Solid Floor", + "solar_eligible_solid_floor_sap_above_threshold": "Solar Eligible, Solid Floor, SAP Above Threshold", + "solar_eligible_solid_floor_needs_heating_upgrade": ( + "Solar Eligible, Solid Floor, Needs Heating Upgrade" + ), + "solar_eligible_solid_floor_needs_heating_upgrade_sap_above_threshold": ( + "Solar Eligible, Solid Floor, Needs Heating Upgrade, SAP Above Threshold" + ), + "solar_eligible_solid_floor_needs_loft": "Solar Eligible, Solid Floor, Needs Loft", + "solar_eligible_solid_floor_needs_loft_sap_above_threshold": ( + "Solar Eligible, Solid Floor, Needs Loft, SAP Above Threshold" + ), + "solar_eligible_solid_floor_needs_loft_needs_heating_upgrade": ( + "Solar Eligible, Solid Floor, Needs Loft, Needs Heating Upgrade" + ), + "solar_eligible_solid_floor_needs_loft_needs_heating_upgrade_sap_above_threshold": ( + "Solar Eligible, Solid Floor, Needs Loft, Needs Heating Upgrade, SAP Above Threshold" + ), + "solar_eligible_other_floor": "Solar Eligible, Other Floor", + "solar_eligible_other_floor_sap_above_threshold": "Solar Eligible, Other Floor, SAP Above Threshold", + "solar_eligible_other_floor_needs_heating_upgrade": "Solar Eligible, Other Floor, Needs Heating Upgrade", + "solar_eligible_other_floor_needs_heating_upgrade_sap_above_threshold": ( + "Solar Eligible, Other Floor, Needs Heating Upgrade, SAP Above Threshold" + ), + "solar_eligible_other_floor_needs_loft": "Solar Eligible, Other Floor, Needs Loft", + "solar_eligible_other_floor_needs_loft_sap_above_threshold": ( + "Solar Eligible, Other Floor, Needs Loft, SAP Above Threshold" + ), + "solar_eligible_other_floor_needs_loft_needs_heating_upgrade": ( + "Solar Eligible, Other Floor, Needs Loft, Needs Heating Upgrade" + ), + "solar_eligible_other_floor_needs_loft_needs_heating_upgrade_sap_above_threshold": ( + "Solar Eligible, Other Floor, Needs Loft, Needs Heating Upgrade, SAP Above Threshold" + ) + } - self.standardised_asset_list["solar_reason"] = np.where( - self.standardised_asset_list["solar_eligible_other_floor"], - "Other Floor, Insulated, No Existing Solar", - self.standardised_asset_list["solar_reason"] - ) - self.standardised_asset_list["solar_reason"] = np.where( - self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade"], - "Other Floor, Insulated, No Existing Solar, Needs Heating Upgrade", - self.standardised_asset_list["solar_reason"] - ) - - self.standardised_asset_list["solar_reason"] = np.where( - self.standardised_asset_list["solar_eligible_other_floor_needs_loft"], - "Other Floor, Insulated, Needs Loft, No Existing Solar", - self.standardised_asset_list["solar_reason"] - ) - self.standardised_asset_list["solar_reason"] = np.where( - self.standardised_asset_list["solar_eligible_other_floor_needs_loft_needs_heating_upgrade"], - "Other Floor, Insulated, Needs Loft, No Existing Solar, Needs Heating Upgrade", - self.standardised_asset_list["solar_reason"] - ) + for variable, reason in solar_reason_map.items(): + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list[variable], + reason, + self.standardised_asset_list["solar_reason"] + ) # Flag anything that has existing outcomes if self.outcomes is not None: @@ -1694,6 +1794,12 @@ class AssetList: self.standardised_asset_list["cavity_reason"] ) + # Produce some aggregate figures + self.work_type_figures = { + **self.standardised_asset_list["cavity_reason"].value_counts().to_dict(), + **self.standardised_asset_list["solar_reason"].value_counts().to_dict() + } + def flat_analysis(self): # We need to deduce the building name - we strip out the house number @@ -2028,7 +2134,7 @@ class AssetList: outcomes_houseno ): if outcomes_filepath is None: - pass + return # ToDO: Parameterise for future use? self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname) @@ -2119,6 +2225,8 @@ class AssetList: self.outcomes[["row_id", "Outcome", "Notes", date_col]], how="left", on="row_id" ) + df = lookup[lookup["domna_property_id"] == "44beckettavenuegainsboroughdn211en-1d4811cbb046"] + visit_counts = ( lookup.groupby(self.DOMNA_PROPERTY_ID)["row_id"] .count() @@ -2153,6 +2261,9 @@ class AssetList: ): # TODO: This probably needs further expansion + if not master_filepaths: + return + if master_to_asset_list_filepath is not None: id_map = pd.read_csv(master_to_asset_list_filepath) else: diff --git a/asset_list/app.py b/asset_list/app.py index 63ca40d8..1a6dbc6b 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -247,6 +247,30 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) + # Wates + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Wates - " + data_filename = "ECO 4 Wates.xlsx" + sheet_name = "Roadmap Homes" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "Address Line 1" + address1_method = None + address_cols_to_concat = ["Address Line 1", "Address Line 2", "Address Line 3"] + missing_postcodes_method = None + landlord_year_built = "Build Year" + landlord_os_uprn = None + landlord_property_type = "Archetype" + landlord_wall_construction = "Wall" + landlord_heating_system = "Heating Type" + landlord_existing_pv = None + landlord_property_id = "UPRN" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + master_filepaths = [] + master_to_asset_list_filepath = None + # Ealing # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing/Programme data - 04032025" # data_filename = "Ealing BC - Property Plus Tenure 25.02.2025.xlsx" @@ -265,6 +289,29 @@ def app(): # landlord_existing_pv = None # landlord_property_id = "Property ref" + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + # data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + # sheet_name = "Sheet1" + # postcode_column = 'Full Address.1' + # fulladdress_column = "Full Address" + # address1_column = None + # address1_method = "first_word" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Build Date" + # landlord_os_uprn = None + # landlord_property_type = "Property Type" + # landlord_wall_construction = "Wallinsul" + # landlord_heating_system = "HeatSorc" + # landlord_existing_pv = None + # landlord_property_id = "Property Reference" + # outcomes_filename = None + # outcomes_sheetname = None + # outcomes_postcode = None + # outcomes_houseno = None + # master_filepaths = [] + # master_to_asset_list_filepath = None + # For Westward # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" # data_filename = "WESTWARD - completed list..xlsx" @@ -282,6 +329,12 @@ def app(): # landlord_heating_system = "Heat Source" # landlord_existing_pv = "PV (Y/N)" # landlord_property_id = "Place ref" + # outcomes_filename = None + # outcomes_sheetname = None + # outcomes_postcode = None + # outcomes_houseno = None + # master_filepaths = [] + # master_to_asset_list_filepath = None # For ACIS - programme re-build # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/ACIS/ACIS Full Programme Review March 2025" @@ -393,7 +446,7 @@ def app(): # We now flag properties that have been treated under existing programmes asset_list.flag_outcomes( - outcomes_filepath=os.path.join(data_folder, outcomes_filename), + outcomes_filepath=os.path.join(data_folder, outcomes_filename) if outcomes_filename else None, outcomes_sheetname=outcomes_sheetname, outcomes_postcode=outcomes_postcode, outcomes_houseno=outcomes_houseno @@ -566,6 +619,22 @@ def app(): pprint(asset_list.work_type_figures) + # TODO: Characterise the properties that didn't qualify + eg = asset_list.standardised_asset_list[ + pd.isnull(asset_list.standardised_asset_list["solar_reason"]) + ] + eg[asset_list.EPC_API_DATA_NAMES["floor-description"]].value_counts() + + # TODO: Look into the estimated ones + eg["estimated"].value_counts() + + eg = eg[eg[asset_list.STANDARD_HEATING_SYSTEM] == "high heat retention storage heaters"] + eg[asset_list.STANDARD_WALL_CONSTRUCTION].value_counts() + eg = eg[eg[asset_list.STANDARD_WALL_CONSTRUCTION] == "filled cavity"] + eg[asset_list.EPC_API_DATA_NAMES["roof-description"]].value_counts() + eg[asset_list.EPC_API_DATA_NAMES["floor-description"]].value_counts() + eg["epc_has_floor_recommendation"].value_counts() + asset_list.flat_analysis() asset_list.load_contact_details( @@ -614,6 +683,7 @@ def app(): with pd.ExcelWriter(filename) as writer: asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False) + # If we have outcomes, we add a tab with the outcomes # Store the Hubspot export as a csv hubspot_data.to_csv(os.path.join(data_folder, "Hubspot Export.csv"), index=False) diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index f397391c..73e2679e 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -95,5 +95,16 @@ HEATING_MAPPINGS = { 'Boiler Solid fuel': 'boiler - other fuel', 'Community heating Community (mains gas)': 'communal gas boiler', 'Boiler Biomass': 'boiler - other fuel', - 'No heating system Mains gas': 'unknown' + 'No heating system Mains gas': 'unknown', + + 'Storage heaters': 'electric storage heaters', + 'Air Source': 'air source heat pump', + 'Ground source': 'ground source heat pump', + 'OIl': 'boiler - other fuel', + 'Quantum storage heaters (old sh on EPC)': 'high heat retention storage heaters', + 'Quanum Storage heaters': 'high heat retention storage heaters', + 'Quantum storage heaters (Old SH on EPC)': 'high heat retention storage heaters', + 'Quantum storage heaters': 'high heat retention storage heaters', + 'Air Source (EPC says SH)': 'air source heat pump', + 'ASHP - Was logged as oil': 'air source heat pump' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index ccee5d3e..3182bd45 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -63,5 +63,13 @@ PROPERTY_MAPPING = { '2 Bed 1st Floor Sheltered Flat': 'flat', '1 Bed First Floor Flat': 'flat', '3 Bed First Floor Flat': 'flat', - 'ND': 'unknown' + 'ND': 'unknown', + 'House (Mid Terrace)': 'house', + 'First Floor Flat General': 'flat', + 'House (End Terrace)': 'house', + 'House (Mid terrace)': 'house', + 'Bungalow (Semi)': 'bungalow', + 'Ground Floor Flat General': 'flat', + 'House (Semi)': 'house' + } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 2313f063..89c97d7e 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -1,10 +1,14 @@ import numpy as np STANDARD_WALL_CONSTRUCTIONS = { + # Cavity "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation", + # Solic Brick "uninsulated solid brick", "insulated solid brick", "solid brick unknown insulation", - "timber frame", - "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", + # Timber Frame + "timber frame unknown insulation", "insulated timber frame", "uninsulated timber frame", + "system built", "granite or whinstone", "other", + "unknown", "sandstone or limestone", "cob", "new build - average thermal transmittance", } @@ -117,5 +121,18 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Solid brick Internal': 'insulated solid brick', 'Cavity Internal': 'filled cavity', 'System build Internal': 'system built', - 'Solid brick As-built': 'solid brick unknown insulation' + 'Solid brick As-built': 'solid brick unknown insulation', + + 'Cavity ': 'cavity unknown insulation', + 'Solid brick ': 'solid brick unknown insulation', + 'Timber frame Timber frame (good insulation)': 'insulated timber frame', + ' ': 'unknown', + 'Cavity No data': 'cavity unknown insulation', + 'Non trad ': 'other', + 'Solid brick / Multiple Attributes ': 'solid brick unknown insulation', + 'Cavity Believe CWI done by Dyson': 'filled cavity', + 'Cavity CWI required': 'uninsulated cavity', + 'Solid brick EWI installed': 'insulated solid brick', + 'Cavity Cavity batts': 'filled cavity', + 'Cavity CWI Completed by Dyson': 'filled cavity' } diff --git a/etl/customers/mod/pilot/1. Create Sample.py b/etl/customers/mod/pilot/1. Create Sample.py new file mode 100644 index 00000000..e1f9b444 --- /dev/null +++ b/etl/customers/mod/pilot/1. Create Sample.py @@ -0,0 +1,34 @@ +import pandas as pd + + +def app(): + """ + Given the sample data and additonal properties, this function prepares the data + :return: + """ + folder_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme" + sample_list = pd.read_excel(f"{folder_path}/20250227_DIO_Accommodation_Sample_Properties.xlsx") + asset_data = pd.read_excel(f"{folder_path}/20250303_DIO_Accommodation_Property_Attribution.xlsx") + asset_data["BLNDG_GOVERMENT_UPRN"] = asset_data["BLNDG_GOVERMENT_UPRN"].astype("Int64") + + asset_data["BLNDG_GOVERMENT_UPRN"].nunique() + for _id in asset_data["ESTB_ID"].unique(): + data = asset_data[asset_data["ESTB_ID"] == _id] + z = data["BLNDG_GOVERMENT_UPRN"] + + data["BLNDG_GOVERMENT_UPRN"].unique() + + asset_data["BLNDG_GOVERMENT_UPRN"].unique() + + df = asset_data.groupby("BLNDG_GOVERMENT_UPRN")["ESTB_ID"].nunique().sort_values(ascending=False).reset_index() + + example = asset_data[asset_data["BLNDG_GOVERMENT_UPRN"] == df.head(1)["BLNDG_GOVERMENT_UPRN"].values[0]] + + asset_data[asset_data["BLNDG_GOVERMENT_UPRN"]] + + asset_data = asset_data[asset_data["ESTB_ID"].isin(sample_list["ESTB_ID"].values)] + asset_data.drop_duplicates("ESTB_ID", inplace=True) + + [x for x in asset_data.columns if "uprn" in x.lower()] + + example = asset_data[asset_data["ESTB_ID"] == 1547072] From b6ef41b21b6a8b1539fd71044b819245395c958b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 10 Mar 2025 19:26:58 +0000 Subject: [PATCH 233/255] adding outcomes writing --- asset_list/AssetList.py | 46 +++++++++++++++++++++++++---------------- asset_list/app.py | 2 ++ 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index dc22a8a2..fc021034 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -354,6 +354,7 @@ class AssetList: self.contact_detail_fields = None self.outcomes = None self.outcomes_no_match = None + self.outcomes_for_output = None self.master_surveyed = None # We detect the presence of the non-intrusive columns @@ -1414,8 +1415,6 @@ class AssetList: # Floor type check self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & # SAP Below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] & - # SAP above threshold self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) # With heating upgrade, above threshold @@ -1431,8 +1430,6 @@ class AssetList: self.standardised_asset_list["solar_epc_roof_insulated"] & # Floor type check self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP Below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] & # SAP above threshold ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) @@ -1677,14 +1674,6 @@ class AssetList: # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] ) - blocks_of_flats = self.standardised_asset_list[ - self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" - ] - - non_blocks_of_flats = self.standardised_asset_list[ - self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" - ] - # Finally, we note why each property has been flagged self.standardised_asset_list["cavity_reason"] = None self.standardised_asset_list["cavity_reason"] = np.where( @@ -1794,12 +1783,35 @@ class AssetList: self.standardised_asset_list["cavity_reason"] ) + blocks_of_flats = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" + ] + + non_blocks_of_flats = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" + ] + # Produce some aggregate figures self.work_type_figures = { - **self.standardised_asset_list["cavity_reason"].value_counts().to_dict(), + **non_blocks_of_flats["cavity_reason"].value_counts().to_dict(), + **{ + k + " (Block of flats)": v for k, v in + blocks_of_flats["solar_reason"].value_counts().to_dict().items() + }, **self.standardised_asset_list["solar_reason"].value_counts().to_dict() } + # We prepare outcomes for output + if self.outcomes: + logger.info("Preparing outcomes for output") + identified_work = self.standardised_asset_list[ + ~pd.isnull(self.standardised_asset_list["cavity_reason"]) | + ~pd.isnull(self.standardised_asset_list["solar_reason"]) + ][self.DOMNA_PROPERTY_ID].values + self.outcomes_for_output = self.outcomes[ + self.outcomes[self.DOMNA_PROPERTY_ID].isin(identified_work) + ] + def flat_analysis(self): # We need to deduce the building name - we strip out the house number @@ -2225,8 +2237,6 @@ class AssetList: self.outcomes[["row_id", "Outcome", "Notes", date_col]], how="left", on="row_id" ) - df = lookup[lookup["domna_property_id"] == "44beckettavenuegainsboroughdn211en-1d4811cbb046"] - visit_counts = ( lookup.groupby(self.DOMNA_PROPERTY_ID)["row_id"] .count() @@ -2245,15 +2255,15 @@ class AssetList: # We merge this data onto outcomes self.outcomes["matched_to_asset_list"] = self.outcomes["row_id"].isin(lookup["row_id"].values) - self.outcomes = self.outcomes.merge( - lookup, how="left", on="row_id" - ) + self.outcomes = self.outcomes.merge(lookup[["row_id", "domna_property_id"]], how="left", on="row_id") # We merge out pivoted outcomes onto the asset list self.standardised_asset_list = self.standardised_asset_list.merge( pivot_df, how="left", left_on=self.DOMNA_PROPERTY_ID, right_on="domna_property_id" ) + self.outcomes = self.outcomes.sort_values("domna_property_id", ascending=False) + def flag_survey_master( self, master_filepaths, diff --git a/asset_list/app.py b/asset_list/app.py index 1a6dbc6b..0fe09767 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -684,6 +684,8 @@ def app(): asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False) # If we have outcomes, we add a tab with the outcomes + if asset_list.outcomes_for_output is not None: + asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False) # Store the Hubspot export as a csv hubspot_data.to_csv(os.path.join(data_folder, "Hubspot Export.csv"), index=False) From dc2d108060808f669d1433611c63062b3f14886e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 11 Mar 2025 17:54:48 +0000 Subject: [PATCH 234/255] analysing wates data --- asset_list/AssetList.py | 178 ++++++++++++++++++++++++------ asset_list/app.py | 35 +++--- asset_list/mappings/built_form.py | 20 ++++ asset_list/mappings/walls.py | 2 +- 4 files changed, 185 insertions(+), 50 deletions(-) create mode 100644 asset_list/mappings/built_form.py diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index fc021034..b3dbd512 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -16,6 +16,7 @@ import asset_list.mappings.property_type as property_type_mappings import asset_list.mappings.walls as walls_mappings import asset_list.mappings.heating_systems as heating_mappings import asset_list.mappings.exising_pv as existing_pv_mappings +import asset_list.mappings.built_form as built_form_mappings from recommendations.recommendation_utils import ( estimate_perimeter, @@ -268,6 +269,7 @@ class AssetList: STANDARD_UPRN = "ordnance_survey_uprn" STANDARD_LANDLORD_PROPERTY_ID = "landlord_property_id" STANDARD_PROPERTY_TYPE = "landlord_property_type" + STANDARD_BUILT_FORM = "landlord_built_form" STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction" STANDARD_HEATING_SYSTEM = "landlord_heating_system" STANDARD_EXISTING_PV = "landlord_existing_pv" @@ -321,6 +323,14 @@ class AssetList: ", ceiling insulated", ] + # List of strings we look for in the EPC data, where substrings indicate that the cavity is empty + UNINSULATED_CAVITY_SUBSTRINGS = [ + "cavity wall, as built, no insulation (assumed)", + "cavity wall, as built, no insulation", + "cavity wall, as built, partial insulation (assumed)", + "cavity wall, as built, partial insulation", + ] + def __init__( self, local_filepath, @@ -335,6 +345,7 @@ class AssetList: landlord_year_built=None, landlord_uprn=None, landlord_property_type=None, + landlord_built_form=None, landlord_wall_construction=None, landlord_heating_system=None, landlord_existing_pv=None, @@ -370,6 +381,7 @@ class AssetList: self.landlord_year_built = landlord_year_built self.landlord_uprn = landlord_uprn self.landlord_property_type = landlord_property_type + self.landlord_built_form = landlord_built_form self.landlord_wall_construction = landlord_wall_construction self.landlord_heating_system = landlord_heating_system self.landlord_existing_pv = landlord_existing_pv @@ -405,6 +417,13 @@ class AssetList: self.standardised_asset_list[self.address1_colname].copy() ) + # Handle the case where the property type column is the same as the built type + if self.landlord_property_type == self.landlord_built_form: + self.landlord_built_form = self.STANDARD_BUILT_FORM + self.standardised_asset_list[self.landlord_built_form] = ( + self.standardised_asset_list[self.landlord_property_type].copy() + ) + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): if method not in self.ADDRESS_1_CLEANING_METHODS: @@ -557,6 +576,7 @@ class AssetList: self.full_address_colname, self.landlord_uprn, self.landlord_property_type, + self.landlord_built_form, self.landlord_year_built, self.landlord_wall_construction, self.landlord_heating_system, @@ -571,6 +591,7 @@ class AssetList: self.full_address_colname: self.STANDARD_FULL_ADDRESS, self.landlord_uprn: self.STANDARD_UPRN, self.landlord_property_type: self.STANDARD_PROPERTY_TYPE, + self.landlord_built_form: self.STANDARD_BUILT_FORM, self.landlord_year_built: self.STANDARD_YEAR_BUILT, self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION, self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM, @@ -681,6 +702,10 @@ class AssetList: "standard_values": property_type_mappings.STANDARD_PROPERTY_TYPES, "standard_map": property_type_mappings.PROPERTY_MAPPING }, + self.landlord_built_form: { + "standard_values": built_form_mappings.STANDARD_BUILT_FORMS, + "standard_map": built_form_mappings.BUILT_FORM_MAPPINGS + }, self.landlord_wall_construction: { "standard_values": walls_mappings.STANDARD_WALL_CONSTRUCTIONS, "standard_map": walls_mappings.WALL_CONSTRUCTION_MAPPINGS @@ -861,6 +886,9 @@ class AssetList: x[self.EPC_API_DATA_NAMES["roof-description"]]) else None, axis=1 ) + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = ( + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].str.replace("+", "") + ) # We produce some additional fields # 1) Is the SAP rating below C75 @@ -990,9 +1018,6 @@ class AssetList: def identify_worktypes(self, cleaned): - if not self.non_intrusives_present and not self.old_format_non_intrusives_present: - raise NotImplementedError("Need to implement the case for non-intrusives") - # If we have non-intrusives completed, we can use this to identify work types ###################################################### # Empty cavity: @@ -1055,6 +1080,39 @@ class AssetList: ) ) + self.standardised_asset_list["epc_indicates_empty_cavity_no_sap_filter"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( + self.EPC_NO_WALL_INSULATION_DESCRIPTIONS + ) & ( + self.standardised_asset_list["epc_year_upper_bound"] <= 1995 + ) & ( + ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"]] > self.EMPTY_CAVITY_SAP_THRESHOLD + ) + ) + + self.standardised_asset_list["landlord_data_indicates_empty_cavity"] = ( + self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) + ) + + self.standardised_asset_list["landlord_data_indicates_empty_cavity_no_sap_filter"] = ( + self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ] > self.EMPTY_CAVITY_SAP_THRESHOLD + ) + ) + # If the EPC is esimtated, we defer to the non-intrusives self.standardised_asset_list["epc_indicates_empty_cavity"] = np.where( ( @@ -1066,6 +1124,15 @@ class AssetList: self.standardised_asset_list["epc_indicates_empty_cavity"] ) + # Finally, we create a flag to indicate that the cavity is empty, based on the criteria above + self.standardised_asset_list["cavity_is_empty"] = ( + non_intrusives_wall_filter | + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( + self.EPC_NO_WALL_INSULATION_DESCRIPTIONS + ) | + self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) + ) + ###################################################### # Extraction ###################################################### @@ -1212,7 +1279,9 @@ class AssetList: print("Should we include cavity properties where they might be uninsulated?") self.standardised_asset_list["solar_landlord_walls_insulated"] = ( self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( - ["filled cavity", "insulated solid brick", "insulated timber frame"] + [ + "filled cavity", "insulated solid brick", "insulated timber frame", + ] ) ) @@ -1264,24 +1333,24 @@ class AssetList: ) # We merge on the u-value for average thermal transmittance - roof_uvalue_data = pd.DataFrame(cleaned["roof-description"]) - roof_uvalue_data = roof_uvalue_data[ - ~pd.isnull(roof_uvalue_data["thermal_transmittance"]) - ][["original_description", "thermal_transmittance"]].rename( + roof_roof_data = pd.DataFrame(cleaned["roof-description"]) + roof_roof_data = roof_roof_data[ + ["original_description", "thermal_transmittance", "is_pitched", "is_loft"] + ].rename( columns={ "original_description": self.EPC_API_DATA_NAMES["roof-description"], - "thermal_transmittance": "roof_u_value" + "thermal_transmittance": "roof_u_value", } ) self.standardised_asset_list = self.standardised_asset_list.merge( - roof_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] + roof_roof_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] ) # If the u-value of a roof is less than 0.7 we consider it insulated self.standardised_asset_list["solar_epc_roof_insulated"] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains( - "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False + "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), ) | ( self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( lambda x: int(x) >= 200 if str(x).isdigit() else False @@ -1293,9 +1362,19 @@ class AssetList: ) ) - self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[ - self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( - lambda x: int(x) < 200 if str(x).isdigit() else False + self.standardised_asset_list["solar_epc_loft_needs_topup"] = ( + self.standardised_asset_list[ + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) < 200 if str(x).isdigit() else False + ) | ( + ( + self.standardised_asset_list["is_loft"] | self.standardised_asset_list["is_pitched"] + ) & ( + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].isin( + ["below average", "none"] + ) + ) + ) ) self.standardised_asset_list["epc_has_floor_recommendation"] = ( @@ -1357,10 +1436,15 @@ class AssetList: self.standardised_asset_list["solar_epc_data_indicates_requires_heating_upgrade"] ) - walls_are_insulated = ( + # The requirements for walls are: + # 1) walls are insulated + # 2) property is a cavity (can be done insulated or not) + walls_meet_solar_requirements = ( self.standardised_asset_list["solar_landlord_walls_insulated"] | self.standardised_asset_list["solar_epc_walls_insulated"] | - self.standardised_asset_list["solar_non_intrusives_walls_insulated"] + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] | + self.standardised_asset_list["cavity_is_empty"] | + (self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].str.contains("cavity")) ) not_a_flat = ( @@ -1375,7 +1459,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & # Floor type check @@ -1392,7 +1476,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & # Floor type check @@ -1409,7 +1493,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & # Floor type check @@ -1425,7 +1509,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & # Floor type check @@ -1461,7 +1545,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_loft_needs_topup"] & # Check floor @@ -1478,7 +1562,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_loft_needs_topup"] & # Check floor @@ -1494,7 +1578,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_loft_needs_topup"] & # Floor type @@ -1512,7 +1596,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_loft_needs_topup"] & # Floor type @@ -1529,7 +1613,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & # Floor type @@ -1544,7 +1628,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & # Floor type - other types @@ -1561,7 +1645,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & # Other floor types @@ -1577,7 +1661,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof is insulated self.standardised_asset_list["solar_epc_roof_insulated"] & # Other floor types @@ -1600,7 +1684,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof need loft top-up self.standardised_asset_list["solar_epc_loft_needs_topup"] & # Other floor types @@ -1616,7 +1700,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof need loft top-up self.standardised_asset_list["solar_epc_loft_needs_topup"] & # Other floor types @@ -1633,7 +1717,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof need loft top-up self.standardised_asset_list["solar_epc_loft_needs_topup"] & # Other floor types @@ -1651,7 +1735,7 @@ class AssetList: # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated - walls_are_insulated & + walls_meet_solar_requirements & # Roof need loft top-up self.standardised_asset_list["solar_epc_loft_needs_topup"] & # Other floor types @@ -1676,6 +1760,7 @@ class AssetList: # Finally, we note why each property has been flagged self.standardised_asset_list["cavity_reason"] = None + self.standardised_asset_list["cavity_reason"] = np.where( self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], "Non-Intrusive Data Showed Empty Cavity", @@ -1694,6 +1779,33 @@ class AssetList: "EPC Data Showed Empty Cavity", self.standardised_asset_list["cavity_reason"] ) + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity_no_sap_filter"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] + ), + "EPC Data Showed Empty Cavity but all SAP scores allowed", + self.standardised_asset_list["cavity_reason"] + ) + # Landlord data + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["landlord_data_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + ~self.standardised_asset_list["epc_indicates_empty_cavity"] + ), + "Landlord Data Showed Empty Cavity", + self.standardised_asset_list["cavity_reason"] + ) + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["landlord_data_indicates_empty_cavity_no_sap_filter"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] & + ~self.standardised_asset_list["epc_indicates_empty_cavity_no_sap_filter"] + ), + "Landlord Data Showed Empty Cavity but all SAP scores allowed", + self.standardised_asset_list["cavity_reason"], + ) # Flag extraction self.standardised_asset_list["cavity_reason"] = np.where( ( @@ -1802,7 +1914,7 @@ class AssetList: } # We prepare outcomes for output - if self.outcomes: + if self.outcomes is not None: logger.info("Preparing outcomes for output") identified_work = self.standardised_asset_list[ ~pd.isnull(self.standardised_asset_list["cavity_reason"]) | diff --git a/asset_list/app.py b/asset_list/app.py index 0fe09767..e9cd7c3f 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -9,6 +9,7 @@ import msgpack from utils.s3 import read_from_s3 from asset_list.AssetList import AssetList from asset_list.mappings.property_type import PROPERTY_MAPPING +from asset_list.mappings.built_form import BUILT_FORM_MAPPINGS from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS from asset_list.mappings.heating_systems import HEATING_MAPPINGS from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS @@ -40,6 +41,13 @@ def get_data( "bedsit": "Flat" } + built_form_map = { + "mid-terrace": "Mid-Terrace", + "end-terrace": "End-Terrace", + "semi-detached": "Semi-Detached", + "detached": "Detached" + } + epc_data = [] errors = [] no_epc = [] @@ -65,6 +73,7 @@ def get_data( uprn = None property_type = property_type_map.get(home[AssetList.STANDARD_PROPERTY_TYPE], None) + built_form = built_form_map.get(home[AssetList.STANDARD_BUILT_FORM]) searcher = SearchEpc( address1=str(house_no), @@ -119,6 +128,7 @@ def get_data( # As a final resort, we estimate the EPC if property_type is not None and searcher.newest_epc is None: searcher.ordnance_survey_client.property_type = property_type + searcher.ordnance_survey_client.built_form = built_form searcher.find_property(skip_os=True) if searcher.newest_epc is None: @@ -260,6 +270,7 @@ def app(): landlord_year_built = "Build Year" landlord_os_uprn = None landlord_property_type = "Archetype" + landlord_built_form = "Archetype" landlord_wall_construction = "Wall" landlord_heating_system = "Heating Type" landlord_existing_pv = None @@ -407,6 +418,7 @@ def app(): landlord_year_built=landlord_year_built, landlord_uprn=landlord_os_uprn, landlord_property_type=landlord_property_type, + landlord_built_form=landlord_built_form, landlord_wall_construction=landlord_wall_construction, landlord_heating_system=landlord_heating_system, landlord_existing_pv=landlord_existing_pv @@ -421,6 +433,13 @@ def app(): ).items() if k not in PROPERTY_MAPPING } + new_built_form_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_built_form] if + asset_list.landlord_built_form else {} + ).items() + if k not in BUILT_FORM_MAPPINGS + } new_wall_map = { k: v for k, v in ( asset_list.variable_mappings[asset_list.landlord_wall_construction] if @@ -619,22 +638,6 @@ def app(): pprint(asset_list.work_type_figures) - # TODO: Characterise the properties that didn't qualify - eg = asset_list.standardised_asset_list[ - pd.isnull(asset_list.standardised_asset_list["solar_reason"]) - ] - eg[asset_list.EPC_API_DATA_NAMES["floor-description"]].value_counts() - - # TODO: Look into the estimated ones - eg["estimated"].value_counts() - - eg = eg[eg[asset_list.STANDARD_HEATING_SYSTEM] == "high heat retention storage heaters"] - eg[asset_list.STANDARD_WALL_CONSTRUCTION].value_counts() - eg = eg[eg[asset_list.STANDARD_WALL_CONSTRUCTION] == "filled cavity"] - eg[asset_list.EPC_API_DATA_NAMES["roof-description"]].value_counts() - eg[asset_list.EPC_API_DATA_NAMES["floor-description"]].value_counts() - eg["epc_has_floor_recommendation"].value_counts() - asset_list.flat_analysis() asset_list.load_contact_details( diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py new file mode 100644 index 00000000..87f36985 --- /dev/null +++ b/asset_list/mappings/built_form.py @@ -0,0 +1,20 @@ +STANDARD_BUILT_FORMS = { + "unknown", + # Houses + "end-terrace", "semi-detached", "detached", "mid-terrace", + # Flats + "ground floor", "mid-floor", "top-floor" +} + +BUILT_FORM_MAPPINGS = { + 'House (End Terrace)': 'end-terrace', + 'Ground Floor Flat General': 'ground floor', + 'House (Semi)': 'semi-detached', + 'House (Mid Terrace)': 'mid-terrace', + 'Bungalow': 'unknown', + 'House (Mid terrace)': 'mid-terrace', + 'Maisonette': 'unknown', + 'Flat': 'unknown', + 'First Floor Flat General': 'mid-floor', + 'Bungalow (Semi)': 'semi-detached' +} diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 89c97d7e..f3156860 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -112,7 +112,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Cavity Unknown insulation': 'cavity unknown insulation', 'Timber frame As-built': 'timber frame', 'System build Unknown insulation': 'system built', - 'Cavity As-built': 'unknown', + 'Cavity As-built': 'uninsulated cavity', 'System build External': 'system built', 'ND (inferred) ND (inferred)': 'unknown', 'Solid brick External': 'insulated solid brick', From 0a7fb131ef9bcf647a3e95117e929840159a1320 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Mar 2025 11:08:59 +0000 Subject: [PATCH 235/255] debuyggin solar api when no data found --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/DataMapper.py | 178 ++++++++++++++++++ asset_list/app.py | 181 ++---------------- asset_list/utils.py | 183 ++++++++++++++++++ backend/Property.py | 17 +- backend/SearchEpc.py | 25 ++- backend/apis/GoogleSolarApi.py | 43 +++-- backend/app/assumptions.py | 3 + etl/customers/mod/pilot/1. Create Sample.py | 197 ++++++++++++++++++-- recommendations/SolarPvRecommendations.py | 18 ++ 11 files changed, 629 insertions(+), 220 deletions(-) create mode 100644 asset_list/DataMapper.py create mode 100644 asset_list/utils.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 96ad7a95..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index fb10c6b0..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/DataMapper.py b/asset_list/DataMapper.py new file mode 100644 index 00000000..ac1b8db3 --- /dev/null +++ b/asset_list/DataMapper.py @@ -0,0 +1,178 @@ +# OpenAI API Key (set this in your environment variables for security) +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + + +class DataRemapper: + def __init__(self, standard_values, standard_map=None, max_tokens=1000): + """ + Initialize the remapper with standard values and a predefined mapping. + + :param standard_values: Set of allowed standardized values. + :param standard_map: Dictionary of common remappings {raw_value: standard_value}. + """ + self.standard_values = standard_values + self.standard_map = standard_map + self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity + self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing + + # Tokenizer for counting tokens + self.tokenizer = tiktoken.encoding_for_model(self.ai_model) + + # Track token usage and remap dictionary + self.total_tokens_used = 0 + self.total_cost = 0 + self.remap_dict = {} # {original_value: standardized_value} + self.max_tokens = max_tokens # Limit for OpenAI API + + # Memoization for AI calls + self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}} + # Capture the reponse for debugging + self.ai_response = None + + # OpenAI pricing (as of Feb 2024) + self.pricing = { + "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000}, + "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000}, + } + + self.openai_client = OpenAI(api_key=OPENAI_API_KEY) + + @staticmethod + def clean_string(text): + """Basic text cleaning: remove extra spaces, punctuation, and normalize case.""" + if not isinstance(text, str): + return None + text = text.strip().lower() + text = re.sub(r'[^\w\s]', '', text) # Remove punctuation + # Replace double strings + text = re.sub(r'\s+', ' ', text) + return text + + def fuzzy_match(self, text): + """Use fuzzy matching to find the closest standard value.""" + match, score = process.extractOne(text, self.standard_values) if text else (None, 0) + return match if score >= self.fuzzy_threshold else None + + def count_tokens(self, text): + """Estimate the number of tokens in a given text.""" + return len(self.tokenizer.encode(text)) if text else 0 + + def ai_standardize(self, unmapped_values): + """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization.""" + if not unmapped_values: + return {} + + unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization + if unmapped_tuple in self.ai_cache: + return self.ai_cache[unmapped_tuple] # Return memoized result + + prompt = f""" + You are an expert in data classification. Standardize each of these values into one of the categories: + {list(self.standard_values)}. + + Return only a JSON dictionary where: + - The keys are the original values. + - The values are the standardized ones. + + Strictly return JSON **without markdown formatting** or extra text. + + Example Output: + {{ + "BLKHOUS": "block house", + "BEDSIT": "bedsit" + }} + + Values to standardize: + {unmapped_values} + """ + + # Count input tokens + input_tokens = self.count_tokens(prompt) + if input_tokens > self.max_tokens: + raise ValueError("Input tokens exceed the maximum limit.") + + logger.info("Calling OpenAI API for standardization...") + response = self.openai_client.chat.completions.create( + model=self.ai_model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=0.1, + ) + + output_text = response.choices[0].message.content.strip() + output_tokens = self.count_tokens(output_text) # Count output tokens + + # Track total token usage + self.total_tokens_used += input_tokens + output_tokens + + # Estimate cost + input_cost = input_tokens * self.pricing[self.ai_model]["input"] + output_cost = output_tokens * self.pricing[self.ai_model]["output"] + self.total_cost += input_cost + output_cost + + try: + # Parse response as dictionary + mapping = eval(output_text) # OpenAI should return a valid dictionary + except: + mapping = {val: "unknown" for val in unmapped_values} # Fallback + + # Memoize the AI response + self.ai_cache[unmapped_tuple] = mapping + # We store the raw AI response for debugging + logger.debug(f"AI Response: {mapping}") + self.ai_response = output_text + + return mapping + + def standardize_list(self, values_to_remap): + """ + Standardizes a list of values and returns a dictionary {original_value: standardized_value}. + + :param values_to_remap: List of raw values to standardize. + :return: Dictionary {original_value: standardized_value}. + """ + unique_values = set(values_to_remap) # Process only unique values + + unmapped_values = [] + for value in unique_values: + if pd.isna(value): # Handle NaN values + self.remap_dict[value] = "unknown" + continue + + cleaned_value = self.clean_string(value) + + # Rule-Based Check (Predefined Mapping) + if cleaned_value in self.standard_map or value in self.standard_map: + self.remap_dict[value] = ( + self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value] + ) + continue + + if value.lower() in self.standard_map: + self.remap_dict[value] = self.standard_map[value.lower()] + continue + + # Exact Match in Standard Values + if cleaned_value in self.standard_values: + self.remap_dict[value] = cleaned_value + continue + + # Fuzzy Matching + fuzzy_match = self.fuzzy_match(cleaned_value) + if fuzzy_match: + self.remap_dict[value] = fuzzy_match + continue + + # Capture anything that wasn't mapped + unmapped_values.append(value) + + # AI Model - remap anything unmapped (batch request) + ai_mapping = self.ai_standardize(unmapped_values) + self.remap_dict.update(ai_mapping) + + return self.remap_dict + + def report_usage(self): + """Prints a summary of token usage and cost.""" + print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}") + print(f"💰 Estimated Cost: ${self.total_cost:.4f}") diff --git a/asset_list/app.py b/asset_list/app.py index e9cd7c3f..088f1603 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -1,9 +1,6 @@ import os -import time import json import pandas as pd -import numpy as np -from tqdm import tqdm from pprint import pprint import msgpack from utils.s3 import read_from_s3 @@ -13,181 +10,15 @@ from asset_list.mappings.built_form import BUILT_FORM_MAPPINGS from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS from asset_list.mappings.heating_systems import HEATING_MAPPINGS from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS +from asset_list.utils import get_data from dotenv import load_dotenv from backend.SearchEpc import SearchEpc -from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -def get_data( - df, manual_uprn_map, epc_api_only=False, row_id_name="row_id" -): - uprn_column = AssetList.STANDARD_UPRN - fulladdress_column = AssetList.STANDARD_FULL_ADDRESS - address1_column = AssetList.STANDARD_ADDRESS_1 - postcode_column = AssetList.STANDARD_POSTCODE - - # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs - property_type_map = { - "house": "House", - "flat": "Flat", - "maisonette": "Maisonette", - "bungalow": "Bungalow", - "block house": "House", - "coach house": "House", - "bedsit": "Flat" - } - - built_form_map = { - "mid-terrace": "Mid-Terrace", - "end-terrace": "End-Terrace", - "semi-detached": "Semi-Detached", - "detached": "Detached" - } - - epc_data = [] - errors = [] - no_epc = [] - for _, home in tqdm(df.iterrows(), total=len(df)): - try: - - # If we have a block of flats, we cannot retrieve this data - if home[AssetList.STANDARD_PROPERTY_TYPE] == "block of flats": - no_epc.append(home[row_id_name]) - continue - - postcode = home[postcode_column] - house_number = str(home[address1_column]).strip() - full_address = home[fulladdress_column].strip() - house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) - if house_no is None: - house_no = house_number - uprn = manual_uprn_map.get(full_address, None) - if uprn is None and home.get(uprn_column): - uprn = home[uprn_column] - - if pd.isnull(uprn): - uprn = None - - property_type = property_type_map.get(home[AssetList.STANDARD_PROPERTY_TYPE], None) - built_form = built_form_map.get(home[AssetList.STANDARD_BUILT_FORM]) - - searcher = SearchEpc( - address1=str(house_no), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5, - uprn=uprn - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None - - searcher.find_property(skip_os=True) - - # Check if we have a flat or appartment - if searcher.newest_epc is None and uprn is None: - # Try again: - if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: - # Backup - add1 = full_address.split(",") - if len(add1) > 1: - add1 = add1[1].strip() - else: - # Try splitting on space - add1 = full_address.split(" ")[0].strip() - - else: - add1 = str(house_number) - searcher = SearchEpc( - address1=add1, - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5 - ) - - if ( - "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in - house_number.lower() - ): - searcher.ordnance_survey_client.property_type = "Flat" - - searcher.find_property(skip_os=True) - - # As a final resort, we estimate the EPC - if property_type is not None and searcher.newest_epc is None: - searcher.ordnance_survey_client.property_type = property_type - searcher.ordnance_survey_client.built_form = built_form - searcher.find_property(skip_os=True) - - if searcher.newest_epc is None: - no_epc.append(home[row_id_name]) - continue - - # Look for EPC recommendatons - try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} - - if epc_api_only: - epc = { - row_id_name: home[row_id_name], - **searcher.newest_epc.copy(), - "recommendations": property_recommendations["rows"] - } - - epc_data.append(epc) - continue - - # Retrieve data from FindMyEPC - try: - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - except ValueError as e: - if "No EPC found" in str(e) and "address1" in searcher.newest_epc: - try: - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - except ValueError as e: - if "No EPC found" in str(e): - find_epc_data = {} - else: - find_epc_data = {} - except Exception as e: - raise Exception(f"Error retrieving FindMyEPC data: {e}") - time.sleep(np.random.uniform(0.1, 1)) - - epc = { - row_id_name: home[row_id_name], - **searcher.newest_epc.copy(), - "recommendations": property_recommendations["rows"], - "find_my_epc_data": find_epc_data, - } - - epc_data.append(epc) - except Exception as e: - errors.append(home[row_id_name]) - time.sleep(5) - - return epc_data, errors, no_epc - - def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): if method == "first_two_words": asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") @@ -507,6 +338,12 @@ def app(): epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, row_id_name=asset_list.DOMNA_PROPERTY_ID, + uprn_column=AssetList.STANDARD_UPRN, + fulladdress_column=AssetList.STANDARD_FULL_ADDRESS, + address1_column=AssetList.STANDARD_ADDRESS_1, + postcode_column=AssetList.STANDARD_POSTCODE, + property_type_column=AssetList.STANDARD_PROPERTY_TYPE, + built_form_column=AssetList.STANDARD_BUILT_FORM, manual_uprn_map=manual_uprn_map, epc_api_only=epc_api_only ) @@ -516,6 +353,10 @@ def app(): epc_data_failed, _, _ = get_data( df=chunk_failed, row_id_name=asset_list.DOMNA_PROPERTY_ID, + uprn_column=AssetList.STANDARD_UPRN, + fulladdress_column=AssetList.STANDARD_FULL_ADDRESS, + address1_column=AssetList.STANDARD_ADDRESS_1, + postcode_column=AssetList.STANDARD_POSTCODE, manual_uprn_map=manual_uprn_map, epc_api_only=epc_api_only ) diff --git a/asset_list/utils.py b/asset_list/utils.py new file mode 100644 index 00000000..ff9db3f8 --- /dev/null +++ b/asset_list/utils.py @@ -0,0 +1,183 @@ +import time +import numpy as np +import pandas as pd +from backend.SearchEpc import SearchEpc +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from tqdm import tqdm +from utils.logger import setup_logger + +logger = setup_logger() + + +def get_data( + df, + manual_uprn_map, + epc_auth_token, + uprn_column, + fulladdress_column, + address1_column, + postcode_column, + property_type_column, + built_form_column, + epc_api_only=False, + row_id_name="row_id", +): + # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs + property_type_map = { + "house": "House", + "flat": "Flat", + "maisonette": "Maisonette", + "bungalow": "Bungalow", + "block house": "House", + "coach house": "House", + "bedsit": "Flat" + } + + built_form_map = { + "mid-terrace": "Mid-Terrace", + "end-terrace": "End-Terrace", + "semi-detached": "Semi-Detached", + "detached": "Detached" + } + + epc_data = [] + errors = [] + no_epc = [] + for _, home in tqdm(df.iterrows(), total=len(df)): + try: + + # If we have a block of flats, we cannot retrieve this data + if home.get(property_type_column) == "block of flats": + no_epc.append(home[row_id_name]) + continue + + postcode = home[postcode_column] + house_number = str(home[address1_column]).strip() + full_address = home[fulladdress_column].strip() + house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) + if house_no is None: + house_no = house_number + uprn = manual_uprn_map.get(full_address, None) + if uprn is None and home.get(uprn_column): + uprn = home[uprn_column] + + if pd.isnull(uprn): + uprn = None + + property_type = property_type_map.get(home.get(property_type_column), None) + built_form = built_form_map.get(home.get(built_form_column)) + + searcher = SearchEpc( + address1=str(house_no), + postcode=postcode, + auth_token=epc_auth_token, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5, + uprn=uprn + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + + # Check if we have a flat or appartment + if searcher.newest_epc is None and uprn is None: + # Try again: + if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: + # Backup + add1 = full_address.split(",") + if len(add1) > 1: + add1 = add1[1].strip() + else: + # Try splitting on space + add1 = full_address.split(" ")[0].strip() + + else: + add1 = str(house_number) + searcher = SearchEpc( + address1=add1, + postcode=postcode, + auth_token=epc_auth_token, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + + if ( + "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in + house_number.lower() + ): + searcher.ordnance_survey_client.property_type = "Flat" + + searcher.find_property(skip_os=True) + + # As a final resort, we estimate the EPC + if property_type is not None and searcher.newest_epc is None: + searcher.ordnance_survey_client.property_type = property_type + searcher.ordnance_survey_client.built_form = built_form + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + no_epc.append(home[row_id_name]) + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + if epc_api_only: + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + continue + + # Retrieve data from FindMyEPC + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e) and "address1" in searcher.newest_epc: + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e): + find_epc_data = {} + else: + logger.error(f"Error retrieving FindMyEPC data: {e}") + raise Exception(f"Error retrieving FindMyEPC data: {e}") + else: + find_epc_data = {} + except Exception as e: + raise Exception(f"Error retrieving FindMyEPC data: {e}") + time.sleep(np.random.uniform(0.1, 1)) + + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"], + "find_my_epc_data": find_epc_data, + } + + epc_data.append(epc) + except Exception as e: + errors.append(home[row_id_name]) + time.sleep(5) + + return epc_data, errors, no_epc diff --git a/backend/Property.py b/backend/Property.py index eaffd54d..498fe0e0 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -226,25 +226,20 @@ class Property: # as we collect more data from the energy assessment n_bathrooms = kwargs.get("n_bathrooms", None) - if n_bathrooms not in [None, ""]: - # We add on a small value to ensure that the number of bathrooms is rounded up, in case the value is 0.5 - n_bathrooms = int(round(float(n_bathrooms) + 1e-5)) + # We add on a small value to ensure that the number of bathrooms is rounded up, in case the value is 0.5 + n_bathrooms = int(round(float(n_bathrooms) + 1e-5)) if n_bathrooms not in [None, ""] else None n_bedrooms = kwargs.get("n_bedrooms", None) - if n_bedrooms not in [None, ""]: - n_bedrooms = int(round(float(n_bedrooms) + 1e-5)) + n_bedrooms = int(round(float(n_bedrooms) + 1e-5)) if n_bedrooms not in [None, ""] else None number_of_floors = kwargs.get("number_of_floors", None) - if number_of_floors not in [None, ""]: - number_of_floors = int(round(float(number_of_floors) + 1e-5)) + number_of_floors = int(round(float(number_of_floors) + 1e-5)) if number_of_floors not in [None, ""] else None insulation_floor_area = kwargs.get("insulation_floor_area", None) - if insulation_floor_area not in [None, ""]: - insulation_floor_area = float(insulation_floor_area) + insulation_floor_area = float(insulation_floor_area) if insulation_floor_area not in [None, ""] else None insulation_wall_area = kwargs.get("insulation_wall_area", None) - if insulation_wall_area not in [None, ""]: - insulation_wall_area = float(insulation_wall_area) + insulation_wall_area = float(insulation_wall_area) if insulation_wall_area not in [None, ""] else None return { "n_bathrooms": n_bathrooms, diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 0d921bec..d33b2e70 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -308,12 +308,20 @@ class SearchEpc: self.data = output["response"] return output["msg"] + if not self.uprn and not self.address1 and not self.postcode: + raise ValueError("No search parameters provided") + uprn_params = {"uprn": self.uprn} if self.uprn else {} - address_params = {"address": self.address1, "postcode": self.postcode} + address_params = {} + if self.address1: + address_params["address"] = self.address1 + if self.postcode: + address_params["postcode"] = self.postcode # We attempt the search with uprn params data = {"rows": []} + api_response = {} if uprn_params: api_response = self._get_epc(params=uprn_params, size=size) if api_response["msg"]["status"] == 200: @@ -321,14 +329,15 @@ class SearchEpc: # If we were unsuccessful, we then make a second attempt to fetch the data. We find that # properties are sometimes listed under the wrong UPRN - api_response = self._get_epc(params=address_params, size=size) - if api_response["msg"]["status"] == 200: - # We update the data with the correct uprn - if self.uprn: - for x in api_response["response"]["rows"]: - x["uprn"] = self.uprn + if address_params: + api_response = self._get_epc(params=address_params, size=size) + if api_response["msg"]["status"] == 200: + # We update the data with the correct uprn + if self.uprn: + for x in api_response["response"]["rows"]: + x["uprn"] = self.uprn - data["rows"].extend(api_response["response"]["rows"]) + data["rows"].extend(api_response["response"]["rows"]) # We no de-dupe on lmk-key to avoid duplicates seen = set() diff --git a/backend/apis/GoogleSolarApi.py b/backend/apis/GoogleSolarApi.py index 183503d5..31ae39bd 100644 --- a/backend/apis/GoogleSolarApi.py +++ b/backend/apis/GoogleSolarApi.py @@ -9,8 +9,7 @@ from tqdm import tqdm from math import sin, cos, sqrt, atan2, radians from utils.logger import setup_logger -from recommendations.Costs import Costs, MCS_SOLAR_PV_COST_DATA -from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel +from recommendations.Costs import Costs from backend.ml_models.AnnualBillSavings import AnnualBillSavings from backend.Property import Property from backend.app.db.functions.solar_functions import get_solar_data, store_batch_data @@ -54,6 +53,9 @@ class GoogleSolarApi: # Max area of a roof space we allow panels for PERCENTAGE_OF_ROOF_LIMIT = 0.8 + # Error Messages + ENTITY_NOT_FOUND_ERROR = 'Requested entity was not found.' + def __init__(self, api_key, max_retries=5): """ Initialize the GoogleSolarApi class with the provided API key and maximum retries. @@ -112,6 +114,13 @@ class GoogleSolarApi: response.raise_for_status() # Raise an error for bad status codes return response.json() except requests.exceptions.RequestException as e: + if ( + (e.response.status_code == 404) & + (e.response.json()["error"]["message"] == self.ENTITY_NOT_FOUND_ERROR) + ): + logger.warning("No building insights found for the given location.") + return {"error": self.ENTITY_NOT_FOUND_ERROR} + attempt += 1 print(f"Attempt {attempt} failed: {e}") time.sleep(2 ** attempt) # Exponential backoff @@ -155,6 +164,11 @@ class GoogleSolarApi: # If we have no data in the db, or updated_at is more than 6 months if self.insights_data is None or is_outdated: self.insights_data = self.get_building_insights(longitude, latitude, required_quality) + if self.insights_data.get("error") == self.ENTITY_NOT_FOUND_ERROR: + # We use default performance since in this case, we couldn't retrieve data. We don't store + self.panel_performance = self.default_panel_performance(property_instance=property_instance) + + return self.need_to_store = True # Extract key data from the insights response @@ -820,7 +834,6 @@ class GoogleSolarApi: if unit["longitude"] is None or unit["latitude"] is None: # At this point, we've checked that solar PV is valid, and so we provide some defaults - property_instance.set_solar_panel_configuration( solar_panel_configuration={ "insights_data": None, @@ -875,19 +888,19 @@ class GoogleSolarApi: cost_instance = Costs(property_instance=property_instance) - # We return a 2.4 and 4 kwp system + # We return a 1.6 and 3.2 kwp system panel_performance = pd.DataFrame( [ { - 'n_panels': 10, - 'yearly_dc_energy': 4000 * 0.99, # Assumed 99% efficient wattage -> dc + 'n_panels': 8, + 'yearly_dc_energy': 3200 * assumptions.MEDIAN_WATTAGE_TO_DC, 'total_cost': cost_instance.solar_pv( - n_panels=10, has_battery=False, n_floors=property_instance.number_of_floors + n_panels=8, has_battery=False, n_floors=property_instance.number_of_floors )["total"], 'weighted_ratio': None, - 'panneled_roof_area': 10 * assumptions.RDSAP_AREA_PER_PANEL, - 'array_wattage': 4000, - 'initial_ac_kwh_per_year': 4000 * 0.95, # Assumed 95% efficient wattage -> ac + 'panneled_roof_area': 8 * assumptions.RDSAP_AREA_PER_PANEL, + 'array_wattage': 3200, + 'initial_ac_kwh_per_year': 3200 * assumptions.MEDIAN_WATTAGE_TO_AC, 'lifetime_ac_kwh': None, 'lifetime_dc_kwh': None, 'roi': None, @@ -899,15 +912,15 @@ class GoogleSolarApi: 'rank': None }, { - 'n_panels': 6, - 'yearly_dc_energy': 2400 * 0.99, # Assumed 99% efficient wattage -> dc + 'n_panels': 4, + 'yearly_dc_energy': 1600 * assumptions.MEDIAN_WATTAGE_TO_DC, 'total_cost': cost_instance.solar_pv( n_panels=6, has_battery=False, n_floors=property_instance.number_of_floors )["total"], 'weighted_ratio': None, - 'panneled_roof_area': 6 * assumptions.RDSAP_AREA_PER_PANEL, - 'array_wattage': 2400, - 'initial_ac_kwh_per_year': 2400 * 0.95, # Assumed 95% efficient wattage -> ac + 'panneled_roof_area': 4 * assumptions.RDSAP_AREA_PER_PANEL, + 'array_wattage': 1600, + 'initial_ac_kwh_per_year': 1600 * assumptions.MEDIAN_WATTAGE_TO_AC, 'lifetime_ac_kwh': None, 'lifetime_dc_kwh': None, 'roi': None, diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py index 8d0c05be..261e2b62 100644 --- a/backend/app/assumptions.py +++ b/backend/app/assumptions.py @@ -11,6 +11,9 @@ SOLAR_CONSUMPTION_WITH_BATTERY_PROPORTION = 0.7 # Typically, each solar panel takes up around 3.4 m2 of roof space under RdSAP. This was been verified in Elmhurst RDSAP_AREA_PER_PANEL = 3.4 +# This is a median based on a sample of properties +MEDIAN_WATTAGE_TO_AC = 0.965 +MEDIAN_WATTAGE_TO_DC = 0.99 SOCIAL_TENURES = ["Rented (social)", "rental (social)"] diff --git a/etl/customers/mod/pilot/1. Create Sample.py b/etl/customers/mod/pilot/1. Create Sample.py index e1f9b444..97480d51 100644 --- a/etl/customers/mod/pilot/1. Create Sample.py +++ b/etl/customers/mod/pilot/1. Create Sample.py @@ -1,4 +1,17 @@ +import os import pandas as pd +from tqdm import tqdm +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from etl.spatial.OpenUprnClient import OpenUprnClient +from asset_list.utils import get_data +from utils.s3 import save_csv_to_s3 + +PORTFOLIO_ID = 139 +USER_ID = 8 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def app(): @@ -9,26 +22,182 @@ def app(): folder_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme" sample_list = pd.read_excel(f"{folder_path}/20250227_DIO_Accommodation_Sample_Properties.xlsx") asset_data = pd.read_excel(f"{folder_path}/20250303_DIO_Accommodation_Property_Attribution.xlsx") - asset_data["BLNDG_GOVERMENT_UPRN"] = asset_data["BLNDG_GOVERMENT_UPRN"].astype("Int64") - asset_data["BLNDG_GOVERMENT_UPRN"].nunique() - for _id in asset_data["ESTB_ID"].unique(): - data = asset_data[asset_data["ESTB_ID"] == _id] - z = data["BLNDG_GOVERMENT_UPRN"] + sample_list = sample_list[sample_list["BLDNG_COUNTRY_NAME"].isin(["ENGLAND", "WALES"])] - data["BLNDG_GOVERMENT_UPRN"].unique() + # Merge on the UPRN + sample_list = sample_list.merge( + asset_data[["BLDNG_ID", "BLNDG_GOVERMENT_UPRN"]].drop_duplicates(), + how="left", on="BLDNG_ID" + ) + sample_list["BLNDG_GOVERMENT_UPRN"] = sample_list["BLNDG_GOVERMENT_UPRN"].astype("Int64") - asset_data["BLNDG_GOVERMENT_UPRN"].unique() + # Use the EPC API to get corrected postcodes + model_asset_list = [] + missed = [] + for _, x in tqdm(sample_list.iterrows(), total=len(sample_list)): - df = asset_data.groupby("BLNDG_GOVERMENT_UPRN")["ESTB_ID"].nunique().sort_values(ascending=False).reset_index() + if pd.isnull(x["BLNDG_GOVERMENT_UPRN"]): + continue + searcher = SearchEpc( + address1="", + postcode="", + uprn=x["BLNDG_GOVERMENT_UPRN"], + auth_token=EPC_AUTH_TOKEN, + os_api_key="" + ) + searcher.find_property(skip_os=True) + newest_epc = searcher.newest_epc + if newest_epc is None: + missed.append(x["BLNDG_GOVERMENT_UPRN"]) + continue - example = asset_data[asset_data["BLNDG_GOVERMENT_UPRN"] == df.head(1)["BLNDG_GOVERMENT_UPRN"].values[0]] + model_asset_list.append(newest_epc) - asset_data[asset_data["BLNDG_GOVERMENT_UPRN"]] + model_asset_list = pd.DataFrame(model_asset_list) + model_asset_list["uprn"] = model_asset_list["uprn"].astype(int) - asset_data = asset_data[asset_data["ESTB_ID"].isin(sample_list["ESTB_ID"].values)] - asset_data.drop_duplicates("ESTB_ID", inplace=True) + spatial_data = OpenUprnClient.get_spatial_data( + uprns=model_asset_list["uprn"].tolist(), bucket_name="retrofit-data-dev" + ) - [x for x in asset_data.columns if "uprn" in x.lower()] + # We determine if the building is listed, heritage or in a conservation area - example = asset_data[asset_data["ESTB_ID"] == 1547072] + # Merge on the property features + features = asset_data.drop( + columns=["BUILDING_SYSTEM_ITEM_NAME", "OBSERVED_CONDITION_DESCRIPTION"] + ).drop_duplicates() + + df = features.merge( + model_asset_list, how="inner", right_on="uprn", left_on="BLNDG_GOVERMENT_UPRN" + ).merge( + pd.DataFrame(spatial_data).rename(columns={"UPRN": "uprn"}), how="left", on="uprn" + ) + + # Store data locally + # df.to_csv(folder_path + "/MOD property data.csv", index=False) + + # Produce as asset list for analysis + + df["row_id"] = df.index + + epc_data, errors, no_epc = get_data( + df=df, + manual_uprn_map={}, + epc_auth_token=EPC_AUTH_TOKEN, + uprn_column="uprn", + fulladdress_column="address", + address1_column="address1", + postcode_column="postcode", + property_type_column=None, + built_form_column=None, + epc_api_only=False, + row_id_name="row_id", + ) + + non_invasive_recommendations = [] + for x in epc_data: + non_invasive_recommendations.append( + { + "uprn": x["uprn"], + "recommendations": x["find_my_epc_data"]["recommendations"] + } + ) + + asset_list = df[ + ["uprn", "address1", "postcode", "NUMBER_OF_BEDROOMS", "BLDNG_STOREYS_QTY", ] + ].rename( + columns={ + "address1": "address", + "NUMBER_OF_BEDROOMS": "n_bedrooms", + "BLDNG_STOREYS_QTY": "number_of_floors" + } + ) + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=asset_list, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + # Scenario 1 - EPC C + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Hit EPC C", + "multi_plan": True, + "budget": None, + # "inclusions": [ + # "cavity_wall_insulation", + # "loft_insulation", + # "windows", + # "solar_pv", + # "air_source_heat_pump" + # ] + } + print(body) + + # Scenario 2 - EPC B + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "B", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Hit EPC B", + "multi_plan": True, + "budget": None, + # "inclusions": [ + # "cavity_wall_insulation", + # "loft_insulation", + # "windows", + # "solar_pv", + # "air_source_heat_pump" + # ] + } + print(body) + + # Scenario 3 - EPC B, 3.5 COP ASHP + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "B", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Hit EPC B - 3.5 COP ASHP", + "multi_plan": True, + "budget": None, + "ashp_cop": 3.5 + # "inclusions": [ + # "cavity_wall_insulation", + # "loft_insulation", + # "windows", + # "solar_pv", + # "air_source_heat_pump" + # ] + } + print(body) diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py index a97dbcb3..77e8fd10 100644 --- a/recommendations/SolarPvRecommendations.py +++ b/recommendations/SolarPvRecommendations.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import backend.app.assumptions as assumptions from recommendations.Costs import Costs from recommendations.recommendation_utils import override_costs, estimate_pitched_roof_area @@ -24,6 +25,23 @@ class SolarPvRecommendations: SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE = 1 + BACKUP_PANEL_PERFORMANCE = pd.DataFrame( + [ + { + "n_panels": 4, + "array_wattage": 1600, + "initial_ac_kwh_per_year": assumptions.MEDIAN_WATTAGE_TO_AC * 1600, + "panneled_roof_area": 4 * assumptions.RDSAP_AREA_PER_PANEL + }, + { + "n_panels": 8, + "array_warrage": 3200, + "initial_ac_kwh_per_year": assumptions.MEDIAN_WATTAGE_TO_AC * 3200, + "panneled_roof_area": 8 * assumptions.RDSAP_AREA_PER_PANEL + }, + ] + ) + def __init__(self, property_instance): """ :param property_instance: Instance of the Property class, for the home associated to property_id From 792be8468b2c53406e37dafc752123c70b6bb1d9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Mar 2025 15:47:11 +0000 Subject: [PATCH 236/255] updating solar --- backend/apis/GoogleSolarApi.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/backend/apis/GoogleSolarApi.py b/backend/apis/GoogleSolarApi.py index 31ae39bd..a5c1e739 100644 --- a/backend/apis/GoogleSolarApi.py +++ b/backend/apis/GoogleSolarApi.py @@ -53,6 +53,10 @@ class GoogleSolarApi: # Max area of a roof space we allow panels for PERCENTAGE_OF_ROOF_LIMIT = 0.8 + # If the roof area that comes back from the solar API is more than 25% larger than the estiamted roof area + # that we calcualte based on the property dimensions, we will correct the roof area + ROOF_AREA_TOLERANCE = 1.25 + # Error Messages ENTITY_NOT_FOUND_ERROR = 'Requested entity was not found.' @@ -167,7 +171,6 @@ class GoogleSolarApi: if self.insights_data.get("error") == self.ENTITY_NOT_FOUND_ERROR: # We use default performance since in this case, we couldn't retrieve data. We don't store self.panel_performance = self.default_panel_performance(property_instance=property_instance) - return self.need_to_store = True @@ -182,7 +185,11 @@ class GoogleSolarApi: ): self.exclude_likely_duplicate_surfaces() + # TODO: We need to constrain the roof area, based on the floor area to be more conservative self.roof_area = self.insights_data["solarPotential"]["wholeRoofStats"]['areaMeters2'] + if self.roof_area > property_instance.roof_area * self.ROOF_AREA_TOLERANCE: + self.roof_area = property_instance.roof_area + self.floor_area = self.insights_data["solarPotential"]["wholeRoofStats"]['groundAreaMeters2'] self.panel_wattage = self.insights_data["solarPotential"]["panelCapacityWatts"] if self.panel_wattage != 400: @@ -279,8 +286,6 @@ class GoogleSolarApi: # minimum is 4 min_panels = self.MIN_BUILDING_PANELS if is_building else self.MIN_UNIT_PANELS - cost_instance = Costs(property_instance=property_instance) if property_instance is not None else None - # Remove any north facing roof segments panel_performance = [] for config in self.insights_data["solarPotential"].get("solarPanelConfigs", []): @@ -314,18 +319,12 @@ class GoogleSolarApi: if roi_summary["n_panels"].sum() < min_panels: continue - if cost_instance is None: - total_cost = Costs.solar_pv( - n_panels=roi_summary["n_panels"].sum(), - has_battery=False, - n_floors=3, # Assume the most amount of scaffolding - )["total"] - else: - total_cost = cost_instance.solar_pv( - n_panels=roi_summary["n_panels"].sum(), - has_battery=False, - n_floors=property_instance.number_of_floors, - )["total"] + total_cost = Costs.solar_pv( + n_panels=roi_summary["n_panels"].sum(), + has_battery=False, + # Assume the most amount of scaffolding + n_floors=3 if property_instance is None else property_instance.number_of_floors + )["total"] weighted_ratio = np.average( roi_summary["ratio"].values, weights=roi_summary["generated_dc_energy"].values From c2062507cab66b0d9ec37d7b3021b3353a052409 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 15 Mar 2025 17:34:55 +0000 Subject: [PATCH 237/255] implementing mv --- backend/Property.py | 9 +-- etl/epc/Dataset.py | 16 ++--- etl/epc/Record.py | 26 +++++--- recommendations/Costs.py | 7 +++ recommendations/HeatingControlRecommender.py | 43 ++++++++----- recommendations/HeatingRecommender.py | 21 +++---- recommendations/Recommendations.py | 62 ++++++++++++------- recommendations/VentilationRecommendations.py | 12 +++- recommendations/county_to_region.py | 5 +- 9 files changed, 120 insertions(+), 81 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index 498fe0e0..b9c88bc2 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -380,7 +380,7 @@ class Property: for rec in property_recommendations_by_phase: # We simulate the impact of the recommendation at this current phase, and all of the prior phases - if rec["type"] in ["mechanical_ventilation", "trickle_vents", "draught_proofing"]: + if rec["type"] in ["trickle_vents", "draught_proofing"]: continue scoring_dict = self.create_recommendation_scoring_data( @@ -388,7 +388,6 @@ class Property: recommendation_record=recommendation_record, recommendations=previous_phase_representatives + [rec], primary_recommendation_id=rec["recommendation_id"], - non_invasive_recommendations=self.non_invasive_recommendations, ) self.recommendations_scoring_data.append(scoring_dict) @@ -494,7 +493,6 @@ class Property: recommendation_record, recommendations: list, primary_recommendation_id: int, - non_invasive_recommendations: list = None, ): """ This function will iterate through a list of recommendations and apply a simulation for each recommendation @@ -503,7 +501,6 @@ class Property: :param recommendation_record: The record of the property, which will be updated :param recommendations: The list of recommendations to apply :param primary_recommendation_id: The id of the primary recommendation, which is used to identify the record - :param non_invasive_recommendations: The list of non-invasive recommendations :return: The updated recommendation record """ @@ -532,7 +529,7 @@ class Property: "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation", "cylinder_thermostat", "loft_insulation", "room_roof_insulation", "flat_roof_insulation", "solid_floor_insulation", "suspended_floor_insulation", "mixed_glazing", - "windows_glazing" + "windows_glazing", "mechanical_ventilation" ]: # We update the data, as defined in the recommendaton for prefix in ["walls", "roof", "floor"]: @@ -558,7 +555,7 @@ class Property: "solid_floor_insulation", "suspended_floor_insulation", "windows_glazing", "solar_pv", "heating", "hot_water_tank_insulation", "heating_control", "secondary_heating", "cylinder_thermostat", "mixed_glazing", - "extension_cavity_wall_insulation", + "extension_cavity_wall_insulation", "mechanical_ventilation", ]: raise NotImplementedError( "Implement me, given type %s" % recommendation["type"] diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 3f2e810e..83a85b78 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -203,11 +203,11 @@ class TrainingDataset(BaseDataset): common_cols = [[col + "_starting", col + "_ending"] for col in common_cols] self.df = self.df.loc[ - :, - no_suffix_cols - + only_ending_cols - + [col for cols in common_cols for col in cols], - ] + :, + no_suffix_cols + + only_ending_cols + + [col for cols in common_cols for col in cols], + ] def _remove_abnormal_change_in_floor_area(self): """ @@ -511,7 +511,7 @@ class TrainingDataset(BaseDataset): expanded_df["is_sandstone_or_limestone"] == expanded_df["is_sandstone_or_limestone_ending"] ) - ] + ] elif component == "floor": expanded_df = expanded_df[ (expanded_df["is_suspended"] == expanded_df["is_suspended_ending"]) @@ -528,7 +528,7 @@ class TrainingDataset(BaseDataset): expanded_df["is_to_external_air"] == expanded_df["is_to_external_air_ending"] ) - ] + ] elif component == "roof": expanded_df = expanded_df[ (expanded_df["is_pitched"] == expanded_df["is_pitched_ending"]) @@ -541,7 +541,7 @@ class TrainingDataset(BaseDataset): expanded_df["has_dwelling_above"] == expanded_df["has_dwelling_above_ending"] ) - ] + ] return expanded_df diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 558dbacb..9ff1de0a 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -139,28 +139,22 @@ class EPCRecord: self._clean_records_using_epc_records() self._clean_with_data_processor() - self._expand_prepared_epc_to_attributes() - self._identify_delta_between_prepared_and_original_records() # Process to create uvalues for the single epc record - - # selff.df = self.epc_record_as_dataframe('prepared_epc') - + # self.df = self.epc_record_as_dataframe('prepared_epc') # self._feature_generation() # self._drop_features() return - self._expand_description_to_features() - self._expand_description_to_uvalues() - + # self._expand_description_to_features() + # self._expand_description_to_uvalues() + # # self._generate_uvalues() # self._validate_expanded_description() # self._validate_u_values() - # etc - pass def _drop_features(self): """ @@ -360,6 +354,7 @@ class EPCRecord: self._clean_number_lighting_outlets() self._clean_floor_level() self._clean_floor_height() + self._clean_constituency() # self._clean_potential_energy_efficiency() # self._clean_environment_impact_potential() @@ -402,6 +397,17 @@ class EPCRecord: if self.prepared_epc["floor-height"] <= 1.665: self.prepared_epc["floor-height"] = average + def _clean_constituency(self): + """ + We handle the single case of finding a missing constituency by using the local authority + """ + if pd.isnull(self.prepared_epc["constituency"]) or (self.prepared_epc["constituency"] == ""): + if self.prepared_epc["local-authority"] != "E06000044": + raise NotImplementedError( + "This function is only implemented for Portsmouth, in the single edgecase seen" + ) + self.prepared_epc["constituency"] = "E14000883" + def _clean_floor_level(self): """ This method will clean the floor level, if empty or invalid diff --git a/recommendations/Costs.py b/recommendations/Costs.py index 2312dff2..4d25ec18 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -234,6 +234,13 @@ class Costs: if self.region is None: # Try and grab using the local-authority-label self.region = county_to_region_map.get(self.property.data["local-authority-label"], None) + + if self.region is None: + # Try and get the region after converting the keys to lower + self.region = { + k.lower(): v for k, v in county_to_region_map.items() + }.get(self.property.data["local-authority-label"].lower(), None) + if self.region is None: raise ValueError("Region not found in county map") diff --git a/recommendations/HeatingControlRecommender.py b/recommendations/HeatingControlRecommender.py index c613aa42..bd015a79 100644 --- a/recommendations/HeatingControlRecommender.py +++ b/recommendations/HeatingControlRecommender.py @@ -12,7 +12,7 @@ class HeatingControlRecommender: self.recommendation = [] - def recommend(self, heating_description, description_prefix="", description_suffix=""): + def recommend(self, heating_description, phase, description_prefix="", description_suffix=""): # TODO: Many of these functions are quite similar. We can possibly create a single wrapper function that # takes in the heating description and the description prefix/suffix, and then creates the appropriate @@ -23,32 +23,32 @@ class HeatingControlRecommender: # This first iteration of the recommender will provide very basic recommendation # We recommend heating controls based on the main heating system if heating_description in ["Room heaters, electric"]: - self.recommend_room_heaters_electric_controls() + self.recommend_room_heaters_electric_controls(phase=phase) return if heating_description in ["Electric storage heaters", "Electric storage heaters, radiators"]: - self.recommend_high_heat_retention_controls(description_prefix=description_prefix) + self.recommend_high_heat_retention_controls(description_prefix=description_prefix, phase=phase) return if heating_description in ["Boiler and radiators, mains gas"]: # We can recommend roomstat programmer trvs - self.recommend_roomstat_programmer_trvs(description_suffix=description_suffix) + self.recommend_roomstat_programmer_trvs(description_suffix=description_suffix, phase=phase) # We can also recommend time and temperature zone controls - self.recommend_time_temperature_zone_controls(description_suffix=description_suffix) + self.recommend_time_temperature_zone_controls(description_suffix=description_suffix, phase=phase) return if heating_description in ["Boiler and radiators, electric"]: - self.recommend_roomstat_programmer_trvs() + self.recommend_roomstat_programmer_trvs(phase=phase) return if heating_description in ["Air source heat pump, radiators, electric"]: # For an ASHP, we can recommend time and temperature zone controls, as well as programmer, trvs and a bypass # which are common configurations for ASHPs - self.recommend_time_temperature_zone_controls() + self.recommend_time_temperature_zone_controls(phase=phase) # self.recommend_programmer_trvs_bypass() - def recommend_room_heaters_electric_controls(self): + def recommend_room_heaters_electric_controls(self, phase): """ If the home has Room heaters, electric, we start by identifying potential heating controls that could be upgraded, that would provide a practical impact. This will be the least invasive improvement. @@ -88,6 +88,9 @@ class HeatingControlRecommender: self.recommendation.append( { + "phase": phase, + "type": "heating", + "measure_type": "programmer_appliance_thermostat", "description": "upgrade heating controls to Programmer and Appliance or Smart Thermostats", **self.costs.programmer_and_appliance_thermostat(has_programmer=has_programmer), "simulation_config": simulation_config @@ -97,7 +100,7 @@ class HeatingControlRecommender: # We don't implement any other recommendations right now return - def recommend_high_heat_retention_controls(self, description_prefix=""): + def recommend_high_heat_retention_controls(self, phase, description_prefix=""): """ When applicable, we recommend upgrading the heating controls to high heat retention controls. This is a specific type of control system that is designed to work with electric storage heaters. It is a more @@ -133,6 +136,9 @@ class HeatingControlRecommender: self.recommendation.append( { + "phase": phase, + "type": "heating", + "measure_type": "celect_type_controls", "description": "Upgrade heating controls to High Heat Retention Storage Heater Controls", **self.costs.celect_type_controls(), "simulation_config": simulation_config, @@ -143,7 +149,7 @@ class HeatingControlRecommender: # We don't implement any other recommendations right now return - def recommend_roomstat_programmer_trvs(self, description_suffix=""): + def recommend_roomstat_programmer_trvs(self, phase, description_suffix=""): """ If the home has a boiler and radiators, mains gas, we start by identifying potential heating controls that could be upgraded, that would provide a practical impact. @@ -208,15 +214,16 @@ class HeatingControlRecommender: description = "Upgrade heating controls to Room thermostat, programmer and TRVs" - already_installed = "heating_control" in self.property.already_installed + already_installed = "roomstat_programmer_trvs" in self.property.already_installed if already_installed: cost_result = override_costs(cost_result) description = "Heating controls have already been upgraded, no further action needed." self.recommendation.append( { - "type": "heating_control", + "type": "heating", "measure_type": "roomstat_programmer_trvs", + "phase": phase, "parts": [], "description": description, **cost_result, @@ -231,7 +238,7 @@ class HeatingControlRecommender: return - def recommend_time_temperature_zone_controls(self, description_suffix=""): + def recommend_time_temperature_zone_controls(self, phase, description_suffix=""): """ If the home has a boiler, we can recommend time and temperature zone controls. This is a more advanced and more efficient control system than the standard controls that come with a boiler. However, it may come @@ -282,14 +289,15 @@ class HeatingControlRecommender: "temperature zone control)" ) - already_installed = "heating_control" in self.property.already_installed + already_installed = "time_temperature_zone_control" in self.property.already_installed if already_installed: cost_result = override_costs(cost_result) description = "Heating controls have already been upgraded, no further action needed." self.recommendation.append( { - "type": "heating_control", + "type": "heating", + "phase": phase, "measure_type": "time_temperature_zone_control", "parts": [], "description": description, @@ -335,14 +343,15 @@ class HeatingControlRecommender: description = "Install a Bypass valve, TRVs and a Programmer" - already_installed = "heating_control" in self.property.already_installed + already_installed = "programmer_trvs_bypass" in self.property.already_installed if already_installed: cost_result = override_costs(cost_result) description = "Heating controls have already been upgraded, no further action needed." self.recommendation.append( { - "type": "heating_control", + "type": "heating", + "measure_type": "programmer_trvs_bypass", "parts": [], "description": description, **cost_result, diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index e4dd3a78..20f5e7ad 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -65,7 +65,6 @@ class HeatingRecommender: self.costs = Costs(self.property) self.heating_recommendations = [] - self.heating_control_recommendations = [] self.has_electric_heating_description = ( self.property.main_heating["has_electric"] or self.property.main_heating["has_electricaire"] @@ -259,7 +258,6 @@ class HeatingRecommender: "ashp_only_heating_recommendation", False ) self.heating_recommendations = [] - self.heating_control_recommendations = [] # This first iteration of the recommender will provide very basic recommendation # We recommend heating controls based on the main heating system @@ -302,7 +300,6 @@ class HeatingRecommender: self.recommend_air_source_heat_pump( phase=phase, has_cavity_or_loft_recommendations=has_cavity_or_loft_recommendations, - ) return @@ -360,7 +357,7 @@ class HeatingRecommender: } controls_recommender = HeatingControlRecommender(self.property) - controls_recommender.recommend(heating_description="Boiler and radiators, electric") + controls_recommender.recommend(heating_description="Boiler and radiators, electric", phase=phase) self.heating_recommendations.extend([boiler_recommendation] + controls_recommender.recommendation) return @@ -453,7 +450,7 @@ class HeatingRecommender: ), {}) controls_recommender = HeatingControlRecommender(self.property) - controls_recommender.recommend(heating_description="Air source heat pump, radiators, electric") + controls_recommender.recommend(heating_description="Air source heat pump, radiators, electric", phase=phase) ashp_size = self.size_heat_pump() ashp_costs = self.costs.air_source_heat_pump(ashp_size) @@ -805,7 +802,9 @@ class HeatingRecommender: description_prefix = "" controls_recommender.recommend( - heating_description="Electric storage heaters", description_prefix=description_prefix + heating_description="Electric storage heaters", + description_prefix=description_prefix, + phase=phase ) has_hhr = self.is_hhr_already_installed() @@ -1120,10 +1119,10 @@ class HeatingRecommender: description_suffix = "" controls_recommender.recommend( heating_description="Boiler and radiators, mains gas", - description_suffix=description_suffix + description_suffix=description_suffix, + phase=recommendation_phase ) # We may have 2 recommendations from the heating controls - if not controls_recommender.recommendation and not boiler_recommendation: return @@ -1161,10 +1160,6 @@ class HeatingRecommender: # 3) Heating controls only # But they are options that are not mutually exclusive # So, we actually set heating controls as a heating recommendation - for recommendation in controls_recommender.recommendation: - recommendation["phase"] = recommendation_phase - # recommendation["type"] = "heating" - - self.heating_control_recommendations.extend(controls_recommender.recommendation) + self.heating_recommendations.extend(controls_recommender.recommendation) return diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 715332a5..edaa611a 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -149,9 +149,10 @@ class Recommendations: (self.wall_recomender.recommendations or self.roof_recommender.recommendations) and ("ventilation" in measures) ): - self.ventilation_recomender.recommend() + self.ventilation_recomender.recommend(phase=phase) if self.ventilation_recomender.recommendation: property_recommendations.append(self.ventilation_recomender.recommendation) + phase += 1 if "trickle_vents" in measures: # This is a recommendatin that typically comes from an energy assessment @@ -208,27 +209,25 @@ class Recommendations: measures=measures, has_cavity_or_loft_recommendations=has_cavity_or_loft_recommendations, ) - if ( - self.heating_recommender.heating_recommendations or - self.heating_recommender.heating_control_recommendations - ): + if self.heating_recommender.heating_recommendations: # We split into first and second phase recommendations first_phase_recommendations = [ r for r in ( - self.heating_recommender.heating_recommendations + - self.heating_recommender.heating_control_recommendations + self.heating_recommender.heating_recommendations ) if r["phase"] == phase ] second_phase_recommendations = [ r for r in ( - self.heating_recommender.heating_recommendations + - self.heating_recommender.heating_control_recommendations + self.heating_recommender.heating_recommendations ) if r["phase"] == phase + 1 ] + if first_phase_recommendations and second_phase_recommendations: + raise Exception("Imeplement me") + if first_phase_recommendations: property_recommendations.append(first_phase_recommendations) @@ -240,8 +239,7 @@ class Recommendations: # otherwise we incremenet by 1 max_used_phase = max( [rec["phase"] for rec in - self.heating_recommender.heating_recommendations + - self.heating_recommender.heating_control_recommendations] + self.heating_recommender.heating_recommendations] ) amount_to_increment = max_used_phase - phase + 1 phase += amount_to_increment @@ -306,7 +304,7 @@ class Recommendations: # want to include the cavity wall insulation recommendation in the defaults if recommendations_by_type[0].get("type") in [ - "mechanical_ventilation", "trickle_vents", "draught_proofing" + "trickle_vents", "draught_proofing" ]: continue @@ -480,12 +478,14 @@ class Recommendations: increasing_variables = ["sap"] decreasing_variables = ["carbon", "heat_demand"] + # If the recommendation is mechanical ventilation, we don't apply the rule that the new value should be higher + mv_increasing_variables = ["carbon", "heat_demand"] + mv_decreasing_variables = ["sap"] + impact_summary = [] for recommendations_by_type in property_recommendations: for rec in recommendations_by_type: - if rec["type"] in [ - "mechanical_ventilation", "trickle_vents", "draught_proofing", "extension_cavity_wall_insulation" - ]: + if rec["type"] in ["trickle_vents", "draught_proofing", "extension_cavity_wall_insulation"]: # We don't have a percieved sap impact of mechanical ventilation or trickle vents, and we don't # have the capacity to score draught proofing if rec["type"] == "extension_cavity_wall_insulation": @@ -571,13 +571,23 @@ class Recommendations: # For decreasing variables, the new value should be lower than the previous, otherwise we set it to # the previous # In either case, we adjudge the recommendation to have had no/negligible impact - for v in increasing_variables: + # However, if the recommendation is mechanical ventilation, this can have a negative SAP impact so + # we don't apply this rule + + if rec["type"] == "mechanical_ventilation": + phase_increasing_variables = mv_increasing_variables + phase_decreasing_variables = mv_decreasing_variables + else: + phase_increasing_variables = increasing_variables + phase_decreasing_variables = decreasing_variables + + for v in phase_increasing_variables: current_phase_values[v] = ( current_phase_values[v] if current_phase_values[v] > previous_phase_values[v] else previous_phase_values[v] ) for v in previous_phase_values: - if v in decreasing_variables: + if v in phase_decreasing_variables: current_phase_values[v] = ( current_phase_values[v] if current_phase_values[v] < previous_phase_values[v] else previous_phase_values[v] @@ -592,13 +602,19 @@ class Recommendations: "heat_demand": previous_phase_values["heat_demand"] - current_phase_values["heat_demand"], } - # Prevent from being negative + # Prevent from being negative - apart from ventilation for metric in ["sap", "carbon", "heat_demand"]: - property_phase_impact[metric] = ( - 0 if property_phase_impact[metric] < 0 else property_phase_impact[metric] - ) - if metric == "sap": - property_phase_impact[metric] = round(property_phase_impact[metric], 2) + if rec["type"] != "mechanical_ventilation": + property_phase_impact[metric] = ( + 0 if property_phase_impact[metric] < 0 else property_phase_impact[metric] + ) + if metric == "sap": + property_phase_impact[metric] = round(property_phase_impact[metric], 2) + else: + # We prevent these from being positive + property_phase_impact[metric] = ( + 0 if property_phase_impact[metric] > 0 else property_phase_impact[metric] + ) # For the moment, we cap the number of SAP points that can be achieved by LEDs at 2 if rec["type"] == "low_energy_lighting": diff --git a/recommendations/VentilationRecommendations.py b/recommendations/VentilationRecommendations.py index 9738b898..a82e4df5 100644 --- a/recommendations/VentilationRecommendations.py +++ b/recommendations/VentilationRecommendations.py @@ -29,7 +29,7 @@ class VentilationRecommendations(Definitions): def identify_ventilation(self): self.has_ventilaion = self.property.data["mechanical-ventilation"] in self.VENTILATION_DESCRIPTIONS - def recommend(self): + def recommend(self, phase): """ If there is no ventilation, we recommend installing ventilation @@ -63,7 +63,7 @@ class VentilationRecommendations(Definitions): # We recommend installing two mechanical ventilation systems self.recommendation = [ { - "phase": None, + "phase": phase, "parts": part, "type": part[0]["type"], "measure_type": "mechanical_ventilation", @@ -79,7 +79,13 @@ class VentilationRecommendations(Definitions): "total": estimated_cost, # We use a very simple and rough estimate of 4 hours per unit "labour_hours": labour_hours, - "labour_days": labour_days # Assume 8 hour day + "labour_days": labour_days, # Assume 8 hour day + "simulation_config": { + "mechanical_ventilation_ending": "mechanical, extract only", + }, + "description_simulation": { + "mechanical-ventilation": "mechanical, extract only" + } } ] diff --git a/recommendations/county_to_region.py b/recommendations/county_to_region.py index e84b5698..13c1cdaa 100644 --- a/recommendations/county_to_region.py +++ b/recommendations/county_to_region.py @@ -135,7 +135,10 @@ county_to_region_map = { 'Merthyr Tydfil': 'Wales', 'Monmouthshire': 'Wales', 'Mountain Ash': 'Wales', 'Neath Port Talbot': 'Wales', 'Newport': 'Wales', 'Pembrokeshire': 'Wales', 'Penarth': 'Wales', 'Pentre': 'Wales', 'Pontyclun': 'Wales', 'Pontypridd': 'Wales', 'Porth': 'Wales', 'Porthcawl': 'Wales', 'Powys': 'Wales', 'Rhondda Cynon Taff': 'Wales', - 'Rhoose': 'Wales', 'Sully': 'Wales', 'Swansea': 'Wales', 'The Vale of Glamorgan': 'Wales', 'Tonypandy': 'Wales', + 'Rhoose': 'Wales', 'Sully': 'Wales', 'Swansea': 'Wales', + 'The Vale of Glamorgan': 'Wales', + 'Vale of Glamorgan': 'Wales', + 'Tonypandy': 'Wales', 'Torfaen': 'Wales', 'Treharris': 'Wales', 'Treorchy': 'Wales', 'Wrexham': 'Wales', 'Birmingham': 'West Midlands', 'Bromsgrove': 'West Midlands', 'Cannock Chase': 'West Midlands', 'Coventry': 'West Midlands', 'Dudley': 'West Midlands', 'East Staffordshire': 'West Midlands', 'Herefordshire': 'West Midlands', From 91b9530578457ba84f7b9db92013dfe57968bdc1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 15 Mar 2025 18:25:15 +0000 Subject: [PATCH 238/255] implemented change to mechanical ventilation --- backend/app/plan/router.py | 58 ++++++++++------------ recommendations/LightingRecommendations.py | 2 + recommendations/Recommendations.py | 20 +++++--- 3 files changed, 41 insertions(+), 39 deletions(-) diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index d82e774b..4f2b578e 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -445,6 +445,16 @@ async def trigger_plan(body: PlanTriggerRequest): bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet", ) + # Set up model api and warm up the lambdas + model_api = ModelApi( + portfolio_id=body.portfolio_id, + timestamp=created_at, + prediction_buckets=get_prediction_buckets() + ) + await model_api.async_warm_up_lambdas( + model_prefies=model_api.KWH_MODEL_PREFIXES + model_api.MODEL_PREFIXES + ) + input_properties = [] for config in tqdm(plan_input): # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly @@ -539,16 +549,6 @@ async def trigger_plan(body: PlanTriggerRequest): if not input_properties: return Response(status_code=204) - # Set up model api and warm up the lambdas - model_api = ModelApi( - portfolio_id=body.portfolio_id, - timestamp=created_at, - prediction_buckets=get_prediction_buckets() - ) - await model_api.async_warm_up_lambdas( - model_prefies=model_api.KWH_MODEL_PREFIXES + model_api.MODEL_PREFIXES - ) - # The materials data could be cached or local so we don't need to make # consistent requests to the backend for # the same data @@ -683,8 +683,6 @@ async def trigger_plan(body: PlanTriggerRequest): ) # We now insert kwh estimates and costs into the recommendations - # TODO: We should join the methodology which maps the heating and hot water descriptions to the fuel types in - # Recommendations, but also the Property class logger.info("Calculating tenant savings - kwh and bills") for property_id in tqdm([p.id for p in input_properties]): property_recommendations = recommendations.get(property_id, []) @@ -701,8 +699,6 @@ async def trigger_plan(body: PlanTriggerRequest): property_instance.current_energy_bill = property_current_energy_bill # Insert the predictions into the recommendations and run the optimiser - # TODO: If a recommendation has a negative impact on SAP, we should remove it - this seems to have become a - # possibility with heating system? for p in input_properties: if not recommendations.get(p.id): @@ -814,23 +810,23 @@ async def trigger_plan(body: PlanTriggerRequest): # Funding # ~~~~~~~~~~~~~~~~ - for p in input_properties: - funding_calulator = Funding( - tenure=body.housing_type, - starting_epc=p.data["current-energy-rating"], - starting_sap=int(p.data["current-energy-efficiency"]), - postcode=p.postcode, - floor_area=p.floor_area, - council_tax_band=None, # This is seemingly always None at the moment - property_recommendations=recommendations[p.id], - project_scores_matrix=eco_project_scores_matrix, - whlg_eligible_postcodes=whlg_eligible_postcodes, - gbis_abs_rate=15, - eco4_abs_rate=15, - ) - funding_calulator.check_eligibiltiy() - # Insert finding - p.insert_funding(funding_calulator) + # for p in input_properties: + # funding_calulator = Funding( + # tenure=body.housing_type, + # starting_epc=p.data["current-energy-rating"], + # starting_sap=int(p.data["current-energy-efficiency"]), + # postcode=p.postcode, + # floor_area=p.floor_area, + # council_tax_band=None, # This is seemingly always None at the moment + # property_recommendations=recommendations[p.id], + # project_scores_matrix=eco_project_scores_matrix, + # whlg_eligible_postcodes=whlg_eligible_postcodes, + # gbis_abs_rate=15, + # eco4_abs_rate=15, + # ) + # funding_calulator.check_eligibiltiy() + # # Insert finding + # p.insert_funding(funding_calulator) logger.info("Uploading recommendations to the database") # If we have any work to do, we create a new scenario diff --git a/recommendations/LightingRecommendations.py b/recommendations/LightingRecommendations.py index f9a1d63a..3447394d 100644 --- a/recommendations/LightingRecommendations.py +++ b/recommendations/LightingRecommendations.py @@ -4,6 +4,7 @@ from backend.Property import Property from typing import List from recommendations.Costs import Costs from recommendations.recommendation_utils import override_costs +from backend.ml_models.AnnualBillSavings import AnnualBillSavings class LightingRecommendations: @@ -161,6 +162,7 @@ class LightingRecommendations: # the proportion of lights that will be set to low energy "sap_points": sap_points, "kwh_savings": heat_demand_change, + "energy_cost_savings": heat_demand_change * AnnualBillSavings.ELECTRICITY_PRICE_CAP, "co2_equivalent_savings": carbon_change, "description_simulation": { "lighting-energy-eff": "Very Good", diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index edaa611a..813f5a80 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -858,7 +858,7 @@ class Recommendations: for recs in property_recommendations: for rec in recs: if rec["type"] in [ - "mechanical_ventilation", "trickle_vents", "draught_proofing", "extension_cavity_wall_insulation" + "trickle_vents", "draught_proofing", "extension_cavity_wall_insulation" ]: # We cannot score the impact on draught proofing continue @@ -883,13 +883,18 @@ class Recommendations: heating_kwh_savings = ( previous_phase_impact["predictions_heating"].mean() - rec_impact["predictions_heating"].values[0] ) - heating_cost_savings = ( - previous_phase_impact["heating_cost"].mean() - rec_impact["heating_cost"].values[0] - ) - hotwater_kwh_savings = ( previous_phase_impact["predictions_hotwater"].mean() - rec_impact["predictions_hotwater"].values[0] ) + + # Shouldn't be positive + if rec["type"] == "mechanical_ventilation": + heating_kwh_savings = 0 if heating_kwh_savings > 0 else heating_kwh_savings + hotwater_kwh_savings = 0 if hotwater_kwh_savings > 0 else hotwater_kwh_savings + + heating_cost_savings = ( + previous_phase_impact["heating_cost"].mean() - rec_impact["heating_cost"].values[0] + ) hotwater_host = ( previous_phase_impact["hotwater_cost"].mean() - rec_impact["hotwater_cost"].values[0] ) @@ -897,9 +902,8 @@ class Recommendations: total_kwh_savings = heating_kwh_savings + hotwater_kwh_savings energy_cost_savings = heating_cost_savings + hotwater_host - if rec["type"] == "lighting": - # In this case, we should probably just SKIP but check when we have one! - raise Exception("Implement me 3") + if rec["type"] == "low_energy_lighting": + continue rec["kwh_savings"] = total_kwh_savings rec["energy_cost_savings"] = energy_cost_savings From 292da782a076d3da1e90d9ff4c081f50a64524e5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 16 Mar 2025 18:47:41 +0000 Subject: [PATCH 239/255] changing simulation methodology to use corrected floor area --- backend/Property.py | 34 +++++++------ backend/app/db/models/materials.py | 1 + backend/app/plan/router.py | 35 +++++++++----- backend/ml_models/api.py | 37 +++++++++++--- etl/costs/app.py | 6 ++- recommendations/Costs.py | 10 ++-- recommendations/Recommendations.py | 1 + recommendations/RoofRecommendations.py | 31 ++++++------ recommendations/SecondaryHeating.py | 18 +------ recommendations/SolarPvRecommendations.py | 59 +++-------------------- 10 files changed, 105 insertions(+), 127 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index b9c88bc2..e6e43efe 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -70,6 +70,10 @@ class Property: # Contains the solar panel optimisation results from the Google Solar API solar_panel_configuration = None + # If true, indicates the floor area has actually been given to us by the owner, and we should use this figure + # instead of the one in the EPC, when we simulate + owner_floor_area = False + def __init__( self, id, @@ -241,6 +245,10 @@ class Property: insulation_wall_area = kwargs.get("insulation_wall_area", None) insulation_wall_area = float(insulation_wall_area) if insulation_wall_area not in [None, ""] else None + # We allow for the asset owner to provide us with total floor area, in the event of it being incorrect + floor_area = kwargs.get("floor_area", None) + floor_area = float(floor_area) if floor_area not in [None, ""] else None + return { "n_bathrooms": n_bathrooms, "n_bedrooms": n_bedrooms, @@ -248,12 +256,15 @@ class Property: "insulation_floor_area": insulation_floor_area, "insulation_wall_area": insulation_wall_area, "building_id": kwargs.get("building_id", None), + "floor_area": floor_area } def parse_kwargs(self, kwargs): # We extract the elements from kwargs that we recognise. Anything additional is ignored for arg, val in kwargs.items(): if val is not None: + if arg == "floor_area": + self.owner_floor_area = True setattr(self, arg, val) def create_base_difference_epc_record(self, cleaned_lookup: dict): @@ -263,14 +274,7 @@ class Property: It will be the same starting and ending EPC, as we don't have the expected EPC yet """ - # difference_record = self.epc_record - self.epc_record - - # TODO: change these lower and replace in the settings file - # print( - # "CHANGE THE LATEST FIELD TO REMOVE NUMBER HABITABLE ROOMS IF WE WANT TO USE STARTING/ENDING" - # ) fixed_data_col_names = MANDATORY_FIXED_FEATURES + LATEST_FIELD - # print("NEED TO CHANGE THE DASH TO LOWER CASE") fixed_data_col_names = [ x.lower().replace("_", "-") for x in fixed_data_col_names ] @@ -281,8 +285,6 @@ class Property: if k in fixed_data_col_names } - # difference_record.append_fixed_data(fixed_data) - difference_record = self.epc_record.create_EPCDifferenceRecord( self.epc_record, fixed_data ) @@ -291,10 +293,11 @@ class Property: datasets=[difference_record], cleaned_lookup=cleaned_lookup ) - # TODO: adjust the base difference record with the previously calculated u values + features - # estimated_perimeter is different to the perimeter in the epc record - - # self.base_difference_record.df + # If we have variables that have been given to us by the landlord that we know are correct, whereas the EPC + # may not be, we use them + if self.owner_floor_area is not None: + self.base_difference_record.df["total_floor_area_ending"] = self.floor_area + self.base_difference_record.df["estimated_perimeter_ending"] = self.perimeter def simulate_all_representative_recommendations( self, property_representative_recommendations, @@ -1254,7 +1257,10 @@ class Property: # If the property is in a conservation area, is listed or is a heriage building, solar panels # become a difficult measure to generally get through planning restrictions and so we do not recommend # solar panels - if self.restricted_measures: + if self.is_listed or self.is_heritage: + # If the property is in a conservation area, we can still recommend solar panels + # but they need to be done in a way that is sympathetic to the building. E.g. the panels + # may be installed such that they are not visible from the street return False is_valid_property_type = self.data["property-type"] in ["House", "Bungalow", "Maisonette"] diff --git a/backend/app/db/models/materials.py b/backend/app/db/models/materials.py index f0af3343..9f8abbf4 100644 --- a/backend/app/db/models/materials.py +++ b/backend/app/db/models/materials.py @@ -19,6 +19,7 @@ class MaterialType(enum.Enum): flat_roof_insulation = "flat_roof_insulation" room_roof_insulation = "room_roof_insulation" windows_glazing = "windows_glazing" + cavity_wall_extraction = "cavity_wall_extraction" iwi_wall_demolition = "iwi_wall_demolition" iwi_vapour_barrier = "iwi_vapour_barrier" diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 4f2b578e..b6b576b3 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -445,16 +445,6 @@ async def trigger_plan(body: PlanTriggerRequest): bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet", ) - # Set up model api and warm up the lambdas - model_api = ModelApi( - portfolio_id=body.portfolio_id, - timestamp=created_at, - prediction_buckets=get_prediction_buckets() - ) - await model_api.async_warm_up_lambdas( - model_prefies=model_api.KWH_MODEL_PREFIXES + model_api.MODEL_PREFIXES - ) - input_properties = [] for config in tqdm(plan_input): # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly @@ -549,6 +539,17 @@ async def trigger_plan(body: PlanTriggerRequest): if not input_properties: return Response(status_code=204) + # Set up model api and warm up the lambdas + model_api = ModelApi( + portfolio_id=body.portfolio_id, + timestamp=created_at, + prediction_buckets=get_prediction_buckets(), + max_retries=1 + ) + await model_api.async_warm_up_lambdas( + model_prefies=model_api.KWH_MODEL_PREFIXES + model_api.MODEL_PREFIXES + ) + # The materials data could be cached or local so we don't need to make # consistent requests to the backend for # the same data @@ -699,7 +700,6 @@ async def trigger_plan(body: PlanTriggerRequest): property_instance.current_energy_bill = property_current_energy_bill # Insert the predictions into the recommendations and run the optimiser - for p in input_properties: if not recommendations.get(p.id): continue @@ -712,8 +712,13 @@ async def trigger_plan(body: PlanTriggerRequest): else: current_sap_points = int(p.data["current-energy-efficiency"]) - target_sap_points = epc_to_sap_lower_bound(body.goal_value) - sap_gain = CostOptimiser.calculate_sap_gain_with_slack(target_sap_points - current_sap_points) + ventilation_impact = next( + (r[0]["sap_points"] for r in recommendations[p.id] if r[0]["type"] == "mechanical_ventilation"), + 0 + ) + sap_gain = CostOptimiser.calculate_sap_gain_with_slack( + epc_to_sap_lower_bound(body.goal_value) - current_sap_points + ) + abs(ventilation_impact) if not body.optimise: if body.goal != "Increasing EPC": @@ -778,6 +783,10 @@ async def trigger_plan(body: PlanTriggerRequest): final_recommendations = [ rec for recommendations_by_type in final_recommendations for rec in recommendations_by_type ] + # Get defaults + defaults = [r for r in final_recommendations if r["default"]] + sum([r['sap_points'] for r in defaults]) + recommendations[p.id] = final_recommendations # when we have buildings, we tweak our solar PV recommendations as if one unit needs it, we apply it to all diff --git a/backend/ml_models/api.py b/backend/ml_models/api.py index c2f2dcd9..c108f1b7 100644 --- a/backend/ml_models/api.py +++ b/backend/ml_models/api.py @@ -39,6 +39,7 @@ class ModelApi: timestamp, prediction_buckets, base_url="https://api.dev.hestia.homes", + max_retries=2, ): """ This class handles the communication with the Model APIs. These models include SAP change, heat demain change @@ -54,6 +55,8 @@ class ModelApi: self.timestamp = timestamp self.prediction_buckets = prediction_buckets + self.max_retries = max_retries + @staticmethod def predictions_template(): return { @@ -295,15 +298,33 @@ class ModelApi: async def run_batches(): for chunk in tqdm(to_loop_over, total=len(to_loop_over)): - predictions_dict = await self.predict_all_async( - df=data.iloc[chunk:chunk + batch_size], - bucket=bucket, - model_prefixes=model_prefixes, - extract_ids=extract_ids - ) - for key, scored in predictions_dict.items(): - all_predictions[key] = pd.concat([all_predictions[key], scored]) + attempts = 0 + success = False + while attempts <= self.max_retries and not success: + try: + predictions_dict = await self.predict_all_async( + df=data.iloc[chunk:chunk + batch_size], + bucket=bucket, + model_prefixes=model_prefixes, + extract_ids=extract_ids + ) + + for key, scored in predictions_dict.items(): + all_predictions[key] = pd.concat([all_predictions[key], scored]) + + success = True + except Exception as e: + attempts += 1 + logger.error( + f"Batch {chunk}-{chunk + batch_size} failed (Attempt {attempts}/{self.max_retries}). " + f"Error: {e}" + ) + + if attempts > self.max_retries: + logger.error( + f"Skipping batch {chunk}-{chunk + batch_size} after {self.max_retries} failed attempts." + ) # Check if there is an existing event loop try: diff --git a/etl/costs/app.py b/etl/costs/app.py index 797191d2..f2bf365b 100644 --- a/etl/costs/app.py +++ b/etl/costs/app.py @@ -11,7 +11,7 @@ import inspect src_file_path = inspect.getfile(lambda: None) -DATA_DIRECTORY = Path(src_file_path).parent / "local_data" / "20240917 Hestia Materials.xlsx" +DATA_DIRECTORY = Path(src_file_path).parent / "local_data" / "20250316 Domna Materials.xlsx" # Environment file is at the same level as this file ENV_FILE = Path(src_file_path).parent / "etl" / "costs" / ".env" dotenv.load_dotenv(ENV_FILE) @@ -91,6 +91,7 @@ def app(): lel_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="low_energy_lighting", header=0) flat_roof_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="flat_roof_insulation", header=0) window_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="window_glazing", header=0) + rir_insulation_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="room_roof_insulation", header=0) # Form a single table to be uploaded costs = pd.concat( @@ -104,7 +105,8 @@ def app(): ewi_costs, lel_costs, flat_roof_costs, - window_costs + window_costs, + rir_insulation_costs, ] ) diff --git a/recommendations/Costs.py b/recommendations/Costs.py index 4d25ec18..5a39bee3 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -101,10 +101,10 @@ INSTALLER_ASHP_COSTS = [ BOILER_UPGRADE_SCHEME_ASHP_VALUE = 7500 INSTALLER_SOLAR_BATTERY_COSTS = [ - {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 2700.00, 'installer': 'CEG'}, - {'capacity_kwh': 10, 'description': 'Battery Add on', 'cost': 4300.00, 'installer': 'CEG'}, - {'capacity_kwh': 5, 'description': 'Battery Retrofit existing system', 'cost': 4250.00, 'installer': 'CEG'}, - {'capacity_kwh': 10, 'description': 'Battery Retrofit Existing system', 'cost': 5950.00, 'installer': 'CEG'} + {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 2030.40, 'installer': 'CEG'}, + # {'capacity_kwh': 10, 'description': 'Battery Add on', 'cost': 4300.00, 'installer': 'CEG'}, + # {'capacity_kwh': 5, 'description': 'Battery Retrofit existing system', 'cost': 4250.00, 'installer': 'CEG'}, + # {'capacity_kwh': 10, 'description': 'Battery Retrofit Existing system', 'cost': 5950.00, 'installer': 'CEG'} ] # This is based on https://www.checkatrade.com/blog/cost-guides/cost-smart-thermostat/ @@ -149,7 +149,7 @@ CONDENSING_BOILER_COSTS = { ELECTRIC_BOILER_COSTS = 1800 # Assumes 1 hours to remove each heater (including re-decorating) -ROOM_HEATER_REMOVAL_COST = 50 +ROOM_HEATER_REMOVAL_COST = 25 ROOM_HEATER_REMOVAL_LABOUR_HOURS = 3 # This is a cost quoted by Jim for a system flush - existig system will run more efficiently diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 813f5a80..8a6b01ab 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -461,6 +461,7 @@ class Recommendations: :param property_instance: Instance of the Property class, for the home associated to property_id :param all_predictions: dictionary of predictions from the model apis :param recommendations: dictionary of recommendations for the property + :param representative_recommendations: dictionary of representative recommendations for the property :return: """ diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index b7e34406..5f9707d9 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -52,6 +52,10 @@ class RoofRecommendations: part for part in materials if part["type"] == "flat_roof_insulation" ] + self.room_roof_insulation_materials = [ + part for part in materials if part["type"] == "room_roof_insulation" + ] + # Extract the insulation thickness from the roof, which is used throughout this method self.insulation_thickness = convert_thickness_to_numeric( self.property.roof["insulation_thickness"], @@ -496,29 +500,22 @@ class RoofRecommendations: :return: """ - # TODO: We temporarilty use costs from SCIS for RIR insulation. The costing was £180/m2 floor - roof_roof_insulation_materials = [ - { - "type": "room_roof_insulation", - "measure_type": "room_roof_insulation", - "description": "Insulating the ceiling of the roof roof and re-decorate", - "depths": [100], - "depth_unit": "mm", - "r_value_per_mm": 0.038, - "thermal_conductivity": 0.022, - "cost": [180], - } - ] + # We have a list of materials that can be used for room roof insulation + # We will iterate over these materials and recommend them based on the current u-value of the roof + # and the cost of the materials rir_non_invasive_recommendation = next( (x for x in self.property.non_invasive_recommendations if x["type"] == "room_roof_insulation"), {} ) + insulation_materials = pd.DataFrame(self.room_roof_insulation_materials) + # lowest_selected_u_value = None recommendations = [] - for material in roof_roof_insulation_materials: - for depth, cost_per_unit in zip(material["depths"], material["cost"]): - part_u_value = r_value_per_mm_to_u_value(depth, material["r_value_per_mm"]) + for _, material_group in insulation_materials.groupby("description"): + for material in material_group.itertuples(): + + part_u_value = r_value_per_mm_to_u_value(material.depth, material.r_value_per_mm) _, new_u_value = calculate_u_value_uplift(u_value, part_u_value) new_u_value = math.ceil(new_u_value * 100.0) / 100.0 @@ -526,7 +523,7 @@ class RoofRecommendations: # We allow a small tolerance for error so we don't discount the recommendation entirely estimated_cost = ( - cost_per_unit * self.property.insulation_floor_area if + material.total_cost * self.property.insulation_floor_area if rir_non_invasive_recommendation.get("cost") is None else rir_non_invasive_recommendation.get("cost") ) diff --git a/recommendations/SecondaryHeating.py b/recommendations/SecondaryHeating.py index a9d5de04..e63951d9 100644 --- a/recommendations/SecondaryHeating.py +++ b/recommendations/SecondaryHeating.py @@ -9,12 +9,6 @@ class SecondaryHeating: system. """ - # The list of existing heating systems that are accepted - ACCEPTED_MAINHEAT_DESCRIPTIONS = ["Boiler and radiators, mains gas", "Electric storage heaters"] - ACCEPTED_SECONDHEAT_DESCRIPTIONS = ["Room heaters, electric", 'Portable electric heaters (assumed)'] - # These are the heaters where works are required to remove them - FIXED_HEATER_DESCRIPTIONS = ["Room heaters, electric"] - def __init__(self, property_instance: Property): self.property = property_instance self.costs = Costs(self.property) @@ -25,18 +19,10 @@ class SecondaryHeating: # Reset self.recommendation = [] - if self.property.main_heating["clean_description"] not in self.ACCEPTED_MAINHEAT_DESCRIPTIONS: - return - - # TODO: We need to clean secondary data - if self.property.data['secondheat-description'] not in self.ACCEPTED_SECONDHEAT_DESCRIPTIONS: - return - - if self.property.data['secondheat-description'] in self.FIXED_HEATER_DESCRIPTIONS: - # We have an associated cost otherwise, there is no cost + if self.property.data['number-habitable-rooms'] > self.property.data['number-heated-rooms']: n_rooms = self.property.data['number-habitable-rooms'] - self.property.data['number-heated-rooms'] else: - n_rooms = 0 + n_rooms = self.property.data["number-heated-rooms"] costs = self.costs.heater_removal(n_rooms=n_rooms) diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py index 77e8fd10..ee07ff28 100644 --- a/recommendations/SolarPvRecommendations.py +++ b/recommendations/SolarPvRecommendations.py @@ -7,14 +7,6 @@ from recommendations.recommendation_utils import override_costs, estimate_pitche class SolarPvRecommendations: - # Solar panel specs based on Eurener 400s solar panels - # https://midsummerwholesale.co.uk/buy/eurener/eurener-400w-mepv-zebra-ab-half-cut-mono - # Approximate area of the solar panels - SOLAR_PANEL_AREA = 1.79 - # Wattage per panel - this is based on the average wattage of a solar panel being between 250w and 420w - # This was previously set to 250w, but has been upped to 400 based on the systems used by Cotswolrd Energy Group - SOLAR_PANEL_WATTAGE = 400 - # For domestic properties, we don't recommend a solar PV system with wattage outside of these # bounds MAX_SYSTEM_WATTAGE = 6000 @@ -65,46 +57,6 @@ class SolarPvRecommendations: return trimmed_list - def mds_recommend(self, phase=None, solar_pv_percentage=0.5): - # For specific usage within the mds report - - solar_pv_roof_area = self.property.get_solar_pv_roof_area(solar_pv_percentage) - - number_solar_panels = np.floor(solar_pv_roof_area / self.SOLAR_PANEL_AREA) - solar_panel_wattage = number_solar_panels * self.SOLAR_PANEL_WATTAGE - - solar_panel_wattage = np.clip( - a=solar_panel_wattage, a_min=self.MIN_SYSTEM_WATTAGE, a_max=self.MAX_SYSTEM_WATTAGE - ) - - # We now have a property which is potentially suitable for solar PV - roof_coverage_percent = round(solar_pv_percentage * 100) - # Given the wattage, we estimate the cost of the solar PV system. This is based on the MCS database - # of solar PV installations - cost_result = self.costs.solar_pv(wattage=solar_panel_wattage, has_battery=False) - kw = np.floor(solar_panel_wattage / 100) / 10 - - description = (f"Install a {kw} kilowatt-peak (kWp) solar photovoltaic (PV) p" - f"anel system on {round(roof_coverage_percent)}% the roof.") - - return [ - { - "phase": phase, - "parts": [], - "type": "solar_pv", - "description": description, - "starting_u_value": None, - "new_u_value": None, - "sap_points": None, - "already_installed": False, - **cost_result, - # This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we scale - # back up here - "photo_supply": roof_coverage_percent, - "has_battery": False - } - ] - def recommend_building_analysis(self, phase): """ This recommendation approach handles the case of producing solar PV recommendations at the building level, @@ -258,11 +210,14 @@ class SolarPvRecommendations: ) kw = np.floor(recommendation_config["array_wattage"] / 100) / 10 if has_battery: - description = (f"Install a {kw} kilowatt-peak (kWp) solar photovoltaic (PV) panel system on " - f"{round(roof_coverage_percent)}% the roof, with a battery storage system.") + description = ( + f"Install a {kw} kilowatt-peak (kWp) solar panel system, with a battery." + ) else: - description = (f"Install a {kw} kilowatt-peak (kWp) solar photovoltaic (PV) p" - f"anel system on {round(roof_coverage_percent)}% the roof.") + description = f"Install a {kw} kilowatt-peak (kWp) solar panel system." + + if self.property.in_conservation_area: + description += " Property is in a consevation area - please check with local planning authority." already_installed = "solar_pv" in self.property.already_installed if already_installed: From 7111f1a43af2912cdcb735a45866aaef33600d38 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Mar 2025 18:50:21 +0000 Subject: [PATCH 240/255] implemented required measures for mod --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 5 +- asset_list/app.py | 25 ++ backend/apis/GoogleSolarApi.py | 2 +- backend/app/plan/router.py | 86 +++- backend/app/plan/schemas.py | 2 + etl/customers/mod/pilot/1. Create Sample.py | 6 +- .../mod/pilot/2. Create Excel Model.py | 398 ++++++++++++++++++ etl/customers/united living/get_data.py | 73 ++++ recommendations/Costs.py | 60 ++- recommendations/FloorRecommendations.py | 4 +- .../optimiser/optimiser_functions.py | 49 ++- 13 files changed, 661 insertions(+), 53 deletions(-) create mode 100644 etl/customers/mod/pilot/2. Create Excel Model.py create mode 100644 etl/customers/united living/get_data.py diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..96ad7a95 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..fb10c6b0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index b3dbd512..81b973b9 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -545,7 +545,10 @@ class AssetList: raise ValueError("Missing full address - please specify columns to concatenate") self.full_address_colname = self.STANDARD_FULL_ADDRESS self.standardised_asset_list[self.full_address_colname] = ( - self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1) + self.standardised_asset_list[self.full_address_cols_to_concat].apply( + lambda x: ", ".join([y for y in x if not pd.isnull(y)]), + axis=1 + ) ) else: diff --git a/asset_list/app.py b/asset_list/app.py index 088f1603..bf5234dd 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -88,6 +88,31 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) + # PFP + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/East" + data_filename = "PFP EAST - Master - DN LN NG NR PE POSTCODES.xlsx" + sheet_name = "PFP EAST" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "AddressLine1" + address1_method = None + address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Archetype" + landlord_built_form = "Archetype" + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Uprn" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + master_filepaths = [] + master_to_asset_list_filepath = None + # Wates data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Wates - " data_filename = "ECO 4 Wates.xlsx" diff --git a/backend/apis/GoogleSolarApi.py b/backend/apis/GoogleSolarApi.py index a5c1e739..ea8650b6 100644 --- a/backend/apis/GoogleSolarApi.py +++ b/backend/apis/GoogleSolarApi.py @@ -185,7 +185,7 @@ class GoogleSolarApi: ): self.exclude_likely_duplicate_surfaces() - # TODO: We need to constrain the roof area, based on the floor area to be more conservative + # We constrain the roof area, based on the floor area to be more conservative self.roof_area = self.insights_data["solarPotential"]["wholeRoofStats"]['areaMeters2'] if self.roof_area > property_instance.roof_area * self.ROOF_AREA_TOLERANCE: self.roof_area = property_instance.roof_area diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index b6b576b3..d55a4f73 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -704,21 +704,70 @@ async def trigger_plan(body: PlanTriggerRequest): if not recommendations.get(p.id): continue - input_measures = prepare_input_measures(recommendations[p.id], body.goal) + # we need to double unlist because we have a list of lists + property_measure_types = {rec["type"] for recs in recommendations[p.id] for rec in recs} + + measures_to_optimise = recommendations[p.id] + property_required_measures = [] + if body.required_measures: + property_required_measures = [ + m for m in measures_to_optimise if m[0]["type"] in body.required_measures + ] + measures_to_optimise = [ + m for m in measures_to_optimise if m[0]["type"] not in body.required_measures + ] + + # If we have a wall insulation measure, we MUST include mechanical ventilation + # Additionally, if we have required measures, they should also be included. Therefore + # we can discount the number of points required to get to the target SAP band (or increase) + # in the case of ventilation + measures_needing_ventilation = [ + "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation" + ] + needs_ventilation = any(x in property_measure_types for x in measures_needing_ventilation) + + input_measures = prepare_input_measures( + measures_to_optimise, body.goal, needs_ventilation, measures_needing_ventilation + ) if not input_measures[0]: # This means that we have no defaults selected_recommendations = {} else: + fixed_gain = 0 + if property_required_measures: + # We get the SAP points for the required measures + if body.goal != "Increasing EPC": + raise NotImplementedError("Only EPC optimisation is currently supported") + sap_by_type = [ + {"type": rec["type"], "sap_points": rec["sap_points"]} for recs in property_required_measures + for rec in recs + ] + # We get a MAX sap points per type + max_per_type = ( + pd.DataFrame(sap_by_type).groupby("type")["sap_points"].max().to_dict() + ) + fixed_gain = sum(max_per_type.values()) + + property_required_measure_types = {rec["type"] for rec in sap_by_type} + + # if the property needs ventilation, but the measure we optimise didn't include + # venilation we add the points for ventilation as a fixed gain + if needs_ventilation and any( + r in property_required_measure_types for r in measures_needing_ventilation + ): + fixed_gain += next( + (r[0]["sap_points"] for r in recommendations[p.id] if + r[0]["type"] == "mechanical_ventilation"), + 0 + ) + current_sap_points = int(p.data["current-energy-efficiency"]) - ventilation_impact = next( - (r[0]["sap_points"] for r in recommendations[p.id] if r[0]["type"] == "mechanical_ventilation"), - 0 - ) + sap_gain = CostOptimiser.calculate_sap_gain_with_slack( epc_to_sap_lower_bound(body.goal_value) - current_sap_points - ) + abs(ventilation_impact) + ) - fixed_gain if not body.optimise: if body.goal != "Increasing EPC": @@ -748,6 +797,31 @@ async def trigger_plan(body: PlanTriggerRequest): selected_recommendations = {r["id"] for r in solution} + if property_required_measures: + # We select the cheapest of the required measures, into selected + for recs in property_required_measures: + # We select the cheapest of the required measures + cost_to_id = { + rec["recommendation_id"]: rec["total"] for rec in recs + if rec["recommendation_id"] not in selected_recommendations + } + # Take the recommendation id with the lowers cost + + selected_recommendations.add(min(cost_to_id, key=cost_to_id.get)) + # Update the solution with the selected recommendaitons + solution = [] + for recs in recommendations[p.id]: + for rec in recs: + if rec["recommendation_id"] in selected_recommendations: + solution.append( + { + "id": rec["recommendation_id"], + "cost": rec["total"], + "gain": rec["sap_points"], + "type": rec["type"] + } + ) + # If wall insulation is selected, we also include mechanical ventilation as a best practice measure if any(x in [r["type"] for r in solution] for x in [ "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation" diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py index 618bec90..7db0f16f 100644 --- a/backend/app/plan/schemas.py +++ b/backend/app/plan/schemas.py @@ -75,6 +75,8 @@ class PlanTriggerRequest(BaseModel): valuation_file_path: Optional[str] = None exclusions: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1) inclusions: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1) + # This is a list of measures that we want to be included, if they are options + required_measures: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1) scenario_name: Optional[str] = "" multi_plan: Optional[bool] = False diff --git a/etl/customers/mod/pilot/1. Create Sample.py b/etl/customers/mod/pilot/1. Create Sample.py index 97480d51..fd045294 100644 --- a/etl/customers/mod/pilot/1. Create Sample.py +++ b/etl/customers/mod/pilot/1. Create Sample.py @@ -104,13 +104,15 @@ def app(): } ) + # also include the floor area asset_list = df[ - ["uprn", "address1", "postcode", "NUMBER_OF_BEDROOMS", "BLDNG_STOREYS_QTY", ] + ["uprn", "address1", "postcode", "NUMBER_OF_BEDROOMS", "BLDNG_STOREYS_QTY", "BLDNG_MSRMNT_VAL"] ].rename( columns={ "address1": "address", "NUMBER_OF_BEDROOMS": "n_bedrooms", - "BLDNG_STOREYS_QTY": "number_of_floors" + "BLDNG_STOREYS_QTY": "number_of_floors", + "BLDNG_MSRMNT_VAL": "floor_area" } ) diff --git a/etl/customers/mod/pilot/2. Create Excel Model.py b/etl/customers/mod/pilot/2. Create Excel Model.py new file mode 100644 index 00000000..0e057a25 --- /dev/null +++ b/etl/customers/mod/pilot/2. Create Excel Model.py @@ -0,0 +1,398 @@ +from pprint import pprint +import pandas as pd +import numpy as np +from backend.app.utils import sap_to_epc +from sqlalchemy.orm import sessionmaker +from backend.app.db.connection import db_engine +from backend.app.db.models.recommendations import Recommendation, Plan, PlanRecommendations +from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel + + +def get_data(portfolio_id, scenario_ids): + session = sessionmaker(bind=db_engine)() + session.begin() + + # Get properties and their details for a specific portfolio + properties_query = session.query( + PropertyModel, + PropertyDetailsEpcModel + ).join( + PropertyDetailsEpcModel, PropertyModel.id == PropertyDetailsEpcModel.property_id + ).filter( + PropertyModel.portfolio_id == portfolio_id # Filter by portfolio ID + ).all() + + # Transform properties data to include all fields dynamically + properties_data = [ + {**{col.name: getattr(prop.PropertyModel, col.name) for col in PropertyModel.__table__.columns}, + **{col.name: getattr(prop.PropertyDetailsEpcModel, col.name) for col in + PropertyDetailsEpcModel.__table__.columns}} + for prop in properties_query + ] + + # Get property IDs from fetched properties + + # Get plans linked to the fetched properties + plans_query = session.query(Plan).filter(Plan.scenario_id.in_(scenario_ids)).all() + + # Transform plans data to include all fields dynamically + plans_data = [ + {col.name: getattr(plan, col.name) for col in Plan.__table__.columns} + for plan in plans_query + ] + + # Extract plan IDs for filtering recommendations through PlanRecommendations + plan_ids = [plan['id'] for plan in plans_data] + + # Get recommendations through PlanRecommendations for those plans and that are default + recommendations_query = session.query( + Recommendation, + Plan.scenario_id + ).join( + PlanRecommendations, Recommendation.id == PlanRecommendations.recommendation_id + ).join( + Plan, Plan.id == PlanRecommendations.plan_id # Join with Plan to access scenario_id + ).filter( + PlanRecommendations.plan_id.in_(plan_ids), + Recommendation.default == True # Filtering for default recommendations + ).all() + + # Transform recommendations data to include all fields dynamically and include scenario_id + recommendations_data = [ + {**{col.name: getattr(rec.Recommendation, col.name) if hasattr(rec, 'Recommendation') + else getattr(rec, col.name) for + col in Recommendation.__table__.columns}, + "Scenario ID": rec.scenario_id} + for rec in recommendations_query + ] + + session.close() + + return properties_data, plans_data, recommendations_data + + +def app(): + """ + Given a portfolio and a scenario, this function prepares an excel model to present the data + """ + + # Set the inputs: + portfolio_id = 139 + scenario_ids = [233, 234] + + properties_data, plans_data, recommendations_data = get_data( + portfolio_id=portfolio_id, scenario_ids=scenario_ids + ) + + properties_df = pd.DataFrame(properties_data) + plans_df = pd.DataFrame(plans_data) + recommendations_df = pd.DataFrame(recommendations_data) + + # Merge on the orignal data + mod_property_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD property data.csv" + ) + + property_asset_data = properties_df.merge( + mod_property_data.drop(columns=["address", "postcode", "tenure"]), how="left", on="uprn" + ) + + property_asset_data["is_pitched"] = property_asset_data["roof"].str.contains("pitched", case=False) + property_asset_data["pre_2002"] = property_asset_data["BUILD_YEAR"] < 2002 + property_asset_data["wall_type"] = property_asset_data["walls"].str.split(" ").str[0].str.strip() + property_asset_data["is_insulated"] = ( + property_asset_data["walls"].str.split(",").str[1].str.strip().isin( + ["filled cavity", "with external insulation", "filled cavity and external insulation"] + ) | property_asset_data["walls"].str.split(",").str[2].str.strip().isin(["insulated"]) + ) + property_asset_data["is_insulated"] = np.where( + property_asset_data["is_insulated"], "Insulated", "Uninsulated" + ) + property_asset_data["is_pitched"] = np.where( + property_asset_data["is_pitched"], "Pitched roof", "Not Pitched Roof" + ) + property_asset_data["pre_2002"] = np.where( + property_asset_data["pre_2002"], "Pre 2002", "Post 2002" + ) + + archetype_variables = ["property_type", "wall_type", "is_insulated", "is_pitched", "pre_2002"] + + assigned_archetypes = ( + property_asset_data.groupby( + archetype_variables + ).size().reset_index().rename(columns={0: "n_properties"}).sort_values("n_properties", ascending=False) + ) + + # Make the archetype ID a concatenation of the variables + assigned_archetypes["archetype_id"] = assigned_archetypes[archetype_variables].apply( + lambda x: "_".join(x.astype(str)), axis=1 + ) + + # Most prominent archetypes + prominent_archetypes = assigned_archetypes.head(3) + other_archetypes = assigned_archetypes.tail(-3) + # 2 or fewer properties in the other archetypes + + property_asset_data = property_asset_data.merge( + assigned_archetypes[archetype_variables + ["archetype_id"]], + how="left", + on=archetype_variables + ) + + # Create age bands: + # 1960-1969 + # 1970-1979 + # 1980-1989 + # 1990-1999 + # 2000+ + property_asset_data["age_band"] = pd.cut( + property_asset_data["BUILD_YEAR"], + bins=[1959, 1969, 1979, 1989, 1999, 2022], + labels=["1960-1969", "1970-1979", "1980-1989", "1990-1999", "2000+"] + ) + + # Create floor area bands + # 0-73 + # 74-97 + # 98-199 + # 200+ + property_asset_data["floor_area_band"] = pd.cut( + property_asset_data["total_floor_area"], + bins=[0, 73, 97, 199, 10000], + labels=["0-73", "74-97", "98-199", "200+"] + ) + + property_asset_data["archetype_group"] = property_asset_data["archetype_id"].copy() + property_asset_data["archetype_group"] = np.where( + property_asset_data["archetype_id"].isin(other_archetypes["archetype_id"].values), + "other", + property_asset_data["archetype_group"] + ) + + # For colour + wall_types = ( + property_asset_data[["wall_type"]].value_counts().to_frame().reset_index().rename( + columns={"wall_type": "Wall Type"} + ) + ) + # Group into age bands + ages = ( + property_asset_data[["age_band"]].value_counts() + .to_frame() + .reset_index().sort_values("age_band", ascending=True) + .rename(columns={"age_band": "Age Band"}) + ) + floor_area_bands = ( + property_asset_data[["floor_area_band"]].value_counts() + .to_frame() + .reset_index().sort_values("floor_area_band", ascending=True) + .rename(columns={"floor_area_band": "Floor Area Band"}) + ) + archetype_counts = ( + property_asset_data[["archetype_group"]]. + value_counts(). + to_frame(). + reset_index() + .rename(columns={"archetype_group": "Archetype"}) + ) + + # epc breakdown + epc_breakdown = ( + property_asset_data["current_epc_rating"] + .apply(lambda x: x.value) + .value_counts() + .to_frame() + .reset_index() + ) + + # Figures for the deck + # Carbon per property + totals = property_asset_data[ + [ + "Total_household_members", + "co2_emissions", "current_energy_demand", "current_energy_demand_heating_hotwater", + "heating_cost_current", "hot_water_cost_current", "lighting_cost_current", + "appliances_cost_current", "gas_standing_charge", "electricity_standing_charge" + ] + ].copy() + totals["total_cost"] = ( + totals["heating_cost_current"] + + totals["hot_water_cost_current"] + + totals["lighting_cost_current"] + + totals["appliances_cost_current"] + + totals["gas_standing_charge"] + + totals["electricity_standing_charge"] + ) + print( + totals[ + [ + "Total_household_members", + "co2_emissions", + "current_energy_demand", + "total_cost", + ] + ].mean() + ) + + # Store these to an excel + # with pd.ExcelWriter( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD archetype breakdowns.xlsx" + # ) as writer: + # wall_types.to_excel(writer, sheet_name="Wall Types", index=False) + # ages.to_excel(writer, sheet_name="Ages", index=False) + # floor_area_bands.to_excel(writer, sheet_name="Floor Area Bands", index=False) + # archetype_counts.to_excel(writer, sheet_name="Archetype Counts", index=False) + # epc_breakdown.to_excel(writer, sheet_name="EPC Rating", index=False) + + contingency = 0.26 + + # We prepare the outputs, by scenario + scenario_data = {} + for scenario in scenario_ids: + + scenario_recommendations_df = recommendations_df[ + recommendations_df["Scenario ID"] == scenario + ].copy() + + scenario_recommendations_df["contingency"] = contingency * scenario_recommendations_df["estimated_cost"] + scenario_recommendations_df["total_cost"] = ( + scenario_recommendations_df["estimated_cost"] + scenario_recommendations_df["contingency"] + ) + + recommended_measures_df = scenario_recommendations_df[ + ["property_id", "measure_type", "estimated_cost", "default"] + ] + + recommended_measures_df = recommended_measures_df[recommended_measures_df["default"]] + recommended_measures_df = recommended_measures_df.drop(columns=["default"]) + + # Metrics by property ID + aggregated_metrics = scenario_recommendations_df[ + [ + "property_id", "type", "default", "sap_points", + "energy_cost_savings", "kwh_savings", "co2_equivalent_savings", "estimated_cost", "contingency", + "total_cost" + ] + ] + aggregated_metrics = aggregated_metrics[aggregated_metrics["default"]] + aggregated_metrics = aggregated_metrics.groupby("property_id")[ + ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost", + "total_cost", "contingency"] + ].sum().reset_index() + + recommendations_measures_pivot = recommended_measures_df.pivot( + index='property_id', + columns='measure_type', + values='estimated_cost' + ) + recommendations_measures_pivot = recommendations_measures_pivot.reset_index() + recommendations_measures_pivot = recommendations_measures_pivot.fillna(0) + + # We flag with boolean if the measure is recommended + for c in recommendations_measures_pivot.columns: + if c == "property_id": + continue + recommendations_measures_pivot["Recommendation: " + c] = recommendations_measures_pivot[c] > 0 + + # We now create a final output + df = properties_df[ + [ + "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", "heating", "windows", + "current_epc_rating", "current_sap_points", "total_floor_area", "number_of_rooms", + ] + ].merge( + recommendations_measures_pivot, how="left", on="property_id" + ).merge( + aggregated_metrics, how="left", on="property_id" + ) + + df = df.drop(columns=["property_id"]) + for c in ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings"]: + df[c] = df[c].fillna(0) + + df = df.rename( + columns={ + "uprn": "UPRN", + "address": "Address", + "postcode": "Postcode", + "walls": "Walls", + "roof": "Roof", + "heating": "Heating", + "windows": "Windows", + "current_epc_rating": "Current EPC Rating", + "current_sap_points": "Current SAP Points", + "total_floor_area": "Total Floor Area", + "number_of_rooms": "Number of Habitable Rooms", + "floor_height": "Floor Height", + } + ) + + # Calculate post SAP + df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"] + df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round() + df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x)) + + # For properties that don't make it to EPC B, check why. E.g. for a property that has an oil boiler, it + # the bills go up recommending HHRSH, so it doesn't make it to EPC B + # For mid-terrace units, use the ordnance survey API to check if there is space for a heat pump? + # DO it manually??? + + # Doesn't make it + # misses = df[df["Predicted Post Works EPC"] == "C"] + # # 5 of them are flats and so are difficult to get to EPC B without renewables. Possibly not worth it from an + # # ROI perspective + # + # misses[["UPRN", "Address", "Postcode", "property_type"]] + + # UPRN Address Postcode property_type + # 2 100120988937 13 Sidbury Circular Road SP9 7HX Flat No further action + # 3 100120988998 74 Sidbury Circular Road SP9 7JA Flat No further action + # 4 100120989416 47 Zouch Avenue SP9 7LR Flat No further action + # 6 100060585002 42, Muscott Close, Shipton Bellinger SP9 7TX House Can probably take a heat pump + # 37 10000801072 34 Luffenham Place, Chicksands SG17 5XH House Already surveyed as having + # an ASHP - should be looked at + # 121 100120988259 8, Karachi Close SP9 7LW Flat + # 122 100121101217 599, Pepper Place BA12 0DW Flat + # 140 100021455241 33 Blenheim Crescent, Ruislip HA4 7HA House - Solar isnt recommended + # due to bug + # 149 100120915656 10 Bower Green, Shrivenham SN6 8TU House - Solar isn't recommended + # due to bug + + scenario_data[scenario] = df + + measure_counts = {} + for scenario in scenario_ids: + recommendation_cols = [c for c in scenario_data[scenario].columns if "Recommendation:" in c] + measure_counts[scenario] = scenario_data[scenario][recommendation_cols].sum().to_dict() + + pprint(measure_counts[scenario_ids[0]]) + pprint(measure_counts[scenario_ids[1]]) + + df = scenario_data[scenario_ids[1]] + z = df[ + (df["Walls"] == "Cavity wall, as built, no insulation") & (~df["Recommendation: cavity_wall_insulation"]) + ] + + # Scenario adjustments: + # Exclude: boiler_upgrade + # Make ASHP COP 3.5 + + # Metrics we need by scenario: + # Cost + # contingency + # Carbon + # kwh + # bill savings + scenario_metrics = {} + for scenario in scenario_ids: + df = scenario_data[scenario].copy() + df["cost_per_sap_point"] = df["total_cost"] / df["sap_points"] + df["cost_per_carbon"] = df["total_cost"] / df["co2_equivalent_savings"] + avg_savings = df[ + ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost", + "cost_per_sap_point", "cost_per_carbon", "total_cost", "contingency"] + ].mean().to_dict() + + # TODO: Add a slide on valuation improvement, on a sample of properties? + + # TODO: Read in costing data and breakdown diff --git a/etl/customers/united living/get_data.py b/etl/customers/united living/get_data.py new file mode 100644 index 00000000..bc4ab400 --- /dev/null +++ b/etl/customers/united living/get_data.py @@ -0,0 +1,73 @@ +import os +import pandas as pd +import numpy as np +from asset_list.utils import get_data +from backend.SearchEpc import SearchEpc +from etl.spatial.OpenUprnClient import OpenUprnClient + +from dotenv import load_dotenv + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def app(): + filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/United Living/Potential GMCA props 05.03.xlsx" + + df = pd.read_excel(filepath) + df["row_id"] = df.index + + df["house_number"] = df.apply( + lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), + axis=1 + ) + + properties_data, _, _ = get_data( + df=df, + manual_uprn_map={}, + epc_auth_token=EPC_AUTH_TOKEN, + uprn_column=None, + fulladdress_column="Address", + address1_column="house_number", + postcode_column="Postcode", + property_type_column=None, + built_form_column=None, + epc_api_only=True, + row_id_name="row_id", + ) + + no_data = df[df["row_id"].isin(_)] + no_data[["Address", "Postcode"]] + + # 53 108 Alexandra Street OL6 9QP 100011536830 + # 56 301 Whiteacre Road OL6 9QF 100011557437 + # 65 97 Princess Street OL6 9QJ 100011551813 + + data = df.merge( + pd.DataFrame(properties_data)[["uprn", "row_id"]], + how="left", left_on="row_id", right_on="row_id" + ) + + # Fill missing UPRNS + data["uprn"] = np.where(data["Address"] == "108 Alexandra Street", 100011536830, data["uprn"]) + data["uprn"] = np.where(data["Address"] == "301 Whiteacre Road", 100011557437, data["uprn"]) + data["uprn"] = np.where(data["Address"] == "97 Princess Street", 100011551813, data["uprn"]) + + # We now get whether the property is listed, heritage or in a conservation area + spatial_data = OpenUprnClient.get_spatial_data(uprns=data["uprn"].tolist(), bucket_name="retrofit-data-dev") + spatial_data = spatial_data.rename(columns={"UPRN": "uprn"}) + + data["uprn"] = data["uprn"].astype(int) + + merged = data.merge( + spatial_data, how="left", on="uprn" + ) + # fill NAs + for c in ['conservation_status', 'is_listed_building', 'is_heritage_building']: + merged[c] = merged[c].fillna(False) + + merged.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/United Living/Potential GMCA props 05.03 - data " + "pulled.xlsx", + index=False + ) diff --git a/recommendations/Costs.py b/recommendations/Costs.py index 5a39bee3..5e90be79 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -37,22 +37,25 @@ MCS_SOLAR_PV_COST_DATA = { "average_cost_per_kwh-Northern Ireland": 1347, } +# Installers are now working with 435 watt panels +PANEL_SIZE = 0.435 + INSTALLER_SOLAR_COSTS = [ - {'n_panels': 4, 'array_kwp': 1.6, 'cost': 3040.00, 'installer': 'CEG'}, - {'n_panels': 5, 'array_kwp': 2.1, 'cost': 3201.00, 'installer': 'CEG'}, - {'n_panels': 6, 'array_kwp': 2.5, 'cost': 3363.00, 'installer': 'CEG'}, - {'n_panels': 7, 'array_kwp': 2.9, 'cost': 3524.00, 'installer': 'CEG'}, - {'n_panels': 8, 'array_kwp': 3.3, 'cost': 3686.00, 'installer': 'CEG'}, - {'n_panels': 9, 'array_kwp': 3.7, 'cost': 3847.00, 'installer': 'CEG'}, - {'n_panels': 10, 'array_kwp': 4.1, 'cost': 4009.00, 'installer': 'CEG'}, - {'n_panels': 11, 'array_kwp': 4.5, 'cost': 4170.00, 'installer': 'CEG'}, - {'n_panels': 12, 'array_kwp': 4.9, 'cost': 4332.00, 'installer': 'CEG'}, - {'n_panels': 13, 'array_kwp': 5.3, 'cost': 4835.00, 'installer': 'CEG'}, - {'n_panels': 14, 'array_kwp': 5.7, 'cost': 5015.00, 'installer': 'CEG'}, - {'n_panels': 15, 'array_kwp': 6.2, 'cost': 5176.00, 'installer': 'CEG'}, - {'n_panels': 16, 'array_kwp': 6.6, 'cost': 5338.00, 'installer': 'CEG'}, - {'n_panels': 17, 'array_kwp': 7.0, 'cost': 5500.00, 'installer': 'CEG'}, - {'n_panels': 18, 'array_kwp': 7.4, 'cost': 6021.00, 'installer': 'CEG'} + {'n_panels': 4, 'array_kwp': 4 * PANEL_SIZE, 'cost': 4089.25, 'installer': 'CEG'}, + {'n_panels': 5, 'array_kwp': 5 * PANEL_SIZE, 'cost': 4242.48, 'installer': 'CEG'}, + {'n_panels': 6, 'array_kwp': 6 * PANEL_SIZE, 'cost': 4395.71, 'installer': 'CEG'}, + {'n_panels': 7, 'array_kwp': 7 * PANEL_SIZE, 'cost': 4548.94, 'installer': 'CEG'}, + {'n_panels': 8, 'array_kwp': 8 * PANEL_SIZE, 'cost': 4702.17, 'installer': 'CEG'}, + {'n_panels': 9, 'array_kwp': 9 * PANEL_SIZE, 'cost': 4855.41, 'installer': 'CEG'}, + {'n_panels': 10, 'array_kwp': 10 * PANEL_SIZE, 'cost': 5010.95, 'installer': 'CEG'}, + {'n_panels': 11, 'array_kwp': 11 * PANEL_SIZE, 'cost': 5166.49, 'installer': 'CEG'}, + {'n_panels': 12, 'array_kwp': 12 * PANEL_SIZE, 'cost': 5322.04, 'installer': 'CEG'}, + {'n_panels': 13, 'array_kwp': 13 * PANEL_SIZE, 'cost': 5657.6, 'installer': 'CEG'}, + {'n_panels': 14, 'array_kwp': 14 * PANEL_SIZE, 'cost': 5993.16, 'installer': 'CEG'}, + {'n_panels': 15, 'array_kwp': 15 * PANEL_SIZE, 'cost': 6328.71, 'installer': 'CEG'}, + {'n_panels': 16, 'array_kwp': 16 * PANEL_SIZE, 'cost': 6483.33, 'installer': 'CEG'}, + {'n_panels': 17, 'array_kwp': 17 * PANEL_SIZE, 'cost': 6637.95, 'installer': 'CEG'}, + {'n_panels': 18, 'array_kwp': 18 * PANEL_SIZE, 'cost': 6792.57, 'installer': 'CEG'} ] # This is the maximum number of panels that we have a cost from the installers for INSTALLER_MAX_PANELS = 18 @@ -62,11 +65,11 @@ INSTALLER_MAX_PANELS = 18 INSTALLER_SOLAR_PV_INVERTER_COST = 7500 INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST = 500 # Just a rough guess to labour costs -INSTALLER_SCAFFOLDING_COSTS = [ - {'stories': 1, 'description': '1 Story Scaffold', 'cost': 531.00, 'installer': 'CEG'}, - {'stories': 2, 'description': '2 Story Scaffold', 'cost': 841.00, 'installer': 'CEG'}, - {'stories': 3, 'description': '3 Story Scaffold', 'cost': 1077.00, 'installer': 'CEG'} -] +# INSTALLER_SCAFFOLDING_COSTS = [ +# {'stories': 1, 'description': '1 Story Scaffold', 'cost': 531.00, 'installer': 'CEG'}, +# {'stories': 2, 'description': '2 Story Scaffold', 'cost': 841.00, 'installer': 'CEG'}, +# {'stories': 3, 'description': '3 Story Scaffold', 'cost': 1077.00, 'installer': 'CEG'} +# ] # This data is based on the MCS database, We use the larger figure between the 2023 and 2024 average, # to be conservative @@ -772,18 +775,14 @@ class Costs: battery_cost = [c for c in INSTALLER_SOLAR_BATTERY_COSTS if c["capacity_kwh"] == battery_kwh][0]["cost"] subtotal += battery_cost - scaffolding_cost = [c for c in INSTALLER_SCAFFOLDING_COSTS if c["stories"] == n_floors][0]["cost"] - subtotal += scaffolding_cost - if needs_inverter: subtotal += INSTALLER_SOLAR_PV_INVERTER_COST # We also add an additional labour cost subtotal += INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST - # We add an additional cost for scaffolding - # The costs from installers exclude VAT - vat = subtotal * cls.VAT_RATE - total_cost = subtotal + vat + # Solar doesn't have VAT but we add a high risk contingency + # to account for design variation that we see in practice + total_cost = subtotal * (1 + cls.HIGH_RISK_CONTINGENCY) # Labour hours are based on estimates from online research but an average team seems to consist of 3 people # and most jobs take around 2 days. Assuming an 8 hour day for 3 people across 2 days, gives us 48 hours of @@ -791,7 +790,7 @@ class Costs: return { "total": total_cost, "subtotal": subtotal, - "vat": vat, + "vat": 0, "labour_hours": 48, "labour_days": 2, } @@ -1161,7 +1160,6 @@ class Costs: pump. This cost will include the boiler upgrade scheme grant """ - # This is the average cost of a project, we'll add some additional contingency if ashp_size is None: @@ -1170,7 +1168,7 @@ class Costs: cost = [x for x in INSTALLER_ASHP_COSTS if x][0]["cost"] # We add some contingency since there are additional costs such as resizing radiators, that could be required - subtotal = cost * (1 + self.CONTINGENCY) + subtotal = cost * (1 + self.HIGH_RISK_CONTINGENCY) # The costs from installers exclude VAT vat = subtotal * self.VAT_RATE total_cost = subtotal + vat @@ -1180,7 +1178,7 @@ class Costs: labour_hours = labour_days * 8 return { - "total": subtotal, + "total": total_cost, "subtotal": subtotal, "vat": vat, "labour_hours": labour_hours, diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py index ed00bbe9..85e1a8dc 100644 --- a/recommendations/FloorRecommendations.py +++ b/recommendations/FloorRecommendations.py @@ -145,7 +145,9 @@ class FloorRecommendations(Definitions): ) return - raise NotImplementedError("Implement me!") + # In this case, we have no recommendation to make. E.g., if we have a solid floor property + # but solid floor insulation has been excluded as a measure, we get here + return @staticmethod def _make_floor_description(material): diff --git a/recommendations/optimiser/optimiser_functions.py b/recommendations/optimiser/optimiser_functions.py index 8c15673d..a0c3719d 100644 --- a/recommendations/optimiser/optimiser_functions.py +++ b/recommendations/optimiser/optimiser_functions.py @@ -1,10 +1,12 @@ -def prepare_input_measures(property_recommendations, goal): +def prepare_input_measures(property_recommendations, goal, needs_ventilation, measures_needing_ventilation): """ Basic function to convert recommendations_to_upload to a format that is suitable for the optimiser - large :param property_recommendations: object containing the recommendations, created in the plan trigger api :param goal: goal to be optimised for, should be one of the keys in gain_map. E.g. if the gain is SAP points, the goal should reflect that desired gain + :param needs_ventilation: boolean to indicate if the property needs ventilation + :param measures_needing_ventilation: list of measures that need ventilation :return: Nested list of input measures """ @@ -16,9 +18,20 @@ def prepare_input_measures(property_recommendations, goal): if not goal_key: raise NotImplementedError("Not implemented this gain type - investigate me") + # We ony ever have one ventilation measure with now + ventilation_recommendation = next( + (measure[0] for measure in property_recommendations if measure[0]["type"] == "mechanical_ventilation"), + {} + ) + input_measures = [] for recs in property_recommendations: + if needs_ventilation and recs[0]["type"] == "mechanical_ventilation": + # If we house needs ventilation, ventilation will be packaged with the fabric measure so + # we don't need to optimise it independently + continue + if recs[0]["type"] == "solar_pv": # if the recommendation is a solar recommendation with a battery, we exclude it from the optimisation. recs = [r for r in recs if ~r["has_battery"]] @@ -27,16 +40,34 @@ def prepare_input_measures(property_recommendations, goal): if not recs_to_append: continue - input_measures.append( - [ + to_append = [] + for rec in recs: + # We bundle the impact of ventilation with the measure + total = ( + rec["total"] + ventilation_recommendation["total"] if rec["type"] in measures_needing_ventilation + else rec["total"] + ) + gain = ( + rec[goal_key] + ventilation_recommendation[goal_key] if rec["type"] in measures_needing_ventilation + else rec[goal_key] + ) + + rec_type = ( + "+".join( + [rec["type"], ventilation_recommendation["type"]] + ) if rec["type"] in measures_needing_ventilation + else rec["type"] + ) + + to_append.append( { "id": rec["recommendation_id"], - "cost": rec["total"], - "gain": rec[goal_key], - "type": rec["type"] + "cost": total, + "gain": gain, + "type": rec_type } - for rec in recs if rec["energy_cost_savings"] >= 0 - ] - ) + ) + + input_measures.append(to_append) return input_measures From b35d021db98c620c3cd45ea6bf3afb749bea0acf Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 19 Mar 2025 19:21:11 +0000 Subject: [PATCH 241/255] starting to tidy up tweaks to optimiser --- asset_list/mappings/built_form.py | 28 ++++++++++++- asset_list/mappings/property_type.py | 25 +++++++++++- asset_list/mappings/walls.py | 3 +- backend/app/assumptions.py | 6 +++ backend/app/plan/router.py | 39 +++++++------------ .../optimiser/optimiser_functions.py | 14 ++++--- 6 files changed, 79 insertions(+), 36 deletions(-) diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index 87f36985..915f84c6 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -3,7 +3,7 @@ STANDARD_BUILT_FORMS = { # Houses "end-terrace", "semi-detached", "detached", "mid-terrace", # Flats - "ground floor", "mid-floor", "top-floor" + "ground floor", "mid-floor", "top-floor", "basement" } BUILT_FORM_MAPPINGS = { @@ -16,5 +16,29 @@ BUILT_FORM_MAPPINGS = { 'Maisonette': 'unknown', 'Flat': 'unknown', 'First Floor Flat General': 'mid-floor', - 'Bungalow (Semi)': 'semi-detached' + 'Bungalow (Semi)': 'semi-detached', + + 'Detached House': 'detached', + 'End Terraced House': 'end-terrace', + 'Studio (Ground floor)': 'ground floor', + 'Mid Terraced House': 'mid-terrace', + 'Ground Floor Flat': 'ground floor', + 'Semi Detached House': 'semi-detached', + 'Detached Property': 'detached', + 'Level not confirmed': 'unknown', + 'Bedsit': 'unknown', + 'Cottage': 'detached', + 'Terraced House': 'mid-terrace', + 'Studio (1st Floor)': 'ground floor', + 'Standard Maisonette': 'unknown', + 'Third Floor Flat or Above': 'top-floor', + 'Town House': 'end-terrace', + 'Guest room in a complex': 'unknown', + 'Back To Back House': 'mid-terrace', + 'PIMSS EMPTY': 'unknown', + 'Flat Basement': 'basement', + 'House': 'unknown', + 'Second Floor Flat': 'mid-floor', + 'First Floor Flat': 'ground floor', + 'Room Only': 'unknown' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index 3182bd45..add53cd8 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -70,6 +70,27 @@ PROPERTY_MAPPING = { 'House (Mid terrace)': 'house', 'Bungalow (Semi)': 'bungalow', 'Ground Floor Flat General': 'flat', - 'House (Semi)': 'house' - + 'House (Semi)': 'house', + 'Detached House': 'house', + 'Bedsit': 'bedsit', + 'Terraced House': 'house', + 'Standard Maisonette': 'maisonette', + 'End Terraced House': 'house', + 'Third Floor Flat or Above': 'flat', + 'Town House': 'house', + 'Mid Terraced House': 'house', + 'Back To Back House': 'house', + 'Flat Basement': 'flat', + 'Ground Floor Flat': 'flat', + 'Semi Detached House': 'house', + 'Second Floor Flat': 'flat', + 'First Floor Flat': 'flat', + 'Level not confirmed': 'flat', + 'Cottage': 'house', + 'Studio (1st Floor)': 'flat', + 'Studio (Ground floor)': 'flat', + 'Guest room in a complex': 'other', + 'PIMSS EMPTY': 'bedsit', + 'Room Only': 'other', + 'Detached Property': 'house' } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index f3156860..894d9e01 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -134,5 +134,6 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Cavity CWI required': 'uninsulated cavity', 'Solid brick EWI installed': 'insulated solid brick', 'Cavity Cavity batts': 'filled cavity', - 'Cavity CWI Completed by Dyson': 'filled cavity' + 'Cavity CWI Completed by Dyson': 'filled cavity', + None: "unknown" } diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py index 261e2b62..f1090ef3 100644 --- a/backend/app/assumptions.py +++ b/backend/app/assumptions.py @@ -59,3 +59,9 @@ DESCRIPTIONS_TO_FUEL_TYPES = { "Boiler and radiators, coal": {"fuel": "Coal", "cop": 0.85}, "From main system, no cylinderstat": {"fuel": "Natural Gas", "cop": 0.85}, } + +# These are the measure types where if there is a ventilation recommendation, we force the inclusion of it +# if one of these has been recommended. +measures_needing_ventilation = [ + "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation" +] diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index d55a4f73..cce47566 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -27,6 +27,7 @@ from backend.app.dependencies import validate_token from backend.app.plan.schemas import PlanTriggerRequest from backend.app.plan.utils import get_cleaned from backend.app.utils import epc_to_sap_lower_bound, sap_to_epc +import backend.app.assumptions as assumptions from backend.ml_models.api import ModelApi from backend.Property import Property @@ -707,32 +708,25 @@ async def trigger_plan(body: PlanTriggerRequest): # we need to double unlist because we have a list of lists property_measure_types = {rec["type"] for recs in recommendations[p.id] for rec in recs} - measures_to_optimise = recommendations[p.id] - property_required_measures = [] - if body.required_measures: - property_required_measures = [ - m for m in measures_to_optimise if m[0]["type"] in body.required_measures - ] - measures_to_optimise = [ - m for m in measures_to_optimise if m[0]["type"] not in body.required_measures - ] + property_required_measures = [ + m for m in recommendations[p.id] if m[0]["type"] in body.required_measures + ] + measures_to_optimise = [ + m for m in recommendations[p.id] if m[0]["type"] not in body.required_measures + ] # If we have a wall insulation measure, we MUST include mechanical ventilation # Additionally, if we have required measures, they should also be included. Therefore # we can discount the number of points required to get to the target SAP band (or increase) # in the case of ventilation - measures_needing_ventilation = [ - "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation" - ] - needs_ventilation = any(x in property_measure_types for x in measures_needing_ventilation) + needs_ventilation = any(x in property_measure_types for x in assumptions.measures_needing_ventilation) - input_measures = prepare_input_measures( - measures_to_optimise, body.goal, needs_ventilation, measures_needing_ventilation - ) + input_measures = prepare_input_measures(measures_to_optimise, body.goal, needs_ventilation) if not input_measures[0]: # This means that we have no defaults selected_recommendations = {} + solution = [] else: fixed_gain = 0 @@ -755,7 +749,7 @@ async def trigger_plan(body: PlanTriggerRequest): # if the property needs ventilation, but the measure we optimise didn't include # venilation we add the points for ventilation as a fixed gain if needs_ventilation and any( - r in property_required_measure_types for r in measures_needing_ventilation + r in property_required_measure_types for r in assumptions.measures_needing_ventilation ): fixed_gain += next( (r[0]["sap_points"] for r in recommendations[p.id] if @@ -823,9 +817,7 @@ async def trigger_plan(body: PlanTriggerRequest): ) # If wall insulation is selected, we also include mechanical ventilation as a best practice measure - if any(x in [r["type"] for r in solution] for x in [ - "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation" - ]): + if any(x in [r["type"] for r in solution] for x in assumptions.measures_needing_ventilation): ventilation_rec = next( (r[0] for r in recommendations[p.id] if r[0]["type"] == "mechanical_ventilation"), None @@ -854,14 +846,9 @@ async def trigger_plan(body: PlanTriggerRequest): ] # We'll also unlist the recommendations so they're a bit easier to handle from here onwards - final_recommendations = [ + recommendations[p.id] = [ rec for recommendations_by_type in final_recommendations for rec in recommendations_by_type ] - # Get defaults - defaults = [r for r in final_recommendations if r["default"]] - sum([r['sap_points'] for r in defaults]) - - recommendations[p.id] = final_recommendations # when we have buildings, we tweak our solar PV recommendations as if one unit needs it, we apply it to all # of them diff --git a/recommendations/optimiser/optimiser_functions.py b/recommendations/optimiser/optimiser_functions.py index a0c3719d..05b9ec42 100644 --- a/recommendations/optimiser/optimiser_functions.py +++ b/recommendations/optimiser/optimiser_functions.py @@ -1,4 +1,7 @@ -def prepare_input_measures(property_recommendations, goal, needs_ventilation, measures_needing_ventilation): +import backend.app.assumptions as assumptions + + +def prepare_input_measures(property_recommendations, goal, needs_ventilation): """ Basic function to convert recommendations_to_upload to a format that is suitable for the optimiser - large @@ -6,7 +9,6 @@ def prepare_input_measures(property_recommendations, goal, needs_ventilation, me :param goal: goal to be optimised for, should be one of the keys in gain_map. E.g. if the gain is SAP points, the goal should reflect that desired gain :param needs_ventilation: boolean to indicate if the property needs ventilation - :param measures_needing_ventilation: list of measures that need ventilation :return: Nested list of input measures """ @@ -44,18 +46,20 @@ def prepare_input_measures(property_recommendations, goal, needs_ventilation, me for rec in recs: # We bundle the impact of ventilation with the measure total = ( - rec["total"] + ventilation_recommendation["total"] if rec["type"] in measures_needing_ventilation + rec["total"] + ventilation_recommendation["total"] + if rec["type"] in assumptions.measures_needing_ventilation else rec["total"] ) gain = ( - rec[goal_key] + ventilation_recommendation[goal_key] if rec["type"] in measures_needing_ventilation + rec[goal_key] + ventilation_recommendation[goal_key] + if rec["type"] in assumptions.measures_needing_ventilation else rec[goal_key] ) rec_type = ( "+".join( [rec["type"], ventilation_recommendation["type"]] - ) if rec["type"] in measures_needing_ventilation + ) if rec["type"] in assumptions.measures_needing_ventilation else rec["type"] ) From f322e55b19483c8b7f06af818f529526f79c9b14 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Mar 2025 09:02:24 +0000 Subject: [PATCH 242/255] fixing extraction of attributes for pfp --- asset_list/AssetList.py | 19 ++++++++------ asset_list/app.py | 55 ++++++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 33 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 81b973b9..df16a314 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -841,8 +841,8 @@ class AssetList: self.standardised_asset_list[self.ATTRIBUTE_NUMBER_OF_FLOORS] = self.standardised_asset_list.apply( lambda x: estimate_number_of_floors( property_type=( - x[self.STANDARD_PROPERTY_TYPE].title() if - x[self.STANDARD_PROPERTY_TYPE].title() in accepted_epc_property_types else ( + str(x[self.STANDARD_PROPERTY_TYPE]).title() if + str(x[self.STANDARD_PROPERTY_TYPE]).title() in accepted_epc_property_types else ( x[self.EPC_API_DATA_NAMES["property-type"]] if not pd.isnull(x[self.EPC_API_DATA_NAMES["property-type"]]) else None ) @@ -996,13 +996,16 @@ class AssetList: age_band = x[self.EPC_API_DATA_NAMES["construction-age-band"]].split(": ")[1] lower_date, upper_date = age_band.split("-") - age_band_matches = ( - "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and ( - x[self.STANDARD_YEAR_BUILT] <= float(upper_date) + if not x[self.STANDARD_YEAR_BUILT]: + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and ( + x[self.STANDARD_YEAR_BUILT] <= float(upper_date) + ) + else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date) + else "EPC Age Band is newer than Year Built" ) - else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date) - else "EPC Age Band is newer than Year Built" - ) processed_age_band.append( { diff --git a/asset_list/app.py b/asset_list/app.py index bf5234dd..d7b1b6cd 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -88,7 +88,7 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) - # PFP + # PFP East data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/East" data_filename = "PFP EAST - Master - DN LN NG NR PE POSTCODES.xlsx" sheet_name = "PFP EAST" @@ -180,28 +180,29 @@ def app(): # master_to_asset_list_filepath = None # For Westward - # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" - # data_filename = "WESTWARD - completed list..xlsx" - # sheet_name = "Sheet1" - # postcode_column = "WFT EDIT Postcode" - # fulladdress_column = "Address" - # address1_column = None - # address1_method = "house_number_extraction" - # address_cols_to_concat = [] - # missing_postcodes_method = None - # landlord_year_built = "Build date" - # landlord_os_uprn = "UPRN" - # landlord_property_type = "Location type" - # landlord_wall_construction = "Wall Construction (EPC)" - # landlord_heating_system = "Heat Source" - # landlord_existing_pv = "PV (Y/N)" - # landlord_property_id = "Place ref" - # outcomes_filename = None - # outcomes_sheetname = None - # outcomes_postcode = None - # outcomes_houseno = None - # master_filepaths = [] - # master_to_asset_list_filepath = None + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + data_filename = "WESTWARD - completed list - 20.03.2025.xlsx" + sheet_name = "Sheet1" + postcode_column = "WFT EDIT Postcode" + fulladdress_column = "Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build date" + landlord_os_uprn = "UPRN" + landlord_property_type = "Location type" + landlord_built_form = None + landlord_wall_construction = "Wall Construction (EPC)" + landlord_heating_system = "Heat Source" + landlord_existing_pv = "PV (Y/N)" + landlord_property_id = "Place ref" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + master_filepaths = [] + master_to_asset_list_filepath = None # For ACIS - programme re-build # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/ACIS/ACIS Full Programme Review March 2025" @@ -370,7 +371,8 @@ def app(): property_type_column=AssetList.STANDARD_PROPERTY_TYPE, built_form_column=AssetList.STANDARD_BUILT_FORM, manual_uprn_map=manual_uprn_map, - epc_api_only=epc_api_only + epc_api_only=epc_api_only, + epc_auth_token=EPC_AUTH_TOKEN ) # We now retrieve any failed properties @@ -382,8 +384,11 @@ def app(): fulladdress_column=AssetList.STANDARD_FULL_ADDRESS, address1_column=AssetList.STANDARD_ADDRESS_1, postcode_column=AssetList.STANDARD_POSTCODE, + property_type_column=AssetList.STANDARD_PROPERTY_TYPE, + built_form_column=AssetList.STANDARD_BUILT_FORM, manual_uprn_map=manual_uprn_map, - epc_api_only=epc_api_only + epc_api_only=epc_api_only, + epc_auth_token=EPC_AUTH_TOKEN ) epc_data_chunk.extend(epc_data_failed) From 746c42594c559791e6f50964724793516e2b9251 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Mar 2025 15:15:36 +0000 Subject: [PATCH 243/255] debugging flattening of find my epc data --- asset_list/AssetList.py | 200 ++++++++++++++---- asset_list/app.py | 113 +++++++++- backend/apis/GoogleSolarApi.py | 4 +- .../mod/pilot/2. Create Excel Model.py | 14 +- 4 files changed, 278 insertions(+), 53 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index df16a314..eddeabdc 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1024,6 +1024,28 @@ class AssetList: def identify_worktypes(self, cleaned): + # Before we being, we identify if a property has solar already as we use this + # for identifying cavity jobs + if self.non_intrusives_present: + existing_solar_non_intrusives_check = ( + self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF" + ) + elif self.old_format_non_intrusives_present: + existing_solar_non_intrusives_check = ( + self.standardised_asset_list["non-intrusives: WFT Findings"].str.lower().str.strip().isin( + ["solar pv on roof"] + ) + ) + else: + # We don't have an indication + existing_solar_non_intrusives_check = False + + self.standardised_asset_list["property_has_solar"] = ( + (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") | + existing_solar_non_intrusives_check | + (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR]) + ) + # If we have non-intrusives completed, we can use this to identify work types ###################################################### # Empty cavity: @@ -1047,22 +1069,76 @@ class AssetList: # We set the filter to False, as we have no non-intrusives non_intrusives_wall_filter = False - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( - (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & - non_intrusives_wall_filter & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"] - ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + if self.landlord_year_built is None: + # The landlord won't always give us year built + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + ( + self.standardised_asset_list["epc_year_upper_bound"] <= 2002 + ) & + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) ) - ) - # Let's also flag work that looks eligible without the SAP filter - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = ( + # Let's also flag work that looks eligible without the SAP filter + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + (self.standardised_asset_list["epc_year_upper_bound"] <= 2002) + ) + + else: + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + ( + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) | + (self.standardised_asset_list["epc_year_upper_bound"] <= 2002) + ) & + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) & ( + # If the property has solar, there's a chance it won't qualify + ~self.standardised_asset_list["property_has_solar"] + ) + ) + + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + ( + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) | + (self.standardised_asset_list["epc_year_upper_bound"] <= 2002) + ) & + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) & ( + # If the property has solar, there's a chance it won't qualify + self.standardised_asset_list["property_has_solar"] + ) + ) + + # Let's also flag work that looks eligible without the SAP filter + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & + # If the property has solar, there's a chance it won't qualify + ~self.standardised_asset_list["property_has_solar"] + ) + + # We also add a filter on anything that was generally identified by the none-intrusives + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter_no_year_filter"] = ( (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & - non_intrusives_wall_filter & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) + non_intrusives_wall_filter ) # If non_intrusive_indicates_empty_cavity is True, @@ -1073,7 +1149,15 @@ class AssetList: self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] ) + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter_no_year_filter"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"], + False, + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter_no_year_filter"] + ) + self.standardised_asset_list["epc_indicates_empty_cavity"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( self.EPC_NO_WALL_INSULATION_DESCRIPTIONS ) & ( @@ -1083,6 +1167,8 @@ class AssetList: ) & ( self.standardised_asset_list[ self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) & ( + ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) ) ) @@ -1096,26 +1182,38 @@ class AssetList: ) & ( self.standardised_asset_list[ self.EPC_API_DATA_NAMES["current-energy-efficiency"]] > self.EMPTY_CAVITY_SAP_THRESHOLD + ) & ( + ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) ) ) self.standardised_asset_list["landlord_data_indicates_empty_cavity"] = ( self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & + ( + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) | + (self.standardised_asset_list["epc_year_upper_bound"] <= 1995) + ) & ( self.standardised_asset_list[ self.EPC_API_DATA_NAMES["current-energy-efficiency"] ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) & ( + ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) ) ) self.standardised_asset_list["landlord_data_indicates_empty_cavity_no_sap_filter"] = ( self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & + ( + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) | + (self.standardised_asset_list["epc_year_upper_bound"] <= 1995) + ) & ( self.standardised_asset_list[ self.EPC_API_DATA_NAMES["current-energy-efficiency"] ] > self.EMPTY_CAVITY_SAP_THRESHOLD + ) & ( + ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) ) ) @@ -1243,27 +1341,6 @@ class AssetList: ).sum(): raise ValueError("Both heating system checks are true - this should not be possible") - # Check 2: Does the property have solar already - if self.non_intrusives_present: - existing_solar_non_intrusives_check = ( - self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF" - ) - elif self.old_format_non_intrusives_present: - existing_solar_non_intrusives_check = ( - self.standardised_asset_list["non-intrusives: WFT Findings"].str.lower().str.strip().isin( - ["solar pv on roof"] - ) - ) - else: - # We don't have an indication - existing_solar_non_intrusives_check = False - - self.standardised_asset_list["property_has_solar"] = ( - (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") | - existing_solar_non_intrusives_check | - (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR]) - ) - # Check 3: Does the property meet the fabric condition # Solar PV installs are subject to the minimum insulation requirements which means: # 1) one of the following insulation measures must be installed as part of the same @@ -1627,6 +1704,26 @@ class AssetList: # SAP below threshold self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) + + self.standardised_asset_list["test"] = ( + not_a_flat & + # Landlord data or EPC data indicates the heating system is appropriate + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_meet_solar_requirements & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + # SAP below threshold + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + self.standardised_asset_list["test"] = np.where( + self.standardised_asset_list["solar_eligible_other_floor"], + False, + self.standardised_asset_list["test"] + ) + self.standardised_asset_list["solar_eligible_other_floor_sap_above_threshold"] = ( not_a_flat & # Landlord data or EPC data indicates the heating system is appropriate @@ -1773,14 +1870,30 @@ class AssetList: self.standardised_asset_list["cavity_reason"] ) self.standardised_asset_list["cavity_reason"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"], - "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed", + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]), + "Non-Intrusive Data Showed Empty Cavity - property already has solar", self.standardised_asset_list["cavity_reason"] ) + self.standardised_asset_list["cavity_reason"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]), + "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed", + self.standardised_asset_list["cavity_reason"] + ) + + self.standardised_asset_list["cavity_reason"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter_no_year_filter"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]), + "Non-Intrusive Data Showed Empty Cavity but all SAP scores and year built allowed", + self.standardised_asset_list["cavity_reason"] + ) + self.standardised_asset_list["cavity_reason"] = np.where( ( self.standardised_asset_list["epc_indicates_empty_cavity"] & - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) ), "EPC Data Showed Empty Cavity", self.standardised_asset_list["cavity_reason"] @@ -1788,7 +1901,8 @@ class AssetList: self.standardised_asset_list["cavity_reason"] = np.where( ( self.standardised_asset_list["epc_indicates_empty_cavity_no_sap_filter"] & - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) ), "EPC Data Showed Empty Cavity but all SAP scores allowed", self.standardised_asset_list["cavity_reason"] @@ -1798,7 +1912,8 @@ class AssetList: ( self.standardised_asset_list["landlord_data_indicates_empty_cavity"] & ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & - ~self.standardised_asset_list["epc_indicates_empty_cavity"] + ~self.standardised_asset_list["epc_indicates_empty_cavity"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) ), "Landlord Data Showed Empty Cavity", self.standardised_asset_list["cavity_reason"] @@ -1807,7 +1922,8 @@ class AssetList: ( self.standardised_asset_list["landlord_data_indicates_empty_cavity_no_sap_filter"] & ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] & - ~self.standardised_asset_list["epc_indicates_empty_cavity_no_sap_filter"] + ~self.standardised_asset_list["epc_indicates_empty_cavity_no_sap_filter"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) ), "Landlord Data Showed Empty Cavity but all SAP scores allowed", self.standardised_asset_list["cavity_reason"], diff --git a/asset_list/app.py b/asset_list/app.py index d7b1b6cd..13621448 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -100,8 +100,8 @@ def app(): missing_postcodes_method = None landlord_year_built = None landlord_os_uprn = None - landlord_property_type = "Archetype" - landlord_built_form = "Archetype" + landlord_property_type = "Archetype (PFP)" + landlord_built_form = "Archetype (PFP)" landlord_wall_construction = None landlord_heating_system = None landlord_existing_pv = None @@ -418,6 +418,8 @@ def app(): epc_df = pd.concat(epc_data) epc_df["estimated"] = epc_df["estimated"].fillna(False) + z = epc_df[epc_df["domna_property_id"] == eg["domna_property_id"].values[0]] + # We expand out the recommendations recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] @@ -457,10 +459,24 @@ def app(): if "find_my_epc_data" not in epc_df.columns: epc_df["find_my_epc_data"] = None - find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( - columns=["find_my_epc_data"]).join( - pd.json_normalize(epc_df["find_my_epc_data"]) - ) + find_my_epc_data = [] + for _, x in epc_df.iterrows(): + if x["find_my_epc_data"]: + find_my_epc_data.append( + { + asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID], + **x["find_my_epc_data"] + } + ) + else: + find_my_epc_data.append( + { + asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID] + } + ) + + find_my_epc_data = pd.DataFrame(find_my_epc_data) + find_my_epc_data = find_my_epc_data.merge( transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], how="left", on=asset_list.DOMNA_PROPERTY_ID @@ -511,6 +527,91 @@ def app(): asset_list.flat_analysis() + ################################################################ + # WESTWARD - comparison between Kieran's method & automated + ################################################################ + + # Check 1) + cavity_fills = pd.read_excel( + os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"), + sheet_name="Straight Fill" + ) + cavity_fills = cavity_fills.merge( + asset_list.standardised_asset_list[ + [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason"] + ], + how="left", + left_on=asset_list.landlord_property_id, + right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID + ) + cavity_fills["cavity_reason"] = cavity_fills["cavity_reason"].fillna("Not identified") + cavity_fills["cavity_reason"].value_counts() + # Didn't identify 3 properties because they're bedsits + # 4 properties were identified, not based on the non-intrusives but instead because + # Westward said they were built in 2003/2007. Have adjusted this to use the age from the + # epc as well, as EPC says 1975 and they look like 1975 properties + # 58 properties flagged as already having solar: + # + + z = cavity_fills[ + cavity_fills["cavity_reason"] == "Non-Intrusive Data Showed Empty Cavity - property already has solar" + ] + + df = asset_list.standardised_asset_list[ + asset_list.standardised_asset_list[asset_list.STANDARD_LANDLORD_PROPERTY_ID].isin( + z[asset_list.landlord_property_id].values) + ] + eg = df[df[asset_list.STANDARD_LANDLORD_PROPERTY_ID] == "TOTNEWINA0102300"] + + z[["Address", "WFT EDIT Postcode", asset_list.landlord_property_id]] + z[[asset_list.STANDARD_FULL_ADDRESS, asset_list.STANDARD_POSTCODE, asset_list.ATTRIBUTE_HAS_SOLAR]] + + # Check 2) + cavity_fills_with_solar = pd.read_excel( + os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"), + sheet_name="Solar PV - Straight Fill" + ) + cavity_fills_with_solar = cavity_fills_with_solar.merge( + asset_list.standardised_asset_list[ + [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason"] + ], + how="left", + left_on=asset_list.landlord_property_id, + right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID + ) + cavity_fills_with_solar["cavity_reason"] = cavity_fills_with_solar["cavity_reason"].fillna("Not identified") + # 203 properties total + # 140 properties were flagged up based on non-intrusives (Non-Intrusive Data Showed Empty Cavity) + + check = cavity_fills_with_solar[ + cavity_fills_with_solar["cavity_reason"] == "Non-Intrusive Data Showed Empty Cavity" + ] + z = asset_list.standardised_asset_list[ + asset_list.standardised_asset_list[asset_list.STANDARD_LANDLORD_PROPERTY_ID].isin( + check[asset_list.landlord_property_id].values) + ] + z[asset_list.ATTRIBUTE_HAS_SOLAR].value_counts() + pd.set_option('display.max_columns', None) + z[[asset_list.STANDARD_FULL_ADDRESS, asset_list.STANDARD_POSTCODE, asset_list.ATTRIBUTE_HAS_SOLAR]] + + not_flagged = asset_list.standardised_asset_list[ + pd.isnull(asset_list.standardised_asset_list["solar_reason"]) + ] + # For everything not flagged for solar, identify why + reasons = [] + for _, x in not_flagged.iterrows(): + if x[asset_list.STANDARD_PROPERTY_TYPE] == "flat": + reason = "property is a flat" + else: + x[asset_list.EPC_API_DATA_NAMES["mainheat-description"]] + + reasons.append( + { + asset_list.DOMNA_PROPERTY_ID: x["asset_list.DOMNA_PROPERTY_ID"], + "solar_exclusion_reason": reason, + } + ) + asset_list.load_contact_details( local_filepath=os.path.join(data_folder, "Full property list wth D&V report V look up 12.2.25.xlsx"), sheet_name="Report 1", diff --git a/backend/apis/GoogleSolarApi.py b/backend/apis/GoogleSolarApi.py index ea8650b6..cda32faa 100644 --- a/backend/apis/GoogleSolarApi.py +++ b/backend/apis/GoogleSolarApi.py @@ -187,7 +187,9 @@ class GoogleSolarApi: # We constrain the roof area, based on the floor area to be more conservative self.roof_area = self.insights_data["solarPotential"]["wholeRoofStats"]['areaMeters2'] - if self.roof_area > property_instance.roof_area * self.ROOF_AREA_TOLERANCE: + if ( + self.roof_area > property_instance.roof_area * self.ROOF_AREA_TOLERANCE + ) | (self.roof_area < (2 - self.ROOF_AREA_TOLERANCE) * property_instance.roof_area): self.roof_area = property_instance.roof_area self.floor_area = self.insights_data["solarPotential"]["wholeRoofStats"]['groundAreaMeters2'] diff --git a/etl/customers/mod/pilot/2. Create Excel Model.py b/etl/customers/mod/pilot/2. Create Excel Model.py index 0e057a25..e656c96e 100644 --- a/etl/customers/mod/pilot/2. Create Excel Model.py +++ b/etl/customers/mod/pilot/2. Create Excel Model.py @@ -368,10 +368,16 @@ def app(): pprint(measure_counts[scenario_ids[0]]) pprint(measure_counts[scenario_ids[1]]) - df = scenario_data[scenario_ids[1]] - z = df[ - (df["Walls"] == "Cavity wall, as built, no insulation") & (~df["Recommendation: cavity_wall_insulation"]) - ] + # Do not get to EPC B: + # 5 are flats + # 1) 34 Luffenham Place, Chicksands SG17 5XH, has been surveyed as having a low performing heat pump - + # should be looked at but several surrounding properties have been surveyed in a similar fashion + # 2) 42, Muscott Close, Shipton Bellinger SP9 7TX, has an oil boiler and the bills go up recommending HHRSH. + # we could non-intrusively recommend a heat pump. + # 3) 33 Blenheim Crescent, Ruislip, HA4 7HA, 100021455241 Solar potential modelling returned nothing - + # manual review indicates that there are multiple trees surrouding the south facing side of the property + # 4) 10 Bower Green, Shrivenham, SN6 8TU - Solar isn't recommended without further survey due to the local + # area being surrounded by trees # Scenario adjustments: # Exclude: boiler_upgrade From 1d48ede60eb47dd41f5de9b7f28736ef5bfb6a38 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 23 Mar 2025 18:48:22 +0000 Subject: [PATCH 244/255] fixing ventilation negative kwh --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 11 +-- asset_list/app.py | 95 ++++++++++--------- backend/Property.py | 4 +- .../mod/pilot/2. Create Excel Model.py | 74 +++++++++++++-- recommendations/Costs.py | 6 +- recommendations/Recommendations.py | 15 ++- 8 files changed, 141 insertions(+), 68 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 96ad7a95..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index fb10c6b0..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index eddeabdc..a6b8f973 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1249,20 +1249,19 @@ class AssetList: (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & (~self.standardised_asset_list['non-intrusives: Material'].isin( ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"] - ) - ) + )) ) self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( extraction_wall_filter & ( self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - ) + )) # Also include work without the SAP filter as optimistic self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = ( - extraction_wall_filter - ) + extraction_wall_filter & ( + ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + )) elif self.old_format_non_intrusives_present: print("Review these categories with Kieran") diff --git a/asset_list/app.py b/asset_list/app.py index 13621448..f2a85ac3 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -418,7 +418,7 @@ def app(): epc_df = pd.concat(epc_data) epc_df["estimated"] = epc_df["estimated"].fillna(False) - z = epc_df[epc_df["domna_property_id"] == eg["domna_property_id"].values[0]] + epc_df["number-habitable-rooms"].mean() + 1 # We expand out the recommendations recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] @@ -545,26 +545,19 @@ def app(): right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID ) cavity_fills["cavity_reason"] = cavity_fills["cavity_reason"].fillna("Not identified") - cavity_fills["cavity_reason"].value_counts() + print(cavity_fills["cavity_reason"].value_counts()) # Didn't identify 3 properties because they're bedsits # 4 properties were identified, not based on the non-intrusives but instead because # Westward said they were built in 2003/2007. Have adjusted this to use the age from the # epc as well, as EPC says 1975 and they look like 1975 properties - # 58 properties flagged as already having solar: - # - - z = cavity_fills[ - cavity_fills["cavity_reason"] == "Non-Intrusive Data Showed Empty Cavity - property already has solar" - ] - - df = asset_list.standardised_asset_list[ - asset_list.standardised_asset_list[asset_list.STANDARD_LANDLORD_PROPERTY_ID].isin( - z[asset_list.landlord_property_id].values) - ] - eg = df[df[asset_list.STANDARD_LANDLORD_PROPERTY_ID] == "TOTNEWINA0102300"] - - z[["Address", "WFT EDIT Postcode", asset_list.landlord_property_id]] - z[[asset_list.STANDARD_FULL_ADDRESS, asset_list.STANDARD_POSTCODE, asset_list.ATTRIBUTE_HAS_SOLAR]] + # 37 properties flagged as already having solar - these are all because the landlord said they have solar + # e.g. + # https://earth.google.com/web/search/11+Winsland+Avenue+TOTNES+TQ9+5FT/@50.43354465,-3.71318276,46.57468503a, + # 59.14004365d,35y,0h,0t, + # 0r/data=CpABGmISXAolMHg0ODZkMWQxOGE4NWRiZjdkOjB4YjBhM2E5M2Q3YWVlMWEwYhlZYgp7fzdJQCHFfC9027QNwCohMTEgV2luc2xhbmQgQXZlbnVlIFRPVE5FUyBUUTkgNUZUGAIgASImCiQJbxsQEoo3SUARXQcp_HE3SUAZBmiZGJ6yDcAhCA0fqq63DcBCAggBOgMKATBCAggASg0I____________ARAA + # https://earth.google.com/web/search/15+St+Anne%27s+Ct,+Newton+Abbot+TQ12+1TL/@50.53068337,-3.61611128, + # 11.74908956a,135.73212429d,35y,0h,0t, + # 0r/data=CpUBGmcSYQolMHg0ODZkMDVkMjFhODhjZjgxOjB4MjBmMzE2Zjc3MGI2NGMwYxlCxHLw8UNJQCFZqyzALe4MwComMTUgU3QgQW5uZSdzIEN0LCBOZXd0b24gQWJib3QgVFExMiAxVEwYAiABIiYKJAm-r6U2iDdJQBHS5ICRdDdJQBmYGVpmiLINwCG8wcrtqbYNwEICCAE6AwoBMEICCABKDQj___________8BEAA # Check 2) cavity_fills_with_solar = pd.read_excel( @@ -580,37 +573,51 @@ def app(): right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID ) cavity_fills_with_solar["cavity_reason"] = cavity_fills_with_solar["cavity_reason"].fillna("Not identified") + print(cavity_fills_with_solar["cavity_reason"].value_counts()) # 203 properties total # 140 properties were flagged up based on non-intrusives (Non-Intrusive Data Showed Empty Cavity) + # 63 property already has solar - check = cavity_fills_with_solar[ - cavity_fills_with_solar["cavity_reason"] == "Non-Intrusive Data Showed Empty Cavity" - ] - z = asset_list.standardised_asset_list[ - asset_list.standardised_asset_list[asset_list.STANDARD_LANDLORD_PROPERTY_ID].isin( - check[asset_list.landlord_property_id].values) + # Check 3) RDF + rdf = pd.read_excel( + os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"), + sheet_name="RDF CIGA checks" + ) + rdf = rdf.merge( + asset_list.standardised_asset_list[ + [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason", "solar_reason"] + ], + how="left", + left_on=asset_list.landlord_property_id, + right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID + ) + rdf["cavity_reason"] = rdf["cavity_reason"].fillna("Not identified") + print(rdf["cavity_reason"].value_counts()) + # 264 properties are not identified, 261 of which are due to the fact they contain materials + # The other 3 were determined to be eligible for solar instead + # Many of these units that were identified for rdf works could be solar jobs + + rdf_with_solar = pd.read_excel( + os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"), + sheet_name="Solar PV - RDF CIGA Checks" + ) + rdf_with_solar = rdf_with_solar.merge( + asset_list.standardised_asset_list[ + [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason", "solar_reason"] + ], + how="left", + left_on=asset_list.landlord_property_id, + right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID + ) + rdf_with_solar["cavity_reason"] = rdf_with_solar["cavity_reason"].fillna("Not identified") + rdf_with_solar["cavity_reason"].value_counts() + + # All others identified - some flagged as empties due to EPC or landlord data suggesting as much + # 5 not identified due to containing COMPACTED BEAD + + asset_list.standardised_asset_list = asset_list.standardised_asset_list[ + asset_list.standardised_asset_list[asset_list.landlord_property_id] ] - z[asset_list.ATTRIBUTE_HAS_SOLAR].value_counts() - pd.set_option('display.max_columns', None) - z[[asset_list.STANDARD_FULL_ADDRESS, asset_list.STANDARD_POSTCODE, asset_list.ATTRIBUTE_HAS_SOLAR]] - - not_flagged = asset_list.standardised_asset_list[ - pd.isnull(asset_list.standardised_asset_list["solar_reason"]) - ] - # For everything not flagged for solar, identify why - reasons = [] - for _, x in not_flagged.iterrows(): - if x[asset_list.STANDARD_PROPERTY_TYPE] == "flat": - reason = "property is a flat" - else: - x[asset_list.EPC_API_DATA_NAMES["mainheat-description"]] - - reasons.append( - { - asset_list.DOMNA_PROPERTY_ID: x["asset_list.DOMNA_PROPERTY_ID"], - "solar_exclusion_reason": reason, - } - ) asset_list.load_contact_details( local_filepath=os.path.join(data_folder, "Full property list wth D&V report V look up 12.2.25.xlsx"), diff --git a/backend/Property.py b/backend/Property.py index e6e43efe..5dcc76da 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -462,7 +462,7 @@ class Property: if self.simulation_epcs is None: raise ValueError("Simulation EPCs have not been created") - rec_ids = sorted(list(self.simulation_epcs.keys())) + rec_ids = list(self.simulation_epcs.keys()) updated_simulation_epcs = [] for rec_id in rec_ids: sim_epc = self.simulation_epcs[rec_id].copy() @@ -488,8 +488,6 @@ class Property: # Now we havet this data inthe self.updated_simulation_epcs = updated_simulation_epcs - return updated_simulation_epcs - @staticmethod def create_recommendation_scoring_data( property_id, diff --git a/etl/customers/mod/pilot/2. Create Excel Model.py b/etl/customers/mod/pilot/2. Create Excel Model.py index e656c96e..a74e22ec 100644 --- a/etl/customers/mod/pilot/2. Create Excel Model.py +++ b/etl/customers/mod/pilot/2. Create Excel Model.py @@ -78,7 +78,7 @@ def app(): # Set the inputs: portfolio_id = 139 - scenario_ids = [233, 234] + scenario_ids = [237, 238] properties_data, plans_data, recommendations_data = get_data( portfolio_id=portfolio_id, scenario_ids=scenario_ids @@ -299,6 +299,9 @@ def app(): [ "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", "heating", "windows", "current_epc_rating", "current_sap_points", "total_floor_area", "number_of_rooms", + "co2_emissions", "current_energy_demand", "current_energy_demand_heating_hotwater", + "heating_cost_current", "hot_water_cost_current", "lighting_cost_current", + "appliances_cost_current", "gas_standing_charge", "electricity_standing_charge" ] ].merge( recommendations_measures_pivot, how="left", on="property_id" @@ -306,6 +309,11 @@ def app(): aggregated_metrics, how="left", on="property_id" ) + df["bills_total_cost"] = ( + df["heating_cost_current"] + df["hot_water_cost_current"] + df["lighting_cost_current"] + + df["appliances_cost_current"] + df["gas_standing_charge"] + df["electricity_standing_charge"] + ) + df = df.drop(columns=["property_id"]) for c in ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings"]: df[c] = df[c].fillna(0) @@ -332,6 +340,11 @@ def app(): df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round() df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x)) + # Calculate the relative savings on carbon, kwh, and bills + df["relative_carbon_savings"] = df["co2_equivalent_savings"] / df["co2_emissions"] + df["relative_kwh_savings"] = df["kwh_savings"] / df["current_energy_demand"] + df["relative_bill_savings"] = df["energy_cost_savings"] / df["bills_total_cost"] + # For properties that don't make it to EPC B, check why. E.g. for a property that has an oil boiler, it # the bills go up recommending HHRSH, so it doesn't make it to EPC B # For mid-terrace units, use the ordnance survey API to check if there is space for a heat pump? @@ -360,13 +373,47 @@ def app(): scenario_data[scenario] = df - measure_counts = {} - for scenario in scenario_ids: - recommendation_cols = [c for c in scenario_data[scenario].columns if "Recommendation:" in c] - measure_counts[scenario] = scenario_data[scenario][recommendation_cols].sum().to_dict() + printing_scenario_id = scenario_ids[0] + # EPC breakdown + print(scenario_data[printing_scenario_id]['Predicted Post Works EPC'].value_counts()) + # Cost + # Total cost + print(scenario_data[printing_scenario_id]["total_cost"].sum()) + # Base cost + print(scenario_data[printing_scenario_id]["estimated_cost"].sum()) + # Contingency + print(scenario_data[printing_scenario_id]["contingency"].sum()) + # Costs averaged per unit + print(scenario_data[printing_scenario_id]["total_cost"].mean()) + print(scenario_data[printing_scenario_id]["estimated_cost"].mean()) + print(scenario_data[printing_scenario_id]["contingency"].mean()) - pprint(measure_counts[scenario_ids[0]]) - pprint(measure_counts[scenario_ids[1]]) + # Average relative savings + print(scenario_data[printing_scenario_id]["relative_carbon_savings"].mean()) + print(scenario_data[printing_scenario_id]["relative_kwh_savings"].mean()) + print(scenario_data[printing_scenario_id]["relative_bill_savings"].mean()) + + measure_details = {} + for scenario in scenario_ids: + measure_details[scenario] = {} + recommendation_cols = [c for c in scenario_data[scenario].columns if "Recommendation:" in c] + measure_details[scenario]["count"] = scenario_data[scenario][recommendation_cols].sum().to_dict() + # Get average cost per measure + measure_columns = [ + c.split("Recommendation: ")[1] for c in scenario_data[scenario].columns if "Recommendation:" in c + ] + # Take the mean, drop zero columns + measure_costs = {} + for m in measure_columns: + measure_costs[m] = float(scenario_data[scenario][scenario_data[scenario][m] > 0][m].mean()) + measure_details[scenario]["cost_per_measure"] = measure_costs + + pprint(measure_details[scenario_ids[0]]["count"]) + pprint(measure_details[scenario_ids[1]]["count"]) + + # Cost per measures + pprint(measure_details[scenario_ids[0]]["cost_per_measure"]) + pprint(measure_details[scenario_ids[1]]["cost_per_measure"]) # Do not get to EPC B: # 5 are flats @@ -392,13 +439,20 @@ def app(): scenario_metrics = {} for scenario in scenario_ids: df = scenario_data[scenario].copy() - df["cost_per_sap_point"] = df["total_cost"] / df["sap_points"] - df["cost_per_carbon"] = df["total_cost"] / df["co2_equivalent_savings"] + avg_savings = df[ ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost", - "cost_per_sap_point", "cost_per_carbon", "total_cost", "contingency"] + "total_cost", "contingency"] ].mean().to_dict() + avg_savings["cost_per_sap_point"] = avg_savings["total_cost"] / avg_savings["sap_points"] + avg_savings["cost_per_carbon"] = avg_savings["total_cost"] / avg_savings["co2_equivalent_savings"] + scenario_metrics[scenario] = avg_savings + + pprint(scenario_metrics[scenario_ids[0]]) + pprint(scenario_metrics[scenario_ids[1]]) # TODO: Add a slide on valuation improvement, on a sample of properties? # TODO: Read in costing data and breakdown + + zz = scenario_recommendations_df[scenario_recommendations_df["type"] == "mechanical_ventilation"] diff --git a/recommendations/Costs.py b/recommendations/Costs.py index 5e90be79..2d486191 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -104,7 +104,7 @@ INSTALLER_ASHP_COSTS = [ BOILER_UPGRADE_SCHEME_ASHP_VALUE = 7500 INSTALLER_SOLAR_BATTERY_COSTS = [ - {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 2030.40, 'installer': 'CEG'}, + {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 3769.89, 'installer': 'JJC'}, # {'capacity_kwh': 10, 'description': 'Battery Add on', 'cost': 4300.00, 'installer': 'CEG'}, # {'capacity_kwh': 5, 'description': 'Battery Retrofit existing system', 'cost': 4250.00, 'installer': 'CEG'}, # {'capacity_kwh': 10, 'description': 'Battery Retrofit Existing system', 'cost': 5950.00, 'installer': 'CEG'} @@ -193,6 +193,8 @@ class Costs: # fittings and trimming doors, as well as scope for damage to the existing wall during preparation. IWI_CONTINGENCY = 0.2 + # For air source heat pumps, we inflate the assume cost by quite a bit to account for design and installation + ASHP_CONTINGENCY = 0.35 # Where there is more uncertainty, a higher contingency rate is used HIGH_RISK_CONTINGENCY = 0.2 # When there is less uncertainty, a lower contingency rate is used @@ -1168,7 +1170,7 @@ class Costs: cost = [x for x in INSTALLER_ASHP_COSTS if x][0]["cost"] # We add some contingency since there are additional costs such as resizing radiators, that could be required - subtotal = cost * (1 + self.HIGH_RISK_CONTINGENCY) + subtotal = cost * (1 + self.ASHP_CONTINGENCY) # The costs from installers exclude VAT vat = subtotal * self.VAT_RATE total_cost = subtotal + vat diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 8a6b01ab..2e044e12 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -793,13 +793,26 @@ class Recommendations: ] ).sort_values(["phase", "recommendation_id"], ascending=True).reset_index(drop=True) + # We need the recommendaion type + rec_id_to_type = { + rec["recommendation_id"]: rec["type"] for recs in property_recommendations for rec in recs + } + rec_id_to_type[STARTING_DUMMY_ID_VALUE] = "starting_dummy" + for i in range(0, len(kwh_impact_table)): - current_phase = kwh_impact_table.loc[i, 'phase'] + current = kwh_impact_table.loc[i] + current_phase = current['phase'] previous_phase_id = (current_phase - 1) if (current_phase > 0) else -9999 previous_phase = kwh_impact_table[kwh_impact_table['phase'] == previous_phase_id] if not previous_phase.empty: for col in ["predictions_heating", "predictions_hotwater"]: + # Check if the recommendation type is ventilation + if rec_id_to_type[current["recommendation_id"]] == "mechanical_ventilation": + # We expect the kwh to increase + if kwh_impact_table.loc[i, col] > previous_phase[col].max(): + continue + if kwh_impact_table.loc[i, col] > previous_phase[col].max(): kwh_impact_table.loc[i, col] = previous_phase[col].max() From 1d0c8a3e43e25387404cb6d59deb56fa78d48a53 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 27 Mar 2025 18:58:57 +0000 Subject: [PATCH 245/255] standardising asset list for livewest --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 76 +++++-- asset_list/app.py | 71 +++++- asset_list/mappings/built_form.py | 17 +- asset_list/mappings/heating_systems.py | 20 +- asset_list/mappings/property_type.py | 25 ++- asset_list/mappings/walls.py | 3 +- backend/SearchEpc.py | 8 +- .../mod/pilot/2. Create Excel Model.py | 212 +++++++++++++++++- .../mod/pilot/3. Past Project Costs.py | 76 +++++++ 11 files changed, 468 insertions(+), 44 deletions(-) create mode 100644 etl/customers/mod/pilot/3. Past Project Costs.py diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..96ad7a95 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..fb10c6b0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index a6b8f973..b7c79c79 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -354,7 +354,10 @@ class AssetList: self.local_filepath = local_filepath self.sheet_name = sheet_name # Read in the data - self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) + if local_filepath.endswith(".xlsx"): + self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) + else: + self.raw_asset_list = pd.read_csv(local_filepath) self.standardised_asset_list = self.raw_asset_list.copy() # Will be used to store aggregated figures against the various work types self.work_type_figures = {} @@ -442,6 +445,9 @@ class AssetList: lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), axis=1 ) + + for _, x in asset_list.iterrows(): + SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]) return asset_list raise ValueError(f"Method {method} not recognized") @@ -509,6 +515,18 @@ class AssetList: return str(int(x)) return x + @staticmethod + def _clean_postcode(postcode): + # Remove double spaces + postcode = postcode.replace(" ", " ") + if " " not in postcode: + # Restructure it + return " ".join( + [postcode[:-3], postcode[-3:]] + ) + + return postcode + def init_standardise(self): """ This function is used to standardise the asset list @@ -518,6 +536,10 @@ class AssetList: # Remove rows without a postcode if self.postcode_colname is not None: self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname]) + # We also clean postcode columns where if there is not space, we create one + self.standardised_asset_list[self.postcode_colname] = self.standardised_asset_list[ + self.postcode_colname + ].apply(self._clean_postcode) # We clean up portential non-breaking spaces, and double spaces for col in [ @@ -667,7 +689,8 @@ class AssetList: "#MULTIVALUE", "This cell has an external reference that can't be shown or edited. Editing this cell will " "remove the external reference.", - "ND" + "ND", + 'PIMSS EMPTY' ] if pd.isnull(date_str) or date_str in known_errors: @@ -693,7 +716,7 @@ class AssetList: if str(date_str).isdigit() & (len(str(date_str)) == 4): return int(date_str) - raise NotImplementedError("Unhandled format for year built - implement me") + raise NotImplementedError(f"Unhandled format for year built, value is {date_str} - implement me") self.standardised_asset_list[self.landlord_year_built] = self.standardised_asset_list[ self.landlord_year_built @@ -2376,12 +2399,12 @@ class AssetList: outcomes_filepath, outcomes_sheetname, outcomes_postcode, - outcomes_houseno + outcomes_houseno, + outcomes_id ): if outcomes_filepath is None: return - # ToDO: Parameterise for future use? self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname) self.outcomes["row_id"] = self.outcomes.index @@ -2390,6 +2413,26 @@ class AssetList: lookup = [] nomatch = [] for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)): + + # Check if we have an id + oid = x[outcomes_id] if outcomes_id is not None else None + + if oid is not None: + matched = self.standardised_asset_list[ + (self.standardised_asset_list[ + self.STANDARD_LANDLORD_PROPERTY_ID + ].str.strip() == oid) + ] + + if matched.shape[0] == 1: + lookup.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + address_clean = x["Address"].lower().replace(",", "").replace(" ", " ") matched = self.standardised_asset_list[ @@ -2407,20 +2450,6 @@ class AssetList: ) continue - if "UPRN" in x: - matched = self.standardised_asset_list[ - self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] == x["UPRN"] - ] - - if matched.shape[0] == 1: - lookup.append( - { - "row_id": x["row_id"], - self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] - } - ) - continue - matched = self.standardised_asset_list[ (self.standardised_asset_list[self.STANDARD_POSTCODE] == x[outcomes_postcode]) ].copy() @@ -2459,6 +2488,9 @@ class AssetList: self.outcomes_no_match = self.outcomes[self.outcomes["row_id"].isin(nomatch)] lookup = pd.DataFrame(lookup) + if lookup.empty: + return + # We will have duplicated domna property IDs, where a surveyor has been to a property multiple times # Where we have multiple rows, we want to make a call on what the action should be. For example, # there may be properties that have been visited multiple times where the outcome was "See notes" implying @@ -2529,9 +2561,13 @@ class AssetList: else "INSTALL / CANCELLATION DATE" ) + submission_col = ( + "SUBMISSION DATE" if "SUBMISSION DATE" in master_data.columns else "SUBMISSION DATE TO INSTALLERS" + ) + # We just need to check if any were cancelled master_to_append = master_data[ - ["UPRN", install_col, "SUBMISSION DATE"] + ["UPRN", install_col, submission_col] ].rename(columns={"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, install_col: "survey_status"}) master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") diff --git a/asset_list/app.py b/asset_list/app.py index f2a85ac3..78ad1a29 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -88,6 +88,67 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) + # Live West (2018 Asset list) + data_folder = ( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/2018 Asset List" + ) + data_filename = "LIVEWEST STOCK - 23rd October 2018.xlsx" + sheet_name = "Assets" + postcode_column = 'Postcode' + fulladdress_column = "Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build Year" + landlord_os_uprn = None + landlord_property_type = "Property Archetype" + landlord_built_form = None + landlord_wall_construction = None + landlord_heating_system = "Heating Fuel Type" + landlord_existing_pv = None + landlord_property_id = "Uprn - DO NOT DELETE" + outcomes_filename = "RT - LiveWest.xlsx" + outcomes_sheetname = "Feedback" + outcomes_postcode = "Poscode" + outcomes_houseno = "No." + outcomes_id = "UPRN" + master_filepaths = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/Rolling Master " + "- redacted for analysis/CAVITY-Table 1.csv" + ] + master_to_asset_list_filepath = None + + # Live West (South West asset list) + data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March " + "2025/Livewest Asset List (Original) - csv") + data_filename = "Report-Table 1.csv" + sheet_name = None + postcode_column = 'Postcode' + fulladdress_column = "T1_Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build Yr" + landlord_os_uprn = None + landlord_property_type = "T1_AssetType" + landlord_built_form = "T1_AssetType" + landlord_wall_construction = "Wall Type Cavity" + landlord_heating_system = "Heating Fuel" + landlord_existing_pv = None + landlord_property_id = "T1_UPRN" + outcomes_filename = "RT - LiveWest.xlsx" + outcomes_sheetname = "Feedback" + outcomes_postcode = "Poscode" + outcomes_houseno = "No." + outcomes_id = "UPRN" + master_filepaths = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/Rolling Master " + "- redacted for analysis/CAVITY-Table 1.csv" + ] + master_to_asset_list_filepath = None + # PFP East data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/East" data_filename = "PFP EAST - Master - DN LN NG NR PE POSTCODES.xlsx" @@ -218,6 +279,7 @@ def app(): # landlord_year_built = "YEAR BUILT" # landlord_os_uprn = None # landlord_property_type = "Property type" + # landlord_built_form = None # landlord_wall_construction = "Wall Constuction" # landlord_heating_system = "Heating" # landlord_existing_pv = None @@ -325,7 +387,8 @@ def app(): outcomes_filepath=os.path.join(data_folder, outcomes_filename) if outcomes_filename else None, outcomes_sheetname=outcomes_sheetname, outcomes_postcode=outcomes_postcode, - outcomes_houseno=outcomes_houseno + outcomes_houseno=outcomes_houseno, + outcomes_id=outcomes_id ) asset_list.flag_survey_master( @@ -340,7 +403,7 @@ def app(): epc_api_only = False force_retrieve_data = False skip = None # Used to skip already completed chunks - chunk_size = 5000 + chunk_size = 2000 filename = "Chunk {i}.csv" download_folder = os.path.join(data_folder, "Chunks") if not os.path.exists(download_folder): @@ -355,6 +418,8 @@ def app(): if all(x in folder_contents for x in downloaded_files): skip = max(chunk_indexes) + # folder_contents = [f for f in folder_contents if "nodata" not in f and f.endswith(".csv")] + for i in range(0, len(asset_list.standardised_asset_list), chunk_size): print(f"Processing chunk {i} to {i + chunk_size}") if skip is not None and not force_retrieve_data: @@ -418,8 +483,6 @@ def app(): epc_df = pd.concat(epc_data) epc_df["estimated"] = epc_df["estimated"].fillna(False) - epc_df["number-habitable-rooms"].mean() + 1 - # We expand out the recommendations recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index 915f84c6..0da1f412 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -40,5 +40,20 @@ BUILT_FORM_MAPPINGS = { 'House': 'unknown', 'Second Floor Flat': 'mid-floor', 'First Floor Flat': 'ground floor', - 'Room Only': 'unknown' + 'Room Only': 'unknown', + + 'End Terrace Housex': 'end-terrace', + 'Mid Terrace Bungalow': 'mid-terrace', + 'End Terrace Bungalow': 'end-terrace', + 'Mid Terrace House': 'mid-terrace', + 'Detached Bungalow': 'detached', + 'End Terrace House': 'end-terrace', + 'Mid Terrace Housekeeping ': 'mid-terrace', + 'Semi Detached Bung': 'semi-detached', + 'Guest Room': 'unknown', + 'Coach House': 'detached', + 'Office Buildings': 'unknown', + 'Maisonnette': 'mid-floor', + 'Bedspace': 'unknown' + } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 73e2679e..a11ce418 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -16,7 +16,12 @@ STANDARD_HEATING_SYSTEMS = { "unknown", "communal gas boiler", "high heat retention storage heaters", - "room heaters" + "room heaters", + 'electric fuel', + 'oil fuel', + 'solid fuel', + 'gas combi boiler', + 'unknown' } HEATING_MAPPINGS = { @@ -106,5 +111,16 @@ HEATING_MAPPINGS = { 'Quantum storage heaters (Old SH on EPC)': 'high heat retention storage heaters', 'Quantum storage heaters': 'high heat retention storage heaters', 'Air Source (EPC says SH)': 'air source heat pump', - 'ASHP - Was logged as oil': 'air source heat pump' + 'ASHP - Was logged as oil': 'air source heat pump', + 'Ground Source': 'ground source heat pump', + 'District Heating': 'district heating', + 'Mains Gas (Communal)': 'communal gas boiler', + 'LPG': 'boiler - other fuel', + 'Mains Gas': 'gas condensing boiler', + + 'ELECTRIC': 'electric fuel', + 'OIL': 'oil fuel', + 'SOLID FUEL': 'solid fuel', + 'GAS': 'gas combi boiler', + 'DO NOT SURVEY': 'unknown' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index add53cd8..4a4bcb54 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -92,5 +92,28 @@ PROPERTY_MAPPING = { 'Guest room in a complex': 'other', 'PIMSS EMPTY': 'bedsit', 'Room Only': 'other', - 'Detached Property': 'house' + 'Detached Property': 'house', + 'End Terrace Housex': 'house', + 'Coach House': 'coach house', + 'Mid Terrace Bungalow': 'bungalow', + 'End Terrace Bungalow': 'bungalow', + 'Mid Terrace House': 'house', + 'Detached Bungalow': 'bungalow', + 'End Terrace House': 'house', + 'Mid Terrace Housekeeping ': 'house', + 'Maisonnette': 'maisonette', + 'Guest Room': 'unknown', + 'Office Buildings': 'unknown', + 'Semi Detached Bung': 'bungalow', + 'Bedspace': 'bedsit', + + 'Houses/Bungalows': 'bungalow', + 'Bedsits': 'bedsit', + 'Unknown': 'unknown', + 'Sheltered Flats/besits': 'flat', + 'House/Bungalow ': 'bungalow', + 'Low/Med Rise Flats/Mais': 'flat', + 'Staff/Comm': 'other', + 'A Rooms': 'other' + } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 894d9e01..065aa988 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -135,5 +135,6 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Solid brick EWI installed': 'insulated solid brick', 'Cavity Cavity batts': 'filled cavity', 'Cavity CWI Completed by Dyson': 'filled cavity', - None: "unknown" + None: "unknown", + "Cavity": "cavity unknown insulation", } diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index d33b2e70..2b3f0c02 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -207,12 +207,12 @@ class SearchEpc: try: # Updated regex to catch house numbers including alphanumeric ones - pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)' + pattern = r'(?i)(?:flat|apartment|room)\s*(\d+\w*)|^\s*(\d+\w*)' match1 = re.search(pattern, address) if match1: return next(g for g in match1.groups() if g is not None) - pattern2 = r'(?i)(flat|apartment)\s*([a-zA-Z]?\d+[a-zA-Z]?)' + pattern2 = r'(?i)(flat|apartment|room)\s*([a-zA-Z]?\d+[a-zA-Z]?)' match2 = re.search(pattern2, address) if match2: return match2.group(2) @@ -226,8 +226,8 @@ class SearchEpc: continue if part == postcode.split(" ")[1]: continue - return part.rstrip( - ",") # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary + return part.rstrip(",") + # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary # number # Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found diff --git a/etl/customers/mod/pilot/2. Create Excel Model.py b/etl/customers/mod/pilot/2. Create Excel Model.py index a74e22ec..9a9eda86 100644 --- a/etl/customers/mod/pilot/2. Create Excel Model.py +++ b/etl/customers/mod/pilot/2. Create Excel Model.py @@ -98,7 +98,7 @@ def app(): ) property_asset_data["is_pitched"] = property_asset_data["roof"].str.contains("pitched", case=False) - property_asset_data["pre_2002"] = property_asset_data["BUILD_YEAR"] < 2002 + property_asset_data["pre_1970"] = property_asset_data["BUILD_YEAR"] < 1970 property_asset_data["wall_type"] = property_asset_data["walls"].str.split(" ").str[0].str.strip() property_asset_data["is_insulated"] = ( property_asset_data["walls"].str.split(",").str[1].str.strip().isin( @@ -111,11 +111,11 @@ def app(): property_asset_data["is_pitched"] = np.where( property_asset_data["is_pitched"], "Pitched roof", "Not Pitched Roof" ) - property_asset_data["pre_2002"] = np.where( - property_asset_data["pre_2002"], "Pre 2002", "Post 2002" + property_asset_data["pre_1970"] = np.where( + property_asset_data["pre_1970"], "Pre 1970", "Post 1970" ) - archetype_variables = ["property_type", "wall_type", "is_insulated", "is_pitched", "pre_2002"] + archetype_variables = ["property_type", "wall_type", "is_insulated", "is_pitched", "pre_1970"] assigned_archetypes = ( property_asset_data.groupby( @@ -129,8 +129,8 @@ def app(): ) # Most prominent archetypes - prominent_archetypes = assigned_archetypes.head(3) - other_archetypes = assigned_archetypes.tail(-3) + prominent_archetypes = assigned_archetypes.head(6) + other_archetypes = assigned_archetypes.tail(-6) # 2 or fewer properties in the other archetypes property_asset_data = property_asset_data.merge( @@ -195,6 +195,13 @@ def app(): reset_index() .rename(columns={"archetype_group": "Archetype"}) ) + property_types = ( + (property_asset_data["property_type"] + ": " + property_asset_data["built_form"]). + value_counts(). + to_frame(). + reset_index() + .rename(columns={"index": "Property Type", 0: "Count"}) + ) # epc breakdown epc_breakdown = ( @@ -345,6 +352,11 @@ def app(): df["relative_kwh_savings"] = df["kwh_savings"] / df["current_energy_demand"] df["relative_bill_savings"] = df["energy_cost_savings"] / df["bills_total_cost"] + # Add on the archetype + df = df.merge( + property_asset_data[["uprn", "archetype_group"]], how="left", left_on="UPRN", right_on="uprn" + ) + # For properties that don't make it to EPC B, check why. E.g. for a property that has an oil boiler, it # the bills go up recommending HHRSH, so it doesn't make it to EPC B # For mid-terrace units, use the ordnance survey API to check if there is space for a heat pump? @@ -451,8 +463,190 @@ def app(): pprint(scenario_metrics[scenario_ids[0]]) pprint(scenario_metrics[scenario_ids[1]]) - # TODO: Add a slide on valuation improvement, on a sample of properties? + scenario_data[scenario_ids[0]]["loft_insulation"][ + scenario_data[scenario_ids[0]]["loft_insulation"] > 0 + ].mean() - # TODO: Read in costing data and breakdown + scenario_data[scenario_ids[0]]["cavity_wall_insulation"][ + scenario_data[scenario_ids[0]]["cavity_wall_insulation"] > 0 + ].mean() - zz = scenario_recommendations_df[scenario_recommendations_df["type"] == "mechanical_ventilation"] + # Testing checking floor risk + + import requests + + def get_flood_risk(lat, lon, radius_km=1): + url = "https://environment.data.gov.uk/flood-monitoring/id/floods" + params = { + 'lat': lat, + 'long': lon, + 'dist': radius_km # search radius in km + } + + response = requests.get(url, params=params) + response.raise_for_status() + data = response.json() + + flood_warnings = data.get("items", []) + + if not flood_warnings: + print("No active flood warnings near this location.") + else: + print(f"{len(flood_warnings)} warning(s) found near the location:") + for warning in flood_warnings: + print(f"- Area: {warning.get('description')}") + print(f" Severity: {warning.get('severity')} (Level {warning.get('severityLevel')})") + print(f" Message changed at: {warning.get('timeMessageChanged')}") + print() + + return flood_warnings + + from shapely.geometry import shape, Point + def get_flood_areas_near_point(lat, lon, radius_km=2): + url = "https://environment.data.gov.uk/flood-monitoring/id/floodAreas" + params = { + 'lat': lat, + 'long': lon, + 'dist': radius_km + } + + response = requests.get(url, params=params) + response.raise_for_status() + return response.json().get("items", []) + + def point_in_flood_area(lat, lon): + flood_areas = get_flood_areas_near_point(lat, lon, radius_km=1) + point = Point(lon, lat) # GeoJSON uses (lon, lat) format + + for area in flood_areas: + polygon_url = area.get("polygon") + if not polygon_url: + continue + + polygon_response = requests.get(polygon_url) + polygon_response.raise_for_status() + polygon_geojson = polygon_response.json() + + features = polygon_geojson.get("features", []) + if not features: + continue + + flood_polygon = shape(features[0]['geometry']) + + try: + is_inside = flood_polygon.contains(point) + except: + is_inside = False + + if is_inside: + print(f"📍 Point is inside flood area: {area['label']} ({area['notation']})") + return area + + from tqdm import tqdm + floor_warnings_data = [] + for _, property in tqdm(property_asset_data.iterrows(), total=len(property_asset_data)): + # warnings = floor_warnings_data.extend( + # get_flood_risk(lat=property["LATITUDE"], lon=property["LONGITUDE"], radius_km=1) + # ) + + resp = point_in_flood_area(lat=property["LATITUDE"], lon=property["LONGITUDE"]) + if resp: + floor_warnings_data.append( + { + "uprn": property["uprn"], + "address": property["address"], + "postcode": property["postcode"], + "area": resp + } + ) + continue + + import plotly.graph_objects as go + + labels = [ + "House_Cavity_Insulated_Pitched roof_Pre 1970", + "House_Cavity_Insulated_Pitched roof_Post 1970", + "House_Cavity_Uninsulated_Pitched roof_Pre 1970", + "House_Cavity_Uninsulated_Pitched roof_Post 1970", + "other", + "House_System_Uninsulated_Pitched roof_Pre 1970", + "House_Solid_Uninsulated_Not Pitched Roof_Pre 1970" + ] + + values = [62, 36, 21, 16, 16, 4, 2] + + hovertext = [ + "Loft insulation, draft proofing", + "Top-up loft insulation", + "Cavity wall insulation, loft insulation", + "Cavity wall insulation, ventilation", + "Bespoke retrofit measures", + "External wall insulation, roof insulation", + "Flat roof insulation, internal wall insulation" + ] + + fig = go.Figure(go.Treemap( + labels=labels, + parents=[""] * len(labels), # No root + values=values, + hovertext=hovertext, + hoverinfo="text", + textinfo="none", + marker=dict( + line=dict(color="white", width=4), + colors=values, + colorscale="Blues" + ) + )) + + fig.update_layout( + margin=dict(t=10, l=10, r=10, b=10), + plot_bgcolor="white", + paper_bgcolor="white" + ) + + fig.show() + + # Get the recommended measures by scenario id + recommendation_cols = [c for c in scenario_data[scenario_ids[1]].columns if "Recommendation:" in c] + measure_counts_by_scenario = scenario_data[scenario_ids[1]].groupby("archetype_group")[ + recommendation_cols + ].sum().reset_index() + + measure_counts_by_scenario.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/measure_counts_by_scenario.csv" + ) + + # Estimate average valuation improvment by scenarios + valuation_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/property_valuation.csv" + ) + + from backend.ml_models.Valuation import PropertyValuation + + uplift = [] + for _, x in valuation_data.iterrows(): + uprn = x["uprn"] + + to_append = {"uprn": uprn} + for _id in scenario_ids: + scenario = scenario_data[_id][ + scenario_data[_id]["uprn"] == uprn + ].squeeze() + + val = PropertyValuation.estimate_valuation_improvement( + current_value=x["valuation"], + current_epc=scenario["Current EPC Rating"].value, + target_epc=scenario["Predicted Post Works EPC"], + total_cost=None + ) + + to_append[_id] = val["average_increase"] + + uplift.append(to_append) + + uplift = pd.DataFrame(uplift) + print(uplift[scenario_ids[0]].mean()) + # £8,161 + print(uplift[scenario_ids[1]].mean()) + # £16,938 diff --git a/etl/customers/mod/pilot/3. Past Project Costs.py b/etl/customers/mod/pilot/3. Past Project Costs.py new file mode 100644 index 00000000..79a0493c --- /dev/null +++ b/etl/customers/mod/pilot/3. Past Project Costs.py @@ -0,0 +1,76 @@ +import pandas as pd + +# Get the wave 2 costing data and produce some breakdowns +costs = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/Measure cost study for MOD.xlsx", + header=2 +) + +# Get the EPC data for these + + +# Cavity +cwi_costs = costs[ + ['Model', 'Total invoiced (including VAT)'] +].copy() +cwi_costs["Model"] = "CWI - " + cwi_costs["Model"] +cwi_costs = cwi_costs[~pd.isnull(cwi_costs["Total invoiced (including VAT)"])] + +# Loft +li_costs = costs[ + ['Model.2', 'Total invoiced (including VAT).2'] +].copy() +li_costs["Model.2"] = "LI - " + li_costs["Model.2"] +li_costs = li_costs[~pd.isnull(li_costs["Total invoiced (including VAT).2"])] +# Rename +li_costs.columns = ["Model", "Total invoiced (including VAT)"] + +# Windows +windows_costs = costs[ + ['Model.3', 'Total invoiced (including VAT).3'] +].copy() +windows_costs["Model.3"] = "Windows - " + windows_costs["Model.3"] +windows_costs = windows_costs[~pd.isnull(windows_costs["Total invoiced (including VAT).3"])] +# Rename +windows_costs.columns = ["Model", "Total invoiced (including VAT)"] + +# Doors +doors_costs = costs[ + ['Model.4', 'Total invoiced (including VAT).4'] +].copy() +doors_costs["Model.4"] = "Doors - " + doors_costs["Model.4"] +doors_costs = doors_costs[~pd.isnull(doors_costs["Total invoiced (including VAT).4"])] +# Rename +doors_costs.columns = ["Model", "Total invoiced (including VAT)"] + +# ASHP +ashps_costs = costs[ + ['Model.5', 'Total invoiced (including VAT).5'] +].copy() +ashps_costs["Model.5"] = "ASHP - " + ashps_costs["Model.5"] +ashps_costs = ashps_costs[~pd.isnull(ashps_costs["Total invoiced (including VAT).5"])] +# Rename +ashps_costs.columns = ["Model", "Total invoiced (including VAT)"] + +# Solar +solar_costs = costs[ + ['Model.6', 'Total invoiced (including VAT).6'] +].copy() +solar_costs["Model.6"] = "Solar - " + solar_costs["Model.6"] +solar_costs = solar_costs[~pd.isnull(solar_costs["Total invoiced (including VAT).6"])] +# Rename +solar_costs.columns = ["Model", "Total invoiced (including VAT)"] + +fabric_costing_data = pd.concat([cwi_costs, li_costs]) +windows_doors_costing_data = pd.concat([windows_costs, doors_costs]) + +windows_doors_costing_data.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/windows_doors_costs.csv" +) +fabric_costing_data.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/fabric_costing_data.csv" +) +ashps_costs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/ashps_costs.csv") +solar_costs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/solar_costs.csv") + +project_cost_by_age = costs[["Property age ", "TOTAL Cost of Works"]].groupby("Property age ").mean().reset_index() From 2d69c671d337ab1f7b58b07bb4cf08ed4d20e62d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 1 Apr 2025 22:43:50 +0100 Subject: [PATCH 246/255] debugging handling of year built --- asset_list/AssetList.py | 146 ++++++++++++++++++------- asset_list/app.py | 116 +++++++++++++++++++- asset_list/mappings/built_form.py | 11 +- asset_list/mappings/heating_systems.py | 15 ++- asset_list/mappings/property_type.py | 9 +- etl/customers/benyon/epc_data.py | 71 ++++++++++++ etl/find_my_epc/RetrieveFindMyEpc.py | 10 ++ 7 files changed, 327 insertions(+), 51 deletions(-) create mode 100644 etl/customers/benyon/epc_data.py diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index b7c79c79..e1df5342 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -368,7 +368,7 @@ class AssetList: self.contact_detail_fields = None self.outcomes = None self.outcomes_no_match = None - self.outcomes_for_output = None + self.outcomes_for_output = pd.DataFrame() self.master_surveyed = None # We detect the presence of the non-intrusive columns @@ -701,6 +701,13 @@ class AssetList: if match: return int(match.group(1)) # Extract the year and convert to integer if "-" in date_str: + + # Count the number of times we have "-", as we've seen double ranges + # (when we have extensions) so the format is like this: + # 'G: 1983-1990, H: 1991-1995' + if date_str.count("-") == 2: + # We have a range + return int(date_str.split("-")[1].split(",")[0]) # We probably have a range return int(date_str.split("-")[1].strip()) @@ -1084,8 +1091,15 @@ class AssetList: ) elif self.old_format_non_intrusives_present: non_intrusives_wall_filter = ( - self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().isin( + self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin( ["empty cavity", "partial fill"] + ) | ( + ( + self.standardised_asset_list['non-intrusives: WFT Findings'] + .str.lower().str.strip().str.contains("empty cavity|partial fill") & + ~self.standardised_asset_list['non-intrusives: WFT Findings'] + .astype(str).str.lower().str.strip().str.contains("major access issues") + ) ) ) else: @@ -1114,6 +1128,20 @@ class AssetList: (self.standardised_asset_list["epc_year_upper_bound"] <= 2002) ) + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + (self.standardised_asset_list["epc_year_upper_bound"] <= 2002) & + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) & ( + # If the property has solar, there's a chance it won't qualify + self.standardised_asset_list["property_has_solar"] + ) + ) + else: self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & @@ -1158,7 +1186,7 @@ class AssetList: ~self.standardised_asset_list["property_has_solar"] ) - # We also add a filter on anything that was generally identified by the none-intrusives + # We also add a filter on anything that was generally identified by the non-intrusives self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter_no_year_filter"] = ( (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & non_intrusives_wall_filter @@ -1290,7 +1318,8 @@ class AssetList: print("Review these categories with Kieran") extraction_wall_filter = ( self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin( - ["retro drilled", "retro filled", "fibre from build", "polybead"] + ["retro drilled", "retro filled", "fibre from build", "polybead", "retro drilled and filled", + "retro drilled & filled", "blown in white wool", "blown in yellow wool"] ) ) @@ -1727,25 +1756,6 @@ class AssetList: self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) - self.standardised_asset_list["test"] = ( - not_a_flat & - # Landlord data or EPC data indicates the heating system is appropriate - correct_heating_system & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - # SAP below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - self.standardised_asset_list["test"] = np.where( - self.standardised_asset_list["solar_eligible_other_floor"], - False, - self.standardised_asset_list["test"] - ) - self.standardised_asset_list["solar_eligible_other_floor_sap_above_threshold"] = ( not_a_flat & # Landlord data or EPC data indicates the heating system is appropriate @@ -1869,6 +1879,32 @@ class AssetList: ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) + # Check if the boiler is electric + # We check if it contains both the terms boiler & electric + has_electric_boiler = ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] + .str.lower().isin( + ["boiler and radiators, electric", "boiler and underfloor heating, electric"]) + ) | ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] == "electric boiler" + ) + ) + + # We check for a specific sub-set of properties which are uninsulated solid wall properties that are EPC E + # or below (we'll use 57 as a threshold) - These are for a pilot with Net Zero Renewables + self.standardised_asset_list["solar_eligible_solid_wall_uninsulated"] = ( + not_a_flat & + # Landlord data or EPC data indicates the heating system is appropriate - in this case, we can also take + # electric boilers + (correct_heating_system | has_electric_boiler) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are uninsulated solic + ~walls_meet_solar_requirements & + (self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 57) + ) + # Drop anything we don't need self.standardised_asset_list = self.standardised_asset_list.drop( columns=["walls_u_value", "roof_u_value", "floor_u_value"] @@ -2009,7 +2045,8 @@ class AssetList: ), "solar_eligible_other_floor_needs_loft_needs_heating_upgrade_sap_above_threshold": ( "Solar Eligible, Other Floor, Needs Loft, Needs Heating Upgrade, SAP Above Threshold" - ) + ), + "solar_eligible_solid_wall_uninsulated": "Solar Eligible, Solid Wall Uninsulated, EPC E or Below", } for variable, reason in solar_reason_map.items(): @@ -2020,20 +2057,30 @@ class AssetList: ) # Flag anything that has existing outcomes - if self.outcomes is not None: - self.standardised_asset_list["cavity_reason"] = np.where( - ( - (self.standardised_asset_list["Surveyed"] > 0) | - (self.standardised_asset_list["Installer Refusal"] > 0) - ), - None, - self.standardised_asset_list["cavity_reason"] - ) + if (self.outcomes is not None) and ("Surveyed" in self.standardised_asset_list.columns): + + if "Installer Refusal" not in self.standardised_asset_list.columns: + self.standardised_asset_list["cavity_reason"] = np.where( + ( + (self.standardised_asset_list["Surveyed"] > 0) + ), + None, + self.standardised_asset_list["cavity_reason"] + ) + else: + self.standardised_asset_list["cavity_reason"] = np.where( + ( + (self.standardised_asset_list["Surveyed"] > 0) | + (self.standardised_asset_list["Installer Refusal"] > 0) + ), + None, + self.standardised_asset_list["cavity_reason"] + ) if self.master_surveyed is not None: self.standardised_asset_list["cavity_reason"] = np.where( ( - (~pd.isnull(self.standardised_asset_list["SUBMISSION DATE"])) + (~pd.isnull(self.standardised_asset_list["submission_date"])) ), None, self.standardised_asset_list["cavity_reason"] @@ -2064,9 +2111,11 @@ class AssetList: ~pd.isnull(self.standardised_asset_list["cavity_reason"]) | ~pd.isnull(self.standardised_asset_list["solar_reason"]) ][self.DOMNA_PROPERTY_ID].values - self.outcomes_for_output = self.outcomes[ - self.outcomes[self.DOMNA_PROPERTY_ID].isin(identified_work) - ] + + if self.DOMNA_PROPERTY_ID in self.outcomes.columns: + self.outcomes_for_output = self.outcomes[ + self.outcomes[self.DOMNA_PROPERTY_ID].isin(identified_work) + ] def flat_analysis(self): @@ -2398,6 +2447,7 @@ class AssetList: self, outcomes_filepath, outcomes_sheetname, + outcomes_address, outcomes_postcode, outcomes_houseno, outcomes_id @@ -2408,6 +2458,12 @@ class AssetList: self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname) self.outcomes["row_id"] = self.outcomes.index + if outcomes_houseno is None: + outcomes_houseno = "houseno" + self.outcomes["houseno"] = self.outcomes[outcomes_address].apply( + lambda x: SearchEpc.get_house_number(x, self.outcomes[outcomes_postcode]) + ) + logger.info("Matching outcomes to asset list") # Merge the outcomes onto the asset list - we check we're able to match sufficiently well lookup = [] @@ -2433,7 +2489,7 @@ class AssetList: ) continue - address_clean = x["Address"].lower().replace(",", "").replace(" ", " ") + address_clean = x[outcomes_address].lower().replace(",", "").replace(" ", " ") matched = self.standardised_asset_list[ (self.standardised_asset_list[ @@ -2451,13 +2507,14 @@ class AssetList: continue matched = self.standardised_asset_list[ - (self.standardised_asset_list[self.STANDARD_POSTCODE] == x[outcomes_postcode]) + (self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip() == x[outcomes_postcode]) ].copy() if not matched.empty: matched["houseno"] = matched.apply( lambda x: SearchEpc.get_house_number(x[self.STANDARD_ADDRESS_1], x[self.STANDARD_POSTCODE]), axis=1 ) + matched = matched[ matched["houseno"].astype(str) == str(x[outcomes_houseno]) ] @@ -2469,6 +2526,8 @@ class AssetList: } ) continue + elif matched.shape[0] > 1: + raise NotImplementedError("Check me") elif not matched.empty: # Use levenstein distance to match matched["address"] = matched[self.STANDARD_ADDRESS_1] + " " + matched[self.STANDARD_POSTCODE] @@ -2498,6 +2557,7 @@ class AssetList: # happened multiple times, in this case we judge that the work may not be viable date_col = "Week Commencing" if "Week Commencing" in self.outcomes else "Survey Date" + lookup = lookup.merge( self.outcomes[["row_id", "Outcome", "Notes", date_col]], how="left", on="row_id" ) @@ -2568,7 +2628,13 @@ class AssetList: # We just need to check if any were cancelled master_to_append = master_data[ ["UPRN", install_col, submission_col] - ].rename(columns={"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, install_col: "survey_status"}) + ].rename( + columns={ + "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, + install_col: "survey_status", + submission_col: "submission_date" + } + ) master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") master_surveyed.append(master_to_append) diff --git a/asset_list/app.py b/asset_list/app.py index 78ad1a29..2925e82f 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -88,6 +88,32 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) + # Southern Midlands + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/Midlands Properties - Apr 2025" + data_filename = "Southern Housing Midlands Property List - combined.xlsx" + sheet_name = "Sheet 1" + postcode_column = 'Post Code' + fulladdress_column = "Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Age_1" + landlord_os_uprn = None + landlord_property_type = "Prop_Type" + landlord_built_form = "Prop_Type" + landlord_wall_construction = "Walls_P" + landlord_heating_system = "Heating System" + landlord_existing_pv = None + landlord_property_id = "AssetID" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + master_filepaths = [] + master_to_asset_list_filepath = None + # Live West (2018 Asset list) data_folder = ( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/2018 Asset List" @@ -149,6 +175,84 @@ def app(): ] master_to_asset_list_filepath = None + # PFP London + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/London" + data_filename = "PFP AREAS SURROUNDING LONDON - JAY, RUTH & LANE.xlsx" + sheet_name = "PFP SURROUNDING LONDON" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "AddressLine1" + address1_method = None + address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Archetype (PFP)" + landlord_built_form = "Archetype (PFP)" + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Uprn" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + master_filepaths = [] + master_to_asset_list_filepath = None + + # PFP North-West + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/North-West" + data_filename = "Places for People NORTH WEST - INSPECTIONS MASTER - UPDATE.xlsx" + sheet_name = "CHECKED" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "AddressLine1" + address1_method = None + address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Archetype (PFP)" + landlord_built_form = "Archetype (PFP)" + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Uprn" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + master_filepaths = [] + master_to_asset_list_filepath = None + + # PFP North-East + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/North-East" + data_filename = "Places for People NORTH EAST - INSPECTIONS MASTER.xlsx" + sheet_name = "CHECKED" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "AddressLine1" + address1_method = None + address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Archetype (PFP)" + landlord_built_form = "Archetype (PFP)" + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Uprn" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + master_filepaths = [] + master_to_asset_list_filepath = None + # PFP East data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/East" data_filename = "PFP EAST - Master - DN LN NG NR PE POSTCODES.xlsx" @@ -171,6 +275,7 @@ def app(): outcomes_sheetname = None outcomes_postcode = None outcomes_houseno = None + outcomes_id = None master_filepaths = [] master_to_asset_list_filepath = None @@ -264,6 +369,7 @@ def app(): outcomes_houseno = None master_filepaths = [] master_to_asset_list_filepath = None + outcomes_id = None # For ACIS - programme re-build # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/ACIS/ACIS Full Programme Review March 2025" @@ -386,6 +492,7 @@ def app(): asset_list.flag_outcomes( outcomes_filepath=os.path.join(data_folder, outcomes_filename) if outcomes_filename else None, outcomes_sheetname=outcomes_sheetname, + outcomes_address=outcomes_address, outcomes_postcode=outcomes_postcode, outcomes_houseno=outcomes_houseno, outcomes_id=outcomes_id @@ -403,7 +510,7 @@ def app(): epc_api_only = False force_retrieve_data = False skip = None # Used to skip already completed chunks - chunk_size = 2000 + chunk_size = 5000 filename = "Chunk {i}.csv" download_folder = os.path.join(data_folder, "Chunks") if not os.path.exists(download_folder): @@ -418,6 +525,9 @@ def app(): if all(x in folder_contents for x in downloaded_files): skip = max(chunk_indexes) + if any(x in folder_contents for x in downloaded_files): + skip = max([i for i in chunk_indexes if filename.format(i=i) in folder_contents]) + # folder_contents = [f for f in folder_contents if "nodata" not in f and f.endswith(".csv")] for i in range(0, len(asset_list.standardised_asset_list), chunk_size): @@ -582,8 +692,6 @@ def app(): ) cleaned = msgpack.unpackb(cleaned, raw=False) - # TODO: We should break out the identification of work types to flag blocks of flats specifically - # TODO: Append existing outcomes onto the sheet. asset_list.identify_worktypes(cleaned) pprint(asset_list.work_type_figures) @@ -729,7 +837,7 @@ def app(): asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False) # If we have outcomes, we add a tab with the outcomes - if asset_list.outcomes_for_output is not None: + if not asset_list.outcomes_for_output.empty: asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False) # Store the Hubspot export as a csv diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index 0da1f412..dbb25e9b 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -1,3 +1,5 @@ +import numpy as np + STANDARD_BUILT_FORMS = { "unknown", # Houses @@ -54,6 +56,13 @@ BUILT_FORM_MAPPINGS = { 'Coach House': 'detached', 'Office Buildings': 'unknown', 'Maisonnette': 'mid-floor', - 'Bedspace': 'unknown' + 'Bedspace': 'unknown', + 'Studio (3rd floor and above)': 'top-floor', + 'Adapted Property For Disabled': 'unknown', + 'Studio (2nd floor)': 'mid-floor', + np.nan: 'unknown', + 'Third Floor Flat': 'mid-floor', + '2 Ext. Wall Flat': 'mid-terrace', + 'Hostel': 'unknown' } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index a11ce418..f6b0d0ea 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -26,7 +26,7 @@ STANDARD_HEATING_SYSTEMS = { HEATING_MAPPINGS = { "Combi - GAS": "gas combi boiler", - "E7 Storage Heaters": "electric storage heaters", + "E7 Storage Heaters": "high heat retention storage heaters", "District heating system": "district heating", "Condensing Boiler - GAS": "gas condensing boiler", "Boiler Oil/other": "oil boiler", @@ -44,7 +44,7 @@ HEATING_MAPPINGS = { "Gas fire": "other", "Backboiler - Solid fuel": "other", 'combi - gas': 'gas combi boiler', - 'e7 storage heaters': 'electric storage heaters', + 'e7 storage heaters': 'high heat retention storage heaters', 'district heating system': 'district heating', 'condensing boiler - gas': 'gas condensing boiler', 'boiler oil/other': 'oil boiler', @@ -117,10 +117,17 @@ HEATING_MAPPINGS = { 'Mains Gas (Communal)': 'communal gas boiler', 'LPG': 'boiler - other fuel', 'Mains Gas': 'gas condensing boiler', - 'ELECTRIC': 'electric fuel', 'OIL': 'oil fuel', 'SOLID FUEL': 'solid fuel', 'GAS': 'gas combi boiler', - 'DO NOT SURVEY': 'unknown' + 'DO NOT SURVEY': 'unknown', + + 'Gas Boiler': 'gas combi boiler', + 'Communal Gas ': 'communal gas boiler', + 'Communal': 'communal gas boiler', + 'Communal Gas': 'communal gas boiler', + 'Wood Burning Boiler': "boiler - other fuel", + 'Oil Fired Boiler': 'oil boiler' + } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index 4a4bcb54..be4aa797 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -106,7 +106,6 @@ PROPERTY_MAPPING = { 'Office Buildings': 'unknown', 'Semi Detached Bung': 'bungalow', 'Bedspace': 'bedsit', - 'Houses/Bungalows': 'bungalow', 'Bedsits': 'bedsit', 'Unknown': 'unknown', @@ -114,6 +113,12 @@ PROPERTY_MAPPING = { 'House/Bungalow ': 'bungalow', 'Low/Med Rise Flats/Mais': 'flat', 'Staff/Comm': 'other', - 'A Rooms': 'other' + 'A Rooms': 'other', + 'Studio (3rd floor and above)': 'flat', + 'Adapted Property For Disabled': 'unknown', + 'Studio (2nd floor)': 'flat', + 'Third Floor Flat': 'flat', + '2 Ext. Wall Flat': 'flat', + 'Hostel': 'other' } diff --git a/etl/customers/benyon/epc_data.py b/etl/customers/benyon/epc_data.py new file mode 100644 index 00000000..9ba71f2f --- /dev/null +++ b/etl/customers/benyon/epc_data.py @@ -0,0 +1,71 @@ +""" +Rough script to get the EPC data for Benyon +""" + +import pandas as pd +import os +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from asset_list.utils import get_data + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/List of All Properties ecl Grd Rents in " + "Alphabetical Order.xlsx", + header=1 +) +asset_list.columns = ["tennancy", "landlord_id", "landlord_address"] +# Get postcode as the last 2 parts of the address, split on space +asset_list["postcode"] = asset_list["landlord_address"].apply(lambda x: x.split(" ")[-2] + " " + x.split(" ")[-1]) + +asset_list["house_no"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x["landlord_address"], postcode=x["postcode"]), axis=1 +) + +epc_data, errors, no_epc = get_data( + df=asset_list, + manual_uprn_map={}, + epc_auth_token=EPC_AUTH_TOKEN, + uprn_column=None, + fulladdress_column="landlord_address", + address1_column="house_no", + postcode_column="postcode", + property_type_column=None, + built_form_column=None, + epc_api_only=True, + row_id_name="landlord_id", +) + +df = asset_list[asset_list["landlord_id"].isin(no_epc)] +epc_df = pd.DataFrame(epc_data) +epc_df["current-energy-rating"].value_counts() +epc_df["property-type"].value_counts() +epc_df["walls-description"].value_counts(normalize=True) + +asset_list = asset_list.merge( + epc_df[ + [ + "landlord_id", "current-energy-rating", "property-type", "total-floor-area", "roof-description", + "walls-description", "co2-emissions-current" + ] + ], + how="left", + left_on="landlord_id", + right_on="landlord_id" +) +asset_list.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/asset_list.csv", index=False +) + +asset_list_big = asset_list.merge( + epc_df, + how="left", + left_on="landlord_id", + right_on="landlord_id" +) +asset_list_big.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/asset_list_full_data.csv", + index=False +) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 9852cc0d..7da21012 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -332,6 +332,16 @@ class RetrieveFindMyEpc: "Replacement warm air unit": [], "Secondary glazing": ["secondary_glazing"], "Condensing heating unit": ["boiler_upgrade"], + '???': [], + 'Solar photovoltaic panels, 2.5kWp': ["solar_pv"], + 'Heating controls (programmer, room thermostat and thermostatic radiator valves)': [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + 'Translation missing: en.improvement_code.41.title': [], + "Condensing boiler (separate from the range cooker)": ["boiler_upgrade"], + "Heating controls (programmer and thermostatic radiator valves)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ] } survey = True From 9e2914156f801a99e32bf8690b79e7e87e04fd6e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 3 Apr 2025 09:06:58 +0100 Subject: [PATCH 247/255] adding retrieve find my epc to engine --- asset_list/AssetList.py | 5 +++-- asset_list/app.py | 1 + asset_list/mappings/built_form.py | 20 ++++++++++++++++++-- asset_list/mappings/heating_systems.py | 20 +++++++++++++++++--- asset_list/mappings/property_type.py | 20 ++++++++++++++++++-- asset_list/mappings/walls.py | 11 +++++++++++ backend/app/plan/router.py | 8 ++++++++ backend/app/plan/schemas.py | 11 +++++++++++ etl/find_my_epc/RetrieveFindMyEpc.py | 25 +++++++++++++++++++++++++ 9 files changed, 112 insertions(+), 9 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index e1df5342..5ae3029f 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -1351,7 +1351,8 @@ class AssetList: # Check 1: Does the property have a valid heating system? self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( - ["air source heat pump", "ground source heat pump", "high heat retention storage heaters"] + ["air source heat pump", "ground source heat pump", "high heat retention storage heaters", + "electric boiler"] ) ) self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] = ( @@ -1363,7 +1364,7 @@ class AssetList: self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = ( ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] - .str.lower().str.contains("air source heat pump|ground source heat pump") + .str.lower().str.contains("air source heat pump|ground source heat pump|boiler and radiators, electric") ) | ( self.standardised_asset_list[ self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains( diff --git a/asset_list/app.py b/asset_list/app.py index 2925e82f..67e18dac 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -111,6 +111,7 @@ def app(): outcomes_postcode = None outcomes_houseno = None outcomes_id = None + outcomes_address = None master_filepaths = [] master_to_asset_list_filepath = None diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index dbb25e9b..aad36fce 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -63,6 +63,22 @@ BUILT_FORM_MAPPINGS = { np.nan: 'unknown', 'Third Floor Flat': 'mid-floor', '2 Ext. Wall Flat': 'mid-terrace', - 'Hostel': 'unknown' - + 'Hostel': 'unknown', + 'Flat: Mid Terrace: Mid Floor': 'mid-terrace', + 'Bungalow: SemiDetached': 'semi-detached', + 'Flat: End Terrace: Top Floor': 'end-terrace', + 'Flat: Enclosed End Terrace: Top Floor': 'end-terrace', + 'Maisonette: End Terrace: Ground Floor': 'end-terrace', + 'Flat: End Terrace: Ground Floor': 'end-terrace', + 'Flat: Mid Terrace: Top Floor': 'mid-terrace', + 'House: Detached': 'detached', + 'Flat: End Terrace: Mid Floor': 'end-terrace', + 'House: SemiDetached': 'semi-detached', + 'Flat: Semi Detached: Ground Floor': 'semi-detached', + 'Flat: Semi Detached: Top Floor': 'semi-detached', + 'Flat: Mid Terrace: Ground Floor': 'mid-terrace', + 'House: MidTerrace': 'mid-terrace', + 'House: EndTerrace': 'end-terrace', + 'Bungalow: EndTerrace': 'end-terrace', + 'Bungalow: MidTerrace': 'mid-terrace' } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index f6b0d0ea..714f5434 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -122,12 +122,26 @@ HEATING_MAPPINGS = { 'SOLID FUEL': 'solid fuel', 'GAS': 'gas combi boiler', 'DO NOT SURVEY': 'unknown', - 'Gas Boiler': 'gas combi boiler', 'Communal Gas ': 'communal gas boiler', 'Communal': 'communal gas boiler', 'Communal Gas': 'communal gas boiler', 'Wood Burning Boiler': "boiler - other fuel", - 'Oil Fired Boiler': 'oil boiler' - + 'Oil Fired Boiler': 'oil boiler', + 'Electric (direct acting) room heaters: Panel, convector or radiant heaters Electricity: Electricity': 'room ' + 'heaters', + 'Electric Storage Systems: Integrated storage+direct-acting heater Electricity: Electricity': 'electric storage ' + 'heaters', + 'Community Heating Systems: Community CHP and boilers (RdSAP) Gas: Mains Gas (Community)': 'communal gas boiler', + 'Boiler: D rated Regular Boiler Gas: Mains Gas': 'gas boiler', + 'Boiler: C rated Combi Gas: Mains Gas': 'gas combi boiler', + 'Electric Storage Systems: Fan storage heaters Electricity: Electricity': 'electric storage heaters', + ' ': 'unknown', + 'Boiler: G rated Regular Boiler Gas: Mains Gas': 'gas boiler', + 'Electric Storage Systems: Modern (slimline) storage heaters Electricity: Electricity': 'electric storage heaters', + 'Boiler: E rated Regular Boiler Gas: Mains Gas': 'gas boiler', + 'Boiler: A rated Regular Boiler Electricity: Electricity': 'electric boiler', + 'Community Heating Systems: Community boilers only (RdSAP) Gas: Mains Gas (Community)': 'communal gas boiler', + 'Boiler: A rated Combi Gas: Mains Gas': 'gas condensing combi', + 'Boiler: A rated CPSU Electricity: Electricity': 'electric boiler' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index be4aa797..139b1622 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -119,6 +119,22 @@ PROPERTY_MAPPING = { 'Studio (2nd floor)': 'flat', 'Third Floor Flat': 'flat', '2 Ext. Wall Flat': 'flat', - 'Hostel': 'other' - + 'Hostel': 'other', + 'House: MidTerrace': 'house', + 'House: EndTerrace': 'house', + 'Flat: Mid Terrace: Mid Floor': 'flat', + 'Bungalow: SemiDetached': 'bungalow', + 'Bungalow: EndTerrace': 'bungalow', + 'Flat: End Terrace: Top Floor': 'flat', + 'Maisonette: End Terrace: Ground Floor': 'maisonette', + 'Flat: End Terrace: Ground Floor': 'flat', + 'Flat: Mid Terrace: Top Floor': 'flat', + 'House: Detached': 'house', + 'Flat: End Terrace: Mid Floor': 'flat', + 'House: SemiDetached': 'house', + 'Flat: Semi Detached: Ground Floor': 'flat', + 'Flat: Semi Detached: Top Floor': 'flat', + 'Flat: Mid Terrace: Ground Floor': 'flat', + 'Bungalow: MidTerrace': 'bungalow', + 'Flat: Enclosed End Terrace: Top Floor': 'flat' } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 065aa988..e5f22f13 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -137,4 +137,15 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Cavity CWI Completed by Dyson': 'filled cavity', None: "unknown", "Cavity": "cavity unknown insulation", + 'SolidBrick: Unknown': 'solid brick unknown insulation', + 'Cavity: Unknown': 'cavity unknown insulation', + 'Cavity: AsBuilt (Post 1995)': 'filled cavity', + 'Cavity: AsBuilt (1976-1982)': 'cavity unknown insulation', + 'SystemBuilt: AsBuilt': 'system built', + 'TimberFrame: AsBuilt': "timber frame unknown insulation", + 'Cavity: AsBuilt (1983-1995)': 'cavity unknown insulation', + 'Cavity: AsBuilt (1983-1995), Cavity: FilledCavity': 'filled cavity', + 'SolidBrick: AsBuilt': 'solid brick unknown insulation', + 'Cavity: FilledCavity': 'filled cavity', + 'SolidBrick: Internal': 'insulated solid brick' } diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index cce47566..45c19484 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -44,6 +44,7 @@ from backend.ml_models.Valuation import PropertyValuation from etl.bill_savings.KwhData import KwhData from etl.spatial.OpenUprnClient import OpenUprnClient +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc logger = setup_logger() @@ -514,6 +515,13 @@ async def trigger_plan(body: PlanTriggerRequest): ) ) + # if we have a remote assment data type, we pull the additional data and include it + if body.event_type == "remote_assessment": + logger.info("Retrieving find my epc data") + property_non_invasive_recommendations = RetrieveFindMyEpc.get_from_epc( + epc_searcher.newest_epc + ) + epc_records = patch_epc(patch, epc_records) prepared_epc = EPCRecord( diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py index 7db0f16f..4237472d 100644 --- a/backend/app/plan/schemas.py +++ b/backend/app/plan/schemas.py @@ -37,6 +37,7 @@ MEASURE_MAP = { VALID_GOALS = ["Increasing EPC"] VALID_HOUSING_TYPES = ["Social", "Private"] +VALID_EVENT_TYPES = ["remote_assessment"] # Define the validation function for inclusions/exclusions @@ -56,10 +57,16 @@ def check_housing_type(value: str) -> str: return value +def check_event_type(value: str) -> str: + assert value in VALID_EVENT_TYPES, f"{value} is not a valid event type" + return value + + # Use Annotated with BeforeValidator for each list item validation InclusionOrExclusionItem = Annotated[str, BeforeValidator(check_inclusion_or_exclusion)] Goal = Annotated[str, BeforeValidator(check_goals)] HousingType = Annotated[str, BeforeValidator(check_housing_type)] +EventType = Annotated[str, BeforeValidator(check_event_type)] class PlanTriggerRequest(BaseModel): @@ -84,3 +91,7 @@ class PlanTriggerRequest(BaseModel): default_u_values: Optional[bool] = True ashp_cop: Optional[float] = 2.8 + + # When performing a remote assessment, if this has been set, it will allow the engine to + # pull data from the find my epc website, to utilise as part of a remote assessment + event_type: Optional[float] = "remote_assessment", diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 7da21012..5e05d56f 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -3,6 +3,10 @@ import requests from bs4 import BeautifulSoup from datetime import datetime +from utils.logger import setup_logger + +logger = setup_logger() + class RetrieveFindMyEpc: SEARCH_POSTCODE_URL = ( @@ -366,3 +370,24 @@ class RetrieveFindMyEpc: formatted_recommendations.append(to_append) return formatted_recommendations + + @classmethod + def get_from_epc(cls, epc): + # Attempt both methods: + try: + searcher = cls(address=epc["address"], postcode=epc["postcode"]) + find_epc_data = searcher.retrieve_newest_find_my_epc_data() + except Exception as e: + logger.error(f"Error retrieving find my epc data: {e}") + # We attempt with the backup add + searcher = cls(address=epc["address1"], postcode=epc["postcode"]) + find_epc_data = searcher.retrieve_newest_find_my_epc_data() + + non_invasive_recommendations = { + "uprn": epc["uprn"], + "address": epc["address"], + "postcode": epc["postcode"], + "recommendations": find_epc_data["recommendations"], + } + + return non_invasive_recommendations From 777d5ccf06500ed276c30dcbdd72a03df00600a0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 3 Apr 2025 09:12:18 +0100 Subject: [PATCH 248/255] updating git ignore with redundant files --- .gitignore | 9 ++++++++- input_property_list.csv | 12 ------------ keyzy_pilot.csv | 3 --- 3 files changed, 8 insertions(+), 16 deletions(-) delete mode 100644 input_property_list.csv delete mode 100644 keyzy_pilot.csv diff --git a/.gitignore b/.gitignore index 63884ad7..5e247d77 100644 --- a/.gitignore +++ b/.gitignore @@ -268,4 +268,11 @@ adhoc adhoc/* etl-router-venv/ -refactor_datasets/ \ No newline at end of file +refactor_datasets/ + +etl/eligibility/ha_15_32/ +cache/ +*/.idea + +*.png +*.pptx \ No newline at end of file diff --git a/input_property_list.csv b/input_property_list.csv deleted file mode 100644 index dc677c88..00000000 --- a/input_property_list.csv +++ /dev/null @@ -1,12 +0,0 @@ -address,postcode,Notes,,,, -28 Distillery Wharf,W6 9bf,,,,, -Flat 14 Godley V C House,E2 0LP,,,,, -49 Elderfield Road,E5 0LF,,,,, -26 Stanhope Road,N6 5NG,,,,, -Flat 3 Frederick Building,N1 4BD,,,,, -Flat 4 Frederick Building,N1 4BD,,,,, -"Flat 28, 22 Adelina Grove",E1 3BX,,,,, -"Flat 39, 239 Long Lane",SE1 4PT,,,,, -"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,, -"59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,, -88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,, \ No newline at end of file diff --git a/keyzy_pilot.csv b/keyzy_pilot.csv deleted file mode 100644 index b972bcf9..00000000 --- a/keyzy_pilot.csv +++ /dev/null @@ -1,3 +0,0 @@ -address,postcode,Notes,,,, -2 South Terrace,NN1 5JY,,,,, -25 Albert Street,PO12 4TY,,,,, \ No newline at end of file From d25bbab7167a1170e5a82f094b57b874ba68ee86 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 3 Apr 2025 09:14:52 +0100 Subject: [PATCH 249/255] adding retrieval of find my epc data to backend and requirements --- backend/requirements/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/requirements/requirements.txt b/backend/requirements/requirements.txt index dd5c34ca..577776be 100644 --- a/backend/requirements/requirements.txt +++ b/backend/requirements/requirements.txt @@ -29,3 +29,5 @@ mip==1.15.0 pyarrow==17.0.0 fastparquet==2024.5.0 aiohttp==3.10.10 +# find my epc +beautifulsoup4 From fd2600b9ba380e25d7b34ae155c4a7be4d9eddd6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 3 Apr 2025 09:18:22 +0100 Subject: [PATCH 250/255] modifying the handling of non-intrusive frecommendations as strings --- backend/Property.py | 5 +---- backend/app/plan/router.py | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index 5dcc76da..424242fd 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -107,10 +107,7 @@ class Property: # cost and instead, provide a message that the measure has already been installed self.already_installed = ast.literal_eval(already_installed['already_installed']) if already_installed else [] - self.non_invasive_recommendations = ( - ast.literal_eval(non_invasive_recommendations['recommendations']) if - non_invasive_recommendations else [] - ) + self.non_invasive_recommendations = non_invasive_recommendations # This is a list of measures that have been recommended for the property if isinstance(measures, list): self.measures = measures diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 45c19484..3028e45f 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -1,3 +1,4 @@ +import ast import json from datetime import datetime @@ -358,7 +359,6 @@ def extract_property_request_data( ), {}) if isinstance(property_non_invasive_recommendations.get("recommendations"), str): - import ast property_non_invasive_recommendations["recommendations"] = ast.literal_eval( property_non_invasive_recommendations["recommendations"] ) @@ -369,7 +369,7 @@ def extract_property_request_data( else: transformed.append(rec) - property_non_invasive_recommendations["recommendations"] = str(transformed) + property_non_invasive_recommendations["recommendations"] = transformed # Check if the valuation data has uprn valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else False From 2d71ad25efced2412edb987f80a977da0c291018 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 10 Apr 2025 23:10:52 +0100 Subject: [PATCH 251/255] added a patch method to scraping epc data --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 808 +++++++----------------- asset_list/app.py | 74 ++- asset_list/mappings/built_form.py | 29 +- asset_list/mappings/exising_pv.py | 8 + asset_list/mappings/heating_systems.py | 31 +- asset_list/mappings/property_type.py | 17 +- asset_list/mappings/roof.py | 26 + asset_list/mappings/walls.py | 12 +- backend/SearchEpc.py | 4 + etl/customers/remote_assessments/app.py | 36 +- etl/find_my_epc/AssetListEpcData.py | 47 +- etl/find_my_epc/RetrieveFindMyEpc.py | 89 ++- 14 files changed, 564 insertions(+), 621 deletions(-) create mode 100644 asset_list/mappings/roof.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 96ad7a95..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index fb10c6b0..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 5ae3029f..0dedc1fd 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -17,6 +17,7 @@ import asset_list.mappings.walls as walls_mappings import asset_list.mappings.heating_systems as heating_mappings import asset_list.mappings.exising_pv as existing_pv_mappings import asset_list.mappings.built_form as built_form_mappings +import asset_list.mappings.roof as roof_mappings from recommendations.recommendation_utils import ( estimate_perimeter, @@ -271,8 +272,10 @@ class AssetList: STANDARD_PROPERTY_TYPE = "landlord_property_type" STANDARD_BUILT_FORM = "landlord_built_form" STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction" + STANDARD_ROOF_CONSTRUCTION = "landlord_roof_construction" STANDARD_HEATING_SYSTEM = "landlord_heating_system" STANDARD_EXISTING_PV = "landlord_existing_pv" + STANDARD_SAP = "landlord_sap_rating" DOMNA_PROPERTY_ID = "domna_property_id" @@ -286,6 +289,8 @@ class AssetList: "Any further surveyor notes", 'Surveyors Name' ] + NON_INTRUSIVES_ELIGIBILITY_COLUMN = "Eligibility (Red/Yellow/Green)" + OLD_FORMAT_NON_INTRUSIVE_COLNAMES = ['WFT Findings', 'ECO Eligibility'] # This SAP threshold is a key search criteria for properties that may be eligible for extraction @@ -295,6 +300,9 @@ class AssetList: # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5 + # Properties before this year are more likely to have lower EPC ratings and more likely to qualify + EMPTY_CAVITY_YEAR_THRESHOLD = 2002 + # Attributes - these are columns that we produce, calcualted based on other pieces of data ATTRIBUTE_HAS_SOLAR = "attribute_has_solar" ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors" @@ -347,8 +355,11 @@ class AssetList: landlord_property_type=None, landlord_built_form=None, landlord_wall_construction=None, + landlord_roof_construction=None, landlord_heating_system=None, landlord_existing_pv=None, + landlord_sap=None, + phase=False, header=0 ): self.local_filepath = local_filepath @@ -361,7 +372,6 @@ class AssetList: self.standardised_asset_list = self.raw_asset_list.copy() # Will be used to store aggregated figures against the various work types self.work_type_figures = {} - self.work_type_breakdowns = {} self.flat_data = None self.duplicated_addresses = None self.contact_details = None @@ -371,11 +381,19 @@ class AssetList: self.outcomes_for_output = pd.DataFrame() self.master_surveyed = None + # When this is True, we intend to break the programme into multiple phases. We may need to review + # how this is structured in the future, as depending on how we get future data, we may need to + # remove some existing phases from the reporting, or specifically highlight the phase (1 to n-1) + # properties, assuming the current phase is n. + self.phase = phase + # We detect the presence of the non-intrusive columns self.non_intrusives_present = "CIGA Check Required" in self.raw_asset_list.columns # We detect if we have the old format of non-intruvies self.old_format_non_intrusives_present = "WFT Findings" in self.raw_asset_list.columns + self.non_intrusives_eligibility = "Eligibility (Red/Yellow/Green)" in self.raw_asset_list.columns + # Names of columns self.landlord_property_id = landlord_property_id self.address1_colname = address1_colname @@ -386,8 +404,10 @@ class AssetList: self.landlord_property_type = landlord_property_type self.landlord_built_form = landlord_built_form self.landlord_wall_construction = landlord_wall_construction + self.landlord_roof_construction = landlord_roof_construction self.landlord_heating_system = landlord_heating_system self.landlord_existing_pv = landlord_existing_pv + self.landlord_sap = landlord_sap # parameters for cleaning self.full_address_cols_to_concat = full_address_cols_to_concat @@ -427,6 +447,13 @@ class AssetList: self.standardised_asset_list[self.landlord_property_type].copy() ) + # If landlord built form is None (which it often is) we use the built for from inspections + if (self.landlord_built_form is None) and self.non_intrusives_present: + self.landlord_built_form = self.STANDARD_BUILT_FORM + self.standardised_asset_list[self.landlord_built_form] = ( + self.standardised_asset_list["Archetype"].copy() + ) + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): if method not in self.ADDRESS_1_CLEANING_METHODS: @@ -604,8 +631,10 @@ class AssetList: self.landlord_built_form, self.landlord_year_built, self.landlord_wall_construction, + self.landlord_roof_construction, self.landlord_heating_system, - self.landlord_existing_pv + self.landlord_existing_pv, + self.landlord_sap, ] # Keep just non-null variables (e.g landlord may not provide uprn self.keep_variables = [v for v in variables if v is not None] @@ -619,8 +648,10 @@ class AssetList: self.landlord_built_form: self.STANDARD_BUILT_FORM, self.landlord_year_built: self.STANDARD_YEAR_BUILT, self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION, + self.landlord_roof_construction: self.STANDARD_ROOF_CONSTRUCTION, self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM, - self.landlord_existing_pv: self.STANDARD_EXISTING_PV + self.landlord_existing_pv: self.STANDARD_EXISTING_PV, + self.landlord_sap: self.STANDARD_SAP, } self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None} @@ -628,6 +659,9 @@ class AssetList: if self.non_intrusives_present: non_intrusive_columns = self.NON_INTRUSIVES_COLNAMES + if self.non_intrusives_eligibility: + non_intrusive_columns.append(self.NON_INTRUSIVES_ELIGIBILITY_COLUMN) + if self.old_format_non_intrusives_present: non_intrusive_columns = self.OLD_FORMAT_NON_INTRUSIVE_COLNAMES @@ -657,9 +691,13 @@ class AssetList: ) else: # We want to make sure that we have a column for wall construction - self.landlord_wall_construction = "landlord_wall_construction" + self.landlord_wall_construction = self.STANDARD_WALL_CONSTRUCTION self.standardised_asset_list[self.landlord_wall_construction] = None + if self.landlord_roof_construction is None: + self.landlord_roof_construction = self.STANDARD_ROOF_CONSTRUCTION + self.standardised_asset_list[self.landlord_roof_construction] = None + # Clear our build year column # We attempt to process the year built column if self.landlord_year_built is not None: @@ -750,6 +788,10 @@ class AssetList: self.landlord_existing_pv: { "standard_values": existing_pv_mappings.STANDARD_EXISTING_PV, "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS + }, + self.landlord_roof_construction: { + "standard_values": roof_mappings.STANDARD_ROOF_CONSTRUCTIONS, + "standard_map": roof_mappings.ROOF_CONSTRUCTION_MAPPINGS } } # Keep just entries where the key is not None @@ -757,6 +799,8 @@ class AssetList: for variable, config in to_remap.items(): logger.info("Standardising variable: %s", variable) + # Strip each of these columns + self.standardised_asset_list[variable] = self.standardised_asset_list[variable].str.strip() values_to_remap = self.standardised_asset_list[variable].unique() # We want to map this to our standardised list of property types we're interested in remapper = DataRemapper(standard_values=config["standard_values"], standard_map=config["standard_map"]) @@ -779,6 +823,13 @@ class AssetList: if there are no categories which need remapping which is highly unlikely :return: """ + + if self.phase: + # We filter on just the properties that have had an inspection + self.standardised_asset_list = self.standardised_asset_list[ + ~self.standardised_asset_list['Surveyors Name'].isin(["YET TO BE SURVEYED"]) + ] + if not self.variable_mappings and not override_empty_mappings: raise ValueError("Please run init_standardise first") @@ -854,7 +905,7 @@ class AssetList: df, how="left", on=self.DOMNA_PROPERTY_ID ) - def extract_attributes(self): + def extract_attributes(self, pull_epc=True): # Used to extracty the typical attributes that we use to identify viable work self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = ( @@ -1054,6 +1105,40 @@ class AssetList: def identify_worktypes(self, cleaned): + if self.STANDARD_SAP is not None: + # We add a SAP category for all work type identification + self.standardised_asset_list["SAP Category"] = np.where( + ( + (self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 68) | + (self.standardised_asset_list[self.STANDARD_SAP] <= 68) + ), + "SAP Rating 68 or less", + np.where( + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= + self.EMPTY_CAVITY_SAP_THRESHOLD + ) | (self.standardised_asset_list[self.STANDARD_SAP] <= self.EMPTY_CAVITY_SAP_THRESHOLD) + ), + f"SAP Rating 69-{self.EMPTY_CAVITY_SAP_THRESHOLD}", + f"SAP Rating {self.EMPTY_CAVITY_SAP_THRESHOLD + 1} or more" + ) + ) + else: + # We add a SAP category for all work type identification + self.standardised_asset_list["SAP Category"] = np.where( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 68, + "SAP Rating 68 or less", + np.where( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= + self.EMPTY_CAVITY_SAP_THRESHOLD + ), + f"SAP Rating 69-{self.EMPTY_CAVITY_SAP_THRESHOLD}", + f"SAP Rating {self.EMPTY_CAVITY_SAP_THRESHOLD + 1} or more" + ) + ) + # Before we being, we identify if a property has solar already as we use this # for identifying cavity jobs if self.non_intrusives_present: @@ -1107,132 +1192,53 @@ class AssetList: non_intrusives_wall_filter = False if self.landlord_year_built is None: - # The landlord won't always give us year built - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( - (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & - non_intrusives_wall_filter & - ( - self.standardised_asset_list["epc_year_upper_bound"] <= 2002 - ) & - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"] - ] <= self.EMPTY_CAVITY_SAP_THRESHOLD - ) - ) - - # Let's also flag work that looks eligible without the SAP filter - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = ( - (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & - non_intrusives_wall_filter & - (self.standardised_asset_list["epc_year_upper_bound"] <= 2002) - ) - - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] = ( - (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & - non_intrusives_wall_filter & - (self.standardised_asset_list["epc_year_upper_bound"] <= 2002) & - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"] - ] <= self.EMPTY_CAVITY_SAP_THRESHOLD - ) & ( - # If the property has solar, there's a chance it won't qualify - self.standardised_asset_list["property_has_solar"] - ) - ) - + year_built_filter = self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD else: - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( - (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & - non_intrusives_wall_filter & - ( - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) | - (self.standardised_asset_list["epc_year_upper_bound"] <= 2002) - ) & - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"] - ] <= self.EMPTY_CAVITY_SAP_THRESHOLD - ) & ( - # If the property has solar, there's a chance it won't qualify - ~self.standardised_asset_list["property_has_solar"] - ) + year_built_filter = ( + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= self.EMPTY_CAVITY_YEAR_THRESHOLD) | + (self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD) ) - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] = ( - (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & - non_intrusives_wall_filter & - ( - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) | - (self.standardised_asset_list["epc_year_upper_bound"] <= 2002) - ) & - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"] - ] <= self.EMPTY_CAVITY_SAP_THRESHOLD - ) & ( - # If the property has solar, there's a chance it won't qualify - self.standardised_asset_list["property_has_solar"] - ) - ) - - # Let's also flag work that looks eligible without the SAP filter - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = ( - (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & - non_intrusives_wall_filter & - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & - # If the property has solar, there's a chance it won't qualify + # Criteria: + # The property isn't a bedsit + # Non-intrusives indicate it needs a fill + # The EPC year is before 2002 + # We also flag where the property has solar on the roof, because this is a signal of a high EPC rating + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + year_built_filter & + ( ~self.standardised_asset_list["property_has_solar"] ) + ) + + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] = ( + pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]) & + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + year_built_filter & + ( + # If the property has solar, there's a chance it won't qualify + self.standardised_asset_list["property_has_solar"] + ) + ) # We also add a filter on anything that was generally identified by the non-intrusives - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter_no_year_filter"] = ( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_year_filter"] = ( + pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]) & + pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"]) & (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & non_intrusives_wall_filter ) - # If non_intrusive_indicates_empty_cavity is True, - # set non_intrusive_indicates_empty_cavity_no_sap_filter to False - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], - False, - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] - ) - - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter_no_year_filter"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"], - False, - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter_no_year_filter"] - ) - self.standardised_asset_list["epc_indicates_empty_cavity"] = ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( self.EPC_NO_WALL_INSULATION_DESCRIPTIONS ) & ( - self.standardised_asset_list["epc_year_upper_bound"] <= 1995 + self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD ) & ( ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] - ) & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD - ) & ( - ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) - ) - ) - - self.standardised_asset_list["epc_indicates_empty_cavity_no_sap_filter"] = ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( - self.EPC_NO_WALL_INSULATION_DESCRIPTIONS - ) & ( - self.standardised_asset_list["epc_year_upper_bound"] <= 1995 - ) & ( - ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] - ) & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"]] > self.EMPTY_CAVITY_SAP_THRESHOLD ) & ( ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) ) @@ -1241,44 +1247,13 @@ class AssetList: self.standardised_asset_list["landlord_data_indicates_empty_cavity"] = ( self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) & ( - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) | - (self.standardised_asset_list["epc_year_upper_bound"] <= 1995) - ) & - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"] - ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= self.EMPTY_CAVITY_YEAR_THRESHOLD) | + (self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD) ) & ( ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) ) ) - self.standardised_asset_list["landlord_data_indicates_empty_cavity_no_sap_filter"] = ( - self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) & - ( - (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) | - (self.standardised_asset_list["epc_year_upper_bound"] <= 1995) - ) & - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["current-energy-efficiency"] - ] > self.EMPTY_CAVITY_SAP_THRESHOLD - ) & ( - ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) - ) - ) - - # If the EPC is esimtated, we defer to the non-intrusives - self.standardised_asset_list["epc_indicates_empty_cavity"] = np.where( - ( - self.standardised_asset_list["epc_indicates_empty_cavity"] & - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & - self.standardised_asset_list["estimated"] - ), - False, - self.standardised_asset_list["epc_indicates_empty_cavity"] - ) - # Finally, we create a flag to indicate that the cavity is empty, based on the criteria above self.standardised_asset_list["cavity_is_empty"] = ( non_intrusives_wall_filter | @@ -1303,19 +1278,21 @@ class AssetList: )) ) - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( - extraction_wall_filter & ( - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - )) + if self.non_intrusives_eligibility: + # If we have the eligibility column, we check if the wall is eligible + extraction_wall_filter = ( + extraction_wall_filter & + ~self.standardised_asset_list["non-intrusives: Eligibility (Red/Yellow/Green)"].isin( + ["RED"] + ) + ) - # Also include work without the SAP filter as optimistic - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = ( - extraction_wall_filter & ( - ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - )) + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + extraction_wall_filter & year_built_filter + ) elif self.old_format_non_intrusives_present: - print("Review these categories with Kieran") + print("Review these categories!!!!") extraction_wall_filter = ( self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin( ["retro drilled", "retro filled", "fibre from build", "polybead", "retro drilled and filled", @@ -1324,12 +1301,6 @@ class AssetList: ) self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( - extraction_wall_filter & ( - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - ) - - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = ( extraction_wall_filter ) @@ -1337,13 +1308,6 @@ class AssetList: self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = False self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = False - # Adjust - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"], - False, - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] - ) - ###################################################### # Solar ###################################################### @@ -1351,8 +1315,12 @@ class AssetList: # Check 1: Does the property have a valid heating system? self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( - ["air source heat pump", "ground source heat pump", "high heat retention storage heaters", - "electric boiler"] + [ + "air source heat pump", + "ground source heat pump", + "high heat retention storage heaters", + "electric boiler" + ] ) ) self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] = ( @@ -1435,8 +1403,6 @@ class AssetList: else: self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = False - # TODO: We don't have information about the roof from this landlord - # We merge on the u-value for average thermal transmittance walls_uvalue_data = pd.DataFrame(cleaned["walls-description"]) walls_uvalue_data = walls_uvalue_data[ @@ -1454,22 +1420,16 @@ class AssetList: self.standardised_asset_list["solar_epc_walls_insulated"] = ( ( self.standardised_asset_list[ - self.EPC_API_DATA_NAMES[ - "walls-description"]].str.lower().str.contains( - "|".join( - self.EPC_INSULATED_WALLS_SUBSTRINGS) + self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains( + "|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS) ) ) | ( - self.standardised_asset_list[ - "walls_u_value"].apply( - lambda x: x <= 0.7 if not pd.isnull(x) else False - ) + self.standardised_asset_list["walls_u_value"].apply(lambda x: x <= 0.7 if not pd.isnull(x) else False) ) ) # We merge on the u-value for average thermal transmittance - roof_roof_data = pd.DataFrame(cleaned["roof-description"]) - roof_roof_data = roof_roof_data[ + roof_roof_data = pd.DataFrame(cleaned["roof-description"])[ ["original_description", "thermal_transmittance", "is_pitched", "is_loft"] ].rename( columns={ @@ -1516,43 +1476,15 @@ class AssetList: self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) ) - # We merge on the u-value for average thermal transmittance - floors_uvalue_data = pd.DataFrame(cleaned["floor-description"]) - floors_uvalue_data = floors_uvalue_data[ - ~pd.isnull(floors_uvalue_data["thermal_transmittance"]) - ][["original_description", "thermal_transmittance"]].rename( - columns={ - "original_description": self.EPC_API_DATA_NAMES["floor-description"], - "thermal_transmittance": "floor_u_value" - } - ) - - # Merge on - self.standardised_asset_list = self.standardised_asset_list.merge( - floors_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["floor-description"] - ) - - # We assume that a U-value of 0.5 or below is indicative of an insulated floor - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = ( + # Check if the boiler is electric + # We check if it contains both the terms boiler & electric + self.standardised_asset_list["has_electric_boiler"] = ( ( - ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str - .lower().str.contains("solid") - ) & ( - ~self.standardised_asset_list["epc_has_floor_recommendation"] - ) & ( - # We do not utilise estimated EPCs for this method because we will always find that - # "epc_has_floor_recommendation" is False - (self.standardised_asset_list["estimated"] == False) - ) + self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] + .str.lower().isin( + ["boiler and radiators, electric"]) ) | ( - ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["floor-description"]].str.lower().str.contains("solid") - ) & ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower() - .str.contains(", insulated") - ) + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] == "electric boiler" ) ) @@ -1563,7 +1495,8 @@ class AssetList: # Set up the filters to stop repetition correct_heating_system = ( self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | - self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] | + self.standardised_asset_list["has_electric_boiler"] ) needs_heating_upgrade = ( @@ -1574,11 +1507,17 @@ class AssetList: # The requirements for walls are: # 1) walls are insulated # 2) property is a cavity (can be done insulated or not) + walls_meet_solar_requirements = ( + # The landlord is saying the walls are insulated self.standardised_asset_list["solar_landlord_walls_insulated"] | + # EPC data is saying the walls are insulated self.standardised_asset_list["solar_epc_walls_insulated"] | + # Non-intrusives are saying the walls are insulated self.standardised_asset_list["solar_non_intrusives_walls_insulated"] | + # It's empty cavity self.standardised_asset_list["cavity_is_empty"] | + # It's a cavity wall (self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].str.contains("cavity")) ) @@ -1586,24 +1525,12 @@ class AssetList: self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "flat" ) - self.standardised_asset_list["solar_eligible_solid_floor"] = ( - # Property isn't a flag - not_a_flat & - # Landlord data or EPC data indicates the heating system is appropriate - correct_heating_system & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - # Floor type check - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + solar_roof_meets_criteria = ( + self.standardised_asset_list["solar_epc_roof_insulated"] | + self.standardised_asset_list["solar_epc_loft_needs_topup"] ) - self.standardised_asset_list["solar_eligible_solid_floor_sap_above_threshold"] = ( + self.standardised_asset_list["solar_eligible"] = ( # Property isn't a flag not_a_flat & # Landlord data or EPC data indicates the heating system is appropriate @@ -1612,16 +1539,12 @@ class AssetList: ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - # Floor type check - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP above threshold - ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + # Roof meets criteria + solar_roof_meets_criteria ) # With heating upgrade - self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] = ( + self.standardised_asset_list["solar_eligible_needs_heating_upgrade"] = ( not_a_flat & # Needs heating upgrade needs_heating_upgrade & @@ -1629,322 +1552,81 @@ class AssetList: ~self.standardised_asset_list["property_has_solar"] & # The walls are insulated walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - # Floor type check - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP Below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - # With heating upgrade, above threshold - self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade_sap_above_threshold"] = ( - not_a_flat & - # Needs heating upgrade - needs_heating_upgrade & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - # Floor type check - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP above threshold - ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - - # Because the EPC data can be contradictrory, we remove any overlap - self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] = np.where( - self.standardised_asset_list["solar_eligible_solid_floor"], - False, - self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] - ) - self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade_sap_above_threshold"] = np.where( - self.standardised_asset_list["solar_eligible_solid_floor_sap_above_threshold"], - False, - self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade_sap_above_threshold"] + # Roof meets criteria + solar_roof_meets_criteria ) # We shouldn't have an overlap if ( - self.standardised_asset_list["solar_eligible_solid_floor"] & - self.standardised_asset_list["solar_eligible_solid_floor_needs_heating_upgrade"] + self.standardised_asset_list["solar_eligible"] & + self.standardised_asset_list["solar_eligible_needs_heating_upgrade"] ).sum(): raise ValueError("Both heating upgrade and no heating upgrade are true - this should not be possible") - # Solid floor but needs a loft top-up - self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] = ( - not_a_flat & - # Landlord data or EPC data indicates the heating system is appropriate - correct_heating_system & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Check floor - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - - # Solid floor, needs loft, above SAP thresold - self.standardised_asset_list["solar_eligible_solid_floor_needs_loft_sap_above_threshold"] = ( - not_a_flat & - # Landlord data or EPC data indicates the heating system is appropriate - correct_heating_system & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Check floor - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP above threshold - ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - # Needs loft & heating - self.standardised_asset_list["solar_eligible_solid_floor_needs_loft_needs_heating_upgrade"] = ( - not_a_flat & - # Needs heating upgrade - needs_heating_upgrade & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Floor type - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - - self.standardised_asset_list[ - "solar_eligible_solid_floor_needs_loft_needs_heating_upgrade_sap_above_threshold" - ] = ( - not_a_flat & - # Needs heating upgrade - needs_heating_upgrade & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Floor type - self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP above threshold - ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - - # Other floor type, fully insulated - self.standardised_asset_list["solar_eligible_other_floor"] = ( - not_a_flat & - # Landlord data or EPC data indicates the heating system is appropriate - correct_heating_system & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - # Floor type - ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - - self.standardised_asset_list["solar_eligible_other_floor_sap_above_threshold"] = ( - not_a_flat & - # Landlord data or EPC data indicates the heating system is appropriate - correct_heating_system & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - # Floor type - other types - ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP above threshold - ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - - # With heating upgrade - self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade"] = ( - not_a_flat & - # Needs heating upgrade - needs_heating_upgrade & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - # Other floor types - ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - # With heating upgrade, SAP above threshold - self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade_sap_above_threshold"] = ( - not_a_flat & - # Needs heating upgrade - needs_heating_upgrade & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof is insulated - self.standardised_asset_list["solar_epc_roof_insulated"] & - # Other floor types - ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP above threshold - ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - # Check for overlap - if ( - self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade"] & - self.standardised_asset_list["solar_eligible_other_floor_needs_heating_upgrade_sap_above_threshold"] - ).sum(): - raise ValueError("Both heating upgrade and no heating upgrade are true - this should not be possible") - - # Other floor type, needs loft top-up - self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] = ( - not_a_flat & - # Landlord data or EPC data indicates the heating system is appropriate - correct_heating_system & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof need loft top-up - self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Other floor types - ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - # Other floor type, needs loft top-up, SAP above threshold - self.standardised_asset_list["solar_eligible_other_floor_needs_loft_sap_above_threshold"] = ( - not_a_flat & - # Landlord data or EPC data indicates the heating system is appropriate - correct_heating_system & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof need loft top-up - self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Other floor types - ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP above threshold - ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - - # With heating upgrade - self.standardised_asset_list["solar_eligible_other_floor_needs_loft_needs_heating_upgrade"] = ( - not_a_flat & - # Landlord data or EPC data indicates the heating system is appropriate - needs_heating_upgrade & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof need loft top-up - self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Other floor types - ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP below threshold - self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - - self.standardised_asset_list[ - "solar_eligible_other_floor_needs_loft_needs_heating_upgrade_sap_above_threshold" - ] = ( - not_a_flat & - # Landlord data or EPC data indicates the heating system is appropriate - needs_heating_upgrade & - # The property doesn't currently have solar - ~self.standardised_asset_list["property_has_solar"] & - # The walls are insulated - walls_meet_solar_requirements & - # Roof need loft top-up - self.standardised_asset_list["solar_epc_loft_needs_topup"] & - # Other floor types - ~self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] & - # SAP above threshold - ~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] - ) - - # Check if the boiler is electric - # We check if it contains both the terms boiler & electric - has_electric_boiler = ( - ( - self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] - .str.lower().isin( - ["boiler and radiators, electric", "boiler and underfloor heating, electric"]) - ) | ( - self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] == "electric boiler" - ) - ) - # We check for a specific sub-set of properties which are uninsulated solid wall properties that are EPC E # or below (we'll use 57 as a threshold) - These are for a pilot with Net Zero Renewables self.standardised_asset_list["solar_eligible_solid_wall_uninsulated"] = ( not_a_flat & # Landlord data or EPC data indicates the heating system is appropriate - in this case, we can also take # electric boilers - (correct_heating_system | has_electric_boiler) & + correct_heating_system & # The property doesn't currently have solar ~self.standardised_asset_list["property_has_solar"] & - # The walls are uninsulated solic + # The walls are uninsulated solid ~walls_meet_solar_requirements & (self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 57) ) # Drop anything we don't need self.standardised_asset_list = self.standardised_asset_list.drop( - columns=["walls_u_value", "roof_u_value", "floor_u_value"] + columns=["walls_u_value", "roof_u_value"] ) # Adjust flagged extraction jobs to remove anything for solar self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & - ~self.standardised_asset_list["solar_eligible_solid_floor"] & - ~self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] - # ~self.standardised_asset_list["solar_eligible_other_floor"] & - # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] + ~self.standardised_asset_list["solar_eligible"] ) # Finally, we note why each property has been flagged self.standardised_asset_list["cavity_reason"] = None + empty_cavity_map = { + "non_intrusive_indicates_empty_cavity": "Non-Intrusive Data Shows Empty Cavity: ", + "non_intrusive_indicates_empty_cavity_has_solar": "Non-Intrusive Data Shows Empty Cavity - property " + "already has solar: ", + "non_intrusive_indicates_empty_cavity_no_year_filter": f"Non-Intrusive Data Shows Empty Cavity, " + f"built after {self.EMPTY_CAVITY_YEAR_THRESHOLD}: ", + + } + for variable, description in empty_cavity_map.items(): + self.standardised_asset_list["cavity_reason"] = np.where( + self.standardised_asset_list[variable] & + pd.isnull(self.standardised_asset_list["cavity_reason"]), + description + self.standardised_asset_list["SAP Category"], + self.standardised_asset_list["cavity_reason"] + ) + + # We break the cavity reason into a few different categories, when the EPC is different from inspections self.standardised_asset_list["cavity_reason"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], - "Non-Intrusive Data Showed Empty Cavity", - self.standardised_asset_list["cavity_reason"] - ) - self.standardised_asset_list["cavity_reason"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] & - pd.isnull(self.standardised_asset_list["cavity_reason"]), - "Non-Intrusive Data Showed Empty Cavity - property already has solar", - self.standardised_asset_list["cavity_reason"] - ) - self.standardised_asset_list["cavity_reason"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] & - pd.isnull(self.standardised_asset_list["cavity_reason"]), - "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed", + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + (self.standardised_asset_list['non-intrusives: Insulated'] == "RETRO DRILLED") & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "EPC Shows Empty Cavity, inspections show retro drilled: " + self.standardised_asset_list["SAP Category"], self.standardised_asset_list["cavity_reason"] ) self.standardised_asset_list["cavity_reason"] = np.where( - self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter_no_year_filter"] & - pd.isnull(self.standardised_asset_list["cavity_reason"]), - "Non-Intrusive Data Showed Empty Cavity but all SAP scores and year built allowed", + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + (self.standardised_asset_list['non-intrusives: Insulated'] == "FILLED AT BUILD") & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "EPC Shows Empty Cavity, inspections show filled at build: " + self.standardised_asset_list["SAP Category"], self.standardised_asset_list["cavity_reason"] ) @@ -1954,19 +1636,12 @@ class AssetList: ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & pd.isnull(self.standardised_asset_list["cavity_reason"]) ), - "EPC Data Showed Empty Cavity", + "EPC Shows Empty Cavity, inspections show non-cavity build: " + self.standardised_asset_list[ + "SAP Category"], self.standardised_asset_list["cavity_reason"] ) - self.standardised_asset_list["cavity_reason"] = np.where( - ( - self.standardised_asset_list["epc_indicates_empty_cavity_no_sap_filter"] & - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] & - pd.isnull(self.standardised_asset_list["cavity_reason"]) - ), - "EPC Data Showed Empty Cavity but all SAP scores allowed", - self.standardised_asset_list["cavity_reason"] - ) - # Landlord data + # Landlord data: The landlord's data indicates that the wall is an uninsulated cavity wall, but EPC and + # inspections show filled self.standardised_asset_list["cavity_reason"] = np.where( ( self.standardised_asset_list["landlord_data_indicates_empty_cavity"] & @@ -1974,35 +1649,18 @@ class AssetList: ~self.standardised_asset_list["epc_indicates_empty_cavity"] & pd.isnull(self.standardised_asset_list["cavity_reason"]) ), - "Landlord Data Showed Empty Cavity", + "Landlord Data Shows Empty Cavity, EPC & Inspections Shows Filled: " + self.standardised_asset_list[ + "SAP Category"], self.standardised_asset_list["cavity_reason"] ) - self.standardised_asset_list["cavity_reason"] = np.where( - ( - self.standardised_asset_list["landlord_data_indicates_empty_cavity_no_sap_filter"] & - ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] & - ~self.standardised_asset_list["epc_indicates_empty_cavity_no_sap_filter"] & - pd.isnull(self.standardised_asset_list["cavity_reason"]) - ), - "Landlord Data Showed Empty Cavity but all SAP scores allowed", - self.standardised_asset_list["cavity_reason"], - ) + # Flag extraction self.standardised_asset_list["cavity_reason"] = np.where( ( self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & pd.isnull(self.standardised_asset_list["cavity_reason"]) ), - "Non-Intrusive Data Showed Cavity Extraction", - self.standardised_asset_list["cavity_reason"] - ) - # extraction no sap filter - self.standardised_asset_list["cavity_reason"] = np.where( - ( - self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] & - pd.isnull(self.standardised_asset_list["cavity_reason"]) - ), - "Non-Intrusive Data Showed Cavity Extraction but all SAP scores allowed", + "Non-Intrusive Data Shows Cavity Extraction: " + self.standardised_asset_list["SAP Category"], self.standardised_asset_list["cavity_reason"] ) @@ -2013,47 +1671,17 @@ class AssetList: # Map of variables and fill values for the solar_reason variable solar_reason_map = { - "solar_eligible_solid_floor": "Solar Eligible, Solid Floor", - "solar_eligible_solid_floor_sap_above_threshold": "Solar Eligible, Solid Floor, SAP Above Threshold", - "solar_eligible_solid_floor_needs_heating_upgrade": ( - "Solar Eligible, Solid Floor, Needs Heating Upgrade" + "solar_eligible": "Solar Eligible: ", + "solar_eligible_needs_heating_upgrade": ( + "Solar Eligible, Solid Floor, Needs Heating Upgrade: " ), - "solar_eligible_solid_floor_needs_heating_upgrade_sap_above_threshold": ( - "Solar Eligible, Solid Floor, Needs Heating Upgrade, SAP Above Threshold" - ), - "solar_eligible_solid_floor_needs_loft": "Solar Eligible, Solid Floor, Needs Loft", - "solar_eligible_solid_floor_needs_loft_sap_above_threshold": ( - "Solar Eligible, Solid Floor, Needs Loft, SAP Above Threshold" - ), - "solar_eligible_solid_floor_needs_loft_needs_heating_upgrade": ( - "Solar Eligible, Solid Floor, Needs Loft, Needs Heating Upgrade" - ), - "solar_eligible_solid_floor_needs_loft_needs_heating_upgrade_sap_above_threshold": ( - "Solar Eligible, Solid Floor, Needs Loft, Needs Heating Upgrade, SAP Above Threshold" - ), - "solar_eligible_other_floor": "Solar Eligible, Other Floor", - "solar_eligible_other_floor_sap_above_threshold": "Solar Eligible, Other Floor, SAP Above Threshold", - "solar_eligible_other_floor_needs_heating_upgrade": "Solar Eligible, Other Floor, Needs Heating Upgrade", - "solar_eligible_other_floor_needs_heating_upgrade_sap_above_threshold": ( - "Solar Eligible, Other Floor, Needs Heating Upgrade, SAP Above Threshold" - ), - "solar_eligible_other_floor_needs_loft": "Solar Eligible, Other Floor, Needs Loft", - "solar_eligible_other_floor_needs_loft_sap_above_threshold": ( - "Solar Eligible, Other Floor, Needs Loft, SAP Above Threshold" - ), - "solar_eligible_other_floor_needs_loft_needs_heating_upgrade": ( - "Solar Eligible, Other Floor, Needs Loft, Needs Heating Upgrade" - ), - "solar_eligible_other_floor_needs_loft_needs_heating_upgrade_sap_above_threshold": ( - "Solar Eligible, Other Floor, Needs Loft, Needs Heating Upgrade, SAP Above Threshold" - ), - "solar_eligible_solid_wall_uninsulated": "Solar Eligible, Solid Wall Uninsulated, EPC E or Below", + "solar_eligible_solid_wall_uninsulated": "Solar Eligible, Solid Wall Uninsulated, EPC E or Below: ", } for variable, reason in solar_reason_map.items(): self.standardised_asset_list["solar_reason"] = np.where( self.standardised_asset_list[variable], - reason, + reason + self.standardised_asset_list["SAP Category"], self.standardised_asset_list["solar_reason"] ) diff --git a/asset_list/app.py b/asset_list/app.py index 67e18dac..ae4b3cef 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -10,6 +10,7 @@ from asset_list.mappings.built_form import BUILT_FORM_MAPPINGS from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS from asset_list.mappings.heating_systems import HEATING_MAPPINGS from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS +from asset_list.mappings.roof import ROOF_CONSTRUCTION_MAPPINGS from asset_list.utils import get_data from dotenv import load_dotenv @@ -88,6 +89,63 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) + # Torus + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Torus/Phase 1" + data_filename = "Torus Property Asset List - Phase 1.xlsx" + sheet_name = "TORUS" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "AddressLine1" + address1_method = None + address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] + missing_postcodes_method = None + landlord_year_built = "Property Age" + landlord_os_uprn = "NatUPRN" + landlord_property_type = "Property Type" + landlord_built_form = "Built Form" + landlord_wall_construction = "Wall Construction" + landlord_roof_construction = "Roof Construction" + landlord_heating_system = "Space Heating Source" + landlord_existing_pv = "Low Carbon Technology (Solar PV)" + landlord_property_id = "UPRN" + landlord_sap = "SAP Score" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_to_asset_list_filepath = None + phase = True + + # Ealing - houses + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing" + data_filename = "Ealing_rechecked_cleaned_05042025.csv" + sheet_name = None + postcode_column = 'Postcode' + fulladdress_column = "Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Year Built" + landlord_os_uprn = None + landlord_property_type = "Property Type Code" + landlord_built_form = None + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Property ref" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_to_asset_list_filepath = None + # Southern Midlands data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/Midlands Properties - Apr 2025" data_filename = "Southern Housing Midlands Property List - combined.xlsx" @@ -446,8 +504,11 @@ def app(): landlord_property_type=landlord_property_type, landlord_built_form=landlord_built_form, landlord_wall_construction=landlord_wall_construction, + landlord_roof_construction=landlord_roof_construction, landlord_heating_system=landlord_heating_system, - landlord_existing_pv=landlord_existing_pv + landlord_existing_pv=landlord_existing_pv, + landlord_sap=landlord_sap, + phase=phase ) asset_list.init_standardise() @@ -486,6 +547,13 @@ def app(): ).items() if k not in EXISTING_PV_MAPPINGS } + new_roof_construction_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_roof_construction] if + asset_list.landlord_roof_construction else {} + ).items() + if k not in ROOF_CONSTRUCTION_MAPPINGS + } asset_list.apply_standardiation() @@ -511,7 +579,7 @@ def app(): epc_api_only = False force_retrieve_data = False skip = None # Used to skip already completed chunks - chunk_size = 5000 + chunk_size = 1000 filename = "Chunk {i}.csv" download_folder = os.path.join(data_folder, "Chunks") if not os.path.exists(download_folder): @@ -529,8 +597,6 @@ def app(): if any(x in folder_contents for x in downloaded_files): skip = max([i for i in chunk_indexes if filename.format(i=i) in folder_contents]) - # folder_contents = [f for f in folder_contents if "nodata" not in f and f.endswith(".csv")] - for i in range(0, len(asset_list.standardised_asset_list), chunk_size): print(f"Processing chunk {i} to {i + chunk_size}") if skip is not None and not force_retrieve_data: diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index aad36fce..cabd970e 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -80,5 +80,32 @@ BUILT_FORM_MAPPINGS = { 'House: MidTerrace': 'mid-terrace', 'House: EndTerrace': 'end-terrace', 'Bungalow: EndTerrace': 'end-terrace', - 'Bungalow: MidTerrace': 'mid-terrace' + 'Bungalow: MidTerrace': 'mid-terrace', + 'Flat: Semi Detached: Mid Floor': 'semi-detached', + 'Maisonette: Mid Terrace: Top Floor': 'mid-terrace', + 'Flat: Enclosed Mid Terrace: Mid Floor': 'mid-terrace', + 'Flat: Enclosed Mid Terrace: Ground Floor': 'mid-terrace', + 'Flat: Detached: Ground Floor': 'detached', + 'Flat: Detached: Mid Floor': 'detached', + 'Flat: Detached: Top Floor': 'detached', + 'Flat: Enclosed End Terrace: Mid Floor': 'end-terrace', + 'Bungalow: Detached': 'detached', + 'Maisonette: End Terrace: Mid Floor': 'end-terrace', + 'Maisonette: Detached: Top Floor': 'detached', + 'Flat: Enclosed End Terrace: Ground Floor': 'end-terrace', + 'Flat: Enclosed Mid Terrace: Top Floor': 'mid-terrace', + 'House: EnclosedEndTerrace': 'end-terrace', + '3 Ext. Wall Flat': 'semi-detached', + 'Bungalow Detached': 'detached', + 'Bungalow End Terrace': 'end-terrace', + 'Bungalow Mid Terrace': 'mid-terrace', + 'Bungalow Semi Detached': 'detached', + 'Maisonette 2 Ext. Wall': 'mid-terrace', + 'Maisonette 3 Ext. Wall': 'semi-detached', + 'End-terrace': 'end-terrace', + 'Mid-terrace': 'mid-terrace', + 'Semi-detached': 'semi-detached', + 'Detached': 'detached', + 'Flat / maisonette': 'unknown', + '2014 onwards': 'unknown' } diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py index 06e77bba..51f5f922 100644 --- a/asset_list/mappings/exising_pv.py +++ b/asset_list/mappings/exising_pv.py @@ -1,3 +1,5 @@ +import numpy as np + STANDARD_EXISTING_PV = { "already has PV", "no PV", "unknown" } @@ -9,4 +11,10 @@ EXISTING_PV_MAPPINGS = { "yes": "already has PV", True: "already has PV", False: "no PV", + np.nan: 'unknown', + 'PV: 2kWp array': 'already has PV', + 'PV: 25% roof area, PV: 3.6kWp array': 'already has PV', + 'PV: 10% roof area, PV: 2kWp array': 'already has PV', + 'PV: 50% roof area': 'already has PV', + 'Solar PV': 'already has PV' } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 714f5434..42326575 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -21,7 +21,9 @@ STANDARD_HEATING_SYSTEMS = { 'oil fuel', 'solid fuel', 'gas combi boiler', - 'unknown' + 'unknown', + "electric ceiling", + "electric underfloor" } HEATING_MAPPINGS = { @@ -143,5 +145,30 @@ HEATING_MAPPINGS = { 'Boiler: A rated Regular Boiler Electricity: Electricity': 'electric boiler', 'Community Heating Systems: Community boilers only (RdSAP) Gas: Mains Gas (Community)': 'communal gas boiler', 'Boiler: A rated Combi Gas: Mains Gas': 'gas condensing combi', - 'Boiler: A rated CPSU Electricity: Electricity': 'electric boiler' + 'Boiler: A rated CPSU Electricity: Electricity': 'electric boiler', + 'Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C': 'ground source heat pump', + 'Heat Pump: Electric Heat pumps: Ground source heat pump in other cases': 'ground source heat pump', + 'Electric Storage Systems: High heat retention storage heaters': 'high heat retention storage heaters', + 'Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C': 'air source heat pump', + 'Electric (direct acting) room heaters: Panel, convector or radiant heaters': 'room heaters', + 'Boiler: C rated Combi': 'gas combi boiler', + 'Boiler: B rated Regular Boiler': 'gas condensing boiler', + 'Boiler: E rated Combi': 'gas combi boiler', + 'Boiler: A rated Combi': 'gas combi boiler', + 'Boiler: E rated Regular Boiler': 'gas condensing boiler', + 'Community Heating Systems: Community boilers only (RdSAP)': 'district heating', + 'Boiler: C rated Regular Boiler': 'gas condensing boiler', + 'Boiler: A rated Regular Boiler': 'gas condensing boiler', + 'Electric Storage Systems: Fan storage heaters': 'electric storage heaters', + 'Boiler: F rated Combi': 'gas combi boiler', + + 'Room heaters': 'room heaters', + 'Room Heaters': 'room heaters', + 'Boiler': 'gas condensing boiler', + 'Heat Pump (Wet)': 'air source heat pump', + 'Community Heating': 'district heating', + 'Heat pump (wet)': 'air source heat pump', + 'Electric ceiling heating': 'electric ceiling', + 'Electric under floor heating': 'electric underfloor', + 'Community heating': 'district heating' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index 139b1622..f208081a 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -136,5 +136,20 @@ PROPERTY_MAPPING = { 'Flat: Semi Detached: Top Floor': 'flat', 'Flat: Mid Terrace: Ground Floor': 'flat', 'Bungalow: MidTerrace': 'bungalow', - 'Flat: Enclosed End Terrace: Top Floor': 'flat' + 'Flat: Enclosed End Terrace: Top Floor': 'flat', + 'Flat: Semi Detached: Mid Floor': 'flat', + 'Maisonette: Mid Terrace: Top Floor': 'maisonette', + 'House: EnclosedEndTerrace': 'house', + 'Flat: Detached: Ground Floor': 'flat', + 'Flat: Detached: Mid Floor': 'flat', + 'Flat: Detached: Top Floor': 'flat', + 'Bungalow: Detached': 'bungalow', + 'Maisonette: End Terrace: Mid Floor': 'maisonette', + 'Maisonette: Detached: Top Floor': 'maisonette', + 'Flat: Enclosed Mid Terrace: Mid Floor': 'flat', + 'Flat: Enclosed Mid Terrace: Ground Floor': 'flat', + 'Flat: Enclosed End Terrace: Mid Floor': 'flat', + 'Flat: Enclosed End Terrace: Ground Floor': 'flat', + 'Flat: Enclosed Mid Terrace: Top Floor': 'flat', + '2013 onwards': 'unknown' } diff --git a/asset_list/mappings/roof.py b/asset_list/mappings/roof.py new file mode 100644 index 00000000..b98a773c --- /dev/null +++ b/asset_list/mappings/roof.py @@ -0,0 +1,26 @@ +import numpy as np + +STANDARD_ROOF_CONSTRUCTIONS = { + "pitched access to loft", + "pitched no access to loft", + "pitched unknown access to loft", + "piched unknown insulation", + "pitched insulated", + "another dwelling above", + "flat unknown insulation", + "unknown insulated", + "unknown", +} + +ROOF_CONSTRUCTION_MAPPINGS = { + 'Flat': 'flat unknown insulation', + 'Pitched (access to loft)': 'pitched access to loft', + 'Pitched (no access to loft)': 'pitched no access to loft', + 'Another dwelling above': 'another dwelling above', + 'Same dwelling above': 'another dwelling above', + 'As-built': 'unknown', + 'ND (inferred)': 'unknown', + '2018 onwards': 'unknown', + 'Pitched (vaulted ceiling)': 'pitched insulated', + np.nan: "unknown" +} diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index e5f22f13..128e84af 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -147,5 +147,15 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Cavity: AsBuilt (1983-1995), Cavity: FilledCavity': 'filled cavity', 'SolidBrick: AsBuilt': 'solid brick unknown insulation', 'Cavity: FilledCavity': 'filled cavity', - 'SolidBrick: Internal': 'insulated solid brick' + 'SolidBrick: Internal': 'insulated solid brick', + 'Cavity: External': 'filled cavity', + 'Sandstone: Internal': 'sandstone or limestone', + 'Cavity: AsBuilt (Pre 1976)': 'cavity unknown insulation', + 'System build': 'system built', + 'Solid brick': 'solid brick unknown insulation', + 'Stone': 'sandstone or limestone', + 'Timber frame': 'timber frame unknown insulation', + '2017 onwards': 'new build - average thermal transmittance', + 'ND (inferred)': 'unknown', + 'Flat / maisonette': 'other' } diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 2b3f0c02..96b7c5de 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -755,6 +755,10 @@ class SearchEpc: "photo-supply"] ) + estimated_epc["co2-emiss-curr-per-floor-area"] = ( + estimated_epc["co2-emissions-current"] / estimated_epc["total-floor-area"] + ) + estimated_epc["postcode"] = self.postcode if not self.uprn: # Update self.uprn too diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index a4d60d85..7e15c1f4 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 138 +PORTFOLIO_ID = 140 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,14 +19,17 @@ def app(): asset_list = [ { - "address": "42 Rippolson Road", - "postcode": "SE18 1NS", - "uprn": 100020999275, + "address": "Brow Cottage", + "postcode": "YO18 7PZ", + "uprn": 10007630752, + "property_type": "House", + "built_form": "Semi-Detached", + "patch": True }, { - "address": "66 Riverdale Road", - "postcode": "DA8 1PX", - "uprn": 100020235516 + "address": "Wyburn", + "postcode": "DT1 2LL", + "uprn": 100040630290 }, ] asset_list = pd.DataFrame(asset_list) @@ -46,6 +49,7 @@ def app(): ) asset_list_epc_client.get_data() asset_list_epc_client.get_non_invasive_recommendations() + asset_list_epc_client.get_patch() # Store non-invasive recommendations in S3 non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" @@ -55,14 +59,24 @@ def app(): file_name=non_invasive_recommendations_filename ) + # Store patches in S3 + patches_filename = "" + if asset_list_epc_client.patches: + patches_filename = f"{USER_ID}/{PORTFOLIO_ID}/patches.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list_epc_client.patches), + bucket_name="retrofit-plan-inputs-dev", + file_name=patches_filename + ) + valuation_data = [ { "valuation": 469_000, - "uprn": 100020999275, + "uprn": 10007630752, }, { - "valuation": 382_000, - "uprn": 100020235516 + "valuation": 373_000, + "uprn": 100040630290 }, ] # Store valuation data to s3 @@ -80,7 +94,7 @@ def app(): "goal_value": "C", "trigger_file_path": filename, "already_installed_file_path": "", - "patches_file_path": "", + "patches_file_path": patches_filename, "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, "valuation_file_path": valuation_filename, "scenario_name": "Full package remote assessment", diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py index 1d2e1472..f085c8fb 100644 --- a/etl/find_my_epc/AssetListEpcData.py +++ b/etl/find_my_epc/AssetListEpcData.py @@ -26,6 +26,7 @@ class AssetListEpcData: self.extracted_data = None self.non_invasive_recommendations = None + self.patches = None @staticmethod def check_asset_list(asset_list): @@ -52,6 +53,21 @@ class AssetListEpcData: } for r in self.extracted_data ] + def get_patch(self): + """ + + :return: + """ + if self.extracted_data is None: + raise ValueError("extracted data is missing - run get_data first") + + self.patches = [ + { + "uprn": r.get("uprn"), + **r.get("patch") + } for r in self.extracted_data if r.get("patch") + ] + def get_data(self): logger.info("Retrieving data for given asset list") @@ -67,11 +83,18 @@ class AssetListEpcData: postcode=pc, uprn=home.get("uprn"), auth_token=self.epc_auth_token, - os_api_key="" + os_api_key="", ) + epc_searcher.ordnance_survey_client.property_type = home.get("property_type") + epc_searcher.ordnance_survey_client.built_form = home.get("built_form") epc_searcher.find_property(skip_os=True) + if epc_searcher.newest_epc is None: continue + + if not pd.isnull(home.get("patch")): + epc_searcher.newest_epc["address1"] = add1 + # Attempt both methods: try: find_epc_searcher = RetrieveFindMyEpc( @@ -89,14 +112,22 @@ class AssetListEpcData: time.sleep(0.5) # We need uprn - extracted_data.append( - { - "uprn": home.get("uprn"), - "address": home["address"], - "postcode": home["postcode"], - **find_epc_data, + to_append = { + "uprn": home.get("uprn"), + "address": home["address"], + "postcode": home["postcode"], + **find_epc_data, + } + if not pd.isnull(home.get("patch")): + to_append["patch"] = { + "current-energy-rating": find_epc_data["current_epc_rating"], + "current-energy-efficiency": find_epc_data["current_epc_efficiency"], + "potential-energy-rating": find_epc_data["potential_epc_rating"], + "potential-energy-efficiency": find_epc_data["potential_epc_efficiency"], + **find_epc_data["epc_data"] } - ) + + extracted_data.append(to_append) self.extracted_data = extracted_data logger.info("Data Extrction complete") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 5e05d56f..86c3fda1 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -1,3 +1,4 @@ +import re import pandas as pd import requests from bs4 import BeautifulSoup @@ -45,6 +46,85 @@ class RetrieveFindMyEpc: sources = {item.get_text(strip=True): True for item in energy_list.find_all("li")} return sources + @staticmethod + def get_text(elem): + return elem.get_text(strip=True) if elem else None + + def extract_epc_data(self, soup): + + results = {} + + # 1. Total floor area + results['total-floor-area'] = int(self.get_text( + soup.find("dt", string="Total floor area").find_next_sibling("dd") + ).split(" ")[0]) + + # Table with features + rows = soup.select("table.govuk-table tbody tr") + + rating_map = { + "Very poor": "Very Poor", + "Very good": "Very Good" + } + + def get_feature_row_text(feature_name, index=0): + matches = [row for row in rows if row.find("th") and feature_name in row.find("th").text] + if len(matches) > index: + cells = matches[index].find_all("td") + description = self.get_text(cells[0]) + rating = self.get_text(cells[1]) + return description, rating_map.get(rating, rating) + return None, None + + # 2-3. First wall description and rating + results['walls-description'], results['walls-energy-eff'] = get_feature_row_text("Wall", 0) + + # 4-5. First roof description and rating + results['roof-description'], results['roof-energy-eff'] = get_feature_row_text("Roof", 0) + + # 6-7. Windows description and rating + results['windows-description'], results['windows-energy-eff'] = get_feature_row_text("Window") + + # 8-9. Main heating description and rating + results['mainheat-description'], results['mainheat-energy-eff'] = get_feature_row_text("Main heating") + + # 10-11. Main heating control description and rating + results['mainheatcont-description'], results['mainheatc-energy-eff'] = get_feature_row_text( + "Main heating control" + ) + + # 12-13. Hot water description and rating + results['hotwater-description'], results['hot-water-energy-ef'] = get_feature_row_text("Hot water") + + # 14-15. Lighting description and rating + results['lighting-description'], results['lighting-energy-eff'] = get_feature_row_text("Lighting") + + # 16. Floor description + results['floor-description'], _ = get_feature_row_text("Floor") + + # 17. Secondary heating description + results['secondheat-description'], _ = get_feature_row_text("Secondary heating") + + # 18. Primary energy use + p_energy = soup.find(string=lambda t: "primary energy use for this property per year" in t.lower()) + # We should always have this + match = re.search(r"(\d+)\s+kilowatt", p_energy) + results['energy-consumption-current'] = int(match.group(1)) if match else None + + # 19. Current CO2 emissions + co2_now = soup.find("dd", id="eir-property-produces") + # We should always have this + match = re.search(r"([\d.]+)", co2_now.text) + results['co2-emissions-current'] = float(match.group(1)) if match else None + # Need co2-emiss-curr-per-floor-area + + # 20. Potential CO2 emissions + co2_pot = soup.find("dd", id="eir-potential-production") + match = re.search(r"([\d.]+)", co2_pot.text) + results['co2-emissions-potential'] = float(match.group(1)) if match else None + + return results + def retrieve_newest_find_my_epc_data(self, sap_2012_date=None): """ For a post code and address, we pull out all the required data from the find my epc website @@ -115,6 +195,9 @@ class RetrieveFindMyEpc: potential_rating = ratings.split(".")[1] current_sap = int(current_rating.split(' ')[-1]) + # Floor area + address_res.find() + # Retrieve the energy consumption bills = address_res.find('div', {'id': 'bills-affected'}) bills_list = bills.find_all('li') @@ -232,6 +315,9 @@ class RetrieveFindMyEpc: # 4) Low and zero carbon energy sources low_carbon_energy_sources = self.extract_low_carbon_sources(address_res) + # 5) Pull out the EPC data + epc_data = self.extract_epc_data(address_res) + resulting_data = { 'epc_certificate': epc_certificate, 'current_epc_rating': current_rating.split(' ')[-6], @@ -241,8 +327,9 @@ class RetrieveFindMyEpc: "heating_text": heating_text, "hot_water_text": hot_water_text, "recommendations": recommendations, + "epc_data": epc_data, **assessment_data, - **low_carbon_energy_sources + **low_carbon_energy_sources, } return resulting_data From 3cfe938e273ab6e75a54ced3da5f970fd9c658eb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 13 Apr 2025 21:39:35 +0100 Subject: [PATCH 252/255] adding matcing from sumissions sheet to asset list --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/AssetList.py | 142 +++++++++++-- asset_list/app.py | 36 ++++ asset_list/mappings/built_form.py | 39 +++- asset_list/mappings/heating_systems.py | 42 +++- asset_list/mappings/property_type.py | 29 ++- asset_list/mappings/roof.py | 3 +- asset_list/mappings/walls.py | 11 +- backend/Property.py | 5 +- backend/app/plan/schemas.py | 3 +- etl/customers/bromford/data_cleanup.py | 192 ++++++++++++++++++ etl/customers/remote_assessments/app.py | 33 +-- .../ha_15_32/ha_analysis_batch_3.py | 5 +- recommendations/Recommendations.py | 2 +- recommendations/RoofRecommendations.py | 8 +- 16 files changed, 509 insertions(+), 45 deletions(-) create mode 100644 etl/customers/bromford/data_cleanup.py diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..96ad7a95 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..fb10c6b0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 0dedc1fd..48ea22f4 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -4,6 +4,8 @@ import re import tiktoken from pprint import pprint from datetime import datetime + +from docutils.utils.math.tex2mathml_extern import blahtexml from openai import OpenAI import numpy as np import pandas as pd @@ -663,7 +665,10 @@ class AssetList: non_intrusive_columns.append(self.NON_INTRUSIVES_ELIGIBILITY_COLUMN) if self.old_format_non_intrusives_present: - non_intrusive_columns = self.OLD_FORMAT_NON_INTRUSIVE_COLNAMES + # We check if we have the ECO Eligibility column, which we might not have + non_intrusive_columns = [ + c for c in self.OLD_FORMAT_NON_INTRUSIVE_COLNAMES if c in self.standardised_asset_list.columns + ] self.keep_variables += non_intrusive_columns @@ -731,7 +736,7 @@ class AssetList: 'PIMSS EMPTY' ] - if pd.isnull(date_str) or date_str in known_errors: + if pd.isnull(date_str) or date_str in known_errors or (date_str == 0): return None if isinstance(date_str, str): @@ -752,6 +757,10 @@ class AssetList: if isinstance(date_str, datetime): return date_str.year + if isinstance(date_str, float): + if str(int(date_str)).isdigit() & (len(str(int(date_str))) == 4): + return int(date_str) + # Check if date_str is a year itself if str(date_str).isdigit() & (len(str(date_str)) == 4): return int(date_str) @@ -1325,7 +1334,7 @@ class AssetList: ) self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] = ( self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( - ["electric storage heaters", "room heaters", "electric radiators"] + ["electric storage heaters", "room heaters", "electric radiators", "no heating"] ) ) @@ -2099,6 +2108,9 @@ class AssetList: nomatch = [] for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)): + if pd.isnull(x[outcomes_address]): + continue + # Check if we have an id oid = x[outcomes_id] if outcomes_id is not None else None @@ -2120,6 +2132,8 @@ class AssetList: address_clean = x[outcomes_address].lower().replace(",", "").replace(" ", " ") + self.outcomes["Outcome"] = self.outcomes["Outcome"].str.lower() + matched = self.standardised_asset_list[ (self.standardised_asset_list[ self.STANDARD_FULL_ADDRESS @@ -2140,7 +2154,9 @@ class AssetList: ].copy() if not matched.empty: matched["houseno"] = matched.apply( - lambda x: SearchEpc.get_house_number(x[self.STANDARD_ADDRESS_1], x[self.STANDARD_POSTCODE]), + lambda x: SearchEpc.get_house_number( + str(x[self.STANDARD_ADDRESS_1]), str(x[self.STANDARD_POSTCODE]) + ), axis=1 ) @@ -2155,8 +2171,6 @@ class AssetList: } ) continue - elif matched.shape[0] > 1: - raise NotImplementedError("Check me") elif not matched.empty: # Use levenstein distance to match matched["address"] = matched[self.STANDARD_ADDRESS_1] + " " + matched[self.STANDARD_POSTCODE] @@ -2254,19 +2268,123 @@ class AssetList: "SUBMISSION DATE" if "SUBMISSION DATE" in master_data.columns else "SUBMISSION DATE TO INSTALLERS" ) - # We just need to check if any were cancelled - master_to_append = master_data[ - ["UPRN", install_col, submission_col] - ].rename( + if "UPRN" in master_data.columns: + # We just need to check if any were cancelled + master_to_append = master_data[ + ["UPRN", install_col, submission_col] + ].rename( + columns={ + "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, + install_col: "survey_status", + submission_col: "submission_date" + } + ) + master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") + + master_surveyed.append(master_to_append) + continue + + master_data["row_id"] = master_data.index + + self.standardised_asset_list["house_no"] = self.standardised_asset_list.apply( + lambda x: SearchEpc.get_house_number( + str(x[self.STANDARD_ADDRESS_1]), str(x[self.STANDARD_POSTCODE]) + ), + axis=1 + ) + + # Otherwise, we need to match algorithmically + logger.info("Matching master data to asset list") + matched = [] + unmatched = [] + for _, row in tqdm(master_data.iterrows(), total=len(master_data)): + if pd.isnull(row["POSTCODE"]): + continue + postcode_no_space = row["POSTCODE"].strip().replace(" ", "").lower() + + df = self.standardised_asset_list[ + ( + self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip().str.lower().str.replace(" ", + "") + == postcode_no_space + ) + ] + + house_no = row["NO"] + + if house_no in df["house_no"].values: + df = df[df["house_no"] == house_no] + if df.shape[0] != 1: + # Levenstein distance + + if any(df[self.STANDARD_FULL_ADDRESS].str.contains(row["Street / Block Name"])): + df = df[ + df[self.STANDARD_FULL_ADDRESS].str.contains(row["Street / Block Name"]) + ] + else: + # Levenstein distance + df = df[ + df[self.STANDARD_FULL_ADDRESS].str.lower().apply( + lambda x: process.extractOne( + " ".join([row["NO"], row["Street / Block Name"], row["TOWN"]]).lower(), + x + )[1] + ) > 90 + ] + + if df.shape[0] == 0: + unmatched.append(row["row_id"]) + continue + + if any(df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains( + " ".join([row["NO"], row["Street / Block Name"]]).lower() + )): + df = df[ + df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains( + " ".join([row["NO"], row["Street / Block Name"]]).lower() + ) + ] + + if any( + df[self.STANDARD_PROPERTY_TYPE].str.contains( + row["PROPERTY TYPE As per table emailed"].split(" ")[-1].lower() + ) + ): + # We ignore "block of flats" entries + df = df[ + df[self.STANDARD_PROPERTY_TYPE].str.contains( + row["PROPERTY TYPE As per table emailed"].split(" ")[-1].lower() + ) & (df[self.STANDARD_PROPERTY_TYPE] != "block of flats") + ] + + if df.shape[0] != 1: + # We have multiple matches + raise NotImplementedError("FIX ME") + matched.append( + { + "row_id": row["row_id"], + self.STANDARD_LANDLORD_PROPERTY_ID: df[self.STANDARD_LANDLORD_PROPERTY_ID].values[0], + } + ) + + self.standardised_asset_list = self.standardised_asset_list.drop(columns="house_no") + + # We match the "UPRN" which is the landlords ID, onto the master sheet + matched = pd.DataFrame(matched) + master_to_append = master_data[["row_id", install_col, submission_col]].merge( + matched, how="left", on="row_id" + ).rename( columns={ - "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, install_col: "survey_status", submission_col: "submission_date" } ) master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") - master_surveyed.append(master_to_append) + unmatched_df = master_data[ + master_data["row_id"].isin(unmatched) + ] + submissions_unmatched.append(unmatched_df) master_surveyed = pd.concat(master_surveyed) master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])] diff --git a/asset_list/app.py b/asset_list/app.py index ae4b3cef..ee74b337 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -89,6 +89,42 @@ def app(): # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) # - Or the insulation required is loft/cavity (floors should be solid) + # Bromford + data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme " + "Rebuild/Prepared data/") + data_filename = "asset_list.xlsx" + sheet_name = "Sheet1" + postcode_column = 'PostCode' + fulladdress_column = "FullAddress" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "ConYear" + landlord_os_uprn = None + landlord_property_type = "AssetTypeDesc" + landlord_built_form = "PropTypeDesc" + landlord_wall_construction = "Construction type" + landlord_roof_construction = None + landlord_heating_system = "Heating Type" + landlord_existing_pv = None + landlord_property_id = "Asset" + landlord_sap = None + outcomes_filename = "outcomes.xlsx" + outcomes_sheetname = "Sheet1" + outcomes_postcode = "Postcode" + outcomes_houseno = "No" + outcomes_id = None + outcomes_address = "Address" + master_filepaths = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared data/ECO " + "3 submissions.csv", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared data/ECO " + "4 submissions.csv", + ] + master_to_asset_list_filepath = None + phase = False + # Torus data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Torus/Phase 1" data_filename = "Torus Property Asset List - Phase 1.xlsx" diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index cabd970e..e103f794 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -107,5 +107,42 @@ BUILT_FORM_MAPPINGS = { 'Semi-detached': 'semi-detached', 'Detached': 'detached', 'Flat / maisonette': 'unknown', - '2014 onwards': 'unknown' + '2014 onwards': 'unknown', + + 'Semi Detached': 'semi-detached', + 'End Terraced': 'end-terrace', + 'Basement': 'basement', + 'No': 'unknown', + 'Mid Terrace': 'mid-terrace', + 'Link Detached': 'detached', + 'Mid Terraced': 'mid-terrace', + 'Ground Floor': 'ground floor', + 'End Terrace': 'end-terrace', + 'Sheltrd Semi Det': 'semi-detached', + 'Shop': 'unknown', + 'Fourth Floor': 'mid-floor', + 'Terraced': 'mid-terrace', + 'Leasehold Terr': 'mid-terrace', + 'Room': 'unknown', + 'Second Floor': 'mid-floor', + 'Third Floor': 'mid-floor', + 'Office': 'unknown', + 'First Floor Over Arch': 'ground floor', + '16-25 IND-PPL': 'unknown', + 'Seventh Floor': 'top-floor', + 'Sheltered': 'unknown', + 'Shelt Bung End': 'end-terrace', + 'Room In Shared Accommodation': 'unknown', + 'Sheltred Bung Terrace': 'mid-terrace', + 'Garage In Block': 'unknown', + 'First Floor': 'ground floor', + 'First Floor Over Garage': 'ground floor', + 'Leasehold': 'unknown', + 'Sheltred Bung': 'unknown', + 'Garage': 'unknown', + 'Sixth Floor': 'top-floor', + 'Sheltered Bung': 'semi-detached', + 'Guest': 'unknown', + 'Fifth Floor': 'mid-floor' + } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 42326575..7f2f81f2 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -23,7 +23,8 @@ STANDARD_HEATING_SYSTEMS = { 'gas combi boiler', 'unknown', "electric ceiling", - "electric underfloor" + "electric underfloor", + "no heating" } HEATING_MAPPINGS = { @@ -87,7 +88,7 @@ HEATING_MAPPINGS = { 'Heat pump (air) Electricity': 'air source heat pump', 'Room heaters Electricity': 'electric radiators', 'Room heaters Oil': 'room heaters', - 'No heating system ND': 'unknown', + 'No heating system ND': 'no heating', 'Heat pump (wet) Electricity': 'ground source heat pump', 'Room heaters Biomass': 'room heaters', 'ND Solid fuel': 'unknown', @@ -98,11 +99,11 @@ HEATING_MAPPINGS = { 'Storage heating Electricity': 'electric storage heaters', 'ND Electricity': 'unknown', 'Community heating Community (non-gas)': 'district heating', - 'No heating system N/A': 'unknown', + 'No heating system N/A': 'no heating', 'Boiler Solid fuel': 'boiler - other fuel', 'Community heating Community (mains gas)': 'communal gas boiler', 'Boiler Biomass': 'boiler - other fuel', - 'No heating system Mains gas': 'unknown', + 'No heating system Mains gas': 'no heating', 'Storage heaters': 'electric storage heaters', 'Air Source': 'air source heat pump', @@ -170,5 +171,36 @@ HEATING_MAPPINGS = { 'Heat pump (wet)': 'air source heat pump', 'Electric ceiling heating': 'electric ceiling', 'Electric under floor heating': 'electric underfloor', - 'Community heating': 'district heating' + 'Community heating': 'district heating', + + 'Wet - Radiators Air Source Heat Pump': 'air source heat pump', + 'Wet - Radiators Electric': 'electric boiler', + 'Storage Heaters': 'high heat retention storage heaters', + 'Wet - Radiators Oil': 'oil boiler', + 'Communal Wet - Radiators Gas': 'communal gas boiler', + 'Electric - Storage/Panel Heaters Electric': 'electric storage heaters', + 'Gas Central Heating': 'gas combi boiler', + 'Wet - Radiators Solar': 'other', + 'Electric - Storage/Panel Heaters LPG': 'electric storage heaters', + 'No Heating Solid': 'no heating', + 'Wet - Underfloor Gas': 'gas condensing boiler', + 'No Heating Electric': 'no heating', + 'Oil Fired Central Heating': 'oil boiler', + 'Warm Air Gas': 'other', + 'Communal Boilers': 'communal gas boiler', + 'Wet - Radiators Gas': 'gas combi boiler', + 'Wet - Radiators Solid': 'solid fuel', + 'Wet - Radiators LPG': 'other', + 'No Heating Gas': 'no heating', + 'No Heating': 'no heating', + 'Panel Heaters': 'electric radiators', + 'Rointe Electric Heating': 'electric storage heaters', + 'Underfloor Heating': 'electric underfloor', + 'Air Source Heating': 'air source heat pump', + 'Warm Air Electric': 'other', + 'Communal Wet - Radiators Electric': 'communal gas boiler', + 'Wet - Underfloor Solar': 'other', + 'No Heating Required Gas': 'unknown', + 'Electric - Storage/Panel Heaters Gas': 'electric storage heaters', + 'Electric - Storage/Panel Heaters Solid': 'electric storage heaters' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index f208081a..dc8dbf21 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -151,5 +151,32 @@ PROPERTY_MAPPING = { 'Flat: Enclosed End Terrace: Mid Floor': 'flat', 'Flat: Enclosed End Terrace: Ground Floor': 'flat', 'Flat: Enclosed Mid Terrace: Top Floor': 'flat', - '2013 onwards': 'unknown' + '2013 onwards': 'unknown', + + 'House 2 Storey': 'house', + 'Bung': 'bungalow', + 'House 3 Storey': 'house', + 'Shared Flat': 'flat', + 'd': 'unknown', + 'Mais': 'maisonette', + 'e': 'unknown', + 'Shared House': 'house', + 'House 4 Storey': 'house', + 'Shared Bungalow': 'bungalow', + 'Detch': 'house', + 'Shop': 'other', + 'Terr': 'house', + 'Terrace': 'house', + 'Description': 'unknown', + 'Hse': 'house', + 'Room': 'other', + 'Office': 'other', + 'Room In Shared Accommodation': 'other', + 'Apartment': 'flat', + 'm': 'unknown', + 'Garage': 'other', + 'Parking Space': 'other', + 'Community Centre': 'other', + 'Communal Facility': 'other', + 'Semi': 'house' } diff --git a/asset_list/mappings/roof.py b/asset_list/mappings/roof.py index b98a773c..a95f0529 100644 --- a/asset_list/mappings/roof.py +++ b/asset_list/mappings/roof.py @@ -22,5 +22,6 @@ ROOF_CONSTRUCTION_MAPPINGS = { 'ND (inferred)': 'unknown', '2018 onwards': 'unknown', 'Pitched (vaulted ceiling)': 'pitched insulated', - np.nan: "unknown" + np.nan: "unknown", + None: "unknown" } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 128e84af..c327338a 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -157,5 +157,14 @@ WALL_CONSTRUCTION_MAPPINGS = { 'Timber frame': 'timber frame unknown insulation', '2017 onwards': 'new build - average thermal transmittance', 'ND (inferred)': 'unknown', - 'Flat / maisonette': 'other' + 'Flat / maisonette': 'other', + + 'Other': 'other', + 'Timber Frame': 'timber frame unknown insulation', + 'Cavity Wall': 'cavity unknown insulation', + 'Non-Traditional': 'system built', + 'PRC': 'system built', + 'Cross Wall': 'system built', + 'Solid Wall': 'solid brick unknown insulation', + 'Traditional': 'other' } diff --git a/backend/Property.py b/backend/Property.py index 424242fd..52e8c213 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -107,7 +107,10 @@ class Property: # cost and instead, provide a message that the measure has already been installed self.already_installed = ast.literal_eval(already_installed['already_installed']) if already_installed else [] - self.non_invasive_recommendations = non_invasive_recommendations + self.non_invasive_recommendations = ( + non_invasive_recommendations['recommendations'] if + non_invasive_recommendations else [] + ) # This is a list of measures that have been recommended for the property if isinstance(measures, list): self.measures = measures diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py index 4237472d..5db3d4d1 100644 --- a/backend/app/plan/schemas.py +++ b/backend/app/plan/schemas.py @@ -83,7 +83,8 @@ class PlanTriggerRequest(BaseModel): exclusions: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1) inclusions: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1) # This is a list of measures that we want to be included, if they are options - required_measures: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1) + # Default to empty + required_measures: Optional[List[InclusionOrExclusionItem]] = Field(default=[], min_length=1) scenario_name: Optional[str] = "" multi_plan: Optional[bool] = False diff --git a/etl/customers/bromford/data_cleanup.py b/etl/customers/bromford/data_cleanup.py new file mode 100644 index 00000000..45429523 --- /dev/null +++ b/etl/customers/bromford/data_cleanup.py @@ -0,0 +1,192 @@ +""" +12th April 2025 +This script attempts to clean up the various pieces of data we have for Bromford, with the intention of producing a +standardised asset list +""" + +import pandas as pd + +# Step 1 +# The inspectons data is spread across three different files. We attempt to produce one finalised asset list, with +# comprehensive inspections + +# Primary asset list +asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford Asset " + "List.xlsx", + sheet_name="Asset List" +) + +# +inspections_1 = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD " + "MDS.xlsx", + sheet_name="Data list" +) +inspections_1["Heating Type"] = (inspections_1["Heating Type"] + " " + inspections_1["Heating fuel"]).str.strip() + +inspections_2 = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD " + "MERLIN LANE.xlsx", + sheet_name="Report" +) +inspections_2["AssetTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[-1] +inspections_2["PropTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[:-1].str.join(" ") + +inspections_3 = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD " + "SEVERN VALE - KLARKE.xlsx", + sheet_name="Asset report" +) + +inspections_3["FullAddress"] = inspections_3["T1_Address1"] + ", " + inspections_3["T1_Address2"] + +# On inspections 3, we have multiple sheets which describe the heating +heating_systems = [] +for sheet_name in [ + "Storage Heaters", "No Heating", "Underfloor Heating", "Rointe Electric Heating", "Air Source Heating", + "Gas Central Heating", "Electric Boiler", "Oil Fired Central Heating", + "Communal Boilers", "Panel Heaters" +]: + df = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme " + "Rebuild/Inspections/BROMFORD " + "SEVERN VALE - KLARKE.xlsx", + sheet_name=sheet_name + ) + df = df[["UPRN"]] + df["Heating Type"] = sheet_name + heating_systems.append(df) + +heating_systems = pd.concat(heating_systems) +# We have no clue which one is correct, we have some dupes +heating_systems = heating_systems.drop_duplicates("UPRN") +heating_systems = heating_systems.rename(columns={"UPRN": "Asset"}) +heating_systems["Asset"] = heating_systems["Asset"].astype(int) + +inspections_3 = inspections_3.merge(heating_systems, how="left", on="Asset") + +# Create a consolidated inspections sheet +inspections = pd.concat( + [ + inspections_1[["Asset", "Construction type", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]], + inspections_2[["Asset", "Construction type", "WFT Findings", "Eligibility (Red/Yellow/Green)"]], + inspections_3[["Asset", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]], + ] +) + +inspections_address_data = pd.concat( + [ + inspections_1[ + ["Asset", "FullAddress", "PostCode", "ConYear", "Beds", "AssetTypeDesc", "PropTypeDesc", 'ManAreaDesc', ] + ], + inspections_2[ + ['Asset', 'FullAddress', 'AccomType', "AssetTypeDesc", "PropTypeDesc", 'ConYear', 'Postcode'] + ].rename(columns={"Postcode": "PostCode"}), + inspections_3[ + ['Asset', "FullAddress", 'T1_Postcode', 'T1_Build Year', 'T1_AssetType'] + ].rename( + columns={"T1_Postcode": "PostCode", "T1_Build Year": "ConYear", "T1_AssetType": "AssetTypeDesc"} + ), + ] +) + +# Remove some error values +inspections = inspections[~inspections["Asset"].isin( + [ + "They're all green partial fill they're all green this", + "South Staffordshire District Council", + 'Blk Milton Crt F9-10, Perton, Wolverhampton' + ] +)] + +inspections["Asset"] = inspections["Asset"].astype(str) +asset_list["Asset"] = asset_list["Asset"].astype(str) +inspections_address_data["Asset"] = inspections_address_data["Asset"].astype(str) +inspections['WFT Findings'] = inspections['WFT Findings'].replace(r'^\s*$', pd.NA, regex=True) + +# We have some cases where the inspetions data has dupes on Asset (the ID column). We take the instance that is +# populated +inspections = inspections.sort_values(by='WFT Findings', na_position='last') +inspections = inspections.drop_duplicates(subset='Asset', keep='first') + +# We have dupes in the asset list +asset_list = asset_list.drop_duplicates("Asset") + +# Merge on +missed_asset_ids = inspections[ + ~inspections["Asset"].isin(asset_list["Asset"].values) +]["Asset"].values + +missed_assets = inspections_address_data[ + inspections_address_data["Asset"].isin(missed_asset_ids) +] +missed_assets = missed_assets.drop_duplicates("Asset") + +# We produce a larger asset list +asset_list = pd.concat([asset_list, missed_assets]) + +asset_list = asset_list.merge( + inspections, how="left", on="Asset" +) +asset_list["WFT Findings"] = asset_list["WFT Findings"].fillna("No Inspections Note") + +# Store +# asset_list.to_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared " +# "data/asset_list.xlsx" +# ) + +# We now prepare outcomes into a single file +pv_outcomes = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford PV " + "Outcomes.csv", + encoding='cp1252' +) +pv_outcomes["measure_type"] = "solar" + +other_outcomes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/(Bromford) " + "15.04.2024.xlsx", + sheet_name="ECO4 & GBIS", + header=1 +) +other_outcomes["measure_type"] = "cwi" + +combined_outcomes = pd.concat( + [ + other_outcomes[["NO", "ADDRESS", "POSTCODE", "WEEK COMMENCING", "OUTCOMES", "NOTES"]].rename( + columns={ + "NO": "No", "ADDRESS": "Address", "POSTCODE": "Postcode", "WEEK COMMENCING": "Week Commencing", + "OUTCOMES": "Outcome", "NOTES": "Notes" + } + ), + pv_outcomes[['No', 'Address', 'Postcode', "Week Commencing", "Outcome", "Notes"]] + ] +) + +# Store +# combined_outcomes.to_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared " +# "data/outcomes.xlsx" +# ) + +# Submissions sheet - +eco3_submissions = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 Submissions.csv", + encoding='cp1252' +) +# Get rid of the unnamed columns +unnamed_columns = [c for c in eco3_submissions.columns if "Unnamed: " in c] +eco3_submissions = eco3_submissions.drop(columns=unnamed_columns) +# Store +eco3_submissions.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 submissions.csv", + index=False +) + +eco4_submissions = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 4 submissions.csv", +) + +same_cols = [c for c in eco4_submissions.columns if c in eco3_submissions.columns] diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 7e15c1f4..a8805a71 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 140 +PORTFOLIO_ID = 141 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,17 +19,20 @@ def app(): asset_list = [ { - "address": "Brow Cottage", - "postcode": "YO18 7PZ", - "uprn": 10007630752, - "property_type": "House", - "built_form": "Semi-Detached", + "address": "196 Merrow Street", + "postcode": "SE17 2NP", + "uprn": 200003423454, "patch": True }, { - "address": "Wyburn", - "postcode": "DT1 2LL", - "uprn": 100040630290 + "address": "65 Liverpool Grove", + "postcode": "SE17 2HP", + "uprn": 200003423194 + }, + { + "address": "2 Brettell Street", + "postcode": "SE17 2NZ", + "uprn": 200003423607 }, ] asset_list = pd.DataFrame(asset_list) @@ -71,12 +74,16 @@ def app(): valuation_data = [ { - "valuation": 469_000, - "uprn": 10007630752, + "valuation": 339_000, + "uprn": 200003423454, }, { - "valuation": 373_000, - "uprn": 100040630290 + "valuation": 374_000, + "uprn": 200003423194 + }, + { + "valuation": 719_000, + "uprn": 200003423607 }, ] # Store valuation data to s3 diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index e97f0202..76087a76 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1,7 +1,7 @@ import os import re import openpyxl -import Levenshtein +from fuzzywuzzy import fuzz from pathlib import Path import msgpack from datetime import datetime @@ -2771,7 +2771,8 @@ class DataLoader: match_to = [x.replace(" ", "") for x in match_to] # Perform matching between full key and match_to - distances = [Levenshtein.distance(matching_string, s) for s in match_to] + distances = [100 - fuzz.ratio(matching_string, s) for s in match_to] + best_match_index = distances.index(min(distances)) # We might want to consider a threshold for the distance, however for the momeny, # we don't consider this for the moment diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 2e044e12..0e73cffe 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -635,7 +635,7 @@ class Recommendations: # By limiting here, we don't change the value in current_phase_values. This means that the # future recommendations won't have an impact that is too large li_sap_limit = RoofRecommendations.get_loft_insulation_sap_limit( - property_instance.data["roof-energy-eff"], property_instance.data["extension-count"] + property_instance.data["roof-energy-eff"], property_instance.roof["insulation_thickness"] ) if li_sap_limit is not None: property_phase_impact["sap"] = min(property_phase_impact["sap"], li_sap_limit) diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index 5f9707d9..cd7f82c4 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -64,16 +64,16 @@ class RoofRecommendations: ) @classmethod - def get_loft_insulation_sap_limit(cls, roof_energy_eff, extension_count): + def get_loft_insulation_sap_limit(cls, roof_energy_eff, existing_thickness): """ Get the SAP limit for loft insulation :param roof_energy_eff: :return: """ - if extension_count == 0: - # No limit - return None + if str(existing_thickness).isdigit(): + if float(existing_thickness) >= 250: + return 0 if roof_energy_eff in ["Good", "Very Good"]: return 1 From 83a1ac8cf347bdb7538b45f577263771da86b0a9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 13 Apr 2025 21:54:32 +0100 Subject: [PATCH 253/255] matched submissions --- asset_list/AssetList.py | 26 +++++++++++++++++--------- asset_list/app.py | 6 ++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 48ea22f4..9657f289 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -5,7 +5,6 @@ import tiktoken from pprint import pprint from datetime import datetime -from docutils.utils.math.tex2mathml_extern import blahtexml from openai import OpenAI import numpy as np import pandas as pd @@ -379,9 +378,10 @@ class AssetList: self.contact_details = None self.contact_detail_fields = None self.outcomes = None - self.outcomes_no_match = None + self.outcomes_no_match = pd.DataFrame() self.outcomes_for_output = pd.DataFrame() self.master_surveyed = None + self.unmatched_submissions = pd.DataFrame() # When this is True, we intend to break the programme into multiple phases. We may need to review # how this is structured in the future, as depending on how we get future data, we may need to @@ -2249,6 +2249,7 @@ class AssetList: logger.info("Getting masters and merging onto asset list") master_surveyed = [] + unmatched_submissions = [] for filepath in master_filepaths: master_data = pd.read_csv(filepath) # Strip columns @@ -2293,14 +2294,17 @@ class AssetList: axis=1 ) + postcode_col = "POSTCODE" if "POSTCODE" in master_data.columns else "Post Code" + house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO" + # Otherwise, we need to match algorithmically logger.info("Matching master data to asset list") matched = [] unmatched = [] for _, row in tqdm(master_data.iterrows(), total=len(master_data)): - if pd.isnull(row["POSTCODE"]): + if pd.isnull(row[postcode_col]): continue - postcode_no_space = row["POSTCODE"].strip().replace(" ", "").lower() + postcode_no_space = row[postcode_col].strip().replace(" ", "").lower() df = self.standardised_asset_list[ ( @@ -2310,7 +2314,7 @@ class AssetList: ) ] - house_no = row["NO"] + house_no = row[house_no_col] if house_no in df["house_no"].values: df = df[df["house_no"] == house_no] @@ -2326,7 +2330,7 @@ class AssetList: df = df[ df[self.STANDARD_FULL_ADDRESS].str.lower().apply( lambda x: process.extractOne( - " ".join([row["NO"], row["Street / Block Name"], row["TOWN"]]).lower(), + " ".join([row[house_no_col], row["Street / Block Name"], row["TOWN"]]).lower(), x )[1] ) > 90 @@ -2337,11 +2341,11 @@ class AssetList: continue if any(df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains( - " ".join([row["NO"], row["Street / Block Name"]]).lower() + " ".join([row[house_no_col], row["Street / Block Name"]]).lower() )): df = df[ df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains( - " ".join([row["NO"], row["Street / Block Name"]]).lower() + " ".join([row[house_no_col], row["Street / Block Name"]]).lower() ) ] @@ -2384,7 +2388,7 @@ class AssetList: unmatched_df = master_data[ master_data["row_id"].isin(unmatched) ] - submissions_unmatched.append(unmatched_df) + unmatched_submissions.append(unmatched_df) master_surveyed = pd.concat(master_surveyed) master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])] @@ -2404,3 +2408,7 @@ class AssetList: self.standardised_asset_list = self.standardised_asset_list.merge( self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID ) + + # Finally, we keep a record of the unmatched + if unmatched_submissions: + self.unmatched_submissions = pd.concat(unmatched_submissions) diff --git a/asset_list/app.py b/asset_list/app.py index ee74b337..a284371e 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -943,5 +943,11 @@ def app(): if not asset_list.outcomes_for_output.empty: asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False) + if not asset_list.unmatched_submissions.empty: + asset_list.unmatched_submissions.to_excel(writer, sheet_name="Unmatched Submissions", index=False) + + if not asset_list.outcomes_no_match.empty: + asset_list.outcomes_no_match.to_excel(writer, sheet_name="Unmatched Outcomes", index=False) + # Store the Hubspot export as a csv hubspot_data.to_csv(os.path.join(data_folder, "Hubspot Export.csv"), index=False) From d585e173bee655a832362651dc0c506424bcef5a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 13 Apr 2025 22:11:37 +0100 Subject: [PATCH 254/255] cleaning up unmatched submissions --- asset_list/AssetList.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 9657f289..af5a3faf 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -2388,6 +2388,26 @@ class AssetList: unmatched_df = master_data[ master_data["row_id"].isin(unmatched) ] + + scheme_col = ( + "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" if + "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" in master_data.columns else "AFFORDABLE WARMTH" + ) + # The columns are massively different - we take just a few + unmatched_df = unmatched_df[ + [ + scheme_col, house_no_col, "Street / Block Name", postcode_col, install_col, submission_col + ] + ].rename( + columns={ + scheme_col: "Funding Scheme", + house_no_col: "House Number", + postcode_col: "Postcode", + install_col: "survey_status", + submission_col: "submission_date" + } + ) + unmatched_submissions.append(unmatched_df) master_surveyed = pd.concat(master_surveyed) @@ -2411,4 +2431,6 @@ class AssetList: # Finally, we keep a record of the unmatched if unmatched_submissions: - self.unmatched_submissions = pd.concat(unmatched_submissions) + self.unmatched_submissions = pd.concat( + unmatched_submissions + ) From bd9ad3f0d6f302adb87fa60c464076154d243ed3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 14 Apr 2025 12:01:59 +0100 Subject: [PATCH 255/255] minor --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/app/plan/router.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 96ad7a95..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index fb10c6b0..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 3028e45f..80a531bf 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -521,6 +521,7 @@ async def trigger_plan(body: PlanTriggerRequest): property_non_invasive_recommendations = RetrieveFindMyEpc.get_from_epc( epc_searcher.newest_epc ) + # TODO: We need to determine if we should make a patch, if the EPC is new epc_records = patch_epc(patch, epc_records)